summaryrefslogtreecommitdiff
path: root/arch/s390
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390')
-rw-r--r--arch/s390/Kbuild8
-rw-r--r--arch/s390/Kconfig513
-rw-r--r--arch/s390/Kconfig.debug29
-rw-r--r--arch/s390/Makefile110
-rw-r--r--arch/s390/appldata/appldata_base.c160
-rw-r--r--arch/s390/appldata/appldata_mem.c6
-rw-r--r--arch/s390/appldata/appldata_net_sum.c4
-rw-r--r--arch/s390/appldata/appldata_os.c12
-rw-r--r--arch/s390/boot/.gitignore4
-rw-r--r--arch/s390/boot/Makefile90
-rw-r--r--arch/s390/boot/als.c6
-rw-r--r--arch/s390/boot/boot.h91
-rw-r--r--arch/s390/boot/clz_ctz.c2
-rw-r--r--arch/s390/boot/compressed/.gitignore2
-rw-r--r--arch/s390/boot/compressed/Makefile68
-rw-r--r--arch/s390/boot/compressed/decompressor.h30
-rw-r--r--arch/s390/boot/decompressor.c (renamed from arch/s390/boot/compressed/decompressor.c)25
-rw-r--r--arch/s390/boot/decompressor.h12
-rw-r--r--arch/s390/boot/head.S461
-rw-r--r--arch/s390/boot/head_kdump.S14
-rwxr-xr-x[-rw-r--r--]arch/s390/boot/install.sh23
-rw-r--r--arch/s390/boot/ipl_data.c84
-rw-r--r--arch/s390/boot/ipl_parm.c185
-rw-r--r--arch/s390/boot/ipl_report.c100
-rw-r--r--arch/s390/boot/kaslr.c176
-rw-r--r--arch/s390/boot/mem_detect.c175
-rw-r--r--arch/s390/boot/pgm_check_info.c217
-rw-r--r--arch/s390/boot/physmem_info.c328
-rw-r--r--arch/s390/boot/sclp_early_core.c9
-rw-r--r--arch/s390/boot/startup.c408
-rw-r--r--arch/s390/boot/string.c1
-rw-r--r--arch/s390/boot/text_dma.S184
-rw-r--r--arch/s390/boot/uv.c73
-rw-r--r--arch/s390/boot/uv.h22
-rw-r--r--arch/s390/boot/version.c1
-rw-r--r--arch/s390/boot/vmem.c474
-rw-r--r--arch/s390/boot/vmlinux.lds.S (renamed from arch/s390/boot/compressed/vmlinux.lds.S)88
-rw-r--r--arch/s390/configs/btf.config2
-rw-r--r--arch/s390/configs/debug_defconfig315
-rw-r--r--arch/s390/configs/defconfig266
-rw-r--r--arch/s390/configs/kasan.config4
-rw-r--r--arch/s390/configs/zfcpdump_defconfig41
-rw-r--r--arch/s390/crypto/Kconfig135
-rw-r--r--arch/s390/crypto/Makefile4
-rw-r--r--arch/s390/crypto/aes_s390.c44
-rw-r--r--arch/s390/crypto/arch_random.c108
-rw-r--r--arch/s390/crypto/chacha-glue.c130
-rw-r--r--arch/s390/crypto/chacha-s390.S908
-rw-r--r--arch/s390/crypto/chacha-s390.h14
-rw-r--r--arch/s390/crypto/crc32-vx.c10
-rw-r--r--arch/s390/crypto/crc32be-vx.S23
-rw-r--r--arch/s390/crypto/crc32le-vx.S30
-rw-r--r--arch/s390/crypto/des_s390.c4
-rw-r--r--arch/s390/crypto/ghash_s390.c6
-rw-r--r--arch/s390/crypto/paes_s390.c286
-rw-r--r--arch/s390/crypto/prng.c77
-rw-r--r--arch/s390/crypto/sha.h3
-rw-r--r--arch/s390/crypto/sha1_s390.c16
-rw-r--r--arch/s390/crypto/sha256_s390.c4
-rw-r--r--arch/s390/crypto/sha3_256_s390.c3
-rw-r--r--arch/s390/crypto/sha3_512_s390.c3
-rw-r--r--arch/s390/crypto/sha512_s390.c36
-rw-r--r--arch/s390/hypfs/Makefile11
-rw-r--r--arch/s390/hypfs/hypfs.h10
-rw-r--r--arch/s390/hypfs/hypfs_dbfs.c31
-rw-r--r--arch/s390/hypfs/hypfs_diag.c463
-rw-r--r--arch/s390/hypfs/hypfs_diag.h35
-rw-r--r--arch/s390/hypfs/hypfs_diag0c.c14
-rw-r--r--arch/s390/hypfs/hypfs_diag_fs.c393
-rw-r--r--arch/s390/hypfs/hypfs_sprp.c13
-rw-r--r--arch/s390/hypfs/hypfs_vm.c182
-rw-r--r--arch/s390/hypfs/hypfs_vm.h50
-rw-r--r--arch/s390/hypfs/hypfs_vm_fs.c139
-rw-r--r--arch/s390/hypfs/inode.c50
-rw-r--r--arch/s390/include/asm/Kbuild19
-rw-r--r--arch/s390/include/asm/abs_lowcore.h27
-rw-r--r--arch/s390/include/asm/airq.h8
-rw-r--r--arch/s390/include/asm/alternative-asm.h70
-rw-r--r--arch/s390/include/asm/alternative.h102
-rw-r--r--arch/s390/include/asm/ap.h436
-rw-r--r--arch/s390/include/asm/appldata.h2
-rw-r--r--arch/s390/include/asm/archrandom.h49
-rw-r--r--arch/s390/include/asm/asm-const.h12
-rw-r--r--arch/s390/include/asm/asm-extable.h87
-rw-r--r--arch/s390/include/asm/asm-prototypes.h4
-rw-r--r--arch/s390/include/asm/atomic.h110
-rw-r--r--arch/s390/include/asm/atomic_ops.h76
-rw-r--r--arch/s390/include/asm/barrier.h24
-rw-r--r--arch/s390/include/asm/bitops.h191
-rw-r--r--arch/s390/include/asm/bug.h19
-rw-r--r--arch/s390/include/asm/bugs.h21
-rw-r--r--arch/s390/include/asm/cache.h2
-rw-r--r--arch/s390/include/asm/ccwdev.h28
-rw-r--r--arch/s390/include/asm/ccwgroup.h19
-rw-r--r--arch/s390/include/asm/checksum.h144
-rw-r--r--arch/s390/include/asm/chsc.h69
-rw-r--r--arch/s390/include/asm/cio.h9
-rw-r--r--arch/s390/include/asm/clocksource.h7
-rw-r--r--arch/s390/include/asm/clp.h3
-rw-r--r--arch/s390/include/asm/cmpxchg.h223
-rw-r--r--arch/s390/include/asm/compat.h158
-rw-r--r--arch/s390/include/asm/cpacf.h215
-rw-r--r--arch/s390/include/asm/cpu.h3
-rw-r--r--arch/s390/include/asm/cpu_mcf.h126
-rw-r--r--arch/s390/include/asm/cpu_mf.h99
-rw-r--r--arch/s390/include/asm/cpufeature.h23
-rw-r--r--arch/s390/include/asm/cputime.h19
-rw-r--r--arch/s390/include/asm/crw.h1
-rw-r--r--arch/s390/include/asm/css_chars.h4
-rw-r--r--arch/s390/include/asm/ctl_reg.h120
-rw-r--r--arch/s390/include/asm/ctlreg.h255
-rw-r--r--arch/s390/include/asm/debug.h149
-rw-r--r--arch/s390/include/asm/delay.h11
-rw-r--r--arch/s390/include/asm/diag.h45
-rw-r--r--arch/s390/include/asm/dma.h10
-rw-r--r--arch/s390/include/asm/eadm.h4
-rw-r--r--arch/s390/include/asm/elf.h98
-rw-r--r--arch/s390/include/asm/entry-common.h63
-rw-r--r--arch/s390/include/asm/extable.h52
-rw-r--r--arch/s390/include/asm/facility.h37
-rw-r--r--arch/s390/include/asm/fault.h28
-rw-r--r--arch/s390/include/asm/fcx.h6
-rw-r--r--arch/s390/include/asm/fpu/api.h40
-rw-r--r--arch/s390/include/asm/fpu/internal.h15
-rw-r--r--arch/s390/include/asm/ftrace.h148
-rw-r--r--arch/s390/include/asm/ftrace.lds.h21
-rw-r--r--arch/s390/include/asm/futex.h12
-rw-r--r--arch/s390/include/asm/gmap.h45
-rw-r--r--arch/s390/include/asm/hardirq.h1
-rw-r--r--arch/s390/include/asm/hugetlb.h54
-rw-r--r--arch/s390/include/asm/hw_irq.h1
-rw-r--r--arch/s390/include/asm/idals.h14
-rw-r--r--arch/s390/include/asm/idle.h15
-rw-r--r--arch/s390/include/asm/io.h16
-rw-r--r--arch/s390/include/asm/ipl.h37
-rw-r--r--arch/s390/include/asm/irq.h32
-rw-r--r--arch/s390/include/asm/irq_work.h10
-rw-r--r--arch/s390/include/asm/irqflags.h16
-rw-r--r--arch/s390/include/asm/jump_label.h11
-rw-r--r--arch/s390/include/asm/kasan.h14
-rw-r--r--arch/s390/include/asm/kdebug.h2
-rw-r--r--arch/s390/include/asm/kexec.h32
-rw-r--r--arch/s390/include/asm/kfence.h42
-rw-r--r--arch/s390/include/asm/kprobes.h7
-rw-r--r--arch/s390/include/asm/kvm_host.h247
-rw-r--r--arch/s390/include/asm/kvm_para.h229
-rw-r--r--arch/s390/include/asm/linkage.h31
-rw-r--r--arch/s390/include/asm/livepatch.h21
-rw-r--r--arch/s390/include/asm/lowcore.h109
-rw-r--r--arch/s390/include/asm/maccess.h20
-rw-r--r--arch/s390/include/asm/mem_detect.h94
-rw-r--r--arch/s390/include/asm/mem_encrypt.h6
-rw-r--r--arch/s390/include/asm/mmu.h21
-rw-r--r--arch/s390/include/asm/mmu_context.h105
-rw-r--r--arch/s390/include/asm/module.h14
-rw-r--r--arch/s390/include/asm/msi.h17
-rw-r--r--arch/s390/include/asm/nmi.h13
-rw-r--r--arch/s390/include/asm/nospec-branch.h5
-rw-r--r--arch/s390/include/asm/nospec-insn.h156
-rw-r--r--arch/s390/include/asm/numa.h13
-rw-r--r--arch/s390/include/asm/os_info.h10
-rw-r--r--arch/s390/include/asm/page-states.h59
-rw-r--r--arch/s390/include/asm/page.h96
-rw-r--r--arch/s390/include/asm/pai.h84
-rw-r--r--arch/s390/include/asm/pci.h110
-rw-r--r--arch/s390/include/asm/pci_clp.h44
-rw-r--r--arch/s390/include/asm/pci_debug.h7
-rw-r--r--arch/s390/include/asm/pci_dma.h130
-rw-r--r--arch/s390/include/asm/pci_insn.h29
-rw-r--r--arch/s390/include/asm/pci_io.h41
-rw-r--r--arch/s390/include/asm/percpu.h59
-rw-r--r--arch/s390/include/asm/perf_event.h2
-rw-r--r--arch/s390/include/asm/pfault.h26
-rw-r--r--arch/s390/include/asm/pgalloc.h59
-rw-r--r--arch/s390/include/asm/pgtable.h691
-rw-r--r--arch/s390/include/asm/physmem_info.h172
-rw-r--r--arch/s390/include/asm/pkey.h4
-rw-r--r--arch/s390/include/asm/preempt.h39
-rw-r--r--arch/s390/include/asm/processor.h243
-rw-r--r--arch/s390/include/asm/ptdump.h14
-rw-r--r--arch/s390/include/asm/ptrace.h105
-rw-r--r--arch/s390/include/asm/qdio.h151
-rw-r--r--arch/s390/include/asm/rwonce.h31
-rw-r--r--arch/s390/include/asm/sclp.h38
-rw-r--r--arch/s390/include/asm/scsw.h89
-rw-r--r--arch/s390/include/asm/seccomp.h9
-rw-r--r--arch/s390/include/asm/sections.h20
-rw-r--r--arch/s390/include/asm/serial.h7
-rw-r--r--arch/s390/include/asm/set_memory.h74
-rw-r--r--arch/s390/include/asm/setup.h96
-rw-r--r--arch/s390/include/asm/shmparam.h12
-rw-r--r--arch/s390/include/asm/sigp.h14
-rw-r--r--arch/s390/include/asm/smp.h14
-rw-r--r--arch/s390/include/asm/softirq_stack.h14
-rw-r--r--arch/s390/include/asm/spinlock.h13
-rw-r--r--arch/s390/include/asm/spinlock_types.h4
-rw-r--r--arch/s390/include/asm/stacktrace.h230
-rw-r--r--arch/s390/include/asm/stp.h100
-rw-r--r--arch/s390/include/asm/string.h78
-rw-r--r--arch/s390/include/asm/syscall.h94
-rw-r--r--arch/s390/include/asm/syscall_wrapper.h139
-rw-r--r--arch/s390/include/asm/sysinfo.h10
-rw-r--r--arch/s390/include/asm/termios.h26
-rw-r--r--arch/s390/include/asm/text-patching.h16
-rw-r--r--arch/s390/include/asm/thread_info.h22
-rw-r--r--arch/s390/include/asm/timex.h124
-rw-r--r--arch/s390/include/asm/tlb.h37
-rw-r--r--arch/s390/include/asm/tlbflush.h8
-rw-r--r--arch/s390/include/asm/topology.h13
-rw-r--r--arch/s390/include/asm/tpi.h37
-rw-r--r--arch/s390/include/asm/types.h19
-rw-r--r--arch/s390/include/asm/uaccess.h689
-rw-r--r--arch/s390/include/asm/unistd.h1
-rw-r--r--arch/s390/include/asm/unwind.h23
-rw-r--r--arch/s390/include/asm/user.h4
-rw-r--r--arch/s390/include/asm/uv.h409
-rw-r--r--arch/s390/include/asm/vdso.h71
-rw-r--r--arch/s390/include/asm/vdso/clocksource.h8
-rw-r--r--arch/s390/include/asm/vdso/data.h13
-rw-r--r--arch/s390/include/asm/vdso/gettimeofday.h63
-rw-r--r--arch/s390/include/asm/vdso/processor.h7
-rw-r--r--arch/s390/include/asm/vdso/vsyscall.h26
-rw-r--r--arch/s390/include/asm/vga.h7
-rw-r--r--arch/s390/include/asm/vmalloc.h4
-rw-r--r--arch/s390/include/asm/vtime.h15
-rw-r--r--arch/s390/include/asm/vtimer.h2
-rw-r--r--arch/s390/include/asm/vx-insn-asm.h681
-rw-r--r--arch/s390/include/asm/vx-insn.h554
-rw-r--r--arch/s390/include/asm/word-at-a-time.h64
-rw-r--r--arch/s390/include/uapi/asm/cmb.h2
-rw-r--r--arch/s390/include/uapi/asm/dasd.h18
-rw-r--r--arch/s390/include/uapi/asm/debug.h35
-rw-r--r--arch/s390/include/uapi/asm/fs3270.h25
-rw-r--r--arch/s390/include/uapi/asm/hwctrset.h51
-rw-r--r--arch/s390/include/uapi/asm/ipl.h54
-rw-r--r--arch/s390/include/uapi/asm/kvm.h24
-rw-r--r--arch/s390/include/uapi/asm/pkey.h145
-rw-r--r--arch/s390/include/uapi/asm/ptrace.h128
-rw-r--r--arch/s390/include/uapi/asm/raw3270.h75
-rw-r--r--arch/s390/include/uapi/asm/schid.h3
-rw-r--r--arch/s390/include/uapi/asm/setup.h13
-rw-r--r--arch/s390/include/uapi/asm/sie.h2
-rw-r--r--arch/s390/include/uapi/asm/signal.h26
-rw-r--r--arch/s390/include/uapi/asm/statfs.h4
-rw-r--r--arch/s390/include/uapi/asm/termios.h50
-rw-r--r--arch/s390/include/uapi/asm/types.h15
-rw-r--r--arch/s390/include/uapi/asm/uvdevice.h102
-rw-r--r--arch/s390/include/uapi/asm/zcrypt.h187
-rw-r--r--arch/s390/kernel/.gitignore1
-rw-r--r--arch/s390/kernel/Makefile40
-rw-r--r--arch/s390/kernel/abs_lowcore.c46
-rw-r--r--arch/s390/kernel/alternative.c84
-rw-r--r--arch/s390/kernel/asm-offsets.c130
-rw-r--r--arch/s390/kernel/audit.c12
-rw-r--r--arch/s390/kernel/base.S63
-rw-r--r--arch/s390/kernel/cache.c9
-rw-r--r--arch/s390/kernel/cert_store.c812
-rw-r--r--arch/s390/kernel/compat_audit.c13
-rw-r--r--arch/s390/kernel/compat_linux.h89
-rw-r--r--arch/s390/kernel/compat_signal.c34
-rw-r--r--arch/s390/kernel/cpcmd.c42
-rw-r--r--arch/s390/kernel/cpufeature.c46
-rw-r--r--arch/s390/kernel/crash_dump.c188
-rw-r--r--arch/s390/kernel/ctlreg.c121
-rw-r--r--arch/s390/kernel/debug.c350
-rw-r--r--arch/s390/kernel/diag.c108
-rw-r--r--arch/s390/kernel/dis.c31
-rw-r--r--arch/s390/kernel/dumpstack.c69
-rw-r--r--arch/s390/kernel/early.c155
-rw-r--r--arch/s390/kernel/early_printk.c2
-rw-r--r--arch/s390/kernel/earlypgm.S23
-rw-r--r--arch/s390/kernel/ebcdic.c2
-rw-r--r--arch/s390/kernel/entry.S1475
-rw-r--r--arch/s390/kernel/entry.h62
-rw-r--r--arch/s390/kernel/facility.c21
-rw-r--r--arch/s390/kernel/fpu.c94
-rw-r--r--arch/s390/kernel/ftrace.c335
-rw-r--r--arch/s390/kernel/ftrace.h24
-rw-r--r--arch/s390/kernel/guarded_storage.c6
-rw-r--r--arch/s390/kernel/head64.S26
-rw-r--r--arch/s390/kernel/idle.c130
-rw-r--r--arch/s390/kernel/ipl.c874
-rw-r--r--arch/s390/kernel/ipl_vmparm.c2
-rw-r--r--arch/s390/kernel/irq.c161
-rw-r--r--arch/s390/kernel/jump_label.c42
-rw-r--r--arch/s390/kernel/kprobes.c343
-rw-r--r--arch/s390/kernel/kprobes.h9
-rw-r--r--arch/s390/kernel/kprobes_insn_page.S22
-rw-r--r--arch/s390/kernel/lgr.c5
-rw-r--r--arch/s390/kernel/machine_kexec.c112
-rw-r--r--arch/s390/kernel/machine_kexec_file.c114
-rw-r--r--arch/s390/kernel/machine_kexec_reloc.c1
-rw-r--r--arch/s390/kernel/mcount.S183
-rw-r--r--arch/s390/kernel/module.c273
-rw-r--r--arch/s390/kernel/nmi.c381
-rw-r--r--arch/s390/kernel/nospec-branch.c50
-rw-r--r--arch/s390/kernel/nospec-sysfs.c4
-rw-r--r--arch/s390/kernel/numa.c35
-rw-r--r--arch/s390/kernel/os_info.c23
-rw-r--r--arch/s390/kernel/perf_cpum_cf.c1792
-rw-r--r--arch/s390/kernel/perf_cpum_cf_common.c201
-rw-r--r--arch/s390/kernel/perf_cpum_cf_diag.c705
-rw-r--r--arch/s390/kernel/perf_cpum_cf_events.c277
-rw-r--r--arch/s390/kernel/perf_cpum_sf.c445
-rw-r--r--arch/s390/kernel/perf_event.c64
-rw-r--r--arch/s390/kernel/perf_pai_crypto.c773
-rw-r--r--arch/s390/kernel/perf_pai_ext.c671
-rw-r--r--arch/s390/kernel/perf_regs.c9
-rw-r--r--arch/s390/kernel/pgm_check.S147
-rw-r--r--arch/s390/kernel/process.c120
-rw-r--r--arch/s390/kernel/processor.c228
-rw-r--r--arch/s390/kernel/ptrace.c539
-rw-r--r--arch/s390/kernel/reipl.S10
-rw-r--r--arch/s390/kernel/relocate_kernel.S100
-rw-r--r--arch/s390/kernel/rethook.c34
-rw-r--r--arch/s390/kernel/rethook.h7
-rw-r--r--arch/s390/kernel/runtime_instr.c2
-rw-r--r--arch/s390/kernel/setup.c741
-rw-r--r--arch/s390/kernel/signal.c72
-rw-r--r--arch/s390/kernel/smp.c463
-rw-r--r--arch/s390/kernel/stacktrace.c60
-rw-r--r--arch/s390/kernel/sthyi.c23
-rw-r--r--arch/s390/kernel/suspend.c240
-rw-r--r--arch/s390/kernel/swsusp.S276
-rw-r--r--arch/s390/kernel/syscall.c (renamed from arch/s390/kernel/sys_s390.c)68
-rw-r--r--arch/s390/kernel/syscalls/Makefile3
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl58
-rw-r--r--arch/s390/kernel/sysinfo.c30
-rw-r--r--arch/s390/kernel/text_amode31.S159
-rw-r--r--arch/s390/kernel/time.c503
-rw-r--r--arch/s390/kernel/topology.c143
-rw-r--r--arch/s390/kernel/trace.c2
-rw-r--r--arch/s390/kernel/traps.c200
-rw-r--r--arch/s390/kernel/unwind_bc.c27
-rw-r--r--arch/s390/kernel/uprobes.c23
-rw-r--r--arch/s390/kernel/uv.c717
-rw-r--r--arch/s390/kernel/vdso.c406
-rw-r--r--arch/s390/kernel/vdso32/.gitignore2
-rw-r--r--arch/s390/kernel/vdso32/Makefile70
-rwxr-xr-xarch/s390/kernel/vdso32/gen_vdso_offsets.sh15
-rw-r--r--arch/s390/kernel/vdso32/note.S13
-rw-r--r--arch/s390/kernel/vdso32/vdso32.lds.S142
-rw-r--r--arch/s390/kernel/vdso32/vdso32_wrapper.S15
-rw-r--r--arch/s390/kernel/vdso32/vdso_user_wrapper.S22
-rw-r--r--arch/s390/kernel/vdso64/.gitignore1
-rw-r--r--arch/s390/kernel/vdso64/Makefile55
-rw-r--r--arch/s390/kernel/vdso64/clock_getres.S50
-rw-r--r--arch/s390/kernel/vdso64/clock_gettime.S163
-rwxr-xr-xarch/s390/kernel/vdso64/gen_vdso_offsets.sh15
-rw-r--r--arch/s390/kernel/vdso64/getcpu.S31
-rw-r--r--arch/s390/kernel/vdso64/getcpu.c21
-rw-r--r--arch/s390/kernel/vdso64/gettimeofday.S71
-rw-r--r--arch/s390/kernel/vdso64/vdso.h14
-rw-r--r--arch/s390/kernel/vdso64/vdso64.lds.S14
-rw-r--r--arch/s390/kernel/vdso64/vdso64_generic.c19
-rw-r--r--arch/s390/kernel/vdso64/vdso_user_wrapper.S57
-rw-r--r--arch/s390/kernel/vmlinux.lds.S60
-rw-r--r--arch/s390/kernel/vtime.c64
-rw-r--r--arch/s390/kvm/Kconfig16
-rw-r--r--arch/s390/kvm/Makefile8
-rw-r--r--arch/s390/kvm/diag.c56
-rw-r--r--arch/s390/kvm/gaccess.c746
-rw-r--r--arch/s390/kvm/gaccess.h153
-rw-r--r--arch/s390/kvm/guestdbg.c12
-rw-r--r--arch/s390/kvm/intercept.c239
-rw-r--r--arch/s390/kvm/interrupt.c661
-rw-r--r--arch/s390/kvm/irq.h19
-rw-r--r--arch/s390/kvm/kvm-s390.c2479
-rw-r--r--arch/s390/kvm/kvm-s390.h117
-rw-r--r--arch/s390/kvm/pci.c704
-rw-r--r--arch/s390/kvm/pci.h87
-rw-r--r--arch/s390/kvm/priv.c195
-rw-r--r--arch/s390/kvm/pv.c894
-rw-r--r--arch/s390/kvm/sigp.c48
-rw-r--r--arch/s390/kvm/trace-s390.h23
-rw-r--r--arch/s390/kvm/vsie.c234
-rw-r--r--arch/s390/lib/Makefile11
-rw-r--r--arch/s390/lib/delay.c114
-rw-r--r--arch/s390/lib/error-inject.c14
-rw-r--r--arch/s390/lib/expoline/Makefile3
-rw-r--r--arch/s390/lib/expoline/expoline.S12
-rw-r--r--arch/s390/lib/mem.S30
-rw-r--r--arch/s390/lib/spinlock.c9
-rw-r--r--arch/s390/lib/string.c182
-rw-r--r--arch/s390/lib/test_kprobes.c75
-rw-r--r--arch/s390/lib/test_kprobes.h10
-rw-r--r--arch/s390/lib/test_kprobes_asm.S45
-rw-r--r--arch/s390/lib/test_modules.c32
-rw-r--r--arch/s390/lib/test_modules.h53
-rw-r--r--arch/s390/lib/test_modules_helpers.c13
-rw-r--r--arch/s390/lib/test_unwind.c419
-rw-r--r--arch/s390/lib/tishift.S63
-rw-r--r--arch/s390/lib/uaccess.c524
-rw-r--r--arch/s390/lib/xor.c26
-rw-r--r--arch/s390/mm/Makefile8
-rw-r--r--arch/s390/mm/cmm.c87
-rw-r--r--arch/s390/mm/dump_pagetables.c413
-rw-r--r--arch/s390/mm/extable.c104
-rw-r--r--arch/s390/mm/extmem.c55
-rw-r--r--arch/s390/mm/fault.c871
-rw-r--r--arch/s390/mm/gmap.c634
-rw-r--r--arch/s390/mm/hugetlbpage.c173
-rw-r--r--arch/s390/mm/init.c136
-rw-r--r--arch/s390/mm/kasan_init.c382
-rw-r--r--arch/s390/mm/maccess.c227
-rw-r--r--arch/s390/mm/mmap.c69
-rw-r--r--arch/s390/mm/page-states.c255
-rw-r--r--arch/s390/mm/pageattr.c188
-rw-r--r--arch/s390/mm/pfault.c248
-rw-r--r--arch/s390/mm/pgalloc.c406
-rw-r--r--arch/s390/mm/pgtable.c251
-rw-r--r--arch/s390/mm/vmem.c844
-rw-r--r--arch/s390/net/bpf_jit_comp.c1459
-rw-r--r--arch/s390/numa/Makefile4
-rw-r--r--arch/s390/numa/mode_emu.c577
-rw-r--r--arch/s390/numa/numa.c171
-rw-r--r--arch/s390/numa/numa_mode.h25
-rw-r--r--arch/s390/numa/toptree.c351
-rw-r--r--arch/s390/numa/toptree.h61
-rw-r--r--arch/s390/oprofile/Makefile10
-rw-r--r--arch/s390/oprofile/init.c37
-rw-r--r--arch/s390/pci/Makefile6
-rw-r--r--arch/s390/pci/pci.c689
-rw-r--r--arch/s390/pci/pci_bus.c374
-rw-r--r--arch/s390/pci/pci_bus.h42
-rw-r--r--arch/s390/pci/pci_clp.c304
-rw-r--r--arch/s390/pci/pci_debug.c14
-rw-r--r--arch/s390/pci/pci_dma.c684
-rw-r--r--arch/s390/pci/pci_event.c350
-rw-r--r--arch/s390/pci/pci_insn.c177
-rw-r--r--arch/s390/pci/pci_iov.c99
-rw-r--r--arch/s390/pci/pci_iov.h30
-rw-r--r--arch/s390/pci/pci_irq.c150
-rw-r--r--arch/s390/pci/pci_kvm_hook.c11
-rw-r--r--arch/s390/pci/pci_mmio.c292
-rw-r--r--arch/s390/pci/pci_sysfs.c127
-rw-r--r--arch/s390/purgatory/.gitignore2
-rw-r--r--arch/s390/purgatory/Makefile31
-rw-r--r--arch/s390/purgatory/head.S103
-rw-r--r--arch/s390/purgatory/kexec-purgatory.S14
-rw-r--r--arch/s390/purgatory/purgatory.c2
-rw-r--r--arch/s390/purgatory/string.c3
-rw-r--r--arch/s390/scripts/Makefile.chkbss20
-rw-r--r--arch/s390/tools/.gitignore1
-rw-r--r--arch/s390/tools/Makefile4
-rwxr-xr-xarch/s390/tools/gcc-thunk-extern.sh24
-rw-r--r--arch/s390/tools/gen_facilities.c14
-rw-r--r--arch/s390/tools/opcodes.txt21
448 files changed, 36368 insertions, 22315 deletions
diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild
index e63940bb57cd..a5d3503b353c 100644
--- a/arch/s390/Kbuild
+++ b/arch/s390/Kbuild
@@ -3,9 +3,11 @@ obj-y += kernel/
obj-y += mm/
obj-$(CONFIG_KVM) += kvm/
obj-y += crypto/
-obj-$(CONFIG_S390_HYPFS_FS) += hypfs/
+obj-$(CONFIG_S390_HYPFS) += hypfs/
obj-$(CONFIG_APPLDATA_BASE) += appldata/
obj-y += net/
obj-$(CONFIG_PCI) += pci/
-obj-$(CONFIG_NUMA) += numa/
-obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += purgatory/
+obj-$(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY) += purgatory/
+
+# for cleaning
+subdir- += boot tools
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index d4051e88e625..fe565f3a3a91 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -2,9 +2,6 @@
config MMU
def_bool y
-config ZONE_DMA
- def_bool y
-
config CPU_BIG_ENDIAN
def_bool y
@@ -30,14 +27,11 @@ config GENERIC_BUG_RELATIVE_POINTERS
def_bool y
config GENERIC_LOCKBREAK
- def_bool y if PREEMPT
+ def_bool y if PREEMPTION
config PGSTE
def_bool y if KVM
-config ARCH_SUPPORTS_DEBUG_PAGEALLOC
- def_bool y
-
config AUDIT_ARCH
def_bool y
@@ -53,25 +47,43 @@ config ARCH_SUPPORTS_UPROBES
config KASAN_SHADOW_OFFSET
hex
depends on KASAN
- default 0x18000000000000 if KASAN_S390_4_LEVEL_PAGING
- default 0x30000000000
+ default 0x1C000000000000
config S390
def_bool y
+ #
+ # Note: keep this list sorted alphabetically
+ #
+ imply IMA_SECURE_AND_OR_TRUSTED_BOOT
+ select ALTERNATE_USER_ADDRESS_SPACE
+ select ARCH_32BIT_USTAT_F_TINODE
select ARCH_BINFMT_ELF_STATE
+ select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
+ select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM
+ select ARCH_ENABLE_MEMORY_HOTREMOVE
+ select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
+ select ARCH_HAS_CURRENT_STACK_POINTER
+ select ARCH_HAS_DEBUG_VM_PGTABLE
+ select ARCH_HAS_DEBUG_WX
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
+ select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
+ select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_MEM_ENCRYPT
+ select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
select ARCH_HAS_PTE_SPECIAL
+ select ARCH_HAS_SCALED_CPUTIME
+ select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_UBSAN_SANITIZE_ALL
+ select ARCH_HAS_VDSO_DATA
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_INLINE_READ_LOCK
select ARCH_INLINE_READ_LOCK_BH
@@ -101,52 +113,75 @@ config S390
select ARCH_INLINE_WRITE_UNLOCK_BH
select ARCH_INLINE_WRITE_UNLOCK_IRQ
select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
- select ARCH_KEEP_MEMBLOCK
- select ARCH_SAVE_PAGE_KEYS if HIBERNATION
select ARCH_STACKWALK
select ARCH_SUPPORTS_ATOMIC_RMW
+ select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ select ARCH_SUPPORTS_HUGETLBFS
+ select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && CC_IS_CLANG
select ARCH_SUPPORTS_NUMA_BALANCING
+ select ARCH_SUPPORTS_PER_VMA_LOCK
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF
- select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+ select ARCH_USE_SYM_ANNOTATIONS
+ select ARCH_WANTS_NO_INSTR
+ select ARCH_WANT_DEFAULT_BPF_JIT
select ARCH_WANT_IPC_PARSE_VERSION
- select BUILDTIME_EXTABLE_SORT
+ select ARCH_WANT_KERNEL_PMD_MKWRITE
+ select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
+ select BUILDTIME_TABLE_SORT
select CLONE_BACKWARDS2
+ select DCACHE_WORD_ACCESS if !KMSAN
+ select DMA_OPS if PCI
select DYNAMIC_FTRACE if FUNCTION_TRACER
- select GENERIC_CLOCKEVENTS
+ select FUNCTION_ALIGNMENT_8B if CC_IS_GCC
+ select FUNCTION_ALIGNMENT_16B if !CC_IS_GCC
+ select GENERIC_ALLOCATOR
select GENERIC_CPU_AUTOPROBE
select GENERIC_CPU_VULNERABILITIES
- select GENERIC_FIND_FIRST_BIT
+ select GENERIC_ENTRY
+ select GENERIC_GETTIMEOFDAY
+ select GENERIC_PTDUMP
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL
- select HAVE_ALIGNED_STRUCT_PAGE if SLUB
+ select GENERIC_VDSO_TIME_NS
+ select GENERIC_IOREMAP if PCI
+ select HAVE_ALIGNED_STRUCT_PAGE
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN
- select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
+ select HAVE_ARCH_KASAN_VMALLOC
+ select HAVE_ARCH_KCSAN
+ select HAVE_ARCH_KFENCE
+ select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_SOFT_DIRTY
+ select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_VMAP_STACK
select HAVE_ASM_MODVERSIONS
- select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
- select HAVE_COPY_THREAD_TLS
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
+ select HAVE_DYNAMIC_FTRACE_WITH_ARGS
+ select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
select HAVE_DYNAMIC_FTRACE_WITH_REGS
- select HAVE_FAST_GUP
+ select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES
select HAVE_EFFICIENT_UNALIGNED_ACCESS
+ select HAVE_FAST_GUP
select HAVE_FENTRY
select HAVE_FTRACE_MCOUNT_RECORD
+ select HAVE_FUNCTION_ARG_ACCESS_API
+ select HAVE_FUNCTION_ERROR_INJECTION
+ select HAVE_FUNCTION_GRAPH_RETVAL
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
- select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_GCC_PLUGINS
+ select HAVE_GENERIC_VDSO
+ select HAVE_IOREMAP_PROT if PCI
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZ4
@@ -154,46 +189,55 @@ config S390
select HAVE_KERNEL_LZO
select HAVE_KERNEL_UNCOMPRESSED
select HAVE_KERNEL_XZ
+ select HAVE_KERNEL_ZSTD
select HAVE_KPROBES
+ select HAVE_KPROBES_ON_FTRACE
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH
- select HAVE_PERF_REGS
- select HAVE_PERF_USER_STACK_DUMP
- select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MEMBLOCK_PHYS_MAP
- select HAVE_MMU_GATHER_NO_GATHER
select HAVE_MOD_ARCH_SPECIFIC
+ select HAVE_NMI
select HAVE_NOP_MCOUNT
- select HAVE_OPROFILE
select HAVE_PCI
select HAVE_PERF_EVENTS
- select HAVE_RCU_TABLE_FREE
+ select HAVE_PERF_REGS
+ select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE
+ select HAVE_RETHOOK
select HAVE_RSEQ
+ select HAVE_SAMPLE_FTRACE_DIRECT
+ select HAVE_SAMPLE_FTRACE_DIRECT_MULTI
+ select HAVE_SETUP_PER_CPU_AREA
+ select HAVE_SOFTIRQ_ON_OWN_STACK
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_VIRT_CPU_ACCOUNTING
+ select HAVE_VIRT_CPU_ACCOUNTING_IDLE
select IOMMU_HELPER if PCI
select IOMMU_SUPPORT if PCI
+ select MMU_GATHER_MERGE_VMAS
+ select MMU_GATHER_NO_GATHER
+ select MMU_GATHER_RCU_TABLE_FREE
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE if PCI
+ select NEED_PER_CPU_EMBED_FIRST_CHUNK
select NEED_SG_DMA_LENGTH if PCI
select OLD_SIGACTION
select OLD_SIGSUSPEND3
select PCI_DOMAINS if PCI
select PCI_MSI if PCI
+ select PCI_MSI_ARCH_FALLBACKS if PCI_MSI
select SPARSE_IRQ
+ select SWIOTLB
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+ select TRACE_IRQFLAGS_SUPPORT
select TTY
+ select USER_STACKTRACE_SUPPORT
select VIRT_CPU_ACCOUNTING
- select ARCH_HAS_SCALED_CPUTIME
- select HAVE_NMI
- select ARCH_HAS_FORCE_DMA_UNENCRYPTED
- select SWIOTLB
- select GENERIC_ALLOCATOR
-
+ select ZONE_DMA
+ # Note: keep the above list sorted alphabetically
config SCHED_OMIT_FRAME_POINTER
def_bool y
@@ -204,22 +248,29 @@ config PGTABLE_LEVELS
source "kernel/livepatch/Kconfig"
-menu "Processor type and features"
+config ARCH_SUPPORTS_KEXEC
+ def_bool y
-config HAVE_MARCH_Z900_FEATURES
- def_bool n
+config ARCH_SUPPORTS_KEXEC_FILE
+ def_bool y
-config HAVE_MARCH_Z990_FEATURES
- def_bool n
- select HAVE_MARCH_Z900_FEATURES
+config ARCH_SUPPORTS_KEXEC_SIG
+ def_bool MODULE_SIG_FORMAT
-config HAVE_MARCH_Z9_109_FEATURES
- def_bool n
- select HAVE_MARCH_Z990_FEATURES
+config ARCH_SUPPORTS_KEXEC_PURGATORY
+ def_bool y
+
+config ARCH_SUPPORTS_CRASH_DUMP
+ def_bool y
+ help
+ Refer to <file:Documentation/arch/s390/zfcpdump.rst> for more details on this.
+ This option also enables s390 zfcpdump.
+ See also <file:Documentation/arch/s390/zfcpdump.rst>
+
+menu "Processor type and features"
config HAVE_MARCH_Z10_FEATURES
def_bool n
- select HAVE_MARCH_Z9_109_FEATURES
config HAVE_MARCH_Z196_FEATURES
def_bool n
@@ -241,45 +292,21 @@ config HAVE_MARCH_Z15_FEATURES
def_bool n
select HAVE_MARCH_Z14_FEATURES
+config HAVE_MARCH_Z16_FEATURES
+ def_bool n
+ select HAVE_MARCH_Z15_FEATURES
+
choice
prompt "Processor type"
default MARCH_Z196
-config MARCH_Z900
- bool "IBM zSeries model z800 and z900"
- select HAVE_MARCH_Z900_FEATURES
- depends on $(cc-option,-march=z900)
- help
- Select this to enable optimizations for model z800/z900 (2064 and
- 2066 series). This will enable some optimizations that are not
- available on older ESA/390 (31 Bit) only CPUs.
-
-config MARCH_Z990
- bool "IBM zSeries model z890 and z990"
- select HAVE_MARCH_Z990_FEATURES
- depends on $(cc-option,-march=z990)
- help
- Select this to enable optimizations for model z890/z990 (2084 and
- 2086 series). The kernel will be slightly faster but will not work
- on older machines.
-
-config MARCH_Z9_109
- bool "IBM System z9"
- select HAVE_MARCH_Z9_109_FEATURES
- depends on $(cc-option,-march=z9-109)
- help
- Select this to enable optimizations for IBM System z9 (2094 and
- 2096 series). The kernel will be slightly faster but will not work
- on older machines.
-
config MARCH_Z10
bool "IBM System z10"
select HAVE_MARCH_Z10_FEATURES
depends on $(cc-option,-march=z10)
help
- Select this to enable optimizations for IBM System z10 (2097 and
- 2098 series). The kernel will be slightly faster but will not work
- on older machines.
+ Select this to enable optimizations for IBM System z10 (2097 and 2098
+ series). This is the oldest machine generation currently supported.
config MARCH_Z196
bool "IBM zEnterprise 114 and 196"
@@ -326,16 +353,15 @@ config MARCH_Z15
and 8561 series). The kernel will be slightly faster but will not
work on older machines.
-endchoice
-
-config MARCH_Z900_TUNE
- def_bool TUNE_Z900 || MARCH_Z900 && TUNE_DEFAULT
-
-config MARCH_Z990_TUNE
- def_bool TUNE_Z990 || MARCH_Z990 && TUNE_DEFAULT
+config MARCH_Z16
+ bool "IBM z16"
+ select HAVE_MARCH_Z16_FEATURES
+ depends on $(cc-option,-march=z16)
+ help
+ Select this to enable optimizations for IBM z16 (3931 and
+ 3932 series).
-config MARCH_Z9_109_TUNE
- def_bool TUNE_Z9_109 || MARCH_Z9_109 && TUNE_DEFAULT
+endchoice
config MARCH_Z10_TUNE
def_bool TUNE_Z10 || MARCH_Z10 && TUNE_DEFAULT
@@ -355,6 +381,9 @@ config MARCH_Z14_TUNE
config MARCH_Z15_TUNE
def_bool TUNE_Z15 || MARCH_Z15 && TUNE_DEFAULT
+config MARCH_Z16_TUNE
+ def_bool TUNE_Z16 || MARCH_Z16 && TUNE_DEFAULT
+
choice
prompt "Tune code generation"
default TUNE_DEFAULT
@@ -372,21 +401,8 @@ config TUNE_DEFAULT
Tune the generated code for the target processor for which the kernel
will be compiled.
-config TUNE_Z900
- bool "IBM zSeries model z800 and z900"
- depends on $(cc-option,-mtune=z900)
-
-config TUNE_Z990
- bool "IBM zSeries model z890 and z990"
- depends on $(cc-option,-mtune=z990)
-
-config TUNE_Z9_109
- bool "IBM System z9"
- depends on $(cc-option,-mtune=z9-109)
-
config TUNE_Z10
bool "IBM System z10"
- depends on $(cc-option,-mtune=z10)
config TUNE_Z196
bool "IBM zEnterprise 114 and 196"
@@ -408,27 +424,38 @@ config TUNE_Z15
bool "IBM z15"
depends on $(cc-option,-mtune=z15)
+config TUNE_Z16
+ bool "IBM z16"
+ depends on $(cc-option,-mtune=z16)
+
endchoice
config 64BIT
def_bool y
+config COMMAND_LINE_SIZE
+ int "Maximum size of kernel command line"
+ default 4096
+ range 896 1048576
+ help
+ This allows you to specify the maximum length of the kernel command
+ line.
+
config COMPAT
- def_bool y
+ def_bool n
prompt "Kernel support for 31 bit emulation"
- select COMPAT_BINFMT_ELF if BINFMT_ELF
select ARCH_WANT_OLD_COMPAT_IPC
select COMPAT_OLD_SIGACTION
select HAVE_UID16
depends on MULTIUSER
+ depends on !CC_IS_CLANG
help
Select this option if you want to enable your system kernel to
handle system-calls from ELF binaries for 31 bit ESA. This option
(and some other stuff like libraries and such) is needed for
- executing 31 bit applications. It is safe to say "Y".
+ executing 31 bit applications.
-config SYSVIPC_COMPAT
- def_bool y if COMPAT && SYSVIPC
+ If unsure say N.
config SMP
def_bool y
@@ -448,14 +475,6 @@ config NR_CPUS
config HOTPLUG_CPU
def_bool y
-# Some NUMA nodes have memory ranges that span
-# other nodes. Even though a pfn is valid and
-# between a node's start and end pfns, it may not
-# reside on that node. See memmap_init_zone()
-# for details. <- They meant memory holes!
-config NODES_SPAN_OTHER_NODES
- def_bool NUMA
-
config NUMA
bool "NUMA support"
depends on SCHED_TOPOLOGY
@@ -465,58 +484,10 @@ config NUMA
This option adds NUMA support to the kernel.
- An operation mode can be selected by appending
- numa=<method> to the kernel command line.
-
- The default behaviour is identical to appending numa=plain to
- the command line. This will create just one node with all
- available memory and all CPUs in it.
-
config NODES_SHIFT
- int "Maximum NUMA nodes (as a power of 2)"
- range 1 10
- depends on NUMA
- default "4"
- help
- Specify the maximum number of NUMA nodes available on the target
- system. Increases memory reserved to accommodate various tables.
-
-menu "Select NUMA modes"
+ int
depends on NUMA
-
-config NUMA_EMU
- bool "NUMA emulation"
- default y
- help
- Numa emulation mode will split the available system memory into
- equal chunks which then are distributed over the configured number
- of nodes in a round-robin manner.
-
- The number of fake nodes is limited by the number of available memory
- chunks (i.e. memory size / fake size) and the number of supported
- nodes in the kernel.
-
- The CPUs are assigned to the nodes in a way that partially respects
- the original machine topology (if supported by the machine).
- Fair distribution of the CPUs is not guaranteed.
-
-config EMU_SIZE
- hex "NUMA emulation memory chunk size"
- default 0x10000000
- range 0x400000 0x100000000
- depends on NUMA_EMU
- help
- Select the default size by which the memory is chopped and then
- assigned to emulated NUMA nodes.
-
- This can be overridden by specifying
-
- emu_size=<n>
-
- on the kernel command line where also suffixes K, M, G, and T are
- supported.
-
-endmenu
+ default "1"
config SCHED_SMT
def_bool n
@@ -524,19 +495,11 @@ config SCHED_SMT
config SCHED_MC
def_bool n
-config SCHED_BOOK
- def_bool n
-
-config SCHED_DRAWER
- def_bool n
-
config SCHED_TOPOLOGY
def_bool y
prompt "Topology scheduler support"
select SCHED_SMT
select SCHED_MC
- select SCHED_BOOK
- select SCHED_DRAWER
help
Topology scheduler support improves the CPU scheduler's decision
making when dealing with machines that have multi-threading,
@@ -544,51 +507,16 @@ config SCHED_TOPOLOGY
source "kernel/Kconfig.hz"
-config KEXEC
- def_bool y
- select KEXEC_CORE
-
-config KEXEC_FILE
- bool "kexec file based system call"
- select KEXEC_CORE
- select BUILD_BIN2C
- depends on CRYPTO
- depends on CRYPTO_SHA256
- depends on CRYPTO_SHA256_S390
- help
- Enable the kexec file based system call. In contrast to the normal
- kexec system call this system call takes file descriptors for the
- kernel and initramfs as arguments.
-
-config ARCH_HAS_KEXEC_PURGATORY
- def_bool y
- depends on KEXEC_FILE
-
-config KEXEC_SIG
- bool "Verify kernel signature during kexec_file_load() syscall"
- depends on KEXEC_FILE && MODULE_SIG_FORMAT
+config CERT_STORE
+ bool "Get user certificates via DIAG320"
+ depends on KEYS
+ select CRYPTO_LIB_SHA256
help
- This option makes kernel signature verification mandatory for
- the kexec_file_load() syscall.
-
- In addition to that option, you need to enable signature
- verification for the corresponding kernel image type being
- loaded in order for this to work.
+ Enable this option if you want to access user-provided secure boot
+ certificates via DIAG 0x320.
-config ARCH_RANDOM
- def_bool y
- prompt "s390 architectural random number generation API"
- help
- Enable the s390 architectural random number generation API
- to provide random data for all consumers within the Linux
- kernel.
-
- When enabled the arch_random_* functions declared in linux/random.h
- are implemented. The implementation is based on the s390 CPACF
- instruction subfunction TRNG which provides a real true random
- number generator.
-
- If unsure, say Y.
+ These certificates will be made available via the keyring named
+ 'cert_store'.
config KERNEL_NOBP
def_bool n
@@ -609,6 +537,7 @@ config KERNEL_NOBP
config EXPOLINE
def_bool n
+ depends on $(cc-option,-mindirect-branch=thunk)
prompt "Avoid speculative indirect branches in the kernel"
help
Compile the kernel with the expoline compiler options to guard
@@ -619,6 +548,19 @@ config EXPOLINE
If unsure, say N.
+config EXPOLINE_EXTERN
+ def_bool n
+ depends on EXPOLINE
+ depends on CC_IS_GCC && GCC_VERSION >= 110200
+ depends on $(success,$(srctree)/arch/s390/tools/gcc-thunk-extern.sh $(CC))
+ prompt "Generate expolines as extern functions."
+ help
+ This option is required for some tooling like kpatch. The kernel is
+ compiled with -mindirect-branch=thunk-extern and requires a newer
+ compiler.
+
+ If unsure, say N.
+
choice
prompt "Expoline default"
depends on EXPOLINE
@@ -636,9 +578,7 @@ config EXPOLINE_FULL
endchoice
config RELOCATABLE
- bool "Build a relocatable kernel"
- select MODULE_REL_CRCS if MODVERSIONS
- default y
+ def_bool y
help
This builds a kernel image that retains relocation information
so it can be loaded at an arbitrary address.
@@ -647,10 +587,11 @@ config RELOCATABLE
bootup process.
The relocations make the kernel image about 15% larger (compressed
10%), but are discarded at runtime.
+ Note: this option exists only for documentation purposes, please do
+ not remove it.
config RANDOMIZE_BASE
bool "Randomize the address of the kernel image (KASLR)"
- depends on RELOCATABLE
default y
help
In support of Kernel Address Space Layout Randomization (KASLR),
@@ -670,19 +611,6 @@ config ARCH_SPARSEMEM_ENABLE
config ARCH_SPARSEMEM_DEFAULT
def_bool y
-config ARCH_ENABLE_MEMORY_HOTPLUG
- def_bool y if SPARSEMEM
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
- def_bool y
-
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
- def_bool y
-
-config FORCE_MAX_ZONEORDER
- int
- default "9"
-
config MAX_PHYSMEM_BITS
int "Maximum size of supported physical memory in bits (42-53)"
range 42 53
@@ -693,20 +621,6 @@ config MAX_PHYSMEM_BITS
Increasing the number of bits also increases the kernel image size.
By default 46 bits (64TB) are supported.
-config PACK_STACK
- def_bool y
- prompt "Pack kernel stack"
- help
- This option enables the compiler option -mkernel-backchain if it
- is available. If the option is available the compiler supports
- the new stack layout which dramatically reduces the minimum stack
- frame size. With an old compiler a non-leaf function needs a
- minimum of 96 bytes on 31 bit and 160 bytes on 64 bit. With
- -mkernel-backchain the minimum size drops to 16 byte on 31 bit
- and 24 byte on 64 bit.
-
- Say Y if you are unsure.
-
config CHECK_STACK
def_bool y
depends on !VMAP_STACK
@@ -733,16 +647,6 @@ config STACK_GUARD
The minimum size for the stack guard should be 256 for 31 bit and
512 for 64 bit.
-config WARN_DYNAMIC_STACK
- def_bool n
- prompt "Emit compiler warnings for function with dynamic stack usage"
- help
- This option enables the compiler option -mwarn-dynamicstack. If the
- compiler supports this options generates warnings for functions
- that dynamically allocate stack space using alloca.
-
- Say N if you are unsure.
-
endmenu
menu "I/O subsystem"
@@ -750,7 +654,7 @@ menu "I/O subsystem"
config QDIO
def_tristate y
prompt "QDIO support"
- ---help---
+ help
This driver provides the Queued Direct I/O base support for
IBM System z.
@@ -764,7 +668,7 @@ if PCI
config PCI_NR_FUNCTIONS
int "Maximum number of PCI functions (1-4096)"
range 1 4096
- default "128"
+ default "512"
help
This allows you to specify the maximum number of PCI functions which
this kernel will support.
@@ -811,7 +715,8 @@ config EADM_SCH
config VFIO_CCW
def_tristate n
prompt "Support for VFIO-CCW subchannels"
- depends on S390_CCW_IOMMU && VFIO_MDEV
+ depends on VFIO
+ select VFIO_MDEV
help
This driver allows usage of I/O subchannels via VFIO-CCW.
@@ -821,55 +726,16 @@ config VFIO_CCW
config VFIO_AP
def_tristate n
prompt "VFIO support for AP devices"
- depends on S390_AP_IOMMU && VFIO_MDEV_DEVICE && KVM
- help
- This driver grants access to Adjunct Processor (AP) devices
- via the VFIO mediated device interface.
-
- To compile this driver as a module, choose M here: the module
- will be called vfio_ap.
-
-endmenu
-
-menu "Dump support"
-
-config CRASH_DUMP
- bool "kernel crash dumps"
- select KEXEC
- help
- Generate crash dump after being started by kexec.
- Crash dump kernels are loaded in the main kernel with kexec-tools
- into a specially reserved region and then later executed after
- a crash by kdump/kexec.
- Refer to <file:Documentation/s390/zfcpdump.rst> for more details on this.
- This option also enables s390 zfcpdump.
- See also <file:Documentation/s390/zfcpdump.rst>
-
-endmenu
-
-config SECCOMP
- def_bool y
- prompt "Enable seccomp to safely compute untrusted bytecode"
- depends on PROC_FS
+ depends on KVM
+ depends on VFIO
+ depends on ZCRYPT
+ select VFIO_MDEV
help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via /proc/<pid>/seccomp, it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
- If unsure, say Y.
-
-menu "Power Management"
-
-config ARCH_HIBERNATION_POSSIBLE
- def_bool y
+ This driver grants access to Adjunct Processor (AP) devices
+ via the VFIO mediated device interface.
-source "kernel/power/Kconfig"
+ To compile this driver as a module, choose M here: the module
+ will be called vfio_ap.
endmenu
@@ -930,7 +796,7 @@ config CMM_IUCV
config APPLDATA_BASE
def_bool n
prompt "Linux - VM Monitor Stream, base infrastructure"
- depends on PROC_FS
+ depends on PROC_SYSCTL
help
This provides a kernel interface for creating and updating z/VM APPLDATA
monitor records. The monitor records are updated at certain time
@@ -991,13 +857,24 @@ config APPLDATA_NET_SUM
This can also be compiled as a module, which will be called
appldata_net_sum.o.
-config S390_HYPFS_FS
+config S390_HYPFS
def_bool y
+ prompt "s390 hypervisor information"
+ help
+ This provides several binary files at (debugfs)/s390_hypfs/ to
+ provide accounting information in an s390 hypervisor environment.
+
+config S390_HYPFS_FS
+ def_bool n
prompt "s390 hypervisor file system support"
select SYS_HYPERVISOR
+ depends on S390_HYPFS
help
This is a virtual file system intended to provide accounting
- information in an s390 hypervisor environment.
+ information in an s390 hypervisor environment. This file system
+ is deprecated and should not be used.
+
+ Say N if you are unsure.
source "arch/s390/kvm/Kconfig"
@@ -1007,7 +884,6 @@ config S390_GUEST
select TTY
select VIRTUALIZATION
select VIRTIO
- select VIRTIO_CONSOLE
help
Enabling this option adds support for virtio based paravirtual device
drivers on s390.
@@ -1017,10 +893,15 @@ config S390_GUEST
endmenu
+config S390_MODULES_SANITY_TEST_HELPERS
+ def_bool n
+
menu "Selftests"
config S390_UNWIND_SELFTEST
def_tristate n
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
prompt "Test unwind functions"
help
This option enables s390 specific stack unwinder testing kernel
@@ -1029,4 +910,28 @@ config S390_UNWIND_SELFTEST
Say N if you are unsure.
+config S390_KPROBES_SANITY_TEST
+ def_tristate n
+ prompt "Enable s390 specific kprobes tests"
+ depends on KPROBES
+ depends on KUNIT
+ help
+ This option enables an s390 specific kprobes test module. This option
+ is not useful for distributions or general kernels, but only for kernel
+ developers working on architecture code.
+
+ Say N if you are unsure.
+
+config S390_MODULES_SANITY_TEST
+ def_tristate n
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ prompt "Enable s390 specific modules tests"
+ select S390_MODULES_SANITY_TEST_HELPERS
+ help
+ This option enables an s390 specific modules test. This option is
+ not useful for distributions or general kernels, but only for
+ kernel developers working on architecture code.
+
+ Say N if you are unsure.
endmenu
diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug
index 190527560b2c..c4300ea4abf8 100644
--- a/arch/s390/Kconfig.debug
+++ b/arch/s390/Kconfig.debug
@@ -1,19 +1,22 @@
# SPDX-License-Identifier: GPL-2.0
-config TRACE_IRQFLAGS_SUPPORT
+config EARLY_PRINTK
def_bool y
-config S390_PTDUMP
- bool "Export kernel pagetable layout to userspace via debugfs"
+config DEBUG_ENTRY
+ bool "Debug low-level entry code"
depends on DEBUG_KERNEL
- select DEBUG_FS
- ---help---
- Say Y here if you want to show the kernel pagetable layout in a
- debugfs file. This information is only useful for kernel developers
- who are working in architecture specific areas of the kernel.
- It is probably not a good idea to enable this feature in a production
- kernel.
- If in doubt, say "N"
+ help
+ This option enables sanity checks in s390 low-level entry code.
+ Some of these sanity checks may slow down kernel entries and
+ exits or otherwise impact performance.
-config EARLY_PRINTK
- def_bool y
+ If unsure, say N.
+
+config CIO_INJECT
+ bool "CIO Inject interfaces"
+ depends on DEBUG_KERNEL && DEBUG_FS
+ help
+ This option provides a debugging facility to inject certain artificial events
+ and instruction responses to the CIO layer of Linux kernel. The newly created
+ debugfs user-interfaces will be at /sys/kernel/debug/s390/cio/*
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index ba8556bb0fb1..73873e451686 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -3,9 +3,7 @@
# s390/Makefile
#
# This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" and "archdep" for cleaning up and making dependencies for
-# this architecture
+# architecture-specific flags and dependencies.
#
# Copyright (C) 1994 by Linus Torvalds
#
@@ -16,51 +14,51 @@ KBUILD_AFLAGS_MODULE += -fPIC
KBUILD_CFLAGS_MODULE += -fPIC
KBUILD_AFLAGS += -m64
KBUILD_CFLAGS += -m64
-ifeq ($(CONFIG_RELOCATABLE),y)
KBUILD_CFLAGS += -fPIE
LDFLAGS_vmlinux := -pie
-endif
aflags_dwarf := -Wa,-gdwarf-2
KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__
+ifndef CONFIG_AS_IS_LLVM
KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf))
-KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2
+endif
+KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack
KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY
-KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float
+KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float -mbackchain
KBUILD_CFLAGS_DECOMPRESSOR += -fno-asynchronous-unwind-tables
-KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-option,-ffreestanding)
+KBUILD_CFLAGS_DECOMPRESSOR += -ffreestanding
+KBUILD_CFLAGS_DECOMPRESSOR += -fno-stack-protector
+KBUILD_CFLAGS_DECOMPRESSOR += -fPIE
KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-member)
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g)
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,))
+KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds)
+
UTS_MACHINE := s390x
STACK_SIZE := $(if $(CONFIG_KASAN),65536,16384)
CHECKFLAGS += -D__s390__ -D__s390x__
export LD_BFD
-mflags-$(CONFIG_MARCH_Z900) := -march=z900
-mflags-$(CONFIG_MARCH_Z990) := -march=z990
-mflags-$(CONFIG_MARCH_Z9_109) := -march=z9-109
mflags-$(CONFIG_MARCH_Z10) := -march=z10
mflags-$(CONFIG_MARCH_Z196) := -march=z196
mflags-$(CONFIG_MARCH_ZEC12) := -march=zEC12
mflags-$(CONFIG_MARCH_Z13) := -march=z13
mflags-$(CONFIG_MARCH_Z14) := -march=z14
mflags-$(CONFIG_MARCH_Z15) := -march=z15
+mflags-$(CONFIG_MARCH_Z16) := -march=z16
export CC_FLAGS_MARCH := $(mflags-y)
aflags-y += $(mflags-y)
cflags-y += $(mflags-y)
-cflags-$(CONFIG_MARCH_Z900_TUNE) += -mtune=z900
-cflags-$(CONFIG_MARCH_Z990_TUNE) += -mtune=z990
-cflags-$(CONFIG_MARCH_Z9_109_TUNE) += -mtune=z9-109
cflags-$(CONFIG_MARCH_Z10_TUNE) += -mtune=z10
cflags-$(CONFIG_MARCH_Z196_TUNE) += -mtune=z196
cflags-$(CONFIG_MARCH_ZEC12_TUNE) += -mtune=zEC12
cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13
cflags-$(CONFIG_MARCH_Z14_TUNE) += -mtune=z14
cflags-$(CONFIG_MARCH_Z15_TUNE) += -mtune=z15
+cflags-$(CONFIG_MARCH_Z16_TUNE) += -mtune=z16
cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include
@@ -69,44 +67,38 @@ cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include
#
cflags-$(CONFIG_FRAME_POINTER) += -fno-optimize-sibling-calls
-ifeq ($(call cc-option-yn,-mpacked-stack),y)
-cflags-$(CONFIG_PACK_STACK) += -mpacked-stack -D__PACK_STACK
-aflags-$(CONFIG_PACK_STACK) += -D__PACK_STACK
-endif
-
KBUILD_AFLAGS_DECOMPRESSOR += $(aflags-y)
KBUILD_CFLAGS_DECOMPRESSOR += $(cflags-y)
-ifeq ($(call cc-option-yn,-mstack-size=8192 -mstack-guard=128),y)
-cflags-$(CONFIG_CHECK_STACK) += -mstack-size=$(STACK_SIZE)
-ifneq ($(call cc-option-yn,-mstack-size=8192),y)
-cflags-$(CONFIG_CHECK_STACK) += -mstack-guard=$(CONFIG_STACK_GUARD)
-endif
-endif
-
-ifdef CONFIG_WARN_DYNAMIC_STACK
- ifeq ($(call cc-option-yn,-mwarn-dynamicstack),y)
- KBUILD_CFLAGS += -mwarn-dynamicstack
- KBUILD_CFLAGS_DECOMPRESSOR += -mwarn-dynamicstack
+ifneq ($(call cc-option,-mstack-size=8192 -mstack-guard=128),)
+ CC_FLAGS_CHECK_STACK := -mstack-size=$(STACK_SIZE)
+ ifeq ($(call cc-option,-mstack-size=8192),)
+ CC_FLAGS_CHECK_STACK += -mstack-guard=$(CONFIG_STACK_GUARD)
endif
+ export CC_FLAGS_CHECK_STACK
+ cflags-$(CONFIG_CHECK_STACK) += $(CC_FLAGS_CHECK_STACK)
endif
ifdef CONFIG_EXPOLINE
- ifeq ($(call cc-option-yn,$(CC_FLAGS_MARCH) -mindirect-branch=thunk),y)
+ ifdef CONFIG_EXPOLINE_EXTERN
+ KBUILD_LDFLAGS_MODULE += arch/s390/lib/expoline/expoline.o
+ CC_FLAGS_EXPOLINE := -mindirect-branch=thunk-extern
+ CC_FLAGS_EXPOLINE += -mfunction-return=thunk-extern
+ else
CC_FLAGS_EXPOLINE := -mindirect-branch=thunk
CC_FLAGS_EXPOLINE += -mfunction-return=thunk
- CC_FLAGS_EXPOLINE += -mindirect-branch-table
- export CC_FLAGS_EXPOLINE
- cflags-y += $(CC_FLAGS_EXPOLINE) -DCC_USING_EXPOLINE
- aflags-y += -DCC_USING_EXPOLINE
endif
+ CC_FLAGS_EXPOLINE += -mindirect-branch-table
+ export CC_FLAGS_EXPOLINE
+ cflags-y += $(CC_FLAGS_EXPOLINE) -DCC_USING_EXPOLINE
+ aflags-y += -DCC_USING_EXPOLINE
endif
ifdef CONFIG_FUNCTION_TRACER
- ifeq ($(call cc-option-yn,-mfentry -mnop-mcount),n)
+ ifeq ($(call cc-option,-mfentry -mnop-mcount),)
# make use of hotpatch feature if the compiler supports it
cc_hotpatch := -mhotpatch=0,3
- ifeq ($(call cc-option-yn,$(cc_hotpatch)),y)
+ ifneq ($(call cc-option,$(cc_hotpatch)),)
CC_FLAGS_FTRACE := $(cc_hotpatch)
KBUILD_AFLAGS += -DCC_USING_HOTPATCH
KBUILD_CFLAGS += -DCC_USING_HOTPATCH
@@ -117,7 +109,7 @@ endif
# Test CFI features of binutils
cfi := $(call as-instr,.cfi_startproc\n.cfi_val_offset 15$(comma)-160\n.cfi_endproc,-DCONFIG_AS_CFI_VAL_OFFSET=1)
-KBUILD_CFLAGS += -mbackchain -msoft-float $(cflags-y)
+KBUILD_CFLAGS += -mpacked-stack -mbackchain -msoft-float $(cflags-y)
KBUILD_CFLAGS += -pipe -Wno-sign-compare
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables $(cfi)
KBUILD_AFLAGS += $(aflags-y) $(cfi)
@@ -126,16 +118,7 @@ export KBUILD_CFLAGS_DECOMPRESSOR
OBJCOPYFLAGS := -O binary
-head-y := arch/s390/kernel/head64.o
-
-# See arch/s390/Kbuild for content of core part of the kernel
-core-y += arch/s390/
-
libs-y += arch/s390/lib/
-drivers-y += drivers/s390/
-
-# must be linked after kernel
-drivers-$(CONFIG_OPROFILE) += arch/s390/oprofile/
boot := arch/s390/boot
syscalls := arch/s390/kernel/syscalls
@@ -146,8 +129,8 @@ all: bzImage
#KBUILD_IMAGE is necessary for packaging targets like rpm-pkg, deb-pkg...
KBUILD_IMAGE := $(boot)/bzImage
-install: vmlinux
- $(Q)$(MAKE) $(build)=$(boot) $@
+install:
+ $(call cmd,install)
bzImage: vmlinux
$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
@@ -155,19 +138,34 @@ bzImage: vmlinux
zfcpdump:
$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
-vdso_install:
- $(Q)$(MAKE) $(build)=arch/$(ARCH)/kernel/vdso64 $@
-
-archclean:
- $(Q)$(MAKE) $(clean)=$(boot)
- $(Q)$(MAKE) $(clean)=$(tools)
-
archheaders:
$(Q)$(MAKE) $(build)=$(syscalls) uapi
archprepare:
$(Q)$(MAKE) $(build)=$(syscalls) kapi
$(Q)$(MAKE) $(build)=$(tools) kapi
+ifeq ($(KBUILD_EXTMOD),)
+# We need to generate vdso-offsets.h before compiling certain files in kernel/.
+# In order to do that, we should use the archprepare target, but we can't since
+# asm-offsets.h is included in some files used to generate vdso-offsets.h, and
+# asm-offsets.h is built in prepare0, for which archprepare is a dependency.
+# Therefore we need to generate the header after prepare0 has been made, hence
+# this hack.
+prepare: vdso_prepare
+vdso_prepare: prepare0
+ $(Q)$(MAKE) $(build)=arch/s390/kernel/vdso64 include/generated/vdso64-offsets.h
+ $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \
+ $(build)=arch/s390/kernel/vdso32 include/generated/vdso32-offsets.h)
+
+vdso-install-y += arch/s390/kernel/vdso64/vdso64.so.dbg
+vdso-install-$(CONFIG_COMPAT) += arch/s390/kernel/vdso32/vdso32.so.dbg
+
+ifdef CONFIG_EXPOLINE_EXTERN
+modules_prepare: expoline_prepare
+expoline_prepare: scripts
+ $(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o
+endif
+endif
# Don't use tabs in echo arguments
define archhelp
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index aa738cad1338..c2978cb03b36 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -26,12 +26,10 @@
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/workqueue.h>
-#include <linux/suspend.h>
-#include <linux/platform_device.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
#include <asm/appldata.h>
#include <asm/vtimer.h>
-#include <linux/uaccess.h>
-#include <asm/io.h>
#include <asm/smp.h>
#include "appldata.h"
@@ -44,17 +42,14 @@
#define TOD_MICRO 0x01000 /* nr. of TOD clock units
for 1 microsecond */
-static struct platform_device *appldata_pdev;
-
/*
* /proc entries (sysctl)
*/
static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata";
static int appldata_timer_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
static int appldata_interval_handler(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
static struct ctl_table_header *appldata_sysctl_header;
static struct ctl_table appldata_table[] = {
@@ -68,17 +63,6 @@ static struct ctl_table appldata_table[] = {
.mode = S_IRUGO | S_IWUSR,
.proc_handler = appldata_interval_handler,
},
- { },
-};
-
-static struct ctl_table appldata_dir_table[] = {
- {
- .procname = appldata_proc_name,
- .maxlen = 0,
- .mode = S_IRUGO | S_IXUGO,
- .child = appldata_table,
- },
- { },
};
/*
@@ -89,7 +73,6 @@ static struct vtimer_list appldata_timer;
static DEFINE_SPINLOCK(appldata_timer_lock);
static int appldata_interval = APPLDATA_CPU_INTERVAL;
static int appldata_timer_active;
-static int appldata_timer_suspended = 0;
/*
* Work queue
@@ -217,7 +200,7 @@ static void __appldata_vtimer_setup(int cmd)
*/
static int
appldata_timer_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int timer_active = appldata_timer_active;
int rc;
@@ -250,7 +233,7 @@ appldata_timer_handler(struct ctl_table *ctl, int write,
*/
static int
appldata_interval_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int interval = appldata_interval;
int rc;
@@ -280,7 +263,7 @@ appldata_interval_handler(struct ctl_table *ctl, int write,
*/
static int
appldata_generic_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct appldata_ops *ops = NULL, *tmp_ops;
struct list_head *lh;
@@ -297,7 +280,7 @@ appldata_generic_handler(struct ctl_table *ctl, int write,
mutex_lock(&appldata_ops_mutex);
list_for_each(lh, &appldata_ops_list) {
tmp_ops = list_entry(lh, struct appldata_ops, list);
- if (&tmp_ops->ctl_table[2] == ctl) {
+ if (&tmp_ops->ctl_table[0] == ctl) {
found = 1;
}
}
@@ -367,7 +350,7 @@ int appldata_register_ops(struct appldata_ops *ops)
if (ops->size > APPLDATA_MAX_REC_SIZE)
return -EINVAL;
- ops->ctl_table = kcalloc(4, sizeof(struct ctl_table), GFP_KERNEL);
+ ops->ctl_table = kcalloc(1, sizeof(struct ctl_table), GFP_KERNEL);
if (!ops->ctl_table)
return -ENOMEM;
@@ -375,17 +358,12 @@ int appldata_register_ops(struct appldata_ops *ops)
list_add(&ops->list, &appldata_ops_list);
mutex_unlock(&appldata_ops_mutex);
- ops->ctl_table[0].procname = appldata_proc_name;
- ops->ctl_table[0].maxlen = 0;
- ops->ctl_table[0].mode = S_IRUGO | S_IXUGO;
- ops->ctl_table[0].child = &ops->ctl_table[2];
-
- ops->ctl_table[2].procname = ops->name;
- ops->ctl_table[2].mode = S_IRUGO | S_IWUSR;
- ops->ctl_table[2].proc_handler = appldata_generic_handler;
- ops->ctl_table[2].data = ops;
+ ops->ctl_table[0].procname = ops->name;
+ ops->ctl_table[0].mode = S_IRUGO | S_IWUSR;
+ ops->ctl_table[0].proc_handler = appldata_generic_handler;
+ ops->ctl_table[0].data = ops;
- ops->sysctl_header = register_sysctl_table(ops->ctl_table);
+ ops->sysctl_header = register_sysctl_sz(appldata_proc_name, ops->ctl_table, 1);
if (!ops->sysctl_header)
goto out;
return 0;
@@ -413,88 +391,6 @@ void appldata_unregister_ops(struct appldata_ops *ops)
/********************** module-ops management <END> **************************/
-/**************************** suspend / resume *******************************/
-static int appldata_freeze(struct device *dev)
-{
- struct appldata_ops *ops;
- int rc;
- struct list_head *lh;
-
- spin_lock(&appldata_timer_lock);
- if (appldata_timer_active) {
- __appldata_vtimer_setup(APPLDATA_DEL_TIMER);
- appldata_timer_suspended = 1;
- }
- spin_unlock(&appldata_timer_lock);
-
- mutex_lock(&appldata_ops_mutex);
- list_for_each(lh, &appldata_ops_list) {
- ops = list_entry(lh, struct appldata_ops, list);
- if (ops->active == 1) {
- rc = appldata_diag(ops->record_nr, APPLDATA_STOP_REC,
- (unsigned long) ops->data, ops->size,
- ops->mod_lvl);
- if (rc != 0)
- pr_err("Stopping the data collection for %s "
- "failed with rc=%d\n", ops->name, rc);
- }
- }
- mutex_unlock(&appldata_ops_mutex);
- return 0;
-}
-
-static int appldata_restore(struct device *dev)
-{
- struct appldata_ops *ops;
- int rc;
- struct list_head *lh;
-
- spin_lock(&appldata_timer_lock);
- if (appldata_timer_suspended) {
- __appldata_vtimer_setup(APPLDATA_ADD_TIMER);
- appldata_timer_suspended = 0;
- }
- spin_unlock(&appldata_timer_lock);
-
- mutex_lock(&appldata_ops_mutex);
- list_for_each(lh, &appldata_ops_list) {
- ops = list_entry(lh, struct appldata_ops, list);
- if (ops->active == 1) {
- ops->callback(ops->data); // init record
- rc = appldata_diag(ops->record_nr,
- APPLDATA_START_INTERVAL_REC,
- (unsigned long) ops->data, ops->size,
- ops->mod_lvl);
- if (rc != 0) {
- pr_err("Starting the data collection for %s "
- "failed with rc=%d\n", ops->name, rc);
- }
- }
- }
- mutex_unlock(&appldata_ops_mutex);
- return 0;
-}
-
-static int appldata_thaw(struct device *dev)
-{
- return appldata_restore(dev);
-}
-
-static const struct dev_pm_ops appldata_pm_ops = {
- .freeze = appldata_freeze,
- .thaw = appldata_thaw,
- .restore = appldata_restore,
-};
-
-static struct platform_driver appldata_pdrv = {
- .driver = {
- .name = "appldata",
- .pm = &appldata_pm_ops,
- },
-};
-/************************* suspend / resume <END> ****************************/
-
-
/******************************* init / exit *********************************/
/*
@@ -504,36 +400,14 @@ static struct platform_driver appldata_pdrv = {
*/
static int __init appldata_init(void)
{
- int rc;
-
init_virt_timer(&appldata_timer);
appldata_timer.function = appldata_timer_function;
appldata_timer.data = (unsigned long) &appldata_work;
-
- rc = platform_driver_register(&appldata_pdrv);
- if (rc)
- return rc;
-
- appldata_pdev = platform_device_register_simple("appldata", -1, NULL,
- 0);
- if (IS_ERR(appldata_pdev)) {
- rc = PTR_ERR(appldata_pdev);
- goto out_driver;
- }
appldata_wq = alloc_ordered_workqueue("appldata", 0);
- if (!appldata_wq) {
- rc = -ENOMEM;
- goto out_device;
- }
-
- appldata_sysctl_header = register_sysctl_table(appldata_dir_table);
+ if (!appldata_wq)
+ return -ENOMEM;
+ appldata_sysctl_header = register_sysctl(appldata_proc_name, appldata_table);
return 0;
-
-out_device:
- platform_device_unregister(appldata_pdev);
-out_driver:
- platform_driver_unregister(&appldata_pdrv);
- return rc;
}
__initcall(appldata_init);
diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index e68136c3c23a..fc608f9b79ab 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c
@@ -15,7 +15,7 @@
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/slab.h>
-#include <asm/io.h>
+#include <linux/io.h>
#include "appldata.h"
@@ -29,10 +29,6 @@
* the structure version (product ID, see appldata_base.c) needs to be changed
* as well and all documentation and z/VM applications using it must be
* updated.
- *
- * The record layout is documented in the Linux for zSeries Device Drivers
- * book:
- * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml
*/
struct appldata_mem_data {
u64 timestamp;
diff --git a/arch/s390/appldata/appldata_net_sum.c b/arch/s390/appldata/appldata_net_sum.c
index 8bc14b0d1def..59c282ca002f 100644
--- a/arch/s390/appldata/appldata_net_sum.c
+++ b/arch/s390/appldata/appldata_net_sum.c
@@ -25,10 +25,6 @@
* This is accessed as binary data by z/VM. If changes to it can't be avoided,
* the structure version (product ID, see appldata_base.c) needs to be changed
* as well and all documentation and z/VM applications using it must be updated.
- *
- * The record layout is documented in the Linux for zSeries Device Drivers
- * book:
- * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml
*/
struct appldata_net_sum_data {
u64 timestamp;
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 54f375627532..a363d30ce739 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -32,10 +32,6 @@
* the structure version (product ID, see appldata_base.c) needs to be changed
* as well and all documentation and z/VM applications using it must be
* updated.
- *
- * The record layout is documented in the Linux for zSeries Device Drivers
- * book:
- * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml
*/
struct appldata_os_per_cpu {
u32 per_cpu_user; /* timer ticks spent in user mode */
@@ -75,7 +71,7 @@ struct appldata_os_data {
(waiting for I/O) */
/* per cpu data */
- struct appldata_os_per_cpu os_cpu[0];
+ struct appldata_os_per_cpu os_cpu[];
} __attribute__((packed));
static struct appldata_os_data *appldata_os_data;
@@ -133,8 +129,7 @@ static void appldata_get_os_data(void *data)
os_data->nr_cpus = j;
- new_size = sizeof(struct appldata_os_data) +
- (os_data->nr_cpus * sizeof(struct appldata_os_per_cpu));
+ new_size = struct_size(os_data, os_cpu, os_data->nr_cpus);
if (ops.size != new_size) {
if (ops.active) {
rc = appldata_diag(APPLDATA_RECORD_OS_ID,
@@ -169,8 +164,7 @@ static int __init appldata_os_init(void)
{
int rc, max_size;
- max_size = sizeof(struct appldata_os_data) +
- (num_possible_cpus() * sizeof(struct appldata_os_per_cpu));
+ max_size = struct_size(appldata_os_data, os_cpu, num_possible_cpus());
if (max_size > APPLDATA_MAX_REC_SIZE) {
pr_err("Maximum OS record size %i exceeds the maximum "
"record size %i\n", max_size, APPLDATA_MAX_REC_SIZE);
diff --git a/arch/s390/boot/.gitignore b/arch/s390/boot/.gitignore
index 16ff906e4610..f56591bc0897 100644
--- a/arch/s390/boot/.gitignore
+++ b/arch/s390/boot/.gitignore
@@ -1,3 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
image
bzImage
section_cmp.*
+vmlinux
+vmlinux.lds
+vmlinux.syms
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index e2c47d3a1c89..c7c81e5f9218 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -7,6 +7,7 @@ KCOV_INSTRUMENT := n
GCOV_PROFILE := n
UBSAN_SANITIZE := n
KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR)
KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR)
@@ -34,16 +35,24 @@ endif
CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char
-obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o
+obj-y := head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o
obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
-obj-y += version.o pgm_check_info.o ctype.o text_dma.o
-obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) += uv.o
-obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o
+obj-y += version.o pgm_check_info.o ctype.o ipl_data.o machine_kexec_reloc.o
+obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o
obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
-targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y)
-subdir- := compressed
+obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o
+obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o
+obj-all := $(obj-y) piggy.o syms.o
+
+targets := bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y)
+targets += vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2
+targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4
+targets += vmlinux.bin.zst info.bin syms.bin vmlinux.syms $(obj-all)
OBJECTS := $(addprefix $(obj)/,$(obj-y))
+OBJECTS_ALL := $(addprefix $(obj)/,$(obj-all))
+
+clean-files += vmlinux.map
quiet_cmd_section_cmp = SECTCMP $*
define cmd_section_cmp
@@ -58,22 +67,67 @@ define cmd_section_cmp
touch $@
endef
-$(obj)/bzImage: $(obj)/compressed/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE
+$(obj)/bzImage: $(obj)/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE
$(call if_changed,objcopy)
-$(obj)/section_cmp%: vmlinux $(obj)/compressed/vmlinux FORCE
+$(obj)/section_cmp%: vmlinux $(obj)/vmlinux FORCE
$(call if_changed,section_cmp)
-$(obj)/compressed/vmlinux: $(obj)/startup.a FORCE
- $(Q)$(MAKE) $(build)=$(obj)/compressed $@
+LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup $(if $(CONFIG_VMLINUX_MAP),-Map=$(obj)/vmlinux.map) --build-id=sha1 -T
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS_ALL) FORCE
+ $(call if_changed,ld)
+
+LDFLAGS_vmlinux.syms := --oformat $(LD_BFD) -e startup -T
+$(obj)/vmlinux.syms: $(obj)/vmlinux.lds $(OBJECTS) FORCE
+ $(call if_changed,ld)
+
+quiet_cmd_dumpsyms = DUMPSYMS $<
+define cmd_dumpsyms
+ $(NM) -n -S --format=bsd "$<" | sed -nE 's/^0*([0-9a-fA-F]+) 0*([0-9a-fA-F]+) [tT] ([^ ]*)$$/\1 \2 \3/p' | tr '\n' '\0' > "$@"
+endef
+
+$(obj)/syms.bin: $(obj)/vmlinux.syms FORCE
+ $(call if_changed,dumpsyms)
-$(obj)/startup.a: $(OBJECTS) FORCE
- $(call if_changed,ar)
+OBJCOPYFLAGS_syms.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.decompressor.syms
+$(obj)/syms.o: $(obj)/syms.bin FORCE
+ $(call if_changed,objcopy)
-install: $(CONFIGURE) $(obj)/bzImage
- sh -x $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
- System.map "$(INSTALL_PATH)"
+OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=load
+$(obj)/info.bin: vmlinux FORCE
+ $(call if_changed,objcopy)
-chkbss := $(obj-y)
-chkbss-target := startup.a
-include $(srctree)/arch/s390/scripts/Makefile.chkbss
+OBJCOPYFLAGS_info.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.info
+$(obj)/info.o: $(obj)/info.bin FORCE
+ $(call if_changed,objcopy)
+
+OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section=.vmlinux.info -S
+$(obj)/vmlinux.bin: vmlinux FORCE
+ $(call if_changed,objcopy)
+
+suffix-$(CONFIG_KERNEL_GZIP) := .gz
+suffix-$(CONFIG_KERNEL_BZIP2) := .bz2
+suffix-$(CONFIG_KERNEL_LZ4) := .lz4
+suffix-$(CONFIG_KERNEL_LZMA) := .lzma
+suffix-$(CONFIG_KERNEL_LZO) := .lzo
+suffix-$(CONFIG_KERNEL_XZ) := .xz
+suffix-$(CONFIG_KERNEL_ZSTD) := .zst
+
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,gzip)
+$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,bzip2_with_size)
+$(obj)/vmlinux.bin.lz4: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,lz4_with_size)
+$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,lzma_with_size)
+$(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,lzo_with_size)
+$(obj)/vmlinux.bin.xz: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,xzkern_with_size)
+$(obj)/vmlinux.bin.zst: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,zstd22_with_size)
+
+OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed
+$(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE
+ $(call if_changed,objcopy)
diff --git a/arch/s390/boot/als.c b/arch/s390/boot/als.c
index ff6801d401c4..47c48fbfb563 100644
--- a/arch/s390/boot/als.c
+++ b/arch/s390/boot/als.c
@@ -68,7 +68,7 @@ void print_missing_facilities(void)
first = 1;
for (i = 0; i < ARRAY_SIZE(als); i++) {
- val = ~S390_lowcore.stfle_fac_list[i] & als[i];
+ val = ~stfle_fac_list[i] & als[i];
for (j = 0; j < BITS_PER_LONG; j++) {
if (!(val & (1UL << (BITS_PER_LONG - 1 - j))))
continue;
@@ -106,9 +106,9 @@ void verify_facilities(void)
{
int i;
- __stfle(S390_lowcore.stfle_fac_list, ARRAY_SIZE(S390_lowcore.stfle_fac_list));
+ __stfle(stfle_fac_list, ARRAY_SIZE(stfle_fac_list));
for (i = 0; i < ARRAY_SIZE(als); i++) {
- if ((S390_lowcore.stfle_fac_list[i] & als[i]) != als[i])
+ if ((stfle_fac_list[i] & als[i]) != als[i])
facility_mismatch();
}
}
diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h
index 2ea603f70c3b..222c6886acf6 100644
--- a/arch/s390/boot/boot.h
+++ b/arch/s390/boot/boot.h
@@ -2,20 +2,101 @@
#ifndef BOOT_BOOT_H
#define BOOT_BOOT_H
+#include <linux/types.h>
+
+#define IPL_START 0x200
+
+#ifndef __ASSEMBLY__
+
+#include <asm/physmem_info.h>
+
+struct machine_info {
+ unsigned char has_edat1 : 1;
+ unsigned char has_edat2 : 1;
+ unsigned char has_nx : 1;
+};
+
+struct vmlinux_info {
+ unsigned long default_lma;
+ unsigned long entry;
+ unsigned long image_size; /* does not include .bss */
+ unsigned long bss_size; /* uncompressed image .bss size */
+ unsigned long bootdata_off;
+ unsigned long bootdata_size;
+ unsigned long bootdata_preserved_off;
+ unsigned long bootdata_preserved_size;
+ unsigned long dynsym_start;
+ unsigned long rela_dyn_start;
+ unsigned long rela_dyn_end;
+ unsigned long amode31_size;
+ unsigned long init_mm_off;
+ unsigned long swapper_pg_dir_off;
+ unsigned long invalid_pg_dir_off;
+#ifdef CONFIG_KASAN
+ unsigned long kasan_early_shadow_page_off;
+ unsigned long kasan_early_shadow_pte_off;
+ unsigned long kasan_early_shadow_pmd_off;
+ unsigned long kasan_early_shadow_pud_off;
+ unsigned long kasan_early_shadow_p4d_off;
+#endif
+};
+
void startup_kernel(void);
-void detect_memory(void);
+unsigned long detect_max_physmem_end(void);
+void detect_physmem_online_ranges(unsigned long max_physmem_end);
+void physmem_set_usable_limit(unsigned long limit);
+void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size);
+void physmem_free(enum reserved_range_type type);
+/* for continuous/multiple allocations per type */
+unsigned long physmem_alloc_top_down(enum reserved_range_type type, unsigned long size,
+ unsigned long align);
+/* for single allocations, 1 per type */
+unsigned long physmem_alloc_range(enum reserved_range_type type, unsigned long size,
+ unsigned long align, unsigned long min, unsigned long max,
+ bool die_on_oom);
+unsigned long get_physmem_alloc_pos(void);
+bool ipl_report_certs_intersects(unsigned long addr, unsigned long size,
+ unsigned long *intersection_start);
+bool is_ipl_block_dump(void);
void store_ipl_parmblock(void);
+int read_ipl_report(void);
+void save_ipl_cert_comp_list(void);
void setup_boot_command_line(void);
void parse_boot_command_line(void);
-void setup_memory_end(void);
void verify_facilities(void);
void print_missing_facilities(void);
+void sclp_early_setup_buffer(void);
void print_pgm_check_info(void);
-unsigned long get_random_base(unsigned long safe_addr);
+unsigned long randomize_within_range(unsigned long size, unsigned long align,
+ unsigned long min, unsigned long max);
+void setup_vmem(unsigned long asce_limit);
+void __printf(1, 2) decompressor_printk(const char *fmt, ...);
+void print_stacktrace(unsigned long sp);
+void error(char *m);
+
+extern struct machine_info machine;
-extern int kaslr_enabled;
+/* Symbols defined by linker scripts */
extern const char kernel_version[];
+extern unsigned long memory_limit;
+extern unsigned long vmalloc_size;
+extern int vmalloc_size_set;
+extern char __boot_data_start[], __boot_data_end[];
+extern char __boot_data_preserved_start[], __boot_data_preserved_end[];
+extern char _decompressor_syms_start[], _decompressor_syms_end[];
+extern char _stack_start[], _stack_end[];
+extern char _end[], _decompressor_end[];
+extern unsigned char _compressed_start[];
+extern unsigned char _compressed_end[];
+extern struct vmlinux_info _vmlinux_info;
+#define vmlinux _vmlinux_info
-unsigned long read_ipl_report(unsigned long safe_offset);
+#define __abs_lowcore_pa(x) (((unsigned long)(x) - __abs_lowcore) % sizeof(struct lowcore))
+static inline bool intersects(unsigned long addr0, unsigned long size0,
+ unsigned long addr1, unsigned long size1)
+{
+ return addr0 + size0 > addr1 && addr1 + size1 > addr0;
+}
+#endif /* __ASSEMBLY__ */
#endif /* BOOT_BOOT_H */
diff --git a/arch/s390/boot/clz_ctz.c b/arch/s390/boot/clz_ctz.c
new file mode 100644
index 000000000000..c3ebf248596b
--- /dev/null
+++ b/arch/s390/boot/clz_ctz.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../../../../lib/clz_ctz.c"
diff --git a/arch/s390/boot/compressed/.gitignore b/arch/s390/boot/compressed/.gitignore
deleted file mode 100644
index e72fcd7ecebb..000000000000
--- a/arch/s390/boot/compressed/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-vmlinux
-vmlinux.lds
diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile
deleted file mode 100644
index fa529c5b4486..000000000000
--- a/arch/s390/boot/compressed/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# linux/arch/s390/boot/compressed/Makefile
-#
-# create a compressed vmlinux image from the original vmlinux
-#
-
-KCOV_INSTRUMENT := n
-GCOV_PROFILE := n
-UBSAN_SANITIZE := n
-KASAN_SANITIZE := n
-
-obj-y := $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) piggy.o info.o
-targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2
-targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4
-targets += info.bin $(obj-y)
-
-KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR)
-KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR)
-OBJCOPYFLAGS :=
-
-OBJECTS := $(addprefix $(obj)/,$(obj-y))
-
-LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup -T
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(objtree)/arch/s390/boot/startup.a $(OBJECTS) FORCE
- $(call if_changed,ld)
-
-OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=load
-$(obj)/info.bin: vmlinux FORCE
- $(call if_changed,objcopy)
-
-OBJCOPYFLAGS_info.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.info
-$(obj)/info.o: $(obj)/info.bin FORCE
- $(call if_changed,objcopy)
-
-OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section=.vmlinux.info -S
-$(obj)/vmlinux.bin: vmlinux FORCE
- $(call if_changed,objcopy)
-
-vmlinux.bin.all-y := $(obj)/vmlinux.bin
-
-suffix-$(CONFIG_KERNEL_GZIP) := .gz
-suffix-$(CONFIG_KERNEL_BZIP2) := .bz2
-suffix-$(CONFIG_KERNEL_LZ4) := .lz4
-suffix-$(CONFIG_KERNEL_LZMA) := .lzma
-suffix-$(CONFIG_KERNEL_LZO) := .lzo
-suffix-$(CONFIG_KERNEL_XZ) := .xz
-
-$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lz4)
-$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lzma)
-$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lzo)
-$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,xzkern)
-
-OBJCOPYFLAGS_piggy.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .data=.vmlinux.bin.compressed
-$(obj)/piggy.o: $(obj)/vmlinux.bin$(suffix-y) FORCE
- $(call if_changed,objcopy)
-
-chkbss := $(filter-out piggy.o info.o, $(obj-y))
-chkbss-target := vmlinux.bin
-include $(srctree)/arch/s390/scripts/Makefile.chkbss
diff --git a/arch/s390/boot/compressed/decompressor.h b/arch/s390/boot/compressed/decompressor.h
deleted file mode 100644
index c15eb7114d83..000000000000
--- a/arch/s390/boot/compressed/decompressor.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef BOOT_COMPRESSED_DECOMPRESSOR_H
-#define BOOT_COMPRESSED_DECOMPRESSOR_H
-
-#ifdef CONFIG_KERNEL_UNCOMPRESSED
-static inline void *decompress_kernel(void) {}
-#else
-void *decompress_kernel(void);
-#endif
-unsigned long mem_safe_offset(void);
-void error(char *m);
-
-struct vmlinux_info {
- unsigned long default_lma;
- void (*entry)(void);
- unsigned long image_size; /* does not include .bss */
- unsigned long bss_size; /* uncompressed image .bss size */
- unsigned long bootdata_off;
- unsigned long bootdata_size;
- unsigned long bootdata_preserved_off;
- unsigned long bootdata_preserved_size;
- unsigned long dynsym_start;
- unsigned long rela_dyn_start;
- unsigned long rela_dyn_end;
-};
-
-extern char _vmlinux_info[];
-#define vmlinux (*(struct vmlinux_info *)_vmlinux_info)
-
-#endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */
diff --git a/arch/s390/boot/compressed/decompressor.c b/arch/s390/boot/decompressor.c
index 45046630c56a..d762733a0753 100644
--- a/arch/s390/boot/compressed/decompressor.c
+++ b/arch/s390/boot/decompressor.c
@@ -11,12 +11,12 @@
#include <linux/string.h>
#include <asm/page.h>
#include "decompressor.h"
+#include "boot.h"
/*
* gzip declarations
*/
#define STATIC static
-#define STATIC_RW_DATA static __section(.data)
#undef memset
#undef memcpy
@@ -24,19 +24,16 @@
#define memmove memmove
#define memzero(s, n) memset((s), 0, (n))
-/* Symbols defined by linker scripts */
-extern char _end[];
-extern unsigned char _compressed_start[];
-extern unsigned char _compressed_end[];
-
-#ifdef CONFIG_HAVE_KERNEL_BZIP2
-#define HEAP_SIZE 0x400000
+#if defined(CONFIG_KERNEL_BZIP2)
+#define BOOT_HEAP_SIZE 0x400000
+#elif defined(CONFIG_KERNEL_ZSTD)
+#define BOOT_HEAP_SIZE 0x30000
#else
-#define HEAP_SIZE 0x10000
+#define BOOT_HEAP_SIZE 0x10000
#endif
static unsigned long free_mem_ptr = (unsigned long) _end;
-static unsigned long free_mem_end_ptr = (unsigned long) _end + HEAP_SIZE;
+static unsigned long free_mem_end_ptr = (unsigned long) _end + BOOT_HEAP_SIZE;
#ifdef CONFIG_KERNEL_GZIP
#include "../../../../lib/decompress_inflate.c"
@@ -62,7 +59,11 @@ static unsigned long free_mem_end_ptr = (unsigned long) _end + HEAP_SIZE;
#include "../../../../lib/decompress_unxz.c"
#endif
-#define decompress_offset ALIGN((unsigned long)_end + HEAP_SIZE, PAGE_SIZE)
+#ifdef CONFIG_KERNEL_ZSTD
+#include "../../../../lib/decompress_unzstd.c"
+#endif
+
+#define decompress_offset ALIGN((unsigned long)_end + BOOT_HEAP_SIZE, PAGE_SIZE)
unsigned long mem_safe_offset(void)
{
@@ -80,6 +81,6 @@ void *decompress_kernel(void)
void *output = (void *)decompress_offset;
__decompress(_compressed_start, _compressed_end - _compressed_start,
- NULL, NULL, output, 0, NULL, error);
+ NULL, NULL, output, vmlinux.image_size, NULL, error);
return output;
}
diff --git a/arch/s390/boot/decompressor.h b/arch/s390/boot/decompressor.h
new file mode 100644
index 000000000000..92b81d2ea35d
--- /dev/null
+++ b/arch/s390/boot/decompressor.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_DECOMPRESSOR_H
+#define BOOT_COMPRESSED_DECOMPRESSOR_H
+
+#ifdef CONFIG_KERNEL_UNCOMPRESSED
+static inline void *decompress_kernel(void) { return NULL; }
+#else
+void *decompress_kernel(void);
+#endif
+unsigned long mem_safe_offset(void);
+
+#endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */
diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S
index 4b86a8d3c121..637c29c3f6e3 100644
--- a/arch/s390/boot/head.S
+++ b/arch/s390/boot/head.S
@@ -5,7 +5,6 @@
* Author(s): Hartmut Penner <hp@de.ibm.com>
* Martin Schwidefsky <schwidefsky@de.ibm.com>
* Rob van der Heij <rvdhei@iae.nl>
- * Heiko Carstens <heiko.carstens@de.ibm.com>
*
* There are 5 different IPL methods
* 1) load the image directly into ram at address 0 and do an PSW restart
@@ -25,259 +24,201 @@
#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
#include <asm/page.h>
#include <asm/ptrace.h>
+#include <asm/sclp.h>
+#include "boot.h"
-#define ARCH_OFFSET 4
+#define EP_OFFSET 0x10008
+#define EP_STRING "S390EP"
+#define IPL_BS 0x730
__HEAD
-
-#define IPL_BS 0x730
- .org 0
- .long 0x00080000,0x80000000+iplstart # The first 24 bytes are loaded
- .long 0x02000018,0x60000050 # by ipl to addresses 0-23.
- .long 0x02000068,0x60000050 # (a PSW and two CCWs).
- .fill 80-24,1,0x40 # bytes 24-79 are discarded !!
- .long 0x020000f0,0x60000050 # The next 160 byte are loaded
- .long 0x02000140,0x60000050 # to addresses 0x18-0xb7
- .long 0x02000190,0x60000050 # They form the continuation
- .long 0x020001e0,0x60000050 # of the CCW program started
- .long 0x02000230,0x60000050 # by ipl and load the range
- .long 0x02000280,0x60000050 # 0x0f0-0x730 from the image
- .long 0x020002d0,0x60000050 # to the range 0x0f0-0x730
- .long 0x02000320,0x60000050 # in memory. At the end of
- .long 0x02000370,0x60000050 # the channel program the PSW
- .long 0x020003c0,0x60000050 # at location 0 is loaded.
- .long 0x02000410,0x60000050 # Initial processing starts
- .long 0x02000460,0x60000050 # at 0x200 = iplstart.
- .long 0x020004b0,0x60000050
- .long 0x02000500,0x60000050
- .long 0x02000550,0x60000050
- .long 0x020005a0,0x60000050
- .long 0x020005f0,0x60000050
- .long 0x02000640,0x60000050
- .long 0x02000690,0x60000050
- .long 0x020006e0,0x20000050
-
- .org __LC_RST_NEW_PSW # 0x1a0
- .quad 0,iplstart
- .org __LC_PGM_NEW_PSW # 0x1d0
- .quad 0x0000000180000000,startup_pgm_check_handler
-
- .org 0x200
-
+ipl_start:
+ mvi __LC_AR_MODE_ID,1 # set esame flag
+ slr %r0,%r0 # set cpuid to zero
+ lhi %r1,2 # mode 2 = esame (dump)
+ sigp %r1,%r0,0x12 # switch to esame mode
+ sam64 # switch to 64 bit addressing mode
+ lgh %r1,__LC_SUBCHANNEL_ID # test if subchannel number
+ brctg %r1,.Lnoload # is valid
+ llgf %r1,__LC_SUBCHANNEL_ID # load ipl subchannel number
+ lghi %r2,IPL_BS # load start address
+ bras %r14,.Lloader # load rest of ipl image
+ larl %r12,parmarea # pointer to parameter area
+ stg %r1,IPL_DEVICE-PARMAREA(%r12) # save ipl device number
+#
+# load parameter file from ipl device
+#
+.Lagain1:
+ larl %r2,_end # ramdisk loc. is temp
+ bras %r14,.Lloader # load parameter file
+ ltgr %r2,%r2 # got anything ?
+ jz .Lnopf
+ lg %r3,MAX_COMMAND_LINE_SIZE-PARMAREA(%r12)
+ aghi %r3,-1
+ clgr %r2,%r3
+ jl .Lnotrunc
+ lgr %r2,%r3
+.Lnotrunc:
+ larl %r4,_end
+ larl %r13,.L_hdr
+ clc 0(3,%r4),0(%r13) # if it is HDRx
+ jz .Lagain1 # skip dataset header
+ larl %r13,.L_eof
+ clc 0(3,%r4),0(%r13) # if it is EOFx
+ jz .Lagain1 # skip data set trailer
+ lgr %r5,%r2
+ la %r6,COMMAND_LINE-PARMAREA(%r12)
+ lgr %r7,%r2
+ aghi %r7,1
+ mvcl %r6,%r4
+.Lnopf:
+#
+# load ramdisk from ipl device
+#
+.Lagain2:
+ larl %r2,_end # addr of ramdisk
+ stg %r2,INITRD_START-PARMAREA(%r12)
+ bras %r14,.Lloader # load ramdisk
+ stg %r2,INITRD_SIZE-PARMAREA(%r12) # store size of rd
+ ltgr %r2,%r2
+ jnz .Lrdcont
+ stg %r2,INITRD_START-PARMAREA(%r12) # no ramdisk found
+.Lrdcont:
+ larl %r2,_end
+ larl %r13,.L_hdr # skip HDRx and EOFx
+ clc 0(3,%r2),0(%r13)
+ jz .Lagain2
+ larl %r13,.L_eof
+ clc 0(3,%r2),0(%r13)
+ jz .Lagain2
+#
+# reset files in VM reader
+#
+ larl %r13,.Lcpuid
+ stidp 0(%r13) # store cpuid
+ tm 0(%r13),0xff # running VM ?
+ jno .Lnoreset
+ larl %r2,.Lreset
+ lghi %r3,26
+ diag %r2,%r3,8
+ larl %r5,.Lirb
+ stsch 0(%r5) # check if irq is pending
+ tm 30(%r5),0x0f # by verifying if any of the
+ jnz .Lwaitforirq # activity or status control
+ tm 31(%r5),0xff # bits is set in the schib
+ jz .Lnoreset
+.Lwaitforirq:
+ bras %r14,.Lirqwait # wait for IO interrupt
+ c %r1,__LC_SUBCHANNEL_ID # compare subchannel number
+ jne .Lwaitforirq
+ larl %r5,.Lirb
+ tsch 0(%r5)
+.Lnoreset:
+ j .Lnoload
+#
+# everything loaded, go for it
+#
+.Lnoload:
+ jg startup
#
# subroutine to wait for end I/O
#
.Lirqwait:
- mvc __LC_IO_NEW_PSW(16),.Lnewpsw # set up IO interrupt psw
- lpsw .Lwaitpsw
+ larl %r13,.Lnewpswmask # set up IO interrupt psw
+ mvc __LC_IO_NEW_PSW(8),0(%r13)
+ stg %r14,__LC_IO_NEW_PSW+8
+ larl %r13,.Lwaitpsw
+ lpswe 0(%r13)
.Lioint:
- br %r14
- .align 8
-.Lnewpsw:
- .quad 0x0000000080000000,.Lioint
-.Lwaitpsw:
- .long 0x020a0000,0x80000000+.Lioint
-
#
# subroutine for loading cards from the reader
#
.Lloader:
- la %r4,0(%r14)
- la %r3,.Lorb # r2 = address of orb into r2
- la %r5,.Lirb # r4 = address of irb
- la %r6,.Lccws
- la %r7,20
+ lgr %r4,%r14
+ larl %r3,.Lorb # r2 = address of orb into r2
+ larl %r5,.Lirb # r4 = address of irb
+ larl %r6,.Lccws
+ lghi %r7,20
.Linit:
st %r2,4(%r6) # initialize CCW data addresses
la %r2,0x50(%r2)
la %r6,8(%r6)
- bct 7,.Linit
-
- lctl %c6,%c6,.Lcr6 # set IO subclass mask
- slr %r2,%r2
+ brctg %r7,.Linit
+ larl %r13,.Lcr6
+ lctlg %c6,%c6,0(%r13)
+ xgr %r2,%r2
.Lldlp:
ssch 0(%r3) # load chunk of 1600 bytes
- bnz .Llderr
+ jnz .Llderr
.Lwait4irq:
- bas %r14,.Lirqwait
+ bras %r14,.Lirqwait
c %r1,__LC_SUBCHANNEL_ID # compare subchannel number
- bne .Lwait4irq
+ jne .Lwait4irq
tsch 0(%r5)
-
- slr %r0,%r0
+ xgr %r0,%r0
ic %r0,8(%r5) # get device status
- chi %r0,8 # channel end ?
- be .Lcont
- chi %r0,12 # channel end + device end ?
- be .Lcont
-
- l %r0,4(%r5)
- s %r0,8(%r3) # r0/8 = number of ccws executed
- mhi %r0,10 # *10 = number of bytes in ccws
- lh %r3,10(%r5) # get residual count
- sr %r0,%r3 # #ccws*80-residual=#bytes read
- ar %r2,%r0
-
+ cghi %r0,8 # channel end ?
+ je .Lcont
+ cghi %r0,12 # channel end + device end ?
+ je .Lcont
+ llgf %r0,4(%r5)
+ sgf %r0,8(%r3) # r0/8 = number of ccws executed
+ mghi %r0,10 # *10 = number of bytes in ccws
+ llgh %r3,10(%r5) # get residual count
+ sgr %r0,%r3 # #ccws*80-residual=#bytes read
+ agr %r2,%r0
br %r4 # r2 contains the total size
-
.Lcont:
- ahi %r2,0x640 # add 0x640 to total size
- la %r6,.Lccws
- la %r7,20
+ aghi %r2,0x640 # add 0x640 to total size
+ larl %r6,.Lccws
+ lghi %r7,20
.Lincr:
l %r0,4(%r6) # update CCW data addresses
- ahi %r0,0x640
+ aghi %r0,0x640
st %r0,4(%r6)
- ahi %r6,8
- bct 7,.Lincr
-
- b .Lldlp
+ aghi %r6,8
+ brctg %r7,.Lincr
+ j .Lldlp
.Llderr:
- lpsw .Lcrash
+ larl %r13,.Lcrash
+ lpsw 0(%r13)
- .align 8
+ .balign 8
+.Lwaitpsw:
+ .quad 0x0202000180000000,.Lioint
+.Lnewpswmask:
+ .quad 0x0000000180000000
+ .balign 8
.Lorb: .long 0x00000000,0x0080ff00,.Lccws
.Lirb: .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.Lcr6: .long 0xff000000
-.Lloadp:.long 0,0
- .align 8
+ .balign 8
+.Lcr6: .quad 0x00000000ff000000
+ .balign 8
.Lcrash:.long 0x000a0000,0x00000000
-
- .align 8
+ .balign 8
.Lccws: .rept 19
.long 0x02600050,0x00000000
.endr
.long 0x02200050,0x00000000
-
-iplstart:
- mvi __LC_AR_MODE_ID,1 # set esame flag
- slr %r0,%r0 # set cpuid to zero
- lhi %r1,2 # mode 2 = esame (dump)
- sigp %r1,%r0,0x12 # switch to esame mode
- bras %r13,0f
- .fill 16,4,0x0
-0: lmh %r0,%r15,0(%r13) # clear high-order half of gprs
- sam31 # switch to 31 bit addressing mode
- lh %r1,__LC_SUBCHANNEL_ID # test if subchannel number
- bct %r1,.Lnoload # is valid
- l %r1,__LC_SUBCHANNEL_ID # load ipl subchannel number
- la %r2,IPL_BS # load start address
- bas %r14,.Lloader # load rest of ipl image
- l %r12,.Lparm # pointer to parameter area
- st %r1,IPL_DEVICE+ARCH_OFFSET-PARMAREA(%r12) # save ipl device number
-
-#
-# load parameter file from ipl device
-#
-.Lagain1:
- l %r2,.Linitrd # ramdisk loc. is temp
- bas %r14,.Lloader # load parameter file
- ltr %r2,%r2 # got anything ?
- bz .Lnopf
- chi %r2,895
- bnh .Lnotrunc
- la %r2,895
-.Lnotrunc:
- l %r4,.Linitrd
- clc 0(3,%r4),.L_hdr # if it is HDRx
- bz .Lagain1 # skip dataset header
- clc 0(3,%r4),.L_eof # if it is EOFx
- bz .Lagain1 # skip dateset trailer
- la %r5,0(%r4,%r2)
- lr %r3,%r2
- la %r3,COMMAND_LINE-PARMAREA(%r12) # load adr. of command line
- mvc 0(256,%r3),0(%r4)
- mvc 256(256,%r3),256(%r4)
- mvc 512(256,%r3),512(%r4)
- mvc 768(122,%r3),768(%r4)
- slr %r0,%r0
- b .Lcntlp
-.Ldelspc:
- ic %r0,0(%r2,%r3)
- chi %r0,0x20 # is it a space ?
- be .Lcntlp
- ahi %r2,1
- b .Leolp
-.Lcntlp:
- brct %r2,.Ldelspc
-.Leolp:
- slr %r0,%r0
- stc %r0,0(%r2,%r3) # terminate buffer
-.Lnopf:
-
-#
-# load ramdisk from ipl device
-#
-.Lagain2:
- l %r2,.Linitrd # addr of ramdisk
- st %r2,INITRD_START+ARCH_OFFSET-PARMAREA(%r12)
- bas %r14,.Lloader # load ramdisk
- st %r2,INITRD_SIZE+ARCH_OFFSET-PARMAREA(%r12) # store size of rd
- ltr %r2,%r2
- bnz .Lrdcont
- st %r2,INITRD_START+ARCH_OFFSET-PARMAREA(%r12) # no ramdisk found
-.Lrdcont:
- l %r2,.Linitrd
-
- clc 0(3,%r2),.L_hdr # skip HDRx and EOFx
- bz .Lagain2
- clc 0(3,%r2),.L_eof
- bz .Lagain2
-
-#
-# reset files in VM reader
-#
- stidp .Lcpuid # store cpuid
- tm .Lcpuid,0xff # running VM ?
- bno .Lnoreset
- la %r2,.Lreset
- lhi %r3,26
- diag %r2,%r3,8
- la %r5,.Lirb
- stsch 0(%r5) # check if irq is pending
- tm 30(%r5),0x0f # by verifying if any of the
- bnz .Lwaitforirq # activity or status control
- tm 31(%r5),0xff # bits is set in the schib
- bz .Lnoreset
-.Lwaitforirq:
- bas %r14,.Lirqwait # wait for IO interrupt
- c %r1,__LC_SUBCHANNEL_ID # compare subchannel number
- bne .Lwaitforirq
- la %r5,.Lirb
- tsch 0(%r5)
-.Lnoreset:
- b .Lnoload
-
-#
-# everything loaded, go for it
-#
-.Lnoload:
- l %r1,.Lstartup
- br %r1
-
-.Linitrd:.long _end # default address of initrd
-.Lparm: .long PARMAREA
-.Lstartup: .long startup
.Lreset:.byte 0xc3,0xc8,0xc1,0xd5,0xc7,0xc5,0x40,0xd9,0xc4,0xd9,0x40
.byte 0xc1,0xd3,0xd3,0x40,0xd2,0xc5,0xc5,0xd7,0x40,0xd5,0xd6
.byte 0xc8,0xd6,0xd3,0xc4 # "change rdr all keep nohold"
.L_eof: .long 0xc5d6c600 /* C'EOF' */
.L_hdr: .long 0xc8c4d900 /* C'HDR' */
- .align 8
+ .balign 8
.Lcpuid:.fill 8,1,0
#
-# startup-code at 0x10000, running in absolute addressing mode
+# normal startup-code, running in absolute addressing mode
# this is called either by the ipl loader or directly by PSW restart
# or linload or SALIPL
#
- .org 0x10000
-ENTRY(startup)
- j .Lep_startup_normal
- .org EP_OFFSET
+ .org STARTUP_NORMAL_OFFSET - IPL_START
+SYM_CODE_START(startup)
+ j startup_normal
+ .org EP_OFFSET - IPL_START
#
# This is a list of s390 kernel entry points. At address 0x1000f the number of
# valid entry points is stored.
@@ -287,12 +228,12 @@ ENTRY(startup)
.ascii EP_STRING
.byte 0x00,0x01
#
-# kdump startup-code at 0x10010, running in 64 bit absolute addressing mode
+# kdump startup-code, running in 64 bit absolute addressing mode
#
- .org 0x10010
-ENTRY(startup_kdump)
- j .Lep_startup_kdump
-.Lep_startup_normal:
+ .org STARTUP_KDUMP_OFFSET - IPL_START
+ j startup_kdump
+SYM_CODE_END(startup)
+SYM_CODE_START_LOCAL(startup_normal)
mvi __LC_AR_MODE_ID,1 # set esame flag
slr %r0,%r0 # set cpuid to zero
lhi %r1,2 # mode 2 = esame (dump)
@@ -301,55 +242,53 @@ ENTRY(startup_kdump)
.fill 16,4,0x0
0: lmh %r0,%r15,0(%r13) # clear high-order half of gprs
sam64 # switch to 64 bit addressing mode
- basr %r13,0 # get base
-.LPG0:
+ larl %r13,.Lext_new_psw
+ mvc __LC_EXT_NEW_PSW(16),0(%r13)
+ larl %r13,.Lpgm_new_psw
+ mvc __LC_PGM_NEW_PSW(16),0(%r13)
+ larl %r13,.Lio_new_psw
+ mvc __LC_IO_NEW_PSW(16),0(%r13)
xc 0x200(256),0x200 # partially clear lowcore
xc 0x300(256),0x300
xc 0xe00(256),0xe00
xc 0xf00(256),0xf00
- lctlg %c0,%c15,.Lctl-.LPG0(%r13) # load control registers
+ larl %r13,.Lctl
+ lctlg %c0,%c15,0(%r13) # load control registers
stcke __LC_BOOT_CLOCK
mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1
- spt 6f-.LPG0(%r13)
- mvc __LC_LAST_UPDATE_TIMER(8),6f-.LPG0(%r13)
- l %r15,.Lstack-.LPG0(%r13)
+ larl %r13,6f
+ spt 0(%r13)
+ mvc __LC_LAST_UPDATE_TIMER(8),0(%r13)
+ larl %r15,_stack_end-STACK_FRAME_OVERHEAD
+ brasl %r14,sclp_early_setup_buffer
brasl %r14,verify_facilities
brasl %r14,startup_kernel
+SYM_CODE_END(startup_normal)
-.Lstack:
- .long 0x8000 + (1<<(PAGE_SHIFT+BOOT_STACK_ORDER)) - STACK_FRAME_OVERHEAD
- .align 8
+ .balign 8
6: .long 0x7fffffff,0xffffffff
-
+.Lext_new_psw:
+ .quad 0x0002000180000000,0x1b0 # disabled wait
+.Lpgm_new_psw:
+ .quad 0x0000000180000000,startup_pgm_check_handler
+.Lio_new_psw:
+ .quad 0x0002000180000000,0x1f0 # disabled wait
.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space
.quad 0 # cr1: primary space segment table
- .quad .Lduct # cr2: dispatchable unit control table
+ .quad 0 # cr2: dispatchable unit control table
.quad 0 # cr3: instruction authorization
.quad 0xffff # cr4: instruction authorization
- .quad .Lduct # cr5: primary-aste origin
+ .quad 0 # cr5: primary-aste origin
.quad 0 # cr6: I/O interrupts
.quad 0 # cr7: secondary space segment table
- .quad 0 # cr8: access registers translation
+ .quad 0x0000000000008000 # cr8: access registers translation
.quad 0 # cr9: tracing off
.quad 0 # cr10: tracing off
.quad 0 # cr11: tracing off
.quad 0 # cr12: tracing off
.quad 0 # cr13: home space segment table
.quad 0xc0000000 # cr14: machine check handling off
- .quad .Llinkage_stack # cr15: linkage stack operations
-
- .section .dma.data,"aw",@progbits
-.Lduct: .long 0,.Laste,.Laste,0,.Lduald,0,0,0
- .long 0,0,0,0,0,0,0,0
-.Llinkage_stack:
- .long 0,0,0x89000000,0,0,0,0x8a000000,0
- .align 64
-.Laste: .quad 0,0xffffffffffffffff,0,0,0,0,0,0
- .align 128
-.Lduald:.rept 8
- .long 0x80000000,0,0,0 # invalid access-list entries
- .endr
- .previous
+ .quad 0 # cr15: linkage stack operations
#include "head_kdump.S"
@@ -359,45 +298,23 @@ ENTRY(startup_kdump)
# It simply saves general/control registers and psw in
# the save area and does disabled wait with a faulty address.
#
-ENTRY(startup_pgm_check_handler)
- stmg %r0,%r15,__LC_SAVE_AREA_SYNC
- la %r1,4095
- stctg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r1)
- mvc __LC_GPREGS_SAVE_AREA-4095(128,%r1),__LC_SAVE_AREA_SYNC
- mvc __LC_PSW_SAVE_AREA-4095(16,%r1),__LC_PGM_OLD_PSW
+SYM_CODE_START_LOCAL(startup_pgm_check_handler)
+ stmg %r8,%r15,__LC_SAVE_AREA_SYNC
+ la %r8,4095
+ stctg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r8)
+ stmg %r0,%r7,__LC_GPREGS_SAVE_AREA-4095(%r8)
+ mvc __LC_GPREGS_SAVE_AREA-4095+64(64,%r8),__LC_SAVE_AREA_SYNC
+ mvc __LC_PSW_SAVE_AREA-4095(16,%r8),__LC_PGM_OLD_PSW
mvc __LC_RETURN_PSW(16),__LC_PGM_OLD_PSW
ni __LC_RETURN_PSW,0xfc # remove IO and EX bits
ni __LC_RETURN_PSW+1,0xfb # remove MCHK bit
oi __LC_RETURN_PSW+1,0x2 # set wait state bit
- larl %r2,.Lold_psw_disabled_wait
- stg %r2,__LC_PGM_NEW_PSW+8
- l %r15,.Ldump_info_stack-.Lold_psw_disabled_wait(%r2)
+ larl %r9,.Lold_psw_disabled_wait
+ stg %r9,__LC_PGM_NEW_PSW+8
+ larl %r15,_dump_info_stack_end-STACK_FRAME_OVERHEAD
brasl %r14,print_pgm_check_info
.Lold_psw_disabled_wait:
- la %r1,4095
- lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)
+ la %r8,4095
+ lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r8)
lpswe __LC_RETURN_PSW # disabled wait
-.Ldump_info_stack:
- .long 0x5000 + PAGE_SIZE - STACK_FRAME_OVERHEAD
-ENDPROC(startup_pgm_check_handler)
-
-#
-# params at 10400 (setup.h)
-# Must be keept in sync with struct parmarea in setup.h
-#
- .org PARMAREA
- .quad 0 # IPL_DEVICE
- .quad 0 # INITRD_START
- .quad 0 # INITRD_SIZE
- .quad 0 # OLDMEM_BASE
- .quad 0 # OLDMEM_SIZE
- .quad kernel_version # points to kernel version string
-
- .org COMMAND_LINE
- .byte "root=/dev/ram0 ro"
- .byte 0
-
- .org EARLY_SCCB_OFFSET
- .fill 4096
-
- .org HEAD_END
+SYM_CODE_END(startup_pgm_check_handler)
diff --git a/arch/s390/boot/head_kdump.S b/arch/s390/boot/head_kdump.S
index 174d6959bf5b..f7107c76258c 100644
--- a/arch/s390/boot/head_kdump.S
+++ b/arch/s390/boot/head_kdump.S
@@ -19,8 +19,7 @@
# Note: This code has to be position independent
#
-.align 2
-.Lep_startup_kdump:
+SYM_CODE_START_LOCAL(startup_kdump)
lhi %r1,2 # mode 2 = esame (dump)
sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to esame mode
sam64 # Switch to 64 bit addressing
@@ -83,19 +82,20 @@
#
# Startup of kdump (relocated new kernel)
#
-.align 2
+ .balign 2
startup_kdump_relocated:
basr %r13,0
0: lpswe .Lrestart_psw-0b(%r13) # Start new kernel...
-.align 8
+SYM_CODE_END(startup_kdump)
+ .balign 8
.Lrestart_psw:
.quad 0x0000000080000000,0x0000000000000000 + startup
#else
-.align 2
-.Lep_startup_kdump:
+SYM_CODE_START_LOCAL(startup_kdump)
larl %r13,startup_kdump_crash
lpswe 0(%r13)
-.align 8
+SYM_CODE_END(startup_kdump)
+ .balign 8
startup_kdump_crash:
.quad 0x0002000080000000,0x0000000000000000 + startup_kdump_crash
#endif /* CONFIG_CRASH_DUMP */
diff --git a/arch/s390/boot/install.sh b/arch/s390/boot/install.sh
index bed227f267ae..a13dd2f2aa1c 100644..100755
--- a/arch/s390/boot/install.sh
+++ b/arch/s390/boot/install.sh
@@ -14,22 +14,11 @@
# $2 - kernel image file
# $3 - kernel map file
# $4 - default install path (blank if root directory)
-#
-
-# User may have a custom install script
-
-if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
-if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
-
-# Default install - same as make zlilo
-
-if [ -f $4/vmlinuz ]; then
- mv $4/vmlinuz $4/vmlinuz.old
-fi
-if [ -f $4/System.map ]; then
- mv $4/System.map $4/System.old
-fi
+echo "Warning: '${INSTALLKERNEL}' command not available - additional " \
+ "bootloader config required" >&2
+if [ -f "$4/vmlinuz-$1" ]; then mv -- "$4/vmlinuz-$1" "$4/vmlinuz-$1.old"; fi
+if [ -f "$4/System.map-$1" ]; then mv -- "$4/System.map-$1" "$4/System.map-$1.old"; fi
-cat $2 > $4/vmlinuz
-cp $3 $4/System.map
+cat -- "$2" > "$4/vmlinuz-$1"
+cp -- "$3" "$4/System.map-$1"
diff --git a/arch/s390/boot/ipl_data.c b/arch/s390/boot/ipl_data.c
new file mode 100644
index 000000000000..0846e2b249c6
--- /dev/null
+++ b/arch/s390/boot/ipl_data.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/compat.h>
+#include <linux/ptrace.h>
+#include <asm/cio.h>
+#include <asm/asm-offsets.h>
+#include "boot.h"
+
+#define CCW0(cmd, addr, cnt, flg) \
+ { .cmd_code = cmd, .cda = addr, .count = cnt, .flags = flg, }
+
+#define PSW_MASK_DISABLED (PSW_MASK_WAIT | PSW_MASK_EA | PSW_MASK_BA)
+
+struct ipl_lowcore {
+ psw_t32 ipl_psw; /* 0x0000 */
+ struct ccw0 ccwpgm[2]; /* 0x0008 */
+ u8 fill[56]; /* 0x0018 */
+ struct ccw0 ccwpgmcc[20]; /* 0x0050 */
+ u8 pad_0xf0[0x01a0-0x00f0]; /* 0x00f0 */
+ psw_t restart_psw; /* 0x01a0 */
+ psw_t external_new_psw; /* 0x01b0 */
+ psw_t svc_new_psw; /* 0x01c0 */
+ psw_t program_new_psw; /* 0x01d0 */
+ psw_t mcck_new_psw; /* 0x01e0 */
+ psw_t io_new_psw; /* 0x01f0 */
+};
+
+/*
+ * Initial lowcore for IPL: the first 24 bytes are loaded by IPL to
+ * addresses 0-23 (a PSW and two CCWs). Bytes 24-79 are discarded.
+ * The next 160 bytes are loaded to addresses 0x18-0xb7. They form
+ * the continuation of the CCW program started by IPL and load the
+ * range 0x0f0-0x730 from the image to the range 0x0f0-0x730 in
+ * memory. At the end of the channel program the PSW at location 0 is
+ * loaded.
+ * Initial processing starts at 0x200 = iplstart.
+ *
+ * The restart psw points to iplstart which allows to load a kernel
+ * image into memory and starting it by a psw restart on any cpu. All
+ * other default psw new locations contain a disabled wait psw where
+ * the address indicates which psw was loaded.
+ *
+ * Note that the 'file' utility can detect s390 kernel images. For
+ * that to succeed the two initial CCWs, and the 0x40 fill bytes must
+ * be present.
+ */
+static struct ipl_lowcore ipl_lowcore __used __section(".ipldata") = {
+ .ipl_psw = { .mask = PSW32_MASK_BASE, .addr = PSW32_ADDR_AMODE | IPL_START },
+ .ccwpgm = {
+ [ 0] = CCW0(CCW_CMD_READ_IPL, 0x018, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 1] = CCW0(CCW_CMD_READ_IPL, 0x068, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ },
+ .fill = {
+ [ 0 ... 55] = 0x40,
+ },
+ .ccwpgmcc = {
+ [ 0] = CCW0(CCW_CMD_READ_IPL, 0x0f0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 1] = CCW0(CCW_CMD_READ_IPL, 0x140, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 2] = CCW0(CCW_CMD_READ_IPL, 0x190, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 3] = CCW0(CCW_CMD_READ_IPL, 0x1e0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 4] = CCW0(CCW_CMD_READ_IPL, 0x230, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 5] = CCW0(CCW_CMD_READ_IPL, 0x280, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 6] = CCW0(CCW_CMD_READ_IPL, 0x2d0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 7] = CCW0(CCW_CMD_READ_IPL, 0x320, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 8] = CCW0(CCW_CMD_READ_IPL, 0x370, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [ 9] = CCW0(CCW_CMD_READ_IPL, 0x3c0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [10] = CCW0(CCW_CMD_READ_IPL, 0x410, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [11] = CCW0(CCW_CMD_READ_IPL, 0x460, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [12] = CCW0(CCW_CMD_READ_IPL, 0x4b0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [13] = CCW0(CCW_CMD_READ_IPL, 0x500, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [14] = CCW0(CCW_CMD_READ_IPL, 0x550, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [15] = CCW0(CCW_CMD_READ_IPL, 0x5a0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [16] = CCW0(CCW_CMD_READ_IPL, 0x5f0, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [17] = CCW0(CCW_CMD_READ_IPL, 0x640, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [18] = CCW0(CCW_CMD_READ_IPL, 0x690, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
+ [19] = CCW0(CCW_CMD_READ_IPL, 0x6e0, 0x50, CCW_FLAG_SLI),
+ },
+ .restart_psw = { .mask = 0, .addr = IPL_START, },
+ .external_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_EXT_NEW_PSW, },
+ .svc_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_SVC_NEW_PSW, },
+ .program_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_PGM_NEW_PSW, },
+ .mcck_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_MCK_NEW_PSW, },
+ .io_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_IO_NEW_PSW, },
+};
diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c
index 24ef67eb1cef..b24de9aabf7d 100644
--- a/arch/s390/boot/ipl_parm.c
+++ b/arch/s390/boot/ipl_parm.c
@@ -2,48 +2,62 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/ctype.h>
+#include <linux/pgtable.h>
+#include <asm/page-states.h>
#include <asm/ebcdic.h>
#include <asm/sclp.h>
#include <asm/sections.h>
#include <asm/boot_data.h>
#include <asm/facility.h>
-#include <asm/pgtable.h>
+#include <asm/setup.h>
#include <asm/uv.h>
#include "boot.h"
+struct parmarea parmarea __section(".parmarea") = {
+ .kernel_version = (unsigned long)kernel_version,
+ .max_command_line_size = COMMAND_LINE_SIZE,
+ .command_line = "root=/dev/ram0 ro",
+};
+
char __bootdata(early_command_line)[COMMAND_LINE_SIZE];
+
+unsigned int __bootdata_preserved(zlib_dfltcc_support) = ZLIB_DFLTCC_FULL;
struct ipl_parameter_block __bootdata_preserved(ipl_block);
int __bootdata_preserved(ipl_block_valid);
+int __bootdata_preserved(__kaslr_enabled);
+int __bootdata_preserved(cmma_flag) = 1;
-unsigned long __bootdata(vmalloc_size) = VMALLOC_DEFAULT_SIZE;
-unsigned long __bootdata(memory_end);
-int __bootdata(memory_end_set);
-int __bootdata(noexec_disabled);
-
-int kaslr_enabled __section(.data);
+unsigned long vmalloc_size = VMALLOC_DEFAULT_SIZE;
+unsigned long memory_limit;
+int vmalloc_size_set;
static inline int __diag308(unsigned long subcode, void *addr)
{
- register unsigned long _addr asm("0") = (unsigned long)addr;
- register unsigned long _rc asm("1") = 0;
unsigned long reg1, reg2;
- psw_t old = S390_lowcore.program_new_psw;
+ union register_pair r1;
+ psw_t old;
+ r1.even = (unsigned long) addr;
+ r1.odd = 0;
asm volatile(
- " epsw %0,%1\n"
- " st %0,%[psw_pgm]\n"
- " st %1,%[psw_pgm]+4\n"
- " larl %0,1f\n"
- " stg %0,%[psw_pgm]+8\n"
- " diag %[addr],%[subcode],0x308\n"
- "1: nopr %%r7\n"
- : "=&d" (reg1), "=&a" (reg2),
- [psw_pgm] "=Q" (S390_lowcore.program_new_psw),
- [addr] "+d" (_addr), "+d" (_rc)
- : [subcode] "d" (subcode)
+ " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n"
+ " epsw %[reg1],%[reg2]\n"
+ " st %[reg1],0(%[psw_pgm])\n"
+ " st %[reg2],4(%[psw_pgm])\n"
+ " larl %[reg1],1f\n"
+ " stg %[reg1],8(%[psw_pgm])\n"
+ " diag %[r1],%[subcode],0x308\n"
+ "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n"
+ : [r1] "+&d" (r1.pair),
+ [reg1] "=&d" (reg1),
+ [reg2] "=&a" (reg2),
+ "+Q" (S390_lowcore.program_new_psw),
+ "=Q" (old)
+ : [subcode] "d" (subcode),
+ [psw_old] "a" (&old),
+ [psw_pgm] "a" (&S390_lowcore.program_new_psw)
: "cc", "memory");
- S390_lowcore.program_new_psw = old;
- return _rc;
+ return r1.odd;
}
void store_ipl_parmblock(void)
@@ -56,6 +70,20 @@ void store_ipl_parmblock(void)
ipl_block_valid = 1;
}
+bool is_ipl_block_dump(void)
+{
+ if (ipl_block.pb0_hdr.pbt == IPL_PBT_FCP &&
+ ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP)
+ return true;
+ if (ipl_block.pb0_hdr.pbt == IPL_PBT_NVME &&
+ ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP)
+ return true;
+ if (ipl_block.pb0_hdr.pbt == IPL_PBT_ECKD &&
+ ipl_block.eckd.opt == IPL_PB0_ECKD_OPT_DUMP)
+ return true;
+ return false;
+}
+
static size_t scpdata_length(const u8 *buf, size_t count)
{
while (count) {
@@ -69,30 +97,49 @@ static size_t scpdata_length(const u8 *buf, size_t count)
static size_t ipl_block_get_ascii_scpdata(char *dest, size_t size,
const struct ipl_parameter_block *ipb)
{
- size_t count;
- size_t i;
+ const __u8 *scp_data;
+ __u32 scp_data_len;
int has_lowercase;
+ size_t count = 0;
+ size_t i;
+
+ switch (ipb->pb0_hdr.pbt) {
+ case IPL_PBT_FCP:
+ scp_data_len = ipb->fcp.scp_data_len;
+ scp_data = ipb->fcp.scp_data;
+ break;
+ case IPL_PBT_NVME:
+ scp_data_len = ipb->nvme.scp_data_len;
+ scp_data = ipb->nvme.scp_data;
+ break;
+ case IPL_PBT_ECKD:
+ scp_data_len = ipb->eckd.scp_data_len;
+ scp_data = ipb->eckd.scp_data;
+ break;
+
+ default:
+ goto out;
+ }
- count = min(size - 1, scpdata_length(ipb->fcp.scp_data,
- ipb->fcp.scp_data_len));
+ count = min(size - 1, scpdata_length(scp_data, scp_data_len));
if (!count)
goto out;
has_lowercase = 0;
for (i = 0; i < count; i++) {
- if (!isascii(ipb->fcp.scp_data[i])) {
+ if (!isascii(scp_data[i])) {
count = 0;
goto out;
}
- if (!has_lowercase && islower(ipb->fcp.scp_data[i]))
+ if (!has_lowercase && islower(scp_data[i]))
has_lowercase = 1;
}
if (has_lowercase)
- memcpy(dest, ipb->fcp.scp_data, count);
+ memcpy(dest, scp_data, count);
else
for (i = 0; i < count; i++)
- dest[i] = tolower(ipb->fcp.scp_data[i]);
+ dest[i] = tolower(scp_data[i]);
out:
dest[count] = '\0';
return count;
@@ -114,6 +161,8 @@ static void append_ipl_block_parm(void)
parm, COMMAND_LINE_SIZE - len - 1, &ipl_block);
break;
case IPL_PBT_FCP:
+ case IPL_PBT_NVME:
+ case IPL_PBT_ECKD:
rc = ipl_block_get_ascii_scpdata(
parm, COMMAND_LINE_SIZE - len - 1, &ipl_block);
break;
@@ -138,12 +187,12 @@ static inline int has_ebcdic_char(const char *str)
void setup_boot_command_line(void)
{
- COMMAND_LINE[ARCH_COMMAND_LINE_SIZE - 1] = 0;
+ parmarea.command_line[COMMAND_LINE_SIZE - 1] = 0;
/* convert arch command line to ascii if necessary */
- if (has_ebcdic_char(COMMAND_LINE))
- EBCASC(COMMAND_LINE, ARCH_COMMAND_LINE_SIZE);
+ if (has_ebcdic_char(parmarea.command_line))
+ EBCASC(parmarea.command_line, COMMAND_LINE_SIZE);
/* copy arch command line */
- strcpy(early_command_line, strim(COMMAND_LINE));
+ strcpy(early_command_line, strim(parmarea.command_line));
/* append IPL PARM data to the boot command line */
if (!is_prot_virt_guest() && ipl_block_valid)
@@ -153,9 +202,9 @@ void setup_boot_command_line(void)
static void modify_facility(unsigned long nr, bool clear)
{
if (clear)
- __clear_facility(nr, S390_lowcore.stfle_fac_list);
+ __clear_facility(nr, stfle_fac_list);
else
- __set_facility(nr, S390_lowcore.stfle_fac_list);
+ __set_facility(nr, stfle_fac_list);
}
static void check_cleared_facilities(void)
@@ -164,7 +213,7 @@ static void check_cleared_facilities(void)
int i;
for (i = 0; i < ARRAY_SIZE(als); i++) {
- if ((S390_lowcore.stfle_fac_list[i] & als[i]) != als[i]) {
+ if ((stfle_fac_list[i] & als[i]) != als[i]) {
sclp_early_printk("Warning: The Linux kernel requires facilities cleared via command line option\n");
print_missing_facilities();
break;
@@ -208,7 +257,7 @@ static void modify_fac_list(char *str)
check_cleared_facilities();
}
-static char command_line_buf[COMMAND_LINE_SIZE] __section(.data);
+static char command_line_buf[COMMAND_LINE_SIZE];
void parse_boot_command_line(void)
{
char *param, *val;
@@ -216,44 +265,50 @@ void parse_boot_command_line(void)
char *args;
int rc;
- kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE);
+ __kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE);
args = strcpy(command_line_buf, early_command_line);
while (*args) {
args = next_arg(args, &param, &val);
- if (!strcmp(param, "mem") && val) {
- memory_end = round_down(memparse(val, NULL), PAGE_SIZE);
- memory_end_set = 1;
- }
+ if (!strcmp(param, "mem") && val)
+ memory_limit = round_down(memparse(val, NULL), PAGE_SIZE);
- if (!strcmp(param, "vmalloc") && val)
- vmalloc_size = round_up(memparse(val, NULL), PAGE_SIZE);
+ if (!strcmp(param, "vmalloc") && val) {
+ vmalloc_size = round_up(memparse(val, NULL), _SEGMENT_SIZE);
+ vmalloc_size_set = 1;
+ }
- if (!strcmp(param, "noexec")) {
- rc = kstrtobool(val, &enabled);
- if (!rc && !enabled)
- noexec_disabled = 1;
+ if (!strcmp(param, "dfltcc") && val) {
+ if (!strcmp(val, "off"))
+ zlib_dfltcc_support = ZLIB_DFLTCC_DISABLED;
+ else if (!strcmp(val, "on"))
+ zlib_dfltcc_support = ZLIB_DFLTCC_FULL;
+ else if (!strcmp(val, "def_only"))
+ zlib_dfltcc_support = ZLIB_DFLTCC_DEFLATE_ONLY;
+ else if (!strcmp(val, "inf_only"))
+ zlib_dfltcc_support = ZLIB_DFLTCC_INFLATE_ONLY;
+ else if (!strcmp(val, "always"))
+ zlib_dfltcc_support = ZLIB_DFLTCC_FULL_DEBUG;
}
if (!strcmp(param, "facilities") && val)
modify_fac_list(val);
if (!strcmp(param, "nokaslr"))
- kaslr_enabled = 0;
- }
-}
+ __kaslr_enabled = 0;
-void setup_memory_end(void)
-{
-#ifdef CONFIG_CRASH_DUMP
- if (OLDMEM_BASE) {
- kaslr_enabled = 0;
- } else if (ipl_block_valid &&
- ipl_block.pb0_hdr.pbt == IPL_PBT_FCP &&
- ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) {
- kaslr_enabled = 0;
- if (!sclp_early_get_hsa_size(&memory_end) && memory_end)
- memory_end_set = 1;
- }
+ if (!strcmp(param, "cmma")) {
+ rc = kstrtobool(val, &enabled);
+ if (!rc && !enabled)
+ cmma_flag = 0;
+ }
+
+#if IS_ENABLED(CONFIG_KVM)
+ if (!strcmp(param, "prot_virt")) {
+ rc = kstrtobool(val, &enabled);
+ if (!rc && enabled)
+ prot_virt_host = 1;
+ }
#endif
+ }
}
diff --git a/arch/s390/boot/ipl_report.c b/arch/s390/boot/ipl_report.c
index 0b4965573656..1803035e68d2 100644
--- a/arch/s390/boot/ipl_report.c
+++ b/arch/s390/boot/ipl_report.c
@@ -5,6 +5,7 @@
#include <asm/sclp.h>
#include <asm/sections.h>
#include <asm/boot_data.h>
+#include <asm/physmem_info.h>
#include <uapi/asm/ipl.h>
#include "boot.h"
@@ -16,20 +17,16 @@ unsigned long __bootdata_preserved(ipl_cert_list_size);
unsigned long __bootdata(early_ipl_comp_list_addr);
unsigned long __bootdata(early_ipl_comp_list_size);
+static struct ipl_rb_certificates *certs;
+static struct ipl_rb_components *comps;
+static bool ipl_report_needs_saving;
+
#define for_each_rb_entry(entry, rb) \
for (entry = rb->entries; \
(void *) entry + sizeof(*entry) <= (void *) rb + rb->len; \
entry++)
-static inline bool intersects(unsigned long addr0, unsigned long size0,
- unsigned long addr1, unsigned long size1)
-{
- return addr0 + size0 > addr1 && addr1 + size1 > addr0;
-}
-
-static unsigned long find_bootdata_space(struct ipl_rb_components *comps,
- struct ipl_rb_certificates *certs,
- unsigned long safe_addr)
+static unsigned long get_cert_comp_list_size(void)
{
struct ipl_rb_certificate_entry *cert;
struct ipl_rb_component_entry *comp;
@@ -44,36 +41,27 @@ static unsigned long find_bootdata_space(struct ipl_rb_components *comps,
ipl_cert_list_size = 0;
for_each_rb_entry(cert, certs)
ipl_cert_list_size += sizeof(unsigned int) + cert->len;
- size = ipl_cert_list_size + early_ipl_comp_list_size;
+ return ipl_cert_list_size + early_ipl_comp_list_size;
+}
- /*
- * Start from safe_addr to find a free memory area large
- * enough for the IPL report boot data. This area is used
- * for ipl_cert_list_addr/ipl_cert_list_size and
- * early_ipl_comp_list_addr/early_ipl_comp_list_size. It must
- * not overlap with any component or any certificate.
- */
-repeat:
- if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE &&
- intersects(INITRD_START, INITRD_SIZE, safe_addr, size))
- safe_addr = INITRD_START + INITRD_SIZE;
- for_each_rb_entry(comp, comps)
- if (intersects(safe_addr, size, comp->addr, comp->len)) {
- safe_addr = comp->addr + comp->len;
- goto repeat;
- }
- for_each_rb_entry(cert, certs)
- if (intersects(safe_addr, size, cert->addr, cert->len)) {
- safe_addr = cert->addr + cert->len;
- goto repeat;
- }
- early_ipl_comp_list_addr = safe_addr;
- ipl_cert_list_addr = safe_addr + early_ipl_comp_list_size;
+bool ipl_report_certs_intersects(unsigned long addr, unsigned long size,
+ unsigned long *intersection_start)
+{
+ struct ipl_rb_certificate_entry *cert;
- return safe_addr + size;
+ if (!ipl_report_needs_saving)
+ return false;
+
+ for_each_rb_entry(cert, certs) {
+ if (intersects(addr, size, cert->addr, cert->len)) {
+ *intersection_start = cert->addr;
+ return true;
+ }
+ }
+ return false;
}
-static void copy_components_bootdata(struct ipl_rb_components *comps)
+static void copy_components_bootdata(void)
{
struct ipl_rb_component_entry *comp, *ptr;
@@ -82,7 +70,7 @@ static void copy_components_bootdata(struct ipl_rb_components *comps)
memcpy(ptr++, comp, sizeof(*ptr));
}
-static void copy_certificates_bootdata(struct ipl_rb_certificates *certs)
+static void copy_certificates_bootdata(void)
{
struct ipl_rb_certificate_entry *cert;
void *ptr;
@@ -96,10 +84,8 @@ static void copy_certificates_bootdata(struct ipl_rb_certificates *certs)
}
}
-unsigned long read_ipl_report(unsigned long safe_addr)
+int read_ipl_report(void)
{
- struct ipl_rb_certificates *certs;
- struct ipl_rb_components *comps;
struct ipl_pl_hdr *pl_hdr;
struct ipl_rl_hdr *rl_hdr;
struct ipl_rb_hdr *rb_hdr;
@@ -112,7 +98,7 @@ unsigned long read_ipl_report(unsigned long safe_addr)
*/
if (!ipl_block_valid ||
!(ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR))
- return safe_addr;
+ return -1;
ipl_secure_flag = !!(ipl_block.hdr.flags & IPL_PL_FLAG_SIPL);
/*
* There is an IPL report, to find it load the pointer to the
@@ -150,16 +136,30 @@ unsigned long read_ipl_report(unsigned long safe_addr)
* With either the component list or the certificate list
* missing the kernel will stay ignorant of secure IPL.
*/
- if (!comps || !certs)
- return safe_addr;
+ if (!comps || !certs) {
+ certs = NULL;
+ return -1;
+ }
- /*
- * Copy component and certificate list to a safe area
- * where the decompressed kernel can find them.
- */
- safe_addr = find_bootdata_space(comps, certs, safe_addr);
- copy_components_bootdata(comps);
- copy_certificates_bootdata(certs);
+ ipl_report_needs_saving = true;
+ physmem_reserve(RR_IPLREPORT, (unsigned long)pl_hdr,
+ (unsigned long)rl_end - (unsigned long)pl_hdr);
+ return 0;
+}
+
+void save_ipl_cert_comp_list(void)
+{
+ unsigned long size;
+
+ if (!ipl_report_needs_saving)
+ return;
+
+ size = get_cert_comp_list_size();
+ early_ipl_comp_list_addr = physmem_alloc_top_down(RR_CERT_COMP_LIST, size, sizeof(int));
+ ipl_cert_list_addr = early_ipl_comp_list_addr + early_ipl_comp_list_size;
- return safe_addr;
+ copy_components_bootdata();
+ copy_certificates_bootdata();
+ physmem_free(RR_IPLREPORT);
+ ipl_report_needs_saving = false;
}
diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c
index 5d12352545c5..90602101e2ae 100644
--- a/arch/s390/boot/kaslr.c
+++ b/arch/s390/boot/kaslr.c
@@ -2,12 +2,13 @@
/*
* Copyright IBM Corp. 2019
*/
-#include <asm/mem_detect.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
+#include <asm/physmem_info.h>
#include <asm/cpacf.h>
#include <asm/timex.h>
#include <asm/sclp.h>
-#include "compressed/decompressor.h"
+#include <asm/kasan.h>
+#include "decompressor.h"
#include "boot.h"
#define PRNG_MODE_TDES 1
@@ -42,7 +43,7 @@ static int check_prng(void)
return PRNG_MODE_TDES;
}
-static unsigned long get_random(unsigned long limit)
+static int get_random(unsigned long limit, unsigned long *value)
{
struct prng_parm prng = {
/* initial parameter block for tdes mode, copied from libica */
@@ -75,7 +76,7 @@ static unsigned long get_random(unsigned long limit)
*(unsigned long *) prng.parm_block ^= seed;
for (i = 0; i < 16; i++) {
cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block,
- (char *) entropy, (char *) entropy,
+ (u8 *) entropy, (u8 *) entropy,
sizeof(entropy));
memcpy(prng.parm_block, entropy, sizeof(entropy));
}
@@ -84,87 +85,114 @@ static unsigned long get_random(unsigned long limit)
(u8 *) &random, sizeof(random));
break;
default:
- random = 0;
+ return -1;
}
- return random % limit;
+ *value = random % limit;
+ return 0;
}
-unsigned long get_random_base(unsigned long safe_addr)
+static void sort_reserved_ranges(struct reserved_range *res, unsigned long size)
{
- unsigned long memory_limit = memory_end_set ? memory_end : 0;
- unsigned long base, start, end, kernel_size;
- unsigned long block_sum, offset;
- unsigned long kasan_needs;
- int i;
+ struct reserved_range tmp;
+ int i, j;
- if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE) {
- if (safe_addr < INITRD_START + INITRD_SIZE)
- safe_addr = INITRD_START + INITRD_SIZE;
+ for (i = 1; i < size; i++) {
+ tmp = res[i];
+ for (j = i - 1; j >= 0 && res[j].start > tmp.start; j--)
+ res[j + 1] = res[j];
+ res[j + 1] = tmp;
}
- safe_addr = ALIGN(safe_addr, THREAD_SIZE);
+}
- if ((IS_ENABLED(CONFIG_KASAN))) {
- /*
- * Estimate kasan memory requirements, which it will reserve
- * at the very end of available physical memory. To estimate
- * that, we take into account that kasan would require
- * 1/8 of available physical memory (for shadow memory) +
- * creating page tables for the whole memory + shadow memory
- * region (1 + 1/8). To keep page tables estimates simple take
- * the double of combined ptes size.
- */
- memory_limit = get_mem_detect_end();
- if (memory_end_set && memory_limit > memory_end)
- memory_limit = memory_end;
+static unsigned long iterate_valid_positions(unsigned long size, unsigned long align,
+ unsigned long _min, unsigned long _max,
+ struct reserved_range *res, size_t res_count,
+ bool pos_count, unsigned long find_pos)
+{
+ unsigned long start, end, tmp_end, range_pos, pos = 0;
+ struct reserved_range *res_end = res + res_count;
+ struct reserved_range *skip_res;
+ int i;
- /* for shadow memory */
- kasan_needs = memory_limit / 8;
- /* for paging structures */
- kasan_needs += (memory_limit + kasan_needs) / PAGE_SIZE /
- _PAGE_ENTRIES * _PAGE_TABLE_SIZE * 2;
- memory_limit -= kasan_needs;
- }
+ align = max(align, 8UL);
+ _min = round_up(_min, align);
+ for_each_physmem_usable_range(i, &start, &end) {
+ if (_min >= end)
+ continue;
+ start = round_up(start, align);
+ if (start >= _max)
+ break;
+ start = max(_min, start);
+ end = min(_max, end);
- kernel_size = vmlinux.image_size + vmlinux.bss_size;
- block_sum = 0;
- for_each_mem_detect_block(i, &start, &end) {
- if (memory_limit) {
- if (start >= memory_limit)
+ while (start + size <= end) {
+ /* skip reserved ranges below the start */
+ while (res && res->end <= start) {
+ res++;
+ if (res >= res_end)
+ res = NULL;
+ }
+ skip_res = NULL;
+ tmp_end = end;
+ /* has intersecting reserved range */
+ if (res && res->start < end) {
+ skip_res = res;
+ tmp_end = res->start;
+ }
+ if (start + size <= tmp_end) {
+ range_pos = (tmp_end - start - size) / align + 1;
+ if (pos_count) {
+ pos += range_pos;
+ } else {
+ if (range_pos >= find_pos)
+ return start + (find_pos - 1) * align;
+ find_pos -= range_pos;
+ }
+ }
+ if (!skip_res)
break;
- if (end > memory_limit)
- end = memory_limit;
+ start = round_up(skip_res->end, align);
}
- if (end - start < kernel_size)
- continue;
- block_sum += end - start - kernel_size;
- }
- if (!block_sum) {
- sclp_early_printk("KASLR disabled: not enough memory\n");
- return 0;
}
- base = get_random(block_sum);
- if (base == 0)
+ return pos_count ? pos : 0;
+}
+
+/*
+ * Two types of decompressor memory allocations/reserves are considered
+ * differently.
+ *
+ * "Static" or "single" allocations are done via physmem_alloc_range() and
+ * physmem_reserve(), and they are listed in physmem_info.reserved[]. Each
+ * type of "static" allocation can only have one allocation per type and
+ * cannot have chains.
+ *
+ * On the other hand, "dynamic" or "repetitive" allocations are done via
+ * physmem_alloc_top_down(). These allocations are tightly packed together
+ * top down from the end of online memory. physmem_alloc_pos represents
+ * current position where those allocations start.
+ *
+ * Functions randomize_within_range() and iterate_valid_positions()
+ * only consider "dynamic" allocations by never looking above
+ * physmem_alloc_pos. "Static" allocations, however, are explicitly
+ * considered by checking the "res" (reserves) array. The first
+ * reserved_range of a "dynamic" allocation may also be checked along the
+ * way, but it will always be above the maximum value anyway.
+ */
+unsigned long randomize_within_range(unsigned long size, unsigned long align,
+ unsigned long min, unsigned long max)
+{
+ struct reserved_range res[RR_MAX];
+ unsigned long max_pos, pos;
+
+ memcpy(res, physmem_info.reserved, sizeof(res));
+ sort_reserved_ranges(res, ARRAY_SIZE(res));
+ max = min(max, get_physmem_alloc_pos());
+
+ max_pos = iterate_valid_positions(size, align, min, max, res, ARRAY_SIZE(res), true, 0);
+ if (!max_pos)
return 0;
- if (base < safe_addr)
- base = safe_addr;
- block_sum = offset = 0;
- for_each_mem_detect_block(i, &start, &end) {
- if (memory_limit) {
- if (start >= memory_limit)
- break;
- if (end > memory_limit)
- end = memory_limit;
- }
- if (end - start < kernel_size)
- continue;
- block_sum += end - start - kernel_size;
- if (base <= block_sum) {
- base = start + base - offset;
- base = ALIGN_DOWN(base, THREAD_SIZE);
- break;
- }
- offset = block_sum;
- }
- return base;
+ if (get_random(max_pos, &pos))
+ return 0;
+ return iterate_valid_positions(size, align, min, max, res, ARRAY_SIZE(res), false, pos + 1);
}
diff --git a/arch/s390/boot/mem_detect.c b/arch/s390/boot/mem_detect.c
deleted file mode 100644
index 62e7c13ce85c..000000000000
--- a/arch/s390/boot/mem_detect.c
+++ /dev/null
@@ -1,175 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <asm/sclp.h>
-#include <asm/sections.h>
-#include <asm/mem_detect.h>
-#include <asm/sparsemem.h>
-#include "compressed/decompressor.h"
-#include "boot.h"
-
-unsigned long __bootdata(max_physmem_end);
-struct mem_detect_info __bootdata(mem_detect);
-
-/* up to 256 storage elements, 1020 subincrements each */
-#define ENTRIES_EXTENDED_MAX \
- (256 * (1020 / 2) * sizeof(struct mem_detect_block))
-
-/*
- * To avoid corrupting old kernel memory during dump, find lowest memory
- * chunk possible either right after the kernel end (decompressed kernel) or
- * after initrd (if it is present and there is no hole between the kernel end
- * and initrd)
- */
-static void *mem_detect_alloc_extended(void)
-{
- unsigned long offset = ALIGN(mem_safe_offset(), sizeof(u64));
-
- if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE &&
- INITRD_START < offset + ENTRIES_EXTENDED_MAX)
- offset = ALIGN(INITRD_START + INITRD_SIZE, sizeof(u64));
-
- return (void *)offset;
-}
-
-static struct mem_detect_block *__get_mem_detect_block_ptr(u32 n)
-{
- if (n < MEM_INLINED_ENTRIES)
- return &mem_detect.entries[n];
- if (unlikely(!mem_detect.entries_extended))
- mem_detect.entries_extended = mem_detect_alloc_extended();
- return &mem_detect.entries_extended[n - MEM_INLINED_ENTRIES];
-}
-
-/*
- * sequential calls to add_mem_detect_block with adjacent memory areas
- * are merged together into single memory block.
- */
-void add_mem_detect_block(u64 start, u64 end)
-{
- struct mem_detect_block *block;
-
- if (mem_detect.count) {
- block = __get_mem_detect_block_ptr(mem_detect.count - 1);
- if (block->end == start) {
- block->end = end;
- return;
- }
- }
-
- block = __get_mem_detect_block_ptr(mem_detect.count);
- block->start = start;
- block->end = end;
- mem_detect.count++;
-}
-
-static int __diag260(unsigned long rx1, unsigned long rx2)
-{
- register unsigned long _rx1 asm("2") = rx1;
- register unsigned long _rx2 asm("3") = rx2;
- register unsigned long _ry asm("4") = 0x10; /* storage configuration */
- int rc = -1; /* fail */
- unsigned long reg1, reg2;
- psw_t old = S390_lowcore.program_new_psw;
-
- asm volatile(
- " epsw %0,%1\n"
- " st %0,%[psw_pgm]\n"
- " st %1,%[psw_pgm]+4\n"
- " larl %0,1f\n"
- " stg %0,%[psw_pgm]+8\n"
- " diag %[rx],%[ry],0x260\n"
- " ipm %[rc]\n"
- " srl %[rc],28\n"
- "1:\n"
- : "=&d" (reg1), "=&a" (reg2),
- [psw_pgm] "=Q" (S390_lowcore.program_new_psw),
- [rc] "+&d" (rc), [ry] "+d" (_ry)
- : [rx] "d" (_rx1), "d" (_rx2)
- : "cc", "memory");
- S390_lowcore.program_new_psw = old;
- return rc == 0 ? _ry : -1;
-}
-
-static int diag260(void)
-{
- int rc, i;
-
- struct {
- unsigned long start;
- unsigned long end;
- } storage_extents[8] __aligned(16); /* VM supports up to 8 extends */
-
- memset(storage_extents, 0, sizeof(storage_extents));
- rc = __diag260((unsigned long)storage_extents, sizeof(storage_extents));
- if (rc == -1)
- return -1;
-
- for (i = 0; i < min_t(int, rc, ARRAY_SIZE(storage_extents)); i++)
- add_mem_detect_block(storage_extents[i].start, storage_extents[i].end + 1);
- return 0;
-}
-
-static int tprot(unsigned long addr)
-{
- unsigned long pgm_addr;
- int rc = -EFAULT;
- psw_t old = S390_lowcore.program_new_psw;
-
- S390_lowcore.program_new_psw.mask = __extract_psw();
- asm volatile(
- " larl %[pgm_addr],1f\n"
- " stg %[pgm_addr],%[psw_pgm_addr]\n"
- " tprot 0(%[addr]),0\n"
- " ipm %[rc]\n"
- " srl %[rc],28\n"
- "1:\n"
- : [pgm_addr] "=&d"(pgm_addr),
- [psw_pgm_addr] "=Q"(S390_lowcore.program_new_psw.addr),
- [rc] "+&d"(rc)
- : [addr] "a"(addr)
- : "cc", "memory");
- S390_lowcore.program_new_psw = old;
- return rc;
-}
-
-static void search_mem_end(void)
-{
- unsigned long range = 1 << (MAX_PHYSMEM_BITS - 20); /* in 1MB blocks */
- unsigned long offset = 0;
- unsigned long pivot;
-
- while (range > 1) {
- range >>= 1;
- pivot = offset + range;
- if (!tprot(pivot << 20))
- offset = pivot;
- }
-
- add_mem_detect_block(0, (offset + 1) << 20);
-}
-
-void detect_memory(void)
-{
- sclp_early_get_memsize(&max_physmem_end);
-
- if (!sclp_early_read_storage_info()) {
- mem_detect.info_source = MEM_DETECT_SCLP_STOR_INFO;
- return;
- }
-
- if (!diag260()) {
- mem_detect.info_source = MEM_DETECT_DIAG260;
- return;
- }
-
- if (max_physmem_end) {
- add_mem_detect_block(0, max_physmem_end);
- mem_detect.info_source = MEM_DETECT_SCLP_READ_INFO;
- return;
- }
-
- search_mem_end();
- mem_detect.info_source = MEM_DETECT_BIN_SEARCH;
- max_physmem_end = get_mem_detect_end();
-}
diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c
index 83b5b7915c32..97244cd7a206 100644
--- a/arch/s390/boot/pgm_check_info.c
+++ b/arch/s390/boot/pgm_check_info.c
@@ -1,90 +1,179 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
+#include <linux/stdarg.h>
#include <linux/string.h>
+#include <linux/ctype.h>
+#include <asm/stacktrace.h>
+#include <asm/boot_data.h>
#include <asm/lowcore.h>
+#include <asm/setup.h>
#include <asm/sclp.h>
+#include <asm/uv.h>
#include "boot.h"
const char hex_asc[] = "0123456789abcdef";
-#define add_val_as_hex(dst, val) \
- __add_val_as_hex(dst, (const unsigned char *)&val, sizeof(val))
+static char *as_hex(char *dst, unsigned long val, int pad)
+{
+ char *p, *end = p = dst + max(pad, (int)__fls(val | 1) / 4 + 1);
+
+ for (*p-- = 0; p >= dst; val >>= 4)
+ *p-- = hex_asc[val & 0x0f];
+ return end;
+}
-static char *__add_val_as_hex(char *dst, const unsigned char *src, size_t count)
+static char *symstart(char *p)
{
- while (count--)
- dst = hex_byte_pack(dst, *src++);
- return dst;
+ while (*p)
+ p--;
+ return p + 1;
}
-static char *add_str(char *dst, char *src)
+static noinline char *findsym(unsigned long ip, unsigned short *off, unsigned short *len)
{
- strcpy(dst, src);
- return dst + strlen(dst);
+ /* symbol entries are in a form "10000 c4 startup\0" */
+ char *a = _decompressor_syms_start;
+ char *b = _decompressor_syms_end;
+ unsigned long start;
+ unsigned long size;
+ char *pivot;
+ char *endp;
+
+ while (a < b) {
+ pivot = symstart(a + (b - a) / 2);
+ start = simple_strtoull(pivot, &endp, 16);
+ size = simple_strtoull(endp + 1, &endp, 16);
+ if (ip < start) {
+ b = pivot;
+ continue;
+ }
+ if (ip > start + size) {
+ a = pivot + strlen(pivot) + 1;
+ continue;
+ }
+ *off = ip - start;
+ *len = size;
+ return endp + 1;
+ }
+ return NULL;
}
-void print_pgm_check_info(void)
+static noinline char *strsym(void *ip)
{
- struct psw_bits *psw = &psw_bits(S390_lowcore.psw_save_area);
- unsigned short ilc = S390_lowcore.pgm_ilc >> 1;
- char buf[256];
- int row, col;
+ static char buf[64];
+ unsigned short off;
+ unsigned short len;
char *p;
- add_str(buf, "Linux version ");
- strlcat(buf, kernel_version, sizeof(buf));
- sclp_early_printk(buf);
+ p = findsym((unsigned long)ip, &off, &len);
+ if (p) {
+ strncpy(buf, p, sizeof(buf));
+ /* reserve 15 bytes for offset/len in symbol+0x1234/0x1234 */
+ p = buf + strnlen(buf, sizeof(buf) - 15);
+ strcpy(p, "+0x");
+ p = as_hex(p + 3, off, 0);
+ strcpy(p, "/0x");
+ as_hex(p + 3, len, 0);
+ } else {
+ as_hex(buf, (unsigned long)ip, 16);
+ }
+ return buf;
+}
- p = add_str(buf, "Kernel fault: interruption code ");
- p = add_val_as_hex(buf + strlen(buf), S390_lowcore.pgm_code);
- p = add_str(p, " ilc:");
- *p++ = hex_asc_lo(ilc);
- add_str(p, "\n");
- sclp_early_printk(buf);
+void decompressor_printk(const char *fmt, ...)
+{
+ char buf[1024] = { 0 };
+ char *end = buf + sizeof(buf) - 1; /* make sure buf is 0 terminated */
+ unsigned long pad;
+ char *p = buf;
+ va_list args;
- p = add_str(buf, "PSW : ");
- p = add_val_as_hex(p, S390_lowcore.psw_save_area.mask);
- p = add_str(p, " ");
- p = add_val_as_hex(p, S390_lowcore.psw_save_area.addr);
- add_str(p, "\n");
+ va_start(args, fmt);
+ for (; p < end && *fmt; fmt++) {
+ if (*fmt != '%') {
+ *p++ = *fmt;
+ continue;
+ }
+ pad = isdigit(*++fmt) ? simple_strtol(fmt, (char **)&fmt, 10) : 0;
+ switch (*fmt) {
+ case 's':
+ p = buf + strlcat(buf, va_arg(args, char *), sizeof(buf));
+ break;
+ case 'p':
+ if (*++fmt != 'S')
+ goto out;
+ p = buf + strlcat(buf, strsym(va_arg(args, void *)), sizeof(buf));
+ break;
+ case 'l':
+ if (*++fmt != 'x' || end - p <= max(sizeof(long) * 2, pad))
+ goto out;
+ p = as_hex(p, va_arg(args, unsigned long), pad);
+ break;
+ case 'x':
+ if (end - p <= max(sizeof(int) * 2, pad))
+ goto out;
+ p = as_hex(p, va_arg(args, unsigned int), pad);
+ break;
+ default:
+ goto out;
+ }
+ }
+out:
+ va_end(args);
sclp_early_printk(buf);
+}
- p = add_str(buf, " R:");
- *p++ = hex_asc_lo(psw->per);
- p = add_str(p, " T:");
- *p++ = hex_asc_lo(psw->dat);
- p = add_str(p, " IO:");
- *p++ = hex_asc_lo(psw->io);
- p = add_str(p, " EX:");
- *p++ = hex_asc_lo(psw->ext);
- p = add_str(p, " Key:");
- *p++ = hex_asc_lo(psw->key);
- p = add_str(p, " M:");
- *p++ = hex_asc_lo(psw->mcheck);
- p = add_str(p, " W:");
- *p++ = hex_asc_lo(psw->wait);
- p = add_str(p, " P:");
- *p++ = hex_asc_lo(psw->pstate);
- p = add_str(p, " AS:");
- *p++ = hex_asc_lo(psw->as);
- p = add_str(p, " CC:");
- *p++ = hex_asc_lo(psw->cc);
- p = add_str(p, " PM:");
- *p++ = hex_asc_lo(psw->pm);
- p = add_str(p, " RI:");
- *p++ = hex_asc_lo(psw->ri);
- p = add_str(p, " EA:");
- *p++ = hex_asc_lo(psw->eaba);
- add_str(p, "\n");
- sclp_early_printk(buf);
+void print_stacktrace(unsigned long sp)
+{
+ struct stack_info boot_stack = { STACK_TYPE_TASK, (unsigned long)_stack_start,
+ (unsigned long)_stack_end };
+ bool first = true;
- for (row = 0; row < 4; row++) {
- p = add_str(buf, row == 0 ? "GPRS:" : " ");
- for (col = 0; col < 4; col++) {
- p = add_str(p, " ");
- p = add_val_as_hex(p, S390_lowcore.gpregs_save_area[row * 4 + col]);
- }
- add_str(p, "\n");
- sclp_early_printk(buf);
+ decompressor_printk("Call Trace:\n");
+ while (!(sp & 0x7) && on_stack(&boot_stack, sp, sizeof(struct stack_frame))) {
+ struct stack_frame *sf = (struct stack_frame *)sp;
+
+ decompressor_printk(first ? "(sp:%016lx [<%016lx>] %pS)\n" :
+ " sp:%016lx [<%016lx>] %pS\n",
+ sp, sf->gprs[8], (void *)sf->gprs[8]);
+ if (sf->back_chain <= sp)
+ break;
+ sp = sf->back_chain;
+ first = false;
}
}
+
+void print_pgm_check_info(void)
+{
+ unsigned long *gpregs = (unsigned long *)S390_lowcore.gpregs_save_area;
+ struct psw_bits *psw = &psw_bits(S390_lowcore.psw_save_area);
+
+ decompressor_printk("Linux version %s\n", kernel_version);
+ if (!is_prot_virt_guest() && early_command_line[0])
+ decompressor_printk("Kernel command line: %s\n", early_command_line);
+ decompressor_printk("Kernel fault: interruption code %04x ilc:%x\n",
+ S390_lowcore.pgm_code, S390_lowcore.pgm_ilc >> 1);
+ if (kaslr_enabled())
+ decompressor_printk("Kernel random base: %lx\n", __kaslr_offset);
+ decompressor_printk("PSW : %016lx %016lx (%pS)\n",
+ S390_lowcore.psw_save_area.mask,
+ S390_lowcore.psw_save_area.addr,
+ (void *)S390_lowcore.psw_save_area.addr);
+ decompressor_printk(
+ " R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x P:%x AS:%x CC:%x PM:%x RI:%x EA:%x\n",
+ psw->per, psw->dat, psw->io, psw->ext, psw->key, psw->mcheck,
+ psw->wait, psw->pstate, psw->as, psw->cc, psw->pm, psw->ri,
+ psw->eaba);
+ decompressor_printk("GPRS: %016lx %016lx %016lx %016lx\n",
+ gpregs[0], gpregs[1], gpregs[2], gpregs[3]);
+ decompressor_printk(" %016lx %016lx %016lx %016lx\n",
+ gpregs[4], gpregs[5], gpregs[6], gpregs[7]);
+ decompressor_printk(" %016lx %016lx %016lx %016lx\n",
+ gpregs[8], gpregs[9], gpregs[10], gpregs[11]);
+ decompressor_printk(" %016lx %016lx %016lx %016lx\n",
+ gpregs[12], gpregs[13], gpregs[14], gpregs[15]);
+ print_stacktrace(S390_lowcore.gpregs_save_area[15]);
+ decompressor_printk("Last Breaking-Event-Address:\n");
+ decompressor_printk(" [<%016lx>] %pS\n", (unsigned long)S390_lowcore.pgm_last_break,
+ (void *)S390_lowcore.pgm_last_break);
+}
diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c
new file mode 100644
index 000000000000..0cf79826eef9
--- /dev/null
+++ b/arch/s390/boot/physmem_info.c
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/processor.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <asm/physmem_info.h>
+#include <asm/stacktrace.h>
+#include <asm/boot_data.h>
+#include <asm/sparsemem.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/sclp.h>
+#include <asm/uv.h>
+#include "decompressor.h"
+#include "boot.h"
+
+struct physmem_info __bootdata(physmem_info);
+static unsigned int physmem_alloc_ranges;
+static unsigned long physmem_alloc_pos;
+
+/* up to 256 storage elements, 1020 subincrements each */
+#define ENTRIES_EXTENDED_MAX \
+ (256 * (1020 / 2) * sizeof(struct physmem_range))
+
+static struct physmem_range *__get_physmem_range_ptr(u32 n)
+{
+ if (n < MEM_INLINED_ENTRIES)
+ return &physmem_info.online[n];
+ if (unlikely(!physmem_info.online_extended)) {
+ physmem_info.online_extended = (struct physmem_range *)physmem_alloc_range(
+ RR_MEM_DETECT_EXTENDED, ENTRIES_EXTENDED_MAX, sizeof(long), 0,
+ physmem_alloc_pos, true);
+ }
+ return &physmem_info.online_extended[n - MEM_INLINED_ENTRIES];
+}
+
+/*
+ * sequential calls to add_physmem_online_range with adjacent memory ranges
+ * are merged together into single memory range.
+ */
+void add_physmem_online_range(u64 start, u64 end)
+{
+ struct physmem_range *range;
+
+ if (physmem_info.range_count) {
+ range = __get_physmem_range_ptr(physmem_info.range_count - 1);
+ if (range->end == start) {
+ range->end = end;
+ return;
+ }
+ }
+
+ range = __get_physmem_range_ptr(physmem_info.range_count);
+ range->start = start;
+ range->end = end;
+ physmem_info.range_count++;
+}
+
+static int __diag260(unsigned long rx1, unsigned long rx2)
+{
+ unsigned long reg1, reg2, ry;
+ union register_pair rx;
+ psw_t old;
+ int rc;
+
+ rx.even = rx1;
+ rx.odd = rx2;
+ ry = 0x10; /* storage configuration */
+ rc = -1; /* fail */
+ asm volatile(
+ " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n"
+ " epsw %[reg1],%[reg2]\n"
+ " st %[reg1],0(%[psw_pgm])\n"
+ " st %[reg2],4(%[psw_pgm])\n"
+ " larl %[reg1],1f\n"
+ " stg %[reg1],8(%[psw_pgm])\n"
+ " diag %[rx],%[ry],0x260\n"
+ " ipm %[rc]\n"
+ " srl %[rc],28\n"
+ "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n"
+ : [reg1] "=&d" (reg1),
+ [reg2] "=&a" (reg2),
+ [rc] "+&d" (rc),
+ [ry] "+&d" (ry),
+ "+Q" (S390_lowcore.program_new_psw),
+ "=Q" (old)
+ : [rx] "d" (rx.pair),
+ [psw_old] "a" (&old),
+ [psw_pgm] "a" (&S390_lowcore.program_new_psw)
+ : "cc", "memory");
+ return rc == 0 ? ry : -1;
+}
+
+static int diag260(void)
+{
+ int rc, i;
+
+ struct {
+ unsigned long start;
+ unsigned long end;
+ } storage_extents[8] __aligned(16); /* VM supports up to 8 extends */
+
+ memset(storage_extents, 0, sizeof(storage_extents));
+ rc = __diag260((unsigned long)storage_extents, sizeof(storage_extents));
+ if (rc == -1)
+ return -1;
+
+ for (i = 0; i < min_t(int, rc, ARRAY_SIZE(storage_extents)); i++)
+ add_physmem_online_range(storage_extents[i].start, storage_extents[i].end + 1);
+ return 0;
+}
+
+static int tprot(unsigned long addr)
+{
+ unsigned long reg1, reg2;
+ int rc = -EFAULT;
+ psw_t old;
+
+ asm volatile(
+ " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n"
+ " epsw %[reg1],%[reg2]\n"
+ " st %[reg1],0(%[psw_pgm])\n"
+ " st %[reg2],4(%[psw_pgm])\n"
+ " larl %[reg1],1f\n"
+ " stg %[reg1],8(%[psw_pgm])\n"
+ " tprot 0(%[addr]),0\n"
+ " ipm %[rc]\n"
+ " srl %[rc],28\n"
+ "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n"
+ : [reg1] "=&d" (reg1),
+ [reg2] "=&a" (reg2),
+ [rc] "+&d" (rc),
+ "=Q" (S390_lowcore.program_new_psw.addr),
+ "=Q" (old)
+ : [psw_old] "a" (&old),
+ [psw_pgm] "a" (&S390_lowcore.program_new_psw),
+ [addr] "a" (addr)
+ : "cc", "memory");
+ return rc;
+}
+
+static unsigned long search_mem_end(void)
+{
+ unsigned long range = 1 << (MAX_PHYSMEM_BITS - 20); /* in 1MB blocks */
+ unsigned long offset = 0;
+ unsigned long pivot;
+
+ while (range > 1) {
+ range >>= 1;
+ pivot = offset + range;
+ if (!tprot(pivot << 20))
+ offset = pivot;
+ }
+ return (offset + 1) << 20;
+}
+
+unsigned long detect_max_physmem_end(void)
+{
+ unsigned long max_physmem_end = 0;
+
+ if (!sclp_early_get_memsize(&max_physmem_end)) {
+ physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
+ } else {
+ max_physmem_end = search_mem_end();
+ physmem_info.info_source = MEM_DETECT_BIN_SEARCH;
+ }
+ return max_physmem_end;
+}
+
+void detect_physmem_online_ranges(unsigned long max_physmem_end)
+{
+ if (!sclp_early_read_storage_info()) {
+ physmem_info.info_source = MEM_DETECT_SCLP_STOR_INFO;
+ } else if (!diag260()) {
+ physmem_info.info_source = MEM_DETECT_DIAG260;
+ } else if (max_physmem_end) {
+ add_physmem_online_range(0, max_physmem_end);
+ }
+}
+
+void physmem_set_usable_limit(unsigned long limit)
+{
+ physmem_info.usable = limit;
+ physmem_alloc_pos = limit;
+}
+
+static void die_oom(unsigned long size, unsigned long align, unsigned long min, unsigned long max)
+{
+ unsigned long start, end, total_mem = 0, total_reserved_mem = 0;
+ struct reserved_range *range;
+ enum reserved_range_type t;
+ int i;
+
+ decompressor_printk("Linux version %s\n", kernel_version);
+ if (!is_prot_virt_guest() && early_command_line[0])
+ decompressor_printk("Kernel command line: %s\n", early_command_line);
+ decompressor_printk("Out of memory allocating %lx bytes %lx aligned in range %lx:%lx\n",
+ size, align, min, max);
+ decompressor_printk("Reserved memory ranges:\n");
+ for_each_physmem_reserved_range(t, range, &start, &end) {
+ decompressor_printk("%016lx %016lx %s\n", start, end, get_rr_type_name(t));
+ total_reserved_mem += end - start;
+ }
+ decompressor_printk("Usable online memory ranges (info source: %s [%x]):\n",
+ get_physmem_info_source(), physmem_info.info_source);
+ for_each_physmem_usable_range(i, &start, &end) {
+ decompressor_printk("%016lx %016lx\n", start, end);
+ total_mem += end - start;
+ }
+ decompressor_printk("Usable online memory total: %lx Reserved: %lx Free: %lx\n",
+ total_mem, total_reserved_mem,
+ total_mem > total_reserved_mem ? total_mem - total_reserved_mem : 0);
+ print_stacktrace(current_frame_address());
+ sclp_early_printk("\n\n -- System halted\n");
+ disabled_wait();
+}
+
+void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size)
+{
+ physmem_info.reserved[type].start = addr;
+ physmem_info.reserved[type].end = addr + size;
+}
+
+void physmem_free(enum reserved_range_type type)
+{
+ physmem_info.reserved[type].start = 0;
+ physmem_info.reserved[type].end = 0;
+}
+
+static bool __physmem_alloc_intersects(unsigned long addr, unsigned long size,
+ unsigned long *intersection_start)
+{
+ unsigned long res_addr, res_size;
+ int t;
+
+ for (t = 0; t < RR_MAX; t++) {
+ if (!get_physmem_reserved(t, &res_addr, &res_size))
+ continue;
+ if (intersects(addr, size, res_addr, res_size)) {
+ *intersection_start = res_addr;
+ return true;
+ }
+ }
+ return ipl_report_certs_intersects(addr, size, intersection_start);
+}
+
+static unsigned long __physmem_alloc_range(unsigned long size, unsigned long align,
+ unsigned long min, unsigned long max,
+ unsigned int from_ranges, unsigned int *ranges_left,
+ bool die_on_oom)
+{
+ unsigned int nranges = from_ranges ?: physmem_info.range_count;
+ unsigned long range_start, range_end;
+ unsigned long intersection_start;
+ unsigned long addr, pos = max;
+
+ align = max(align, 8UL);
+ while (nranges) {
+ __get_physmem_range(nranges - 1, &range_start, &range_end, false);
+ pos = min(range_end, pos);
+
+ if (round_up(min, align) + size > pos)
+ break;
+ addr = round_down(pos - size, align);
+ if (range_start > addr) {
+ nranges--;
+ continue;
+ }
+ if (__physmem_alloc_intersects(addr, size, &intersection_start)) {
+ pos = intersection_start;
+ continue;
+ }
+
+ if (ranges_left)
+ *ranges_left = nranges;
+ return addr;
+ }
+ if (die_on_oom)
+ die_oom(size, align, min, max);
+ return 0;
+}
+
+unsigned long physmem_alloc_range(enum reserved_range_type type, unsigned long size,
+ unsigned long align, unsigned long min, unsigned long max,
+ bool die_on_oom)
+{
+ unsigned long addr;
+
+ max = min(max, physmem_alloc_pos);
+ addr = __physmem_alloc_range(size, align, min, max, 0, NULL, die_on_oom);
+ if (addr)
+ physmem_reserve(type, addr, size);
+ return addr;
+}
+
+unsigned long physmem_alloc_top_down(enum reserved_range_type type, unsigned long size,
+ unsigned long align)
+{
+ struct reserved_range *range = &physmem_info.reserved[type];
+ struct reserved_range *new_range;
+ unsigned int ranges_left;
+ unsigned long addr;
+
+ addr = __physmem_alloc_range(size, align, 0, physmem_alloc_pos, physmem_alloc_ranges,
+ &ranges_left, true);
+ /* if not a consecutive allocation of the same type or first allocation */
+ if (range->start != addr + size) {
+ if (range->end) {
+ physmem_alloc_pos = __physmem_alloc_range(
+ sizeof(struct reserved_range), 0, 0, physmem_alloc_pos,
+ physmem_alloc_ranges, &ranges_left, true);
+ new_range = (struct reserved_range *)physmem_alloc_pos;
+ *new_range = *range;
+ range->chain = new_range;
+ addr = __physmem_alloc_range(size, align, 0, physmem_alloc_pos,
+ ranges_left, &ranges_left, true);
+ }
+ range->end = addr + size;
+ }
+ range->start = addr;
+ physmem_alloc_pos = addr;
+ physmem_alloc_ranges = ranges_left;
+ return addr;
+}
+
+unsigned long get_physmem_alloc_pos(void)
+{
+ return physmem_alloc_pos;
+}
diff --git a/arch/s390/boot/sclp_early_core.c b/arch/s390/boot/sclp_early_core.c
index 5a19fd7020b5..6f30646afbd0 100644
--- a/arch/s390/boot/sclp_early_core.c
+++ b/arch/s390/boot/sclp_early_core.c
@@ -1,2 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+#include "boot.h"
#include "../../../drivers/s390/char/sclp_early_core.c"
+
+/* SCLP early buffer must stay page-aligned and below 2GB */
+static char __sclp_early_sccb[EXT_SCCB_READ_SCP] __aligned(PAGE_SIZE);
+
+void sclp_early_setup_buffer(void)
+{
+ sclp_early_set_buffer(&__sclp_early_sccb);
+}
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index 3b3a11f95269..9cc76e631759 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -1,55 +1,41 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/string.h>
#include <linux/elf.h>
+#include <asm/page-states.h>
+#include <asm/boot_data.h>
#include <asm/sections.h>
+#include <asm/maccess.h>
+#include <asm/cpu_mf.h>
#include <asm/setup.h>
+#include <asm/kasan.h>
#include <asm/kexec.h>
#include <asm/sclp.h>
#include <asm/diag.h>
#include <asm/uv.h>
-#include "compressed/decompressor.h"
+#include <asm/abs_lowcore.h>
+#include <asm/physmem_info.h>
+#include "decompressor.h"
#include "boot.h"
+#include "uv.h"
-extern char __boot_data_start[], __boot_data_end[];
-extern char __boot_data_preserved_start[], __boot_data_preserved_end[];
unsigned long __bootdata_preserved(__kaslr_offset);
+unsigned long __bootdata_preserved(__abs_lowcore);
+unsigned long __bootdata_preserved(__memcpy_real_area);
+pte_t *__bootdata_preserved(memcpy_real_ptep);
+unsigned long __bootdata_preserved(VMALLOC_START);
+unsigned long __bootdata_preserved(VMALLOC_END);
+struct page *__bootdata_preserved(vmemmap);
+unsigned long __bootdata_preserved(vmemmap_size);
+unsigned long __bootdata_preserved(MODULES_VADDR);
+unsigned long __bootdata_preserved(MODULES_END);
+unsigned long __bootdata_preserved(max_mappable);
+unsigned long __bootdata(ident_map_size);
-/*
- * Some code and data needs to stay below 2 GB, even when the kernel would be
- * relocated above 2 GB, because it has to use 31 bit addresses.
- * Such code and data is part of the .dma section, and its location is passed
- * over to the decompressed / relocated kernel via the .boot.preserved.data
- * section.
- */
-extern char _sdma[], _edma[];
-extern char _stext_dma[], _etext_dma[];
-extern struct exception_table_entry _start_dma_ex_table[];
-extern struct exception_table_entry _stop_dma_ex_table[];
-unsigned long __bootdata_preserved(__sdma) = __pa(&_sdma);
-unsigned long __bootdata_preserved(__edma) = __pa(&_edma);
-unsigned long __bootdata_preserved(__stext_dma) = __pa(&_stext_dma);
-unsigned long __bootdata_preserved(__etext_dma) = __pa(&_etext_dma);
-struct exception_table_entry *
- __bootdata_preserved(__start_dma_ex_table) = _start_dma_ex_table;
-struct exception_table_entry *
- __bootdata_preserved(__stop_dma_ex_table) = _stop_dma_ex_table;
-
-int _diag210_dma(struct diag210 *addr);
-int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode);
-int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode);
-void _diag0c_dma(struct hypfs_diag0c_entry *entry);
-void _diag308_reset_dma(void);
-struct diag_ops __bootdata_preserved(diag_dma_ops) = {
- .diag210 = _diag210_dma,
- .diag26c = _diag26c_dma,
- .diag14 = _diag14_dma,
- .diag0c = _diag0c_dma,
- .diag308_reset = _diag308_reset_dma
-};
-static struct diag210 _diag210_tmp_dma __section(.dma.data);
-struct diag210 *__bootdata_preserved(__diag210_tmp_dma) = &_diag210_tmp_dma;
-void _swsusp_reset_dma(void);
-unsigned long __bootdata_preserved(__swsusp_reset_dma) = __pa(_swsusp_reset_dma);
+u64 __bootdata_preserved(stfle_fac_list[16]);
+u64 __bootdata_preserved(alt_stfle_fac_list[16]);
+struct oldmem_data __bootdata_preserved(oldmem_data);
+
+struct machine_info machine;
void error(char *x)
{
@@ -60,6 +46,68 @@ void error(char *x)
disabled_wait();
}
+static void detect_facilities(void)
+{
+ if (test_facility(8)) {
+ machine.has_edat1 = 1;
+ local_ctl_set_bit(0, CR0_EDAT_BIT);
+ }
+ if (test_facility(78))
+ machine.has_edat2 = 1;
+ if (test_facility(130))
+ machine.has_nx = 1;
+}
+
+static int cmma_test_essa(void)
+{
+ unsigned long reg1, reg2, tmp = 0;
+ int rc = 1;
+ psw_t old;
+
+ /* Test ESSA_GET_STATE */
+ asm volatile(
+ " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n"
+ " epsw %[reg1],%[reg2]\n"
+ " st %[reg1],0(%[psw_pgm])\n"
+ " st %[reg2],4(%[psw_pgm])\n"
+ " larl %[reg1],1f\n"
+ " stg %[reg1],8(%[psw_pgm])\n"
+ " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n"
+ " la %[rc],0\n"
+ "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n"
+ : [reg1] "=&d" (reg1),
+ [reg2] "=&a" (reg2),
+ [rc] "+&d" (rc),
+ [tmp] "=&d" (tmp),
+ "+Q" (S390_lowcore.program_new_psw),
+ "=Q" (old)
+ : [psw_old] "a" (&old),
+ [psw_pgm] "a" (&S390_lowcore.program_new_psw),
+ [cmd] "i" (ESSA_GET_STATE)
+ : "cc", "memory");
+ return rc;
+}
+
+static void cmma_init(void)
+{
+ if (!cmma_flag)
+ return;
+ if (cmma_test_essa()) {
+ cmma_flag = 0;
+ return;
+ }
+ if (test_facility(147))
+ cmma_flag = 2;
+}
+
+static void setup_lpp(void)
+{
+ S390_lowcore.current_pid = 0;
+ S390_lowcore.lpp = LPP_MAGIC;
+ if (test_facility(40))
+ lpp(&S390_lowcore.lpp);
+}
+
#ifdef CONFIG_KERNEL_UNCOMPRESSED
unsigned long mem_safe_offset(void)
{
@@ -67,16 +115,20 @@ unsigned long mem_safe_offset(void)
}
#endif
-static void rescue_initrd(unsigned long addr)
+static void rescue_initrd(unsigned long min, unsigned long max)
{
+ unsigned long old_addr, addr, size;
+
if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD))
return;
- if (!INITRD_START || !INITRD_SIZE)
+ if (!get_physmem_reserved(RR_INITRD, &addr, &size))
return;
- if (addr <= INITRD_START)
+ if (addr >= min && addr + size <= max)
return;
- memmove((void *)addr, (void *)INITRD_START, INITRD_SIZE);
- INITRD_START = addr;
+ old_addr = addr;
+ physmem_free(RR_INITRD);
+ addr = physmem_alloc_top_down(RR_INITRD, size, 0);
+ memmove((void *)addr, (void *)old_addr, size);
}
static void copy_bootdata(void)
@@ -120,64 +172,252 @@ static void handle_relocs(unsigned long offset)
}
}
-static void clear_bss_section(void)
+/*
+ * Merge information from several sources into a single ident_map_size value.
+ * "ident_map_size" represents the upper limit of physical memory we may ever
+ * reach. It might not be all online memory, but also include standby (offline)
+ * memory. "ident_map_size" could be lower then actual standby or even online
+ * memory present, due to limiting factors. We should never go above this limit.
+ * It is the size of our identity mapping.
+ *
+ * Consider the following factors:
+ * 1. max_physmem_end - end of physical memory online or standby.
+ * Always >= end of the last online memory range (get_physmem_online_end()).
+ * 2. CONFIG_MAX_PHYSMEM_BITS - the maximum size of physical memory the
+ * kernel is able to support.
+ * 3. "mem=" kernel command line option which limits physical memory usage.
+ * 4. OLDMEM_BASE which is a kdump memory limit when the kernel is executed as
+ * crash kernel.
+ * 5. "hsa" size which is a memory limit when the kernel is executed during
+ * zfcp/nvme dump.
+ */
+static void setup_ident_map_size(unsigned long max_physmem_end)
+{
+ unsigned long hsa_size;
+
+ ident_map_size = max_physmem_end;
+ if (memory_limit)
+ ident_map_size = min(ident_map_size, memory_limit);
+ ident_map_size = min(ident_map_size, 1UL << MAX_PHYSMEM_BITS);
+
+#ifdef CONFIG_CRASH_DUMP
+ if (oldmem_data.start) {
+ __kaslr_enabled = 0;
+ ident_map_size = min(ident_map_size, oldmem_data.size);
+ } else if (ipl_block_valid && is_ipl_block_dump()) {
+ __kaslr_enabled = 0;
+ if (!sclp_early_get_hsa_size(&hsa_size) && hsa_size)
+ ident_map_size = min(ident_map_size, hsa_size);
+ }
+#endif
+}
+
+static unsigned long setup_kernel_memory_layout(void)
+{
+ unsigned long vmemmap_start;
+ unsigned long asce_limit;
+ unsigned long rte_size;
+ unsigned long pages;
+ unsigned long vsize;
+ unsigned long vmax;
+
+ pages = ident_map_size / PAGE_SIZE;
+ /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */
+ vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page);
+
+ /* choose kernel address space layout: 4 or 3 levels. */
+ vsize = round_up(ident_map_size, _REGION3_SIZE) + vmemmap_size +
+ MODULES_LEN + MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE;
+ vsize = size_add(vsize, vmalloc_size);
+ if (IS_ENABLED(CONFIG_KASAN) || (vsize > _REGION2_SIZE)) {
+ asce_limit = _REGION1_SIZE;
+ rte_size = _REGION2_SIZE;
+ } else {
+ asce_limit = _REGION2_SIZE;
+ rte_size = _REGION3_SIZE;
+ }
+
+ /*
+ * Forcing modules and vmalloc area under the ultravisor
+ * secure storage limit, so that any vmalloc allocation
+ * we do could be used to back secure guest storage.
+ */
+ vmax = adjust_to_uv_max(asce_limit);
+#ifdef CONFIG_KASAN
+ /* force vmalloc and modules below kasan shadow */
+ vmax = min(vmax, KASAN_SHADOW_START);
+#endif
+ __memcpy_real_area = round_down(vmax - MEMCPY_REAL_SIZE, PAGE_SIZE);
+ __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE,
+ sizeof(struct lowcore));
+ MODULES_END = round_down(__abs_lowcore, _SEGMENT_SIZE);
+ MODULES_VADDR = MODULES_END - MODULES_LEN;
+ VMALLOC_END = MODULES_VADDR;
+
+ /* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */
+ vsize = round_down(VMALLOC_END / 2, _SEGMENT_SIZE);
+ vmalloc_size = min(vmalloc_size, vsize);
+ VMALLOC_START = VMALLOC_END - vmalloc_size;
+
+ /* split remaining virtual space between 1:1 mapping & vmemmap array */
+ pages = VMALLOC_START / (PAGE_SIZE + sizeof(struct page));
+ pages = SECTION_ALIGN_UP(pages);
+ /* keep vmemmap_start aligned to a top level region table entry */
+ vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size);
+ vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS);
+ /* maximum mappable address as seen by arch_get_mappable_range() */
+ max_mappable = vmemmap_start;
+ /* make sure identity map doesn't overlay with vmemmap */
+ ident_map_size = min(ident_map_size, vmemmap_start);
+ vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page);
+ /* make sure vmemmap doesn't overlay with vmalloc area */
+ VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START);
+ vmemmap = (struct page *)vmemmap_start;
+
+ return asce_limit;
+}
+
+/*
+ * This function clears the BSS section of the decompressed Linux kernel and NOT the decompressor's.
+ */
+static void clear_bss_section(unsigned long vmlinux_lma)
+{
+ memset((void *)vmlinux_lma + vmlinux.image_size, 0, vmlinux.bss_size);
+}
+
+/*
+ * Set vmalloc area size to an 8th of (potential) physical memory
+ * size, unless size has been set by kernel command line parameter.
+ */
+static void setup_vmalloc_size(void)
+{
+ unsigned long size;
+
+ if (vmalloc_size_set)
+ return;
+ size = round_up(ident_map_size / 8, _SEGMENT_SIZE);
+ vmalloc_size = max(size, vmalloc_size);
+}
+
+static void offset_vmlinux_info(unsigned long offset)
{
- memset((void *)vmlinux.default_lma + vmlinux.image_size, 0, vmlinux.bss_size);
+ *(unsigned long *)(&vmlinux.entry) += offset;
+ vmlinux.bootdata_off += offset;
+ vmlinux.bootdata_preserved_off += offset;
+ vmlinux.rela_dyn_start += offset;
+ vmlinux.rela_dyn_end += offset;
+ vmlinux.dynsym_start += offset;
+ vmlinux.init_mm_off += offset;
+ vmlinux.swapper_pg_dir_off += offset;
+ vmlinux.invalid_pg_dir_off += offset;
+#ifdef CONFIG_KASAN
+ vmlinux.kasan_early_shadow_page_off += offset;
+ vmlinux.kasan_early_shadow_pte_off += offset;
+ vmlinux.kasan_early_shadow_pmd_off += offset;
+ vmlinux.kasan_early_shadow_pud_off += offset;
+ vmlinux.kasan_early_shadow_p4d_off += offset;
+#endif
}
void startup_kernel(void)
{
- unsigned long random_lma;
+ unsigned long max_physmem_end;
+ unsigned long vmlinux_lma = 0;
+ unsigned long amode31_lma = 0;
+ unsigned long asce_limit;
unsigned long safe_addr;
void *img;
+ psw_t psw;
- store_ipl_parmblock();
+ setup_lpp();
safe_addr = mem_safe_offset();
- safe_addr = read_ipl_report(safe_addr);
+
+ /*
+ * Reserve decompressor memory together with decompression heap, buffer and
+ * memory which might be occupied by uncompressed kernel at default 1Mb
+ * position (if KASLR is off or failed).
+ */
+ physmem_reserve(RR_DECOMPRESSOR, 0, safe_addr);
+ if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && parmarea.initrd_size)
+ physmem_reserve(RR_INITRD, parmarea.initrd_start, parmarea.initrd_size);
+ oldmem_data.start = parmarea.oldmem_base;
+ oldmem_data.size = parmarea.oldmem_size;
+
+ store_ipl_parmblock();
+ read_ipl_report();
uv_query_info();
- rescue_initrd(safe_addr);
sclp_early_read_info();
setup_boot_command_line();
parse_boot_command_line();
- setup_memory_end();
- detect_memory();
-
- random_lma = __kaslr_offset = 0;
- if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) {
- random_lma = get_random_base(safe_addr);
- if (random_lma) {
- __kaslr_offset = random_lma - vmlinux.default_lma;
- img = (void *)vmlinux.default_lma;
- vmlinux.default_lma += __kaslr_offset;
- vmlinux.entry += __kaslr_offset;
- vmlinux.bootdata_off += __kaslr_offset;
- vmlinux.bootdata_preserved_off += __kaslr_offset;
- vmlinux.rela_dyn_start += __kaslr_offset;
- vmlinux.rela_dyn_end += __kaslr_offset;
- vmlinux.dynsym_start += __kaslr_offset;
+ detect_facilities();
+ cmma_init();
+ sanitize_prot_virt_host();
+ max_physmem_end = detect_max_physmem_end();
+ setup_ident_map_size(max_physmem_end);
+ setup_vmalloc_size();
+ asce_limit = setup_kernel_memory_layout();
+ /* got final ident_map_size, physmem allocations could be performed now */
+ physmem_set_usable_limit(ident_map_size);
+ detect_physmem_online_ranges(max_physmem_end);
+ save_ipl_cert_comp_list();
+ rescue_initrd(safe_addr, ident_map_size);
+
+ if (kaslr_enabled()) {
+ vmlinux_lma = randomize_within_range(vmlinux.image_size + vmlinux.bss_size,
+ THREAD_SIZE, vmlinux.default_lma,
+ ident_map_size);
+ if (vmlinux_lma) {
+ __kaslr_offset = vmlinux_lma - vmlinux.default_lma;
+ offset_vmlinux_info(__kaslr_offset);
}
}
+ vmlinux_lma = vmlinux_lma ?: vmlinux.default_lma;
+ physmem_reserve(RR_VMLINUX, vmlinux_lma, vmlinux.image_size + vmlinux.bss_size);
if (!IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) {
img = decompress_kernel();
- memmove((void *)vmlinux.default_lma, img, vmlinux.image_size);
- } else if (__kaslr_offset)
- memcpy((void *)vmlinux.default_lma, img, vmlinux.image_size);
+ memmove((void *)vmlinux_lma, img, vmlinux.image_size);
+ } else if (__kaslr_offset) {
+ img = (void *)vmlinux.default_lma;
+ memmove((void *)vmlinux_lma, img, vmlinux.image_size);
+ memset(img, 0, vmlinux.image_size);
+ }
- clear_bss_section();
+ /* vmlinux decompression is done, shrink reserved low memory */
+ physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end);
+ if (kaslr_enabled())
+ amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, 0, SZ_2G);
+ amode31_lma = amode31_lma ?: vmlinux.default_lma - vmlinux.amode31_size;
+ physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size);
+
+ /*
+ * The order of the following operations is important:
+ *
+ * - handle_relocs() must follow clear_bss_section() to establish static
+ * memory references to data in .bss to be used by setup_vmem()
+ * (i.e init_mm.pgd)
+ *
+ * - setup_vmem() must follow handle_relocs() to be able using
+ * static memory references to data in .bss (i.e init_mm.pgd)
+ *
+ * - copy_bootdata() must follow setup_vmem() to propagate changes to
+ * bootdata made by setup_vmem()
+ */
+ clear_bss_section(vmlinux_lma);
+ handle_relocs(__kaslr_offset);
+ setup_vmem(asce_limit);
copy_bootdata();
- if (IS_ENABLED(CONFIG_RELOCATABLE))
- handle_relocs(__kaslr_offset);
-
- if (__kaslr_offset) {
- /*
- * Save KASLR offset for early dumps, before vmcore_info is set.
- * Mark as uneven to distinguish from real vmcore_info pointer.
- */
- S390_lowcore.vmcore_info = __kaslr_offset | 0x1UL;
- /* Clear non-relocated kernel */
- if (IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED))
- memset(img, 0, vmlinux.image_size);
- }
- vmlinux.entry();
+
+ /*
+ * Save KASLR offset for early dumps, before vmcore_info is set.
+ * Mark as uneven to distinguish from real vmcore_info pointer.
+ */
+ S390_lowcore.vmcore_info = __kaslr_offset ? __kaslr_offset | 0x1UL : 0;
+
+ /*
+ * Jump to the decompressed kernel entry point and switch DAT mode on.
+ */
+ psw.addr = vmlinux.entry;
+ psw.mask = PSW_KERNEL_BITS;
+ __load_psw(psw);
}
diff --git a/arch/s390/boot/string.c b/arch/s390/boot/string.c
index b11e8108773a..faccb33b462c 100644
--- a/arch/s390/boot/string.c
+++ b/arch/s390/boot/string.c
@@ -3,6 +3,7 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#undef CONFIG_KASAN
+#undef CONFIG_KASAN_GENERIC
#include "../lib/string.c"
int strncmp(const char *cs, const char *ct, size_t count)
diff --git a/arch/s390/boot/text_dma.S b/arch/s390/boot/text_dma.S
deleted file mode 100644
index 9715715c4c28..000000000000
--- a/arch/s390/boot/text_dma.S
+++ /dev/null
@@ -1,184 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Code that needs to run below 2 GB.
- *
- * Copyright IBM Corp. 2019
- */
-
-#include <linux/linkage.h>
-#include <asm/errno.h>
-#include <asm/sigp.h>
-
-#ifdef CC_USING_EXPOLINE
- .pushsection .dma.text.__s390_indirect_jump_r14,"axG"
-__dma__s390_indirect_jump_r14:
- larl %r1,0f
- ex 0,0(%r1)
- j .
-0: br %r14
- .popsection
-#endif
-
- .section .dma.text,"ax"
-/*
- * Simplified version of expoline thunk. The normal thunks can not be used here,
- * because they might be more than 2 GB away, and not reachable by the relative
- * branch. No comdat, exrl, etc. optimizations used here, because it only
- * affects a few functions that are not performance-relevant.
- */
- .macro BR_EX_DMA_r14
-#ifdef CC_USING_EXPOLINE
- jg __dma__s390_indirect_jump_r14
-#else
- br %r14
-#endif
- .endm
-
-/*
- * int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode)
- */
-ENTRY(_diag14_dma)
- lgr %r1,%r2
- lgr %r2,%r3
- lgr %r3,%r4
- lhi %r5,-EIO
- sam31
- diag %r1,%r2,0x14
-.Ldiag14_ex:
- ipm %r5
- srl %r5,28
-.Ldiag14_fault:
- sam64
- lgfr %r2,%r5
- BR_EX_DMA_r14
- EX_TABLE_DMA(.Ldiag14_ex, .Ldiag14_fault)
-ENDPROC(_diag14_dma)
-
-/*
- * int _diag210_dma(struct diag210 *addr)
- */
-ENTRY(_diag210_dma)
- lgr %r1,%r2
- lhi %r2,-1
- sam31
- diag %r1,%r0,0x210
-.Ldiag210_ex:
- ipm %r2
- srl %r2,28
-.Ldiag210_fault:
- sam64
- lgfr %r2,%r2
- BR_EX_DMA_r14
- EX_TABLE_DMA(.Ldiag210_ex, .Ldiag210_fault)
-ENDPROC(_diag210_dma)
-
-/*
- * int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode)
- */
-ENTRY(_diag26c_dma)
- lghi %r5,-EOPNOTSUPP
- sam31
- diag %r2,%r4,0x26c
-.Ldiag26c_ex:
- sam64
- lgfr %r2,%r5
- BR_EX_DMA_r14
- EX_TABLE_DMA(.Ldiag26c_ex, .Ldiag26c_ex)
-ENDPROC(_diag26c_dma)
-
-/*
- * void _diag0c_dma(struct hypfs_diag0c_entry *entry)
- */
-ENTRY(_diag0c_dma)
- sam31
- diag %r2,%r2,0x0c
- sam64
- BR_EX_DMA_r14
-ENDPROC(_diag0c_dma)
-
-/*
- * void _swsusp_reset_dma(void)
- */
-ENTRY(_swsusp_reset_dma)
- larl %r1,restart_entry
- larl %r2,.Lrestart_diag308_psw
- og %r1,0(%r2)
- stg %r1,0(%r0)
- lghi %r0,0
- diag %r0,%r0,0x308
-restart_entry:
- lhi %r1,1
- sigp %r1,%r0,SIGP_SET_ARCHITECTURE
- sam64
- BR_EX_DMA_r14
-ENDPROC(_swsusp_reset_dma)
-
-/*
- * void _diag308_reset_dma(void)
- *
- * Calls diag 308 subcode 1 and continues execution
- */
-ENTRY(_diag308_reset_dma)
- larl %r4,.Lctlregs # Save control registers
- stctg %c0,%c15,0(%r4)
- lg %r2,0(%r4) # Disable lowcore protection
- nilh %r2,0xefff
- larl %r4,.Lctlreg0
- stg %r2,0(%r4)
- lctlg %c0,%c0,0(%r4)
- larl %r4,.Lfpctl # Floating point control register
- stfpc 0(%r4)
- larl %r4,.Lprefix # Save prefix register
- stpx 0(%r4)
- larl %r4,.Lprefix_zero # Set prefix register to 0
- spx 0(%r4)
- larl %r4,.Lcontinue_psw # Save PSW flags
- epsw %r2,%r3
- stm %r2,%r3,0(%r4)
- larl %r4,restart_part2 # Setup restart PSW at absolute 0
- larl %r3,.Lrestart_diag308_psw
- og %r4,0(%r3) # Save PSW
- lghi %r3,0
- sturg %r4,%r3 # Use sturg, because of large pages
- lghi %r1,1
- lghi %r0,0
- diag %r0,%r1,0x308
-restart_part2:
- lhi %r0,0 # Load r0 with zero
- lhi %r1,2 # Use mode 2 = ESAME (dump)
- sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode
- sam64 # Switch to 64 bit addressing mode
- larl %r4,.Lctlregs # Restore control registers
- lctlg %c0,%c15,0(%r4)
- larl %r4,.Lfpctl # Restore floating point ctl register
- lfpc 0(%r4)
- larl %r4,.Lprefix # Restore prefix register
- spx 0(%r4)
- larl %r4,.Lcontinue_psw # Restore PSW flags
- lpswe 0(%r4)
-.Lcontinue:
- BR_EX_DMA_r14
-ENDPROC(_diag308_reset_dma)
-
- .section .dma.data,"aw",@progbits
-.align 8
-.Lrestart_diag308_psw:
- .long 0x00080000,0x80000000
-
-.align 8
-.Lcontinue_psw:
- .quad 0,.Lcontinue
-
-.align 8
-.Lctlreg0:
- .quad 0
-.Lctlregs:
- .rept 16
- .quad 0
- .endr
-.Lfpctl:
- .long 0
-.Lprefix:
- .long 0
-.Lprefix_zero:
- .long 0
diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c
index ed007f4a6444..1e66d2cbb096 100644
--- a/arch/s390/boot/uv.c
+++ b/arch/s390/boot/uv.c
@@ -1,9 +1,20 @@
// SPDX-License-Identifier: GPL-2.0
#include <asm/uv.h>
+#include <asm/boot_data.h>
#include <asm/facility.h>
#include <asm/sections.h>
+#include "boot.h"
+#include "uv.h"
+
+/* will be used in arch/s390/kernel/uv.c */
+#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
int __bootdata_preserved(prot_virt_guest);
+#endif
+#if IS_ENABLED(CONFIG_KVM)
+int __bootdata_preserved(prot_virt_host);
+#endif
+struct uv_info __bootdata_preserved(uv_info);
void uv_query_info(void)
{
@@ -15,10 +26,70 @@ void uv_query_info(void)
if (!test_facility(158))
return;
- if (uv_call(0, (uint64_t)&uvcb))
+ /* rc==0x100 means that there is additional data we do not process */
+ if (uv_call(0, (uint64_t)&uvcb) && uvcb.header.rc != 0x100)
return;
+ if (IS_ENABLED(CONFIG_KVM)) {
+ memcpy(uv_info.inst_calls_list, uvcb.inst_calls_list, sizeof(uv_info.inst_calls_list));
+ uv_info.uv_base_stor_len = uvcb.uv_base_stor_len;
+ uv_info.guest_base_stor_len = uvcb.conf_base_phys_stor_len;
+ uv_info.guest_virt_base_stor_len = uvcb.conf_base_virt_stor_len;
+ uv_info.guest_virt_var_stor_len = uvcb.conf_virt_var_stor_len;
+ uv_info.guest_cpu_stor_len = uvcb.cpu_stor_len;
+ uv_info.max_sec_stor_addr = ALIGN(uvcb.max_guest_stor_addr, PAGE_SIZE);
+ uv_info.max_num_sec_conf = uvcb.max_num_sec_conf;
+ uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id;
+ uv_info.uv_feature_indications = uvcb.uv_feature_indications;
+ uv_info.supp_se_hdr_ver = uvcb.supp_se_hdr_versions;
+ uv_info.supp_se_hdr_pcf = uvcb.supp_se_hdr_pcf;
+ uv_info.conf_dump_storage_state_len = uvcb.conf_dump_storage_state_len;
+ uv_info.conf_dump_finalize_len = uvcb.conf_dump_finalize_len;
+ uv_info.supp_att_req_hdr_ver = uvcb.supp_att_req_hdr_ver;
+ uv_info.supp_att_pflags = uvcb.supp_att_pflags;
+ uv_info.supp_add_secret_req_ver = uvcb.supp_add_secret_req_ver;
+ uv_info.supp_add_secret_pcf = uvcb.supp_add_secret_pcf;
+ uv_info.supp_secret_types = uvcb.supp_secret_types;
+ uv_info.max_secrets = uvcb.max_secrets;
+ }
+
+#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) &&
test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list))
prot_virt_guest = 1;
+#endif
+}
+
+#if IS_ENABLED(CONFIG_KVM)
+unsigned long adjust_to_uv_max(unsigned long limit)
+{
+ if (is_prot_virt_host() && uv_info.max_sec_stor_addr)
+ limit = min_t(unsigned long, limit, uv_info.max_sec_stor_addr);
+ return limit;
+}
+
+static int is_prot_virt_host_capable(void)
+{
+ /* disable if no prot_virt=1 given on command-line */
+ if (!is_prot_virt_host())
+ return 0;
+ /* disable if protected guest virtualization is enabled */
+ if (is_prot_virt_guest())
+ return 0;
+ /* disable if no hardware support */
+ if (!test_facility(158))
+ return 0;
+ /* disable if kdump */
+ if (oldmem_data.start)
+ return 0;
+ /* disable if stand-alone dump */
+ if (ipl_block_valid && is_ipl_block_dump())
+ return 0;
+ return 1;
+}
+
+void sanitize_prot_virt_host(void)
+{
+ prot_virt_host = is_prot_virt_host_capable();
}
+#endif
diff --git a/arch/s390/boot/uv.h b/arch/s390/boot/uv.h
new file mode 100644
index 000000000000..0f3070856f8d
--- /dev/null
+++ b/arch/s390/boot/uv.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_UV_H
+#define BOOT_UV_H
+
+#if IS_ENABLED(CONFIG_KVM)
+unsigned long adjust_to_uv_max(unsigned long limit);
+void sanitize_prot_virt_host(void);
+#else
+static inline unsigned long adjust_to_uv_max(unsigned long limit)
+{
+ return limit;
+}
+static inline void sanitize_prot_virt_host(void) {}
+#endif
+
+#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM)
+void uv_query_info(void);
+#else
+static inline void uv_query_info(void) {}
+#endif
+
+#endif /* BOOT_UV_H */
diff --git a/arch/s390/boot/version.c b/arch/s390/boot/version.c
index d32e58bdda6a..fd32f038777f 100644
--- a/arch/s390/boot/version.c
+++ b/arch/s390/boot/version.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#include <generated/utsversion.h>
#include <generated/utsrelease.h>
#include <generated/compile.h>
#include "boot.h"
diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c
new file mode 100644
index 000000000000..e3a4500a5a75
--- /dev/null
+++ b/arch/s390/boot/vmem.c
@@ -0,0 +1,474 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched/task.h>
+#include <linux/pgtable.h>
+#include <linux/kasan.h>
+#include <asm/page-states.h>
+#include <asm/pgalloc.h>
+#include <asm/facility.h>
+#include <asm/sections.h>
+#include <asm/ctlreg.h>
+#include <asm/physmem_info.h>
+#include <asm/maccess.h>
+#include <asm/abs_lowcore.h>
+#include "decompressor.h"
+#include "boot.h"
+
+struct ctlreg __bootdata_preserved(s390_invalid_asce);
+
+#ifdef CONFIG_PROC_FS
+atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]);
+#endif
+
+#define init_mm (*(struct mm_struct *)vmlinux.init_mm_off)
+#define swapper_pg_dir vmlinux.swapper_pg_dir_off
+#define invalid_pg_dir vmlinux.invalid_pg_dir_off
+
+enum populate_mode {
+ POPULATE_NONE,
+ POPULATE_DIRECT,
+ POPULATE_ABS_LOWCORE,
+#ifdef CONFIG_KASAN
+ POPULATE_KASAN_MAP_SHADOW,
+ POPULATE_KASAN_ZERO_SHADOW,
+ POPULATE_KASAN_SHALLOW
+#endif
+};
+
+static void pgtable_populate(unsigned long addr, unsigned long end, enum populate_mode mode);
+
+#ifdef CONFIG_KASAN
+
+#define kasan_early_shadow_page vmlinux.kasan_early_shadow_page_off
+#define kasan_early_shadow_pte ((pte_t *)vmlinux.kasan_early_shadow_pte_off)
+#define kasan_early_shadow_pmd ((pmd_t *)vmlinux.kasan_early_shadow_pmd_off)
+#define kasan_early_shadow_pud ((pud_t *)vmlinux.kasan_early_shadow_pud_off)
+#define kasan_early_shadow_p4d ((p4d_t *)vmlinux.kasan_early_shadow_p4d_off)
+#define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x))
+
+static pte_t pte_z;
+
+static inline void kasan_populate(unsigned long start, unsigned long end, enum populate_mode mode)
+{
+ start = PAGE_ALIGN_DOWN(__sha(start));
+ end = PAGE_ALIGN(__sha(end));
+ pgtable_populate(start, end, mode);
+}
+
+static void kasan_populate_shadow(void)
+{
+ pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY);
+ pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY);
+ p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY);
+ unsigned long memgap_start = 0;
+ unsigned long untracked_end;
+ unsigned long start, end;
+ int i;
+
+ pte_z = __pte(__pa(kasan_early_shadow_page) | pgprot_val(PAGE_KERNEL_RO));
+ if (!machine.has_nx)
+ pte_z = clear_pte_bit(pte_z, __pgprot(_PAGE_NOEXEC));
+ crst_table_init((unsigned long *)kasan_early_shadow_p4d, p4d_val(p4d_z));
+ crst_table_init((unsigned long *)kasan_early_shadow_pud, pud_val(pud_z));
+ crst_table_init((unsigned long *)kasan_early_shadow_pmd, pmd_val(pmd_z));
+ memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE);
+ __arch_set_page_dat(kasan_early_shadow_p4d, 1UL << CRST_ALLOC_ORDER);
+ __arch_set_page_dat(kasan_early_shadow_pud, 1UL << CRST_ALLOC_ORDER);
+ __arch_set_page_dat(kasan_early_shadow_pmd, 1UL << CRST_ALLOC_ORDER);
+ __arch_set_page_dat(kasan_early_shadow_pte, 1);
+
+ /*
+ * Current memory layout:
+ * +- 0 -------------+ +- shadow start -+
+ * |1:1 ident mapping| /|1/8 of ident map|
+ * | | / | |
+ * +-end of ident map+ / +----------------+
+ * | ... gap ... | / | kasan |
+ * | | / | zero page |
+ * +- vmalloc area -+ / | mapping |
+ * | vmalloc_size | / | (untracked) |
+ * +- modules vaddr -+ / +----------------+
+ * | 2Gb |/ | unmapped | allocated per module
+ * +- shadow start -+ +----------------+
+ * | 1/8 addr space | | zero pg mapping| (untracked)
+ * +- shadow end ----+---------+- shadow end ---+
+ *
+ * Current memory layout (KASAN_VMALLOC):
+ * +- 0 -------------+ +- shadow start -+
+ * |1:1 ident mapping| /|1/8 of ident map|
+ * | | / | |
+ * +-end of ident map+ / +----------------+
+ * | ... gap ... | / | kasan zero page| (untracked)
+ * | | / | mapping |
+ * +- vmalloc area -+ / +----------------+
+ * | vmalloc_size | / |shallow populate|
+ * +- modules vaddr -+ / +----------------+
+ * | 2Gb |/ |shallow populate|
+ * +- shadow start -+ +----------------+
+ * | 1/8 addr space | | zero pg mapping| (untracked)
+ * +- shadow end ----+---------+- shadow end ---+
+ */
+
+ for_each_physmem_usable_range(i, &start, &end) {
+ kasan_populate(start, end, POPULATE_KASAN_MAP_SHADOW);
+ if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260)
+ kasan_populate(memgap_start, start, POPULATE_KASAN_ZERO_SHADOW);
+ memgap_start = end;
+ }
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
+ untracked_end = VMALLOC_START;
+ /* shallowly populate kasan shadow for vmalloc and modules */
+ kasan_populate(VMALLOC_START, MODULES_END, POPULATE_KASAN_SHALLOW);
+ } else {
+ untracked_end = MODULES_VADDR;
+ }
+ /* populate kasan shadow for untracked memory */
+ kasan_populate(ident_map_size, untracked_end, POPULATE_KASAN_ZERO_SHADOW);
+ kasan_populate(MODULES_END, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW);
+}
+
+static bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr,
+ unsigned long end, enum populate_mode mode)
+{
+ if (mode == POPULATE_KASAN_ZERO_SHADOW &&
+ IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) {
+ pgd_populate(&init_mm, pgd, kasan_early_shadow_p4d);
+ return true;
+ }
+ return false;
+}
+
+static bool kasan_p4d_populate_zero_shadow(p4d_t *p4d, unsigned long addr,
+ unsigned long end, enum populate_mode mode)
+{
+ if (mode == POPULATE_KASAN_ZERO_SHADOW &&
+ IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) {
+ p4d_populate(&init_mm, p4d, kasan_early_shadow_pud);
+ return true;
+ }
+ return false;
+}
+
+static bool kasan_pud_populate_zero_shadow(pud_t *pud, unsigned long addr,
+ unsigned long end, enum populate_mode mode)
+{
+ if (mode == POPULATE_KASAN_ZERO_SHADOW &&
+ IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) {
+ pud_populate(&init_mm, pud, kasan_early_shadow_pmd);
+ return true;
+ }
+ return false;
+}
+
+static bool kasan_pmd_populate_zero_shadow(pmd_t *pmd, unsigned long addr,
+ unsigned long end, enum populate_mode mode)
+{
+ if (mode == POPULATE_KASAN_ZERO_SHADOW &&
+ IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) {
+ pmd_populate(&init_mm, pmd, kasan_early_shadow_pte);
+ return true;
+ }
+ return false;
+}
+
+static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode)
+{
+ if (mode == POPULATE_KASAN_ZERO_SHADOW) {
+ set_pte(pte, pte_z);
+ return true;
+ }
+ return false;
+}
+#else
+
+static inline void kasan_populate_shadow(void) {}
+
+static inline bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr,
+ unsigned long end, enum populate_mode mode)
+{
+ return false;
+}
+
+static inline bool kasan_p4d_populate_zero_shadow(p4d_t *p4d, unsigned long addr,
+ unsigned long end, enum populate_mode mode)
+{
+ return false;
+}
+
+static inline bool kasan_pud_populate_zero_shadow(pud_t *pud, unsigned long addr,
+ unsigned long end, enum populate_mode mode)
+{
+ return false;
+}
+
+static inline bool kasan_pmd_populate_zero_shadow(pmd_t *pmd, unsigned long addr,
+ unsigned long end, enum populate_mode mode)
+{
+ return false;
+}
+
+static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode)
+{
+ return false;
+}
+
+#endif
+
+/*
+ * Mimic virt_to_kpte() in lack of init_mm symbol. Skip pmd NULL check though.
+ */
+static inline pte_t *__virt_to_kpte(unsigned long va)
+{
+ return pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va), va);
+}
+
+static void *boot_crst_alloc(unsigned long val)
+{
+ unsigned long size = PAGE_SIZE << CRST_ALLOC_ORDER;
+ unsigned long *table;
+
+ table = (unsigned long *)physmem_alloc_top_down(RR_VMEM, size, size);
+ crst_table_init(table, val);
+ __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
+ return table;
+}
+
+static pte_t *boot_pte_alloc(void)
+{
+ static void *pte_leftover;
+ pte_t *pte;
+
+ /*
+ * handling pte_leftovers this way helps to avoid memory fragmentation
+ * during POPULATE_KASAN_MAP_SHADOW when EDAT is off
+ */
+ if (!pte_leftover) {
+ pte_leftover = (void *)physmem_alloc_top_down(RR_VMEM, PAGE_SIZE, PAGE_SIZE);
+ pte = pte_leftover + _PAGE_TABLE_SIZE;
+ __arch_set_page_dat(pte, 1);
+ } else {
+ pte = pte_leftover;
+ pte_leftover = NULL;
+ }
+
+ memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
+ return pte;
+}
+
+static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_mode mode)
+{
+ switch (mode) {
+ case POPULATE_NONE:
+ return -1;
+ case POPULATE_DIRECT:
+ return addr;
+ case POPULATE_ABS_LOWCORE:
+ return __abs_lowcore_pa(addr);
+#ifdef CONFIG_KASAN
+ case POPULATE_KASAN_MAP_SHADOW:
+ addr = physmem_alloc_top_down(RR_VMEM, size, size);
+ memset((void *)addr, 0, size);
+ return addr;
+#endif
+ default:
+ return -1;
+ }
+}
+
+static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end)
+{
+ return machine.has_edat2 &&
+ IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE;
+}
+
+static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end)
+{
+ return machine.has_edat1 &&
+ IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE;
+}
+
+static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end,
+ enum populate_mode mode)
+{
+ unsigned long pages = 0;
+ pte_t *pte, entry;
+
+ pte = pte_offset_kernel(pmd, addr);
+ for (; addr < end; addr += PAGE_SIZE, pte++) {
+ if (pte_none(*pte)) {
+ if (kasan_pte_populate_zero_shadow(pte, mode))
+ continue;
+ entry = __pte(_pa(addr, PAGE_SIZE, mode));
+ entry = set_pte_bit(entry, PAGE_KERNEL);
+ if (!machine.has_nx)
+ entry = clear_pte_bit(entry, __pgprot(_PAGE_NOEXEC));
+ set_pte(pte, entry);
+ pages++;
+ }
+ }
+ if (mode == POPULATE_DIRECT)
+ update_page_count(PG_DIRECT_MAP_4K, pages);
+}
+
+static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end,
+ enum populate_mode mode)
+{
+ unsigned long next, pages = 0;
+ pmd_t *pmd, entry;
+ pte_t *pte;
+
+ pmd = pmd_offset(pud, addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(*pmd)) {
+ if (kasan_pmd_populate_zero_shadow(pmd, addr, next, mode))
+ continue;
+ if (can_large_pmd(pmd, addr, next)) {
+ entry = __pmd(_pa(addr, _SEGMENT_SIZE, mode));
+ entry = set_pmd_bit(entry, SEGMENT_KERNEL);
+ if (!machine.has_nx)
+ entry = clear_pmd_bit(entry, __pgprot(_SEGMENT_ENTRY_NOEXEC));
+ set_pmd(pmd, entry);
+ pages++;
+ continue;
+ }
+ pte = boot_pte_alloc();
+ pmd_populate(&init_mm, pmd, pte);
+ } else if (pmd_large(*pmd)) {
+ continue;
+ }
+ pgtable_pte_populate(pmd, addr, next, mode);
+ }
+ if (mode == POPULATE_DIRECT)
+ update_page_count(PG_DIRECT_MAP_1M, pages);
+}
+
+static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end,
+ enum populate_mode mode)
+{
+ unsigned long next, pages = 0;
+ pud_t *pud, entry;
+ pmd_t *pmd;
+
+ pud = pud_offset(p4d, addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+ if (pud_none(*pud)) {
+ if (kasan_pud_populate_zero_shadow(pud, addr, next, mode))
+ continue;
+ if (can_large_pud(pud, addr, next)) {
+ entry = __pud(_pa(addr, _REGION3_SIZE, mode));
+ entry = set_pud_bit(entry, REGION3_KERNEL);
+ if (!machine.has_nx)
+ entry = clear_pud_bit(entry, __pgprot(_REGION_ENTRY_NOEXEC));
+ set_pud(pud, entry);
+ pages++;
+ continue;
+ }
+ pmd = boot_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ pud_populate(&init_mm, pud, pmd);
+ } else if (pud_large(*pud)) {
+ continue;
+ }
+ pgtable_pmd_populate(pud, addr, next, mode);
+ }
+ if (mode == POPULATE_DIRECT)
+ update_page_count(PG_DIRECT_MAP_2G, pages);
+}
+
+static void pgtable_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end,
+ enum populate_mode mode)
+{
+ unsigned long next;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ p4d = p4d_offset(pgd, addr);
+ for (; addr < end; addr = next, p4d++) {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none(*p4d)) {
+ if (kasan_p4d_populate_zero_shadow(p4d, addr, next, mode))
+ continue;
+ pud = boot_crst_alloc(_REGION3_ENTRY_EMPTY);
+ p4d_populate(&init_mm, p4d, pud);
+ }
+ pgtable_pud_populate(p4d, addr, next, mode);
+ }
+}
+
+static void pgtable_populate(unsigned long addr, unsigned long end, enum populate_mode mode)
+{
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ pgd = pgd_offset(&init_mm, addr);
+ for (; addr < end; addr = next, pgd++) {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(*pgd)) {
+ if (kasan_pgd_populate_zero_shadow(pgd, addr, next, mode))
+ continue;
+ p4d = boot_crst_alloc(_REGION2_ENTRY_EMPTY);
+ pgd_populate(&init_mm, pgd, p4d);
+ }
+#ifdef CONFIG_KASAN
+ if (mode == POPULATE_KASAN_SHALLOW)
+ continue;
+#endif
+ pgtable_p4d_populate(pgd, addr, next, mode);
+ }
+}
+
+void setup_vmem(unsigned long asce_limit)
+{
+ unsigned long start, end;
+ unsigned long asce_type;
+ unsigned long asce_bits;
+ int i;
+
+ /*
+ * Mark whole memory as no-dat. This must be done before any
+ * page tables are allocated, or kernel image builtin pages
+ * are marked as dat tables.
+ */
+ for_each_physmem_online_range(i, &start, &end)
+ __arch_set_page_nodat((void *)start, (end - start) >> PAGE_SHIFT);
+
+ if (asce_limit == _REGION1_SIZE) {
+ asce_type = _REGION2_ENTRY_EMPTY;
+ asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
+ } else {
+ asce_type = _REGION3_ENTRY_EMPTY;
+ asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+ }
+ s390_invalid_asce.val = invalid_pg_dir | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+
+ crst_table_init((unsigned long *)swapper_pg_dir, asce_type);
+ crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY);
+ __arch_set_page_dat((void *)swapper_pg_dir, 1UL << CRST_ALLOC_ORDER);
+ __arch_set_page_dat((void *)invalid_pg_dir, 1UL << CRST_ALLOC_ORDER);
+
+ /*
+ * To allow prefixing the lowcore must be mapped with 4KB pages.
+ * To prevent creation of a large page at address 0 first map
+ * the lowcore and create the identity mapping only afterwards.
+ */
+ pgtable_populate(0, sizeof(struct lowcore), POPULATE_DIRECT);
+ for_each_physmem_usable_range(i, &start, &end)
+ pgtable_populate(start, end, POPULATE_DIRECT);
+ pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore),
+ POPULATE_ABS_LOWCORE);
+ pgtable_populate(__memcpy_real_area, __memcpy_real_area + PAGE_SIZE,
+ POPULATE_NONE);
+ memcpy_real_ptep = __virt_to_kpte(__memcpy_real_area);
+
+ kasan_populate_shadow();
+
+ S390_lowcore.kernel_asce.val = swapper_pg_dir | asce_bits;
+ S390_lowcore.user_asce = s390_invalid_asce;
+
+ local_ctl_load(1, &S390_lowcore.kernel_asce);
+ local_ctl_load(7, &S390_lowcore.user_asce);
+ local_ctl_load(13, &S390_lowcore.kernel_asce);
+
+ init_mm.context.asce = S390_lowcore.kernel_asce.val;
+}
diff --git a/arch/s390/boot/compressed/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S
index 44561b2c3712..389df0e0d9e5 100644
--- a/arch/s390/boot/compressed/vmlinux.lds.S
+++ b/arch/s390/boot/vmlinux.lds.S
@@ -1,6 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm-generic/vmlinux.lds.h>
#include <asm/vmlinux.lds.h>
+#include <asm/thread_info.h>
+#include <asm/page.h>
+#include <asm/sclp.h>
+#include "boot.h"
OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390")
OUTPUT_ARCH(s390:64-bit)
@@ -10,11 +14,19 @@ ENTRY(startup)
SECTIONS
{
. = 0;
+ .ipldata : {
+ *(.ipldata)
+ }
+ . = IPL_START;
.head.text : {
_head = . ;
HEAD_TEXT
_ehead = . ;
}
+ . = PARMAREA;
+ .parmarea : {
+ *(.parmarea)
+ }
.text : {
_text = .; /* Text */
*(.text)
@@ -27,38 +39,42 @@ SECTIONS
*(.rodata.*)
_erodata = . ;
}
+ NOTES
.data : {
_data = . ;
*(.data)
*(.data.*)
_edata = . ;
}
- /*
- * .dma section for code, data, ex_table that need to stay below 2 GB,
- * even when the kernel is relocate: above 2 GB.
- */
- . = ALIGN(PAGE_SIZE);
- _sdma = .;
- .dma.text : {
- _stext_dma = .;
- *(.dma.text)
- . = ALIGN(PAGE_SIZE);
- _etext_dma = .;
- }
- . = ALIGN(16);
- .dma.ex_table : {
- _start_dma_ex_table = .;
- KEEP(*(.dma.ex_table))
- _stop_dma_ex_table = .;
- }
- .dma.data : { *(.dma.data) }
- . = ALIGN(PAGE_SIZE);
- _edma = .;
BOOT_DATA
BOOT_DATA_PRESERVED
/*
+ * This is the BSS section of the decompressor and not of the decompressed Linux kernel.
+ * It will consume place in the decompressor's image.
+ */
+ . = ALIGN(8);
+ .bss : {
+ _bss = . ;
+ *(.bss)
+ *(.bss.*)
+ *(COMMON)
+ /*
+ * Stacks for the decompressor
+ */
+ . = ALIGN(PAGE_SIZE);
+ _dump_info_stack_start = .;
+ . += PAGE_SIZE;
+ _dump_info_stack_end = .;
+ . = ALIGN(PAGE_SIZE);
+ _stack_start = .;
+ . += BOOT_STACK_SIZE;
+ _stack_end = .;
+ _ebss = .;
+ }
+
+ /*
* uncompressed image info used by the decompressor it should match
* struct vmlinux_info. It comes from .vmlinux.info section of
* uncompressed vmlinux in a form of info.o
@@ -69,6 +85,16 @@ SECTIONS
*(.vmlinux.info)
}
+ .decompressor.syms : {
+ . += 1; /* make sure we have \0 before the first entry */
+ . = ALIGN(2);
+ _decompressor_syms_start = .;
+ *(.decompressor.syms)
+ _decompressor_syms_end = .;
+ }
+
+ _decompressor_end = .;
+
#ifdef CONFIG_KERNEL_UNCOMPRESSED
. = 0x100000;
#else
@@ -78,17 +104,17 @@ SECTIONS
_compressed_start = .;
*(.vmlinux.bin.compressed)
_compressed_end = .;
- FILL(0xff);
- . = ALIGN(4096);
}
- . = ALIGN(256);
- .bss : {
- _bss = . ;
- *(.bss)
- *(.bss.*)
- *(COMMON)
- . = ALIGN(8); /* For convenience during zeroing */
- _ebss = .;
+
+#define SB_TRAILER_SIZE 32
+ /* Trailer needed for Secure Boot */
+ . += SB_TRAILER_SIZE; /* make sure .sb.trailer does not overwrite the previous section */
+ . = ALIGN(4096) - SB_TRAILER_SIZE;
+ .sb.trailer : {
+ QUAD(0)
+ QUAD(0)
+ QUAD(0)
+ QUAD(0x000000207a49504c)
}
_end = .;
diff --git a/arch/s390/configs/btf.config b/arch/s390/configs/btf.config
new file mode 100644
index 000000000000..eb7f84f5925c
--- /dev/null
+++ b/arch/s390/configs/btf.config
@@ -0,0 +1,2 @@
+# Help: Enable BTF debug info
+CONFIG_DEBUG_INFO_BTF=y
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 2e60c80395ab..cae2dd34fbb4 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -1,9 +1,16 @@
+CONFIG_UAPI_HEADER_TEST=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
+CONFIG_WATCH_QUEUE=y
CONFIG_AUDIT=y
CONFIG_NO_HZ_IDLE=y
CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BPF_JIT=y
+CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_LSM=y
CONFIG_PREEMPT=y
+CONFIG_SCHED_CORE=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_BSD_PROCESS_ACCT_V3=y
CONFIG_TASKSTATS=y
@@ -14,10 +21,8 @@ CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_NUMA_BALANCING=y
CONFIG_MEMCG=y
-CONFIG_MEMCG_SWAP=y
CONFIG_BLK_CGROUP=y
CONFIG_CFS_BANDWIDTH=y
-CONFIG_RT_GROUP_SCHED=y
CONFIG_CGROUP_PIDS=y
CONFIG_CGROUP_RDMA=y
CONFIG_CGROUP_FREEZER=y
@@ -27,55 +32,57 @@ CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_CGROUP_PERF=y
CONFIG_CGROUP_BPF=y
+CONFIG_CGROUP_MISC=y
CONFIG_NAMESPACES=y
CONFIG_USER_NS=y
CONFIG_CHECKPOINT_RESTORE=y
CONFIG_SCHED_AUTOGROUP=y
-CONFIG_BLK_DEV_INITRD=y
CONFIG_EXPERT=y
# CONFIG_SYSFS_SYSCALL is not set
-CONFIG_BPF_SYSCALL=y
-CONFIG_USERFAULTFD=y
-# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
+CONFIG_KEXEC=y
+CONFIG_KEXEC_FILE=y
+CONFIG_KEXEC_SIG=y
+CONFIG_CRASH_DUMP=y
CONFIG_LIVEPATCH=y
-CONFIG_TUNE_ZEC12=y
+CONFIG_MARCH_Z13=y
CONFIG_NR_CPUS=512
CONFIG_NUMA=y
CONFIG_HZ_100=y
-CONFIG_KEXEC_FILE=y
-CONFIG_KEXEC_SIG=y
+CONFIG_CERT_STORE=y
CONFIG_EXPOLINE=y
CONFIG_EXPOLINE_AUTO=y
CONFIG_CHSC_SCH=y
CONFIG_VFIO_CCW=m
CONFIG_VFIO_AP=m
-CONFIG_CRASH_DUMP=y
-CONFIG_HIBERNATION=y
-CONFIG_PM_DEBUG=y
+CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y
CONFIG_CMM=m
CONFIG_APPLDATA_BASE=y
+CONFIG_S390_HYPFS_FS=y
CONFIG_KVM=m
-CONFIG_VHOST_NET=m
-CONFIG_VHOST_VSOCK=m
-CONFIG_OPROFILE=m
+CONFIG_S390_UNWIND_SELFTEST=m
+CONFIG_S390_KPROBES_SANITY_TEST=m
+CONFIG_S390_MODULES_SANITY_TEST=m
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
CONFIG_STATIC_KEYS_SELFTEST=y
+CONFIG_SECCOMP_CACHE_DEBUG=y
CONFIG_LOCK_EVENT_COUNTS=y
+# CONFIG_GCC_PLUGINS is not set
CONFIG_MODULES=y
CONFIG_MODULE_FORCE_LOAD=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y
CONFIG_MODVERSIONS=y
CONFIG_MODULE_SRCVERSION_ALL=y
-CONFIG_MODULE_SIG_SHA256=y
-CONFIG_UNUSED_SYMBOLS=y
-CONFIG_BLK_DEV_INTEGRITY=y
CONFIG_BLK_DEV_THROTTLING=y
CONFIG_BLK_WBT=y
CONFIG_BLK_CGROUP_IOLATENCY=y
CONFIG_BLK_CGROUP_IOCOST=y
+CONFIG_BLK_CGROUP_IOPRIO=y
+CONFIG_BLK_INLINE_ENCRYPTION=y
+CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y
CONFIG_PARTITION_ADVANCED=y
CONFIG_IBM_PARTITION=y
CONFIG_BSD_DISKLABEL=y
@@ -83,25 +90,27 @@ CONFIG_MINIX_SUBPARTITION=y
CONFIG_SOLARIS_X86_PARTITION=y
CONFIG_UNIXWARE_DISKLABEL=y
CONFIG_IOSCHED_BFQ=y
-CONFIG_BFQ_GROUP_IOSCHED=y
CONFIG_BINFMT_MISC=m
+CONFIG_ZSWAP=y
+CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y
+CONFIG_ZSMALLOC_STAT=y
+CONFIG_SLUB_STATS=y
+# CONFIG_COMPAT_BRK is not set
CONFIG_MEMORY_HOTPLUG=y
CONFIG_MEMORY_HOTREMOVE=y
CONFIG_KSM=y
CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_CLEANCACHE=y
-CONFIG_FRONTSWAP=y
CONFIG_CMA_DEBUG=y
CONFIG_CMA_DEBUGFS=y
+CONFIG_CMA_SYSFS=y
+CONFIG_CMA_AREAS=7
CONFIG_MEM_SOFT_DIRTY=y
-CONFIG_ZSWAP=y
-CONFIG_ZBUD=m
-CONFIG_ZSMALLOC=m
-CONFIG_ZSMALLOC_STAT=y
CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
CONFIG_IDLE_PAGE_TRACKING=y
CONFIG_PERCPU_STATS=y
-CONFIG_GUP_BENCHMARK=y
+CONFIG_GUP_TEST=y
+CONFIG_ANON_VMA_NAME=y
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -129,6 +138,7 @@ CONFIG_SYN_COOKIES=y
CONFIG_NET_IPVTI=m
CONFIG_INET_AH=m
CONFIG_INET_ESP=m
+CONFIG_INET_ESPINTCP=y
CONFIG_INET_IPCOMP=m
CONFIG_INET_DIAG=m
CONFIG_INET_UDP_DIAG=m
@@ -143,6 +153,7 @@ CONFIG_TCP_CONG_ILLINOIS=m
CONFIG_IPV6_ROUTER_PREF=y
CONFIG_INET6_AH=m
CONFIG_INET6_ESP=m
+CONFIG_INET6_ESPINTCP=y
CONFIG_INET6_IPCOMP=m
CONFIG_IPV6_MIP6=m
CONFIG_IPV6_VTI=m
@@ -150,9 +161,14 @@ CONFIG_IPV6_SIT=m
CONFIG_IPV6_GRE=m
CONFIG_IPV6_MULTIPLE_TABLES=y
CONFIG_IPV6_SUBTREES=y
+CONFIG_IPV6_RPL_LWTUNNEL=y
+CONFIG_MPTCP=y
CONFIG_NETFILTER=y
+CONFIG_BRIDGE_NETFILTER=m
+CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_PROCFS=y
CONFIG_NF_CONNTRACK_EVENTS=y
CONFIG_NF_CONNTRACK_TIMEOUT=y
CONFIG_NF_CONNTRACK_TIMESTAMP=y
@@ -169,13 +185,16 @@ CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_CT_NETLINK=m
CONFIG_NF_CT_NETLINK_TIMEOUT=m
CONFIG_NF_TABLES=m
+CONFIG_NF_TABLES_INET=y
CONFIG_NFT_CT=m
-CONFIG_NFT_COUNTER=m
CONFIG_NFT_LOG=m
CONFIG_NFT_LIMIT=m
CONFIG_NFT_NAT=m
+CONFIG_NFT_REJECT=m
CONFIG_NFT_COMPAT=m
CONFIG_NFT_HASH=m
+CONFIG_NFT_FIB_INET=m
+CONFIG_NETFILTER_XTABLES_COMPAT=y
CONFIG_NETFILTER_XT_SET=m
CONFIG_NETFILTER_XT_TARGET_AUDIT=m
CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
@@ -264,10 +283,12 @@ CONFIG_IP_VS_DH=m
CONFIG_IP_VS_SH=m
CONFIG_IP_VS_SED=m
CONFIG_IP_VS_NQ=m
+CONFIG_IP_VS_TWOS=m
CONFIG_IP_VS_FTP=m
CONFIG_IP_VS_PE_SIP=m
-CONFIG_NF_TABLES_IPV4=y
+CONFIG_NFT_FIB_IPV4=m
CONFIG_NF_TABLES_ARP=y
+CONFIG_NF_LOG_IPV4=m
CONFIG_IP_NF_IPTABLES=m
CONFIG_IP_NF_MATCH_AH=m
CONFIG_IP_NF_MATCH_ECN=m
@@ -278,7 +299,6 @@ CONFIG_IP_NF_TARGET_REJECT=m
CONFIG_IP_NF_NAT=m
CONFIG_IP_NF_TARGET_MASQUERADE=m
CONFIG_IP_NF_MANGLE=m
-CONFIG_IP_NF_TARGET_CLUSTERIP=m
CONFIG_IP_NF_TARGET_ECN=m
CONFIG_IP_NF_TARGET_TTL=m
CONFIG_IP_NF_RAW=m
@@ -286,7 +306,7 @@ CONFIG_IP_NF_SECURITY=m
CONFIG_IP_NF_ARPTABLES=m
CONFIG_IP_NF_ARPFILTER=m
CONFIG_IP_NF_ARP_MANGLE=m
-CONFIG_NF_TABLES_IPV6=y
+CONFIG_NFT_FIB_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -315,11 +335,11 @@ CONFIG_L2TP_DEBUGFS=m
CONFIG_L2TP_V3=y
CONFIG_L2TP_IP=m
CONFIG_L2TP_ETH=m
-CONFIG_BRIDGE=m
+CONFIG_BRIDGE=y
+CONFIG_BRIDGE_MRP=y
CONFIG_VLAN_8021Q=m
CONFIG_VLAN_8021Q_GVRP=y
CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_CBQ=m
CONFIG_NET_SCH_HTB=m
CONFIG_NET_SCH_HFSC=m
CONFIG_NET_SCH_PRIO=m
@@ -330,7 +350,6 @@ CONFIG_NET_SCH_SFQ=m
CONFIG_NET_SCH_TEQL=m
CONFIG_NET_SCH_TBF=m
CONFIG_NET_SCH_GRED=m
-CONFIG_NET_SCH_DSMARK=m
CONFIG_NET_SCH_NETEM=m
CONFIG_NET_SCH_DRR=m
CONFIG_NET_SCH_MQPRIO=m
@@ -340,15 +359,13 @@ CONFIG_NET_SCH_CODEL=m
CONFIG_NET_SCH_FQ_CODEL=m
CONFIG_NET_SCH_INGRESS=m
CONFIG_NET_SCH_PLUG=m
+CONFIG_NET_SCH_ETS=m
CONFIG_NET_CLS_BASIC=m
-CONFIG_NET_CLS_TCINDEX=m
CONFIG_NET_CLS_ROUTE4=m
CONFIG_NET_CLS_FW=m
CONFIG_NET_CLS_U32=m
CONFIG_CLS_U32_PERF=y
CONFIG_CLS_U32_MARK=y
-CONFIG_NET_CLS_RSVP=m
-CONFIG_NET_CLS_RSVP6=m
CONFIG_NET_CLS_FLOW=m
CONFIG_NET_CLS_CGROUP=y
CONFIG_NET_CLS_BPF=m
@@ -357,30 +374,33 @@ CONFIG_NET_ACT_POLICE=m
CONFIG_NET_ACT_GACT=m
CONFIG_GACT_PROB=y
CONFIG_NET_ACT_MIRRED=m
-CONFIG_NET_ACT_IPT=m
CONFIG_NET_ACT_NAT=m
CONFIG_NET_ACT_PEDIT=m
CONFIG_NET_ACT_SIMP=m
CONFIG_NET_ACT_SKBEDIT=m
CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_GATE=m
+CONFIG_NET_TC_SKB_EXT=y
CONFIG_DNS_RESOLVER=y
CONFIG_OPENVSWITCH=m
CONFIG_VSOCKETS=m
CONFIG_VIRTIO_VSOCKETS=m
CONFIG_NETLINK_DIAG=m
+CONFIG_NET_SWITCHDEV=y
CONFIG_CGROUP_NET_PRIO=y
-CONFIG_BPF_JIT=y
CONFIG_NET_PKTGEN=m
-# CONFIG_NET_DROP_MONITOR is not set
CONFIG_PCI=y
+# CONFIG_PCIEASPM is not set
CONFIG_PCI_DEBUG=y
+CONFIG_PCI_IOV=y
CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_S390=y
CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_SAFE=y
+# CONFIG_FW_LOADER is not set
CONFIG_CONNECTOR=y
-CONFIG_ZRAM=m
+CONFIG_ZRAM=y
CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=y
@@ -401,12 +421,12 @@ CONFIG_SCSI_ENCLOSURE=m
CONFIG_SCSI_CONSTANTS=y
CONFIG_SCSI_LOGGING=y
CONFIG_SCSI_SPI_ATTRS=m
-CONFIG_SCSI_FC_ATTRS=y
+CONFIG_SCSI_FC_ATTRS=m
CONFIG_SCSI_SAS_LIBSAS=m
CONFIG_SCSI_SRP_ATTRS=m
CONFIG_ISCSI_TCP=m
CONFIG_SCSI_DEBUG=m
-CONFIG_ZFCP=y
+CONFIG_ZFCP=m
CONFIG_SCSI_VIRTIO=m
CONFIG_SCSI_DH=y
CONFIG_SCSI_DH_RDAC=m
@@ -415,12 +435,13 @@ CONFIG_SCSI_DH_EMC=m
CONFIG_SCSI_DH_ALUA=m
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
+# CONFIG_MD_BITMAP_FILE is not set
CONFIG_MD_LINEAR=m
CONFIG_MD_MULTIPATH=m
CONFIG_MD_FAULTY=m
CONFIG_MD_CLUSTER=m
CONFIG_BCACHE=m
-CONFIG_BLK_DEV_DM=m
+CONFIG_BLK_DEV_DM=y
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
CONFIG_DM_SNAPSHOT=m
@@ -434,12 +455,16 @@ CONFIG_DM_ZERO=m
CONFIG_DM_MULTIPATH=m
CONFIG_DM_MULTIPATH_QL=m
CONFIG_DM_MULTIPATH_ST=m
+CONFIG_DM_MULTIPATH_HST=m
+CONFIG_DM_MULTIPATH_IOA=m
CONFIG_DM_DELAY=m
+CONFIG_DM_INIT=y
CONFIG_DM_UEVENT=y
CONFIG_DM_FLAKEY=m
CONFIG_DM_VERITY=m
CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y
CONFIG_DM_SWITCH=m
+CONFIG_DM_INTEGRITY=m
CONFIG_NETDEVICES=y
CONFIG_BONDING=m
CONFIG_DUMMY=m
@@ -447,6 +472,9 @@ CONFIG_EQUALIZER=m
CONFIG_IFB=m
CONFIG_MACVLAN=m
CONFIG_MACVTAP=m
+CONFIG_VXLAN=m
+CONFIG_BAREUDP=m
+CONFIG_AMT=m
CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_VIRTIO_NET=m
@@ -460,41 +488,44 @@ CONFIG_NLMON=m
# CONFIG_NET_VENDOR_AMD is not set
# CONFIG_NET_VENDOR_AQUANTIA is not set
# CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_VENDOR_ASIX is not set
# CONFIG_NET_VENDOR_ATHEROS is not set
-# CONFIG_NET_VENDOR_AURORA is not set
# CONFIG_NET_VENDOR_BROADCOM is not set
-# CONFIG_NET_VENDOR_BROCADE is not set
# CONFIG_NET_VENDOR_CADENCE is not set
# CONFIG_NET_VENDOR_CAVIUM is not set
# CONFIG_NET_VENDOR_CHELSIO is not set
# CONFIG_NET_VENDOR_CISCO is not set
# CONFIG_NET_VENDOR_CORTINA is not set
+# CONFIG_NET_VENDOR_DAVICOM is not set
# CONFIG_NET_VENDOR_DEC is not set
# CONFIG_NET_VENDOR_DLINK is not set
# CONFIG_NET_VENDOR_EMULEX is not set
+# CONFIG_NET_VENDOR_ENGLEDER is not set
# CONFIG_NET_VENDOR_EZCHIP is not set
+# CONFIG_NET_VENDOR_FUNGIBLE is not set
# CONFIG_NET_VENDOR_GOOGLE is not set
-# CONFIG_NET_VENDOR_HP is not set
# CONFIG_NET_VENDOR_HUAWEI is not set
# CONFIG_NET_VENDOR_INTEL is not set
+# CONFIG_NET_VENDOR_LITEX is not set
# CONFIG_NET_VENDOR_MARVELL is not set
CONFIG_MLX4_EN=m
CONFIG_MLX5_CORE=m
CONFIG_MLX5_CORE_EN=y
-# CONFIG_MLXFW is not set
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_MICROCHIP is not set
# CONFIG_NET_VENDOR_MICROSEMI is not set
+# CONFIG_NET_VENDOR_MICROSOFT is not set
# CONFIG_NET_VENDOR_MYRI is not set
+# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_NATSEMI is not set
# CONFIG_NET_VENDOR_NETERION is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
-# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_NVIDIA is not set
# CONFIG_NET_VENDOR_OKI is not set
# CONFIG_NET_VENDOR_PACKET_ENGINES is not set
# CONFIG_NET_VENDOR_PENSANDO is not set
# CONFIG_NET_VENDOR_QLOGIC is not set
+# CONFIG_NET_VENDOR_BROCADE is not set
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RDC is not set
# CONFIG_NET_VENDOR_REALTEK is not set
@@ -502,9 +533,9 @@ CONFIG_MLX5_CORE_EN=y
# CONFIG_NET_VENDOR_ROCKER is not set
# CONFIG_NET_VENDOR_SAMSUNG is not set
# CONFIG_NET_VENDOR_SEEQ is not set
-# CONFIG_NET_VENDOR_SOLARFLARE is not set
# CONFIG_NET_VENDOR_SILAN is not set
# CONFIG_NET_VENDOR_SIS is not set
+# CONFIG_NET_VENDOR_SOLARFLARE is not set
# CONFIG_NET_VENDOR_SMSC is not set
# CONFIG_NET_VENDOR_SOCIONEXT is not set
# CONFIG_NET_VENDOR_STMICRO is not set
@@ -512,8 +543,11 @@ CONFIG_MLX5_CORE_EN=y
# CONFIG_NET_VENDOR_SYNOPSYS is not set
# CONFIG_NET_VENDOR_TEHUTI is not set
# CONFIG_NET_VENDOR_TI is not set
+# CONFIG_NET_VENDOR_VERTEXCOM is not set
# CONFIG_NET_VENDOR_VIA is not set
+# CONFIG_NET_VENDOR_WANGXUN is not set
# CONFIG_NET_VENDOR_WIZNET is not set
+# CONFIG_NET_VENDOR_XILINX is not set
CONFIG_PPP=m
CONFIG_PPP_BSDCOMP=m
CONFIG_PPP_DEFLATE=m
@@ -531,9 +565,9 @@ CONFIG_INPUT_EVDEV=y
# CONFIG_INPUT_MOUSE is not set
# CONFIG_SERIO is not set
CONFIG_LEGACY_PTY_COUNT=0
-CONFIG_NULL_TTY=m
+# CONFIG_LEGACY_TIOCSTI is not set
+CONFIG_VIRTIO_CONSOLE=m
CONFIG_HW_RANDOM_VIRTIO=m
-CONFIG_RAW_DRIVER=m
CONFIG_HANGCHECK_TIMER=m
CONFIG_TN3270_FS=y
CONFIG_PPS=m
@@ -543,10 +577,12 @@ CONFIG_WATCHDOG=y
CONFIG_WATCHDOG_NOWAYOUT=y
CONFIG_SOFT_WATCHDOG=m
CONFIG_DIAG288_WATCHDOG=m
+# CONFIG_DRM_DEBUG_MODESET_LOCK is not set
CONFIG_FB=y
+# CONFIG_FB_DEVICE is not set
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
-# CONFIG_HID is not set
+# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_INFINIBAND=m
CONFIG_INFINIBAND_USER_ACCESS=m
@@ -555,13 +591,12 @@ CONFIG_MLX5_INFINIBAND=m
CONFIG_SYNC_FILE=y
CONFIG_VFIO=m
CONFIG_VFIO_PCI=m
-CONFIG_VFIO_MDEV=m
-CONFIG_VFIO_MDEV_DEVICE=m
+CONFIG_MLX5_VFIO_PCI=m
CONFIG_VIRTIO_PCI=m
CONFIG_VIRTIO_BALLOON=m
CONFIG_VIRTIO_INPUT=y
-CONFIG_S390_CCW_IOMMU=y
-CONFIG_S390_AP_IOMMU=y
+CONFIG_VHOST_NET=m
+CONFIG_VHOST_VSOCK=m
CONFIG_EXT4_FS=y
CONFIG_EXT4_FS_POSIX_ACL=y
CONFIG_EXT4_FS_SECURITY=y
@@ -583,6 +618,9 @@ CONFIG_BTRFS_FS_POSIX_ACL=y
CONFIG_BTRFS_DEBUG=y
CONFIG_BTRFS_ASSERT=y
CONFIG_NILFS2_FS=m
+CONFIG_BCACHEFS_FS=y
+CONFIG_BCACHEFS_QUOTA=y
+CONFIG_BCACHEFS_POSIX_ACL=y
CONFIG_FS_DAX=y
CONFIG_EXPORTFS_BLOCK_OPS=y
CONFIG_FS_ENCRYPTION=y
@@ -594,12 +632,14 @@ CONFIG_QUOTA_NETLINK_INTERFACE=y
CONFIG_QUOTA_DEBUG=y
CONFIG_QFMT_V1=m
CONFIG_QFMT_V2=m
-CONFIG_AUTOFS4_FS=m
+CONFIG_AUTOFS_FS=m
CONFIG_FUSE_FS=y
CONFIG_CUSE=m
CONFIG_VIRTIO_FS=m
CONFIG_OVERLAY_FS=m
-CONFIG_FSCACHE=m
+CONFIG_NETFS_SUPPORT=m
+CONFIG_NETFS_STATS=y
+CONFIG_FSCACHE=y
CONFIG_CACHEFILES=m
CONFIG_ISO9660_FS=y
CONFIG_JOLIET=y
@@ -607,16 +647,19 @@ CONFIG_ZISOFS=y
CONFIG_UDF_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
+CONFIG_EXFAT_FS=m
CONFIG_NTFS_FS=m
CONFIG_NTFS_RW=y
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_TMPFS_INODE64=y
+CONFIG_TMPFS_QUOTA=y
CONFIG_HUGETLBFS=y
-CONFIG_CONFIGFS_FS=m
CONFIG_ECRYPT_FS=m
CONFIG_CRAMFS=m
CONFIG_SQUASHFS=m
+CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y
CONFIG_SQUASHFS_XATTR=y
CONFIG_SQUASHFS_LZ4=y
CONFIG_SQUASHFS_LZO=y
@@ -632,13 +675,12 @@ CONFIG_NFSD_V3_ACL=y
CONFIG_NFSD_V4=y
CONFIG_NFSD_V4_SECURITY_LABEL=y
CONFIG_CIFS=m
-CONFIG_CIFS_STATS2=y
-CONFIG_CIFS_WEAK_PW_HASH=y
CONFIG_CIFS_UPCALL=y
CONFIG_CIFS_XATTR=y
CONFIG_CIFS_POSIX=y
# CONFIG_CIFS_DEBUG is not set
CONFIG_CIFS_DFS_UPCALL=y
+CONFIG_CIFS_SWN_UPCALL=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=m
CONFIG_NLS_CODEPAGE_850=m
@@ -649,23 +691,26 @@ CONFIG_NLS_UTF8=m
CONFIG_DLM=m
CONFIG_UNICODE=y
CONFIG_PERSISTENT_KEYRINGS=y
-CONFIG_BIG_KEYS=y
CONFIG_ENCRYPTED_KEYS=m
+CONFIG_KEY_NOTIFICATIONS=y
CONFIG_SECURITY=y
-CONFIG_SECURITY_NETWORK=y
+CONFIG_HARDENED_USERCOPY=y
CONFIG_FORTIFY_SOURCE=y
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_SECURITY_LOCKDOWN_LSM=y
CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y
+CONFIG_SECURITY_LANDLOCK=y
CONFIG_INTEGRITY_SIGNATURE=y
CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y
+CONFIG_INTEGRITY_PLATFORM_KEYRING=y
CONFIG_IMA=y
CONFIG_IMA_DEFAULT_HASH_SHA256=y
CONFIG_IMA_WRITE_POLICY=y
CONFIG_IMA_APPRAISE=y
CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor"
+CONFIG_INIT_STACK_NONE=y
+CONFIG_BUG_ON_DATA_CORRUPTION=y
CONFIG_CRYPTO_USER=m
# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
CONFIG_CRYPTO_PCRYPT=m
@@ -673,42 +718,45 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_DH=m
CONFIG_CRYPTO_ECDH=m
+CONFIG_CRYPTO_ECDSA=m
CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
-CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CFB=m
-CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_KEYWRAP=m
-CONFIG_CRYPTO_ADIANTUM=m
-CONFIG_CRYPTO_XCBC=m
-CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_CRC32=m
-CONFIG_CRYPTO_XXHASH=m
-CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_RMD128=m
-CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
-CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
-CONFIG_CRYPTO_TGR192=m
-CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_SM2=m
+CONFIG_CRYPTO_CURVE25519=m
CONFIG_CRYPTO_AES_TI=m
CONFIG_CRYPTO_ANUBIS=m
-CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_ARIA=m
CONFIG_CRYPTO_BLOWFISH=m
CONFIG_CRYPTO_CAMELLIA=m
CONFIG_CRYPTO_CAST5=m
CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_CFB=m
+CONFIG_CRYPTO_HCTR2=m
+CONFIG_CRYPTO_KEYWRAP=m
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_AEGIS128=m
+CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_GCM=y
+CONFIG_CRYPTO_SEQIV=y
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SM3_GENERIC=m
+CONFIG_CRYPTO_VMAC=m
+CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_XCBC=m
+CONFIG_CRYPTO_CRC32=m
CONFIG_CRYPTO_842=m
CONFIG_CRYPTO_LZ4=m
CONFIG_CRYPTO_LZ4HC=m
@@ -719,41 +767,45 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
CONFIG_CRYPTO_STATS=y
-CONFIG_ZCRYPT=m
-CONFIG_PKEY=m
-CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_CRC32_S390=y
+CONFIG_CRYPTO_SHA512_S390=m
CONFIG_CRYPTO_SHA1_S390=m
CONFIG_CRYPTO_SHA256_S390=m
-CONFIG_CRYPTO_SHA512_S390=m
CONFIG_CRYPTO_SHA3_256_S390=m
CONFIG_CRYPTO_SHA3_512_S390=m
-CONFIG_CRYPTO_DES_S390=m
-CONFIG_CRYPTO_AES_S390=m
CONFIG_CRYPTO_GHASH_S390=m
-CONFIG_CRYPTO_CRC32_S390=y
+CONFIG_CRYPTO_AES_S390=m
+CONFIG_CRYPTO_DES_S390=m
+CONFIG_CRYPTO_CHACHA_S390=m
+CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
+CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_DEV_VIRTIO=m
+CONFIG_SYSTEM_BLACKLIST_KEYRING=y
CONFIG_CORDIC=m
+CONFIG_CRYPTO_LIB_CURVE25519=m
+CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
CONFIG_CRC32_SELFTEST=y
CONFIG_CRC4=m
CONFIG_CRC7=m
CONFIG_CRC8=m
CONFIG_RANDOM32_SELFTEST=y
+CONFIG_XZ_DEC_MICROLZMA=y
CONFIG_DMA_CMA=y
CONFIG_CMA_SIZE_MBYTES=0
-CONFIG_DMA_API_DEBUG=y
-CONFIG_STRING_SELFTEST=y
CONFIG_PRINTK_TIME=y
CONFIG_DYNAMIC_DEBUG=y
-CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_DWARF4=y
CONFIG_GDB_SCRIPTS=y
-CONFIG_FRAME_WARN=1024
CONFIG_HEADERS_INSTALL=y
-CONFIG_HEADERS_CHECK=y
CONFIG_DEBUG_SECTION_MISMATCH=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_PAGEALLOC=y
+CONFIG_SLUB_DEBUG_ON=y
CONFIG_PAGE_OWNER=y
CONFIG_DEBUG_RODATA_TEST=y
+CONFIG_DEBUG_WX=y
+CONFIG_PTDUMP_DEBUGFS=y
CONFIG_DEBUG_OBJECTS=y
CONFIG_DEBUG_OBJECTS_SELFTEST=y
CONFIG_DEBUG_OBJECTS_FREE=y
@@ -761,58 +813,77 @@ CONFIG_DEBUG_OBJECTS_TIMERS=y
CONFIG_DEBUG_OBJECTS_WORK=y
CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y
-CONFIG_SLUB_DEBUG_ON=y
-CONFIG_SLUB_STATS=y
CONFIG_DEBUG_STACK_USAGE=y
CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_VM_VMACACHE=y
-CONFIG_DEBUG_VM_RB=y
CONFIG_DEBUG_VM_PGFLAGS=y
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m
CONFIG_DEBUG_PER_CPU_MAPS=y
+CONFIG_KFENCE=y
+CONFIG_KFENCE_DEFERRABLE=y
+CONFIG_KFENCE_STATIC_KEYS=y
CONFIG_DEBUG_SHIRQ=y
+CONFIG_PANIC_ON_OOPS=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_WQ_WATCHDOG=y
-CONFIG_PANIC_ON_OOPS=y
-CONFIG_DEBUG_TIMEKEEPING=y
+CONFIG_TEST_LOCKUP=m
+CONFIG_DEBUG_PREEMPT=y
CONFIG_PROVE_LOCKING=y
CONFIG_LOCK_STAT=y
-CONFIG_DEBUG_LOCKDEP=y
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DEBUG_LOCKING_API_SELFTESTS=y
+CONFIG_DEBUG_IRQFLAGS=y
+CONFIG_DEBUG_LIST=y
CONFIG_DEBUG_SG=y
CONFIG_DEBUG_NOTIFIERS=y
-CONFIG_DEBUG_CREDENTIALS=y
CONFIG_RCU_TORTURE_TEST=m
+CONFIG_RCU_REF_SCALE_TEST=m
CONFIG_RCU_CPU_STALL_TIMEOUT=300
+# CONFIG_RCU_TRACE is not set
+CONFIG_LATENCYTOP=y
+CONFIG_BOOTTIME_TRACING=y
+CONFIG_FUNCTION_GRAPH_RETVAL=y
+CONFIG_FPROBE=y
+CONFIG_FUNCTION_PROFILER=y
+CONFIG_STACK_TRACER=y
+CONFIG_IRQSOFF_TRACER=y
+CONFIG_PREEMPT_TRACER=y
+CONFIG_SCHED_TRACER=y
+CONFIG_FTRACE_SYSCALLS=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_USER_EVENTS=y
+CONFIG_HIST_TRIGGERS=y
+CONFIG_FTRACE_STARTUP_TEST=y
+# CONFIG_EVENT_TRACE_STARTUP_TEST is not set
+CONFIG_SAMPLES=y
+CONFIG_SAMPLE_TRACE_PRINTK=m
+CONFIG_SAMPLE_FTRACE_DIRECT=m
+CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m
+CONFIG_SAMPLE_FTRACE_OPS=m
+CONFIG_DEBUG_ENTRY=y
+CONFIG_CIO_INJECT=y
+CONFIG_KUNIT=m
+CONFIG_KUNIT_DEBUGFS=y
CONFIG_NOTIFIER_ERROR_INJECTION=m
CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m
CONFIG_FAULT_INJECTION=y
CONFIG_FAILSLAB=y
CONFIG_FAIL_PAGE_ALLOC=y
+CONFIG_FAULT_INJECTION_USERCOPY=y
CONFIG_FAIL_MAKE_REQUEST=y
CONFIG_FAIL_IO_TIMEOUT=y
CONFIG_FAIL_FUTEX=y
CONFIG_FAULT_INJECTION_DEBUG_FS=y
+CONFIG_FAULT_INJECTION_CONFIGFS=y
CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y
-CONFIG_LATENCYTOP=y
-CONFIG_IRQSOFF_TRACER=y
-CONFIG_PREEMPT_TRACER=y
-CONFIG_SCHED_TRACER=y
-CONFIG_FTRACE_SYSCALLS=y
-CONFIG_STACK_TRACER=y
-CONFIG_BLK_DEV_IO_TRACE=y
-CONFIG_FUNCTION_PROFILER=y
-CONFIG_HIST_TRIGGERS=y
CONFIG_LKDTM=m
-CONFIG_TEST_LIST_SORT=y
-CONFIG_TEST_SORT=y
-CONFIG_KPROBES_SANITY_TEST=y
+CONFIG_TEST_MIN_HEAP=y
+CONFIG_KPROBES_SANITY_TEST=m
CONFIG_RBTREE_TEST=y
CONFIG_INTERVAL_TREE_TEST=m
CONFIG_PERCPU_TEST=m
CONFIG_ATOMIC64_SELFTEST=y
+CONFIG_STRING_SELFTEST=y
+CONFIG_TEST_BITOPS=m
CONFIG_TEST_BPF=m
-CONFIG_BUG_ON_DATA_CORRUPTION=y
-CONFIG_S390_PTDUMP=y
+CONFIG_TEST_LIVEPATCH=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 25f799849582..42b988873e54 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -1,8 +1,14 @@
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
+CONFIG_WATCH_QUEUE=y
CONFIG_AUDIT=y
CONFIG_NO_HZ_IDLE=y
CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BPF_JIT=y
+CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_LSM=y
+CONFIG_SCHED_CORE=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_BSD_PROCESS_ACCT_V3=y
CONFIG_TASKSTATS=y
@@ -13,10 +19,8 @@ CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_NUMA_BALANCING=y
CONFIG_MEMCG=y
-CONFIG_MEMCG_SWAP=y
CONFIG_BLK_CGROUP=y
CONFIG_CFS_BANDWIDTH=y
-CONFIG_RT_GROUP_SCHED=y
CONFIG_CGROUP_PIDS=y
CONFIG_CGROUP_RDMA=y
CONFIG_CGROUP_FREEZER=y
@@ -26,53 +30,54 @@ CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_CGROUP_PERF=y
CONFIG_CGROUP_BPF=y
+CONFIG_CGROUP_MISC=y
CONFIG_NAMESPACES=y
CONFIG_USER_NS=y
CONFIG_CHECKPOINT_RESTORE=y
CONFIG_SCHED_AUTOGROUP=y
-CONFIG_BLK_DEV_INITRD=y
CONFIG_EXPERT=y
# CONFIG_SYSFS_SYSCALL is not set
-CONFIG_BPF_SYSCALL=y
-CONFIG_USERFAULTFD=y
-# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
+CONFIG_KEXEC=y
+CONFIG_KEXEC_FILE=y
+CONFIG_KEXEC_SIG=y
+CONFIG_CRASH_DUMP=y
CONFIG_LIVEPATCH=y
-CONFIG_TUNE_ZEC12=y
+CONFIG_MARCH_Z13=y
CONFIG_NR_CPUS=512
CONFIG_NUMA=y
-# CONFIG_NUMA_EMU is not set
CONFIG_HZ_100=y
-CONFIG_KEXEC_FILE=y
-CONFIG_KEXEC_SIG=y
+CONFIG_CERT_STORE=y
CONFIG_EXPOLINE=y
CONFIG_EXPOLINE_AUTO=y
CONFIG_CHSC_SCH=y
CONFIG_VFIO_CCW=m
CONFIG_VFIO_AP=m
-CONFIG_CRASH_DUMP=y
-CONFIG_HIBERNATION=y
-CONFIG_PM_DEBUG=y
+CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y
CONFIG_CMM=m
CONFIG_APPLDATA_BASE=y
+CONFIG_S390_HYPFS_FS=y
CONFIG_KVM=m
-CONFIG_VHOST_NET=m
-CONFIG_VHOST_VSOCK=m
-CONFIG_OPROFILE=m
+CONFIG_S390_UNWIND_SELFTEST=m
+CONFIG_S390_KPROBES_SANITY_TEST=m
+CONFIG_S390_MODULES_SANITY_TEST=m
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
+# CONFIG_GCC_PLUGINS is not set
CONFIG_MODULES=y
CONFIG_MODULE_FORCE_LOAD=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_MODULE_UNLOAD_TAINT_TRACKING=y
CONFIG_MODVERSIONS=y
CONFIG_MODULE_SRCVERSION_ALL=y
-CONFIG_MODULE_SIG_SHA256=y
-CONFIG_UNUSED_SYMBOLS=y
CONFIG_BLK_DEV_THROTTLING=y
CONFIG_BLK_WBT=y
CONFIG_BLK_CGROUP_IOLATENCY=y
CONFIG_BLK_CGROUP_IOCOST=y
+CONFIG_BLK_CGROUP_IOPRIO=y
+CONFIG_BLK_INLINE_ENCRYPTION=y
+CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y
CONFIG_PARTITION_ADVANCED=y
CONFIG_IBM_PARTITION=y
CONFIG_BSD_DISKLABEL=y
@@ -80,23 +85,23 @@ CONFIG_MINIX_SUBPARTITION=y
CONFIG_SOLARIS_X86_PARTITION=y
CONFIG_UNIXWARE_DISKLABEL=y
CONFIG_IOSCHED_BFQ=y
-CONFIG_BFQ_GROUP_IOSCHED=y
CONFIG_BINFMT_MISC=m
+CONFIG_ZSWAP=y
+CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y
+CONFIG_ZSMALLOC_STAT=y
+# CONFIG_COMPAT_BRK is not set
CONFIG_MEMORY_HOTPLUG=y
CONFIG_MEMORY_HOTREMOVE=y
CONFIG_KSM=y
CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_CLEANCACHE=y
-CONFIG_FRONTSWAP=y
+CONFIG_CMA_SYSFS=y
+CONFIG_CMA_AREAS=7
CONFIG_MEM_SOFT_DIRTY=y
-CONFIG_ZSWAP=y
-CONFIG_ZBUD=m
-CONFIG_ZSMALLOC=m
-CONFIG_ZSMALLOC_STAT=y
CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
CONFIG_IDLE_PAGE_TRACKING=y
CONFIG_PERCPU_STATS=y
-CONFIG_GUP_BENCHMARK=y
+CONFIG_ANON_VMA_NAME=y
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -124,6 +129,7 @@ CONFIG_SYN_COOKIES=y
CONFIG_NET_IPVTI=m
CONFIG_INET_AH=m
CONFIG_INET_ESP=m
+CONFIG_INET_ESPINTCP=y
CONFIG_INET_IPCOMP=m
CONFIG_INET_DIAG=m
CONFIG_INET_UDP_DIAG=m
@@ -138,6 +144,7 @@ CONFIG_TCP_CONG_ILLINOIS=m
CONFIG_IPV6_ROUTER_PREF=y
CONFIG_INET6_AH=m
CONFIG_INET6_ESP=m
+CONFIG_INET6_ESPINTCP=y
CONFIG_INET6_IPCOMP=m
CONFIG_IPV6_MIP6=m
CONFIG_IPV6_VTI=m
@@ -145,9 +152,14 @@ CONFIG_IPV6_SIT=m
CONFIG_IPV6_GRE=m
CONFIG_IPV6_MULTIPLE_TABLES=y
CONFIG_IPV6_SUBTREES=y
+CONFIG_IPV6_RPL_LWTUNNEL=y
+CONFIG_MPTCP=y
CONFIG_NETFILTER=y
+CONFIG_BRIDGE_NETFILTER=m
+CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_PROCFS=y
CONFIG_NF_CONNTRACK_EVENTS=y
CONFIG_NF_CONNTRACK_TIMEOUT=y
CONFIG_NF_CONNTRACK_TIMESTAMP=y
@@ -164,13 +176,16 @@ CONFIG_NF_CONNTRACK_TFTP=m
CONFIG_NF_CT_NETLINK=m
CONFIG_NF_CT_NETLINK_TIMEOUT=m
CONFIG_NF_TABLES=m
+CONFIG_NF_TABLES_INET=y
CONFIG_NFT_CT=m
-CONFIG_NFT_COUNTER=m
CONFIG_NFT_LOG=m
CONFIG_NFT_LIMIT=m
CONFIG_NFT_NAT=m
+CONFIG_NFT_REJECT=m
CONFIG_NFT_COMPAT=m
CONFIG_NFT_HASH=m
+CONFIG_NFT_FIB_INET=m
+CONFIG_NETFILTER_XTABLES_COMPAT=y
CONFIG_NETFILTER_XT_SET=m
CONFIG_NETFILTER_XT_TARGET_AUDIT=m
CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
@@ -259,10 +274,12 @@ CONFIG_IP_VS_DH=m
CONFIG_IP_VS_SH=m
CONFIG_IP_VS_SED=m
CONFIG_IP_VS_NQ=m
+CONFIG_IP_VS_TWOS=m
CONFIG_IP_VS_FTP=m
CONFIG_IP_VS_PE_SIP=m
-CONFIG_NF_TABLES_IPV4=y
+CONFIG_NFT_FIB_IPV4=m
CONFIG_NF_TABLES_ARP=y
+CONFIG_NF_LOG_IPV4=m
CONFIG_IP_NF_IPTABLES=m
CONFIG_IP_NF_MATCH_AH=m
CONFIG_IP_NF_MATCH_ECN=m
@@ -273,7 +290,6 @@ CONFIG_IP_NF_TARGET_REJECT=m
CONFIG_IP_NF_NAT=m
CONFIG_IP_NF_TARGET_MASQUERADE=m
CONFIG_IP_NF_MANGLE=m
-CONFIG_IP_NF_TARGET_CLUSTERIP=m
CONFIG_IP_NF_TARGET_ECN=m
CONFIG_IP_NF_TARGET_TTL=m
CONFIG_IP_NF_RAW=m
@@ -281,7 +297,7 @@ CONFIG_IP_NF_SECURITY=m
CONFIG_IP_NF_ARPTABLES=m
CONFIG_IP_NF_ARPFILTER=m
CONFIG_IP_NF_ARP_MANGLE=m
-CONFIG_NF_TABLES_IPV6=y
+CONFIG_NFT_FIB_IPV6=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
@@ -309,11 +325,11 @@ CONFIG_L2TP_DEBUGFS=m
CONFIG_L2TP_V3=y
CONFIG_L2TP_IP=m
CONFIG_L2TP_ETH=m
-CONFIG_BRIDGE=m
+CONFIG_BRIDGE=y
+CONFIG_BRIDGE_MRP=y
CONFIG_VLAN_8021Q=m
CONFIG_VLAN_8021Q_GVRP=y
CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_CBQ=m
CONFIG_NET_SCH_HTB=m
CONFIG_NET_SCH_HFSC=m
CONFIG_NET_SCH_PRIO=m
@@ -324,7 +340,6 @@ CONFIG_NET_SCH_SFQ=m
CONFIG_NET_SCH_TEQL=m
CONFIG_NET_SCH_TBF=m
CONFIG_NET_SCH_GRED=m
-CONFIG_NET_SCH_DSMARK=m
CONFIG_NET_SCH_NETEM=m
CONFIG_NET_SCH_DRR=m
CONFIG_NET_SCH_MQPRIO=m
@@ -334,15 +349,13 @@ CONFIG_NET_SCH_CODEL=m
CONFIG_NET_SCH_FQ_CODEL=m
CONFIG_NET_SCH_INGRESS=m
CONFIG_NET_SCH_PLUG=m
+CONFIG_NET_SCH_ETS=m
CONFIG_NET_CLS_BASIC=m
-CONFIG_NET_CLS_TCINDEX=m
CONFIG_NET_CLS_ROUTE4=m
CONFIG_NET_CLS_FW=m
CONFIG_NET_CLS_U32=m
CONFIG_CLS_U32_PERF=y
CONFIG_CLS_U32_MARK=y
-CONFIG_NET_CLS_RSVP=m
-CONFIG_NET_CLS_RSVP6=m
CONFIG_NET_CLS_FLOW=m
CONFIG_NET_CLS_CGROUP=y
CONFIG_NET_CLS_BPF=m
@@ -351,35 +364,37 @@ CONFIG_NET_ACT_POLICE=m
CONFIG_NET_ACT_GACT=m
CONFIG_GACT_PROB=y
CONFIG_NET_ACT_MIRRED=m
-CONFIG_NET_ACT_IPT=m
CONFIG_NET_ACT_NAT=m
CONFIG_NET_ACT_PEDIT=m
CONFIG_NET_ACT_SIMP=m
CONFIG_NET_ACT_SKBEDIT=m
CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_GATE=m
+CONFIG_NET_TC_SKB_EXT=y
CONFIG_DNS_RESOLVER=y
CONFIG_OPENVSWITCH=m
CONFIG_VSOCKETS=m
CONFIG_VIRTIO_VSOCKETS=m
CONFIG_NETLINK_DIAG=m
+CONFIG_NET_SWITCHDEV=y
CONFIG_CGROUP_NET_PRIO=y
-CONFIG_BPF_JIT=y
CONFIG_NET_PKTGEN=m
-# CONFIG_NET_DROP_MONITOR is not set
CONFIG_PCI=y
+# CONFIG_PCIEASPM is not set
+CONFIG_PCI_IOV=y
CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_S390=y
CONFIG_UEVENT_HELPER=y
CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_SAFE=y
+# CONFIG_FW_LOADER is not set
CONFIG_CONNECTOR=y
-CONFIG_ZRAM=m
+CONFIG_ZRAM=y
CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=32768
-# CONFIG_BLK_DEV_XPRAM is not set
CONFIG_VIRTIO_BLK=y
CONFIG_BLK_DEV_RBD=m
CONFIG_BLK_DEV_NVME=m
@@ -396,12 +411,12 @@ CONFIG_SCSI_ENCLOSURE=m
CONFIG_SCSI_CONSTANTS=y
CONFIG_SCSI_LOGGING=y
CONFIG_SCSI_SPI_ATTRS=m
-CONFIG_SCSI_FC_ATTRS=y
+CONFIG_SCSI_FC_ATTRS=m
CONFIG_SCSI_SAS_LIBSAS=m
CONFIG_SCSI_SRP_ATTRS=m
CONFIG_ISCSI_TCP=m
CONFIG_SCSI_DEBUG=m
-CONFIG_ZFCP=y
+CONFIG_ZFCP=m
CONFIG_SCSI_VIRTIO=m
CONFIG_SCSI_DH=y
CONFIG_SCSI_DH_RDAC=m
@@ -410,12 +425,13 @@ CONFIG_SCSI_DH_EMC=m
CONFIG_SCSI_DH_ALUA=m
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
+# CONFIG_MD_BITMAP_FILE is not set
CONFIG_MD_LINEAR=m
CONFIG_MD_MULTIPATH=m
CONFIG_MD_FAULTY=m
CONFIG_MD_CLUSTER=m
CONFIG_BCACHE=m
-CONFIG_BLK_DEV_DM=m
+CONFIG_BLK_DEV_DM=y
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
CONFIG_DM_SNAPSHOT=m
@@ -429,7 +445,10 @@ CONFIG_DM_ZERO=m
CONFIG_DM_MULTIPATH=m
CONFIG_DM_MULTIPATH_QL=m
CONFIG_DM_MULTIPATH_ST=m
+CONFIG_DM_MULTIPATH_HST=m
+CONFIG_DM_MULTIPATH_IOA=m
CONFIG_DM_DELAY=m
+CONFIG_DM_INIT=y
CONFIG_DM_UEVENT=y
CONFIG_DM_FLAKEY=m
CONFIG_DM_VERITY=m
@@ -443,6 +462,9 @@ CONFIG_EQUALIZER=m
CONFIG_IFB=m
CONFIG_MACVLAN=m
CONFIG_MACVTAP=m
+CONFIG_VXLAN=m
+CONFIG_BAREUDP=m
+CONFIG_AMT=m
CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_VIRTIO_NET=m
@@ -456,41 +478,44 @@ CONFIG_NLMON=m
# CONFIG_NET_VENDOR_AMD is not set
# CONFIG_NET_VENDOR_AQUANTIA is not set
# CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_VENDOR_ASIX is not set
# CONFIG_NET_VENDOR_ATHEROS is not set
-# CONFIG_NET_VENDOR_AURORA is not set
# CONFIG_NET_VENDOR_BROADCOM is not set
-# CONFIG_NET_VENDOR_BROCADE is not set
# CONFIG_NET_VENDOR_CADENCE is not set
# CONFIG_NET_VENDOR_CAVIUM is not set
# CONFIG_NET_VENDOR_CHELSIO is not set
# CONFIG_NET_VENDOR_CISCO is not set
# CONFIG_NET_VENDOR_CORTINA is not set
+# CONFIG_NET_VENDOR_DAVICOM is not set
# CONFIG_NET_VENDOR_DEC is not set
# CONFIG_NET_VENDOR_DLINK is not set
# CONFIG_NET_VENDOR_EMULEX is not set
+# CONFIG_NET_VENDOR_ENGLEDER is not set
# CONFIG_NET_VENDOR_EZCHIP is not set
+# CONFIG_NET_VENDOR_FUNGIBLE is not set
# CONFIG_NET_VENDOR_GOOGLE is not set
-# CONFIG_NET_VENDOR_HP is not set
# CONFIG_NET_VENDOR_HUAWEI is not set
# CONFIG_NET_VENDOR_INTEL is not set
+# CONFIG_NET_VENDOR_LITEX is not set
# CONFIG_NET_VENDOR_MARVELL is not set
CONFIG_MLX4_EN=m
CONFIG_MLX5_CORE=m
CONFIG_MLX5_CORE_EN=y
-# CONFIG_MLXFW is not set
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_MICROCHIP is not set
# CONFIG_NET_VENDOR_MICROSEMI is not set
+# CONFIG_NET_VENDOR_MICROSOFT is not set
# CONFIG_NET_VENDOR_MYRI is not set
+# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_NATSEMI is not set
# CONFIG_NET_VENDOR_NETERION is not set
# CONFIG_NET_VENDOR_NETRONOME is not set
-# CONFIG_NET_VENDOR_NI is not set
# CONFIG_NET_VENDOR_NVIDIA is not set
# CONFIG_NET_VENDOR_OKI is not set
# CONFIG_NET_VENDOR_PACKET_ENGINES is not set
# CONFIG_NET_VENDOR_PENSANDO is not set
# CONFIG_NET_VENDOR_QLOGIC is not set
+# CONFIG_NET_VENDOR_BROCADE is not set
# CONFIG_NET_VENDOR_QUALCOMM is not set
# CONFIG_NET_VENDOR_RDC is not set
# CONFIG_NET_VENDOR_REALTEK is not set
@@ -498,9 +523,9 @@ CONFIG_MLX5_CORE_EN=y
# CONFIG_NET_VENDOR_ROCKER is not set
# CONFIG_NET_VENDOR_SAMSUNG is not set
# CONFIG_NET_VENDOR_SEEQ is not set
-# CONFIG_NET_VENDOR_SOLARFLARE is not set
# CONFIG_NET_VENDOR_SILAN is not set
# CONFIG_NET_VENDOR_SIS is not set
+# CONFIG_NET_VENDOR_SOLARFLARE is not set
# CONFIG_NET_VENDOR_SMSC is not set
# CONFIG_NET_VENDOR_SOCIONEXT is not set
# CONFIG_NET_VENDOR_STMICRO is not set
@@ -508,8 +533,11 @@ CONFIG_MLX5_CORE_EN=y
# CONFIG_NET_VENDOR_SYNOPSYS is not set
# CONFIG_NET_VENDOR_TEHUTI is not set
# CONFIG_NET_VENDOR_TI is not set
+# CONFIG_NET_VENDOR_VERTEXCOM is not set
# CONFIG_NET_VENDOR_VIA is not set
+# CONFIG_NET_VENDOR_WANGXUN is not set
# CONFIG_NET_VENDOR_WIZNET is not set
+# CONFIG_NET_VENDOR_XILINX is not set
CONFIG_PPP=m
CONFIG_PPP_BSDCOMP=m
CONFIG_PPP_DEFLATE=m
@@ -527,9 +555,9 @@ CONFIG_INPUT_EVDEV=y
# CONFIG_INPUT_MOUSE is not set
# CONFIG_SERIO is not set
CONFIG_LEGACY_PTY_COUNT=0
-CONFIG_NULL_TTY=m
+# CONFIG_LEGACY_TIOCSTI is not set
+CONFIG_VIRTIO_CONSOLE=m
CONFIG_HW_RANDOM_VIRTIO=m
-CONFIG_RAW_DRIVER=m
CONFIG_HANGCHECK_TIMER=m
CONFIG_TN3270_FS=y
# CONFIG_PTP_1588_CLOCK is not set
@@ -540,9 +568,10 @@ CONFIG_WATCHDOG_NOWAYOUT=y
CONFIG_SOFT_WATCHDOG=m
CONFIG_DIAG288_WATCHDOG=m
CONFIG_FB=y
+# CONFIG_FB_DEVICE is not set
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
-# CONFIG_HID is not set
+# CONFIG_HID_SUPPORT is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_INFINIBAND=m
CONFIG_INFINIBAND_USER_ACCESS=m
@@ -551,13 +580,12 @@ CONFIG_MLX5_INFINIBAND=m
CONFIG_SYNC_FILE=y
CONFIG_VFIO=m
CONFIG_VFIO_PCI=m
-CONFIG_VFIO_MDEV=m
-CONFIG_VFIO_MDEV_DEVICE=m
+CONFIG_MLX5_VFIO_PCI=m
CONFIG_VIRTIO_PCI=m
CONFIG_VIRTIO_BALLOON=m
CONFIG_VIRTIO_INPUT=y
-CONFIG_S390_CCW_IOMMU=y
-CONFIG_S390_AP_IOMMU=y
+CONFIG_VHOST_NET=m
+CONFIG_VHOST_VSOCK=m
CONFIG_EXT4_FS=y
CONFIG_EXT4_FS_POSIX_ACL=y
CONFIG_EXT4_FS_SECURITY=y
@@ -576,6 +604,9 @@ CONFIG_OCFS2_FS=m
CONFIG_BTRFS_FS=y
CONFIG_BTRFS_FS_POSIX_ACL=y
CONFIG_NILFS2_FS=m
+CONFIG_BCACHEFS_FS=m
+CONFIG_BCACHEFS_QUOTA=y
+CONFIG_BCACHEFS_POSIX_ACL=y
CONFIG_FS_DAX=y
CONFIG_EXPORTFS_BLOCK_OPS=y
CONFIG_FS_ENCRYPTION=y
@@ -586,12 +617,14 @@ CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
CONFIG_QFMT_V1=m
CONFIG_QFMT_V2=m
-CONFIG_AUTOFS4_FS=m
+CONFIG_AUTOFS_FS=m
CONFIG_FUSE_FS=y
CONFIG_CUSE=m
CONFIG_VIRTIO_FS=m
CONFIG_OVERLAY_FS=m
-CONFIG_FSCACHE=m
+CONFIG_NETFS_SUPPORT=m
+CONFIG_NETFS_STATS=y
+CONFIG_FSCACHE=y
CONFIG_CACHEFILES=m
CONFIG_ISO9660_FS=y
CONFIG_JOLIET=y
@@ -599,16 +632,20 @@ CONFIG_ZISOFS=y
CONFIG_UDF_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
+CONFIG_EXFAT_FS=m
CONFIG_NTFS_FS=m
CONFIG_NTFS_RW=y
CONFIG_PROC_KCORE=y
CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_TMPFS_INODE64=y
+CONFIG_TMPFS_QUOTA=y
CONFIG_HUGETLBFS=y
CONFIG_CONFIGFS_FS=m
CONFIG_ECRYPT_FS=m
CONFIG_CRAMFS=m
CONFIG_SQUASHFS=m
+CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y
CONFIG_SQUASHFS_XATTR=y
CONFIG_SQUASHFS_LZ4=y
CONFIG_SQUASHFS_LZO=y
@@ -624,13 +661,12 @@ CONFIG_NFSD_V3_ACL=y
CONFIG_NFSD_V4=y
CONFIG_NFSD_V4_SECURITY_LABEL=y
CONFIG_CIFS=m
-CONFIG_CIFS_STATS2=y
-CONFIG_CIFS_WEAK_PW_HASH=y
CONFIG_CIFS_UPCALL=y
CONFIG_CIFS_XATTR=y
CONFIG_CIFS_POSIX=y
# CONFIG_CIFS_DEBUG is not set
CONFIG_CIFS_DFS_UPCALL=y
+CONFIG_CIFS_SWN_UPCALL=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=m
CONFIG_NLS_CODEPAGE_850=m
@@ -641,22 +677,24 @@ CONFIG_NLS_UTF8=m
CONFIG_DLM=m
CONFIG_UNICODE=y
CONFIG_PERSISTENT_KEYRINGS=y
-CONFIG_BIG_KEYS=y
CONFIG_ENCRYPTED_KEYS=m
+CONFIG_KEY_NOTIFICATIONS=y
CONFIG_SECURITY=y
-CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_SECURITY_LOCKDOWN_LSM=y
CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y
+CONFIG_SECURITY_LANDLOCK=y
CONFIG_INTEGRITY_SIGNATURE=y
CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y
+CONFIG_INTEGRITY_PLATFORM_KEYRING=y
CONFIG_IMA=y
CONFIG_IMA_DEFAULT_HASH_SHA256=y
CONFIG_IMA_WRITE_POLICY=y
CONFIG_IMA_APPRAISE=y
CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor"
+CONFIG_INIT_STACK_NONE=y
+CONFIG_BUG_ON_DATA_CORRUPTION=y
CONFIG_CRYPTO_FIPS=y
CONFIG_CRYPTO_USER=m
# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
@@ -665,43 +703,46 @@ CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_DH=m
CONFIG_CRYPTO_ECDH=m
+CONFIG_CRYPTO_ECDSA=m
CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
-CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CFB=m
-CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_KEYWRAP=m
-CONFIG_CRYPTO_ADIANTUM=m
-CONFIG_CRYPTO_XCBC=m
-CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_CRC32=m
-CONFIG_CRYPTO_XXHASH=m
-CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_RMD128=m
-CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
-CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
-CONFIG_CRYPTO_TGR192=m
-CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_SM2=m
+CONFIG_CRYPTO_CURVE25519=m
CONFIG_CRYPTO_AES_TI=m
CONFIG_CRYPTO_ANUBIS=m
-CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_ARIA=m
CONFIG_CRYPTO_BLOWFISH=m
CONFIG_CRYPTO_CAMELLIA=m
CONFIG_CRYPTO_CAST5=m
CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_CFB=m
+CONFIG_CRYPTO_HCTR2=m
+CONFIG_CRYPTO_KEYWRAP=m
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_OFB=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_AEGIS128=m
+CONFIG_CRYPTO_CHACHA20POLY1305=m
+CONFIG_CRYPTO_GCM=y
+CONFIG_CRYPTO_SEQIV=y
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SM3_GENERIC=m
+CONFIG_CRYPTO_VMAC=m
+CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_XCBC=m
+CONFIG_CRYPTO_CRC32=m
CONFIG_CRYPTO_842=m
CONFIG_CRYPTO_LZ4=m
CONFIG_CRYPTO_LZ4HC=m
@@ -712,45 +753,66 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
CONFIG_CRYPTO_STATS=y
-CONFIG_ZCRYPT=m
-CONFIG_PKEY=m
-CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_CRC32_S390=y
+CONFIG_CRYPTO_SHA512_S390=m
CONFIG_CRYPTO_SHA1_S390=m
CONFIG_CRYPTO_SHA256_S390=m
-CONFIG_CRYPTO_SHA512_S390=m
CONFIG_CRYPTO_SHA3_256_S390=m
CONFIG_CRYPTO_SHA3_512_S390=m
-CONFIG_CRYPTO_DES_S390=m
-CONFIG_CRYPTO_AES_S390=m
CONFIG_CRYPTO_GHASH_S390=m
-CONFIG_CRYPTO_CRC32_S390=y
+CONFIG_CRYPTO_AES_S390=m
+CONFIG_CRYPTO_DES_S390=m
+CONFIG_CRYPTO_CHACHA_S390=m
+CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
+CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_DEV_VIRTIO=m
+CONFIG_SYSTEM_BLACKLIST_KEYRING=y
CONFIG_CORDIC=m
+CONFIG_PRIME_NUMBERS=m
+CONFIG_CRYPTO_LIB_CURVE25519=m
+CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
CONFIG_CRC4=m
CONFIG_CRC7=m
CONFIG_CRC8=m
+CONFIG_XZ_DEC_MICROLZMA=y
CONFIG_DMA_CMA=y
CONFIG_CMA_SIZE_MBYTES=0
CONFIG_PRINTK_TIME=y
-CONFIG_DEBUG_INFO=y
+CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO_DWARF4=y
CONFIG_GDB_SCRIPTS=y
-CONFIG_FRAME_WARN=1024
CONFIG_DEBUG_SECTION_MISMATCH=y
CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_WX=y
+CONFIG_PTDUMP_DEBUGFS=y
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_PANIC_ON_OOPS=y
+CONFIG_TEST_LOCKUP=m
CONFIG_RCU_TORTURE_TEST=m
+CONFIG_RCU_REF_SCALE_TEST=m
CONFIG_RCU_CPU_STALL_TIMEOUT=60
CONFIG_LATENCYTOP=y
+CONFIG_BOOTTIME_TRACING=y
+CONFIG_FUNCTION_GRAPH_RETVAL=y
+CONFIG_FPROBE=y
+CONFIG_FUNCTION_PROFILER=y
+CONFIG_STACK_TRACER=y
CONFIG_SCHED_TRACER=y
CONFIG_FTRACE_SYSCALLS=y
-CONFIG_STACK_TRACER=y
CONFIG_BLK_DEV_IO_TRACE=y
-CONFIG_FUNCTION_PROFILER=y
+CONFIG_USER_EVENTS=y
CONFIG_HIST_TRIGGERS=y
+CONFIG_SAMPLES=y
+CONFIG_SAMPLE_TRACE_PRINTK=m
+CONFIG_SAMPLE_FTRACE_DIRECT=m
+CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m
+CONFIG_SAMPLE_FTRACE_OPS=m
+CONFIG_KUNIT=m
+CONFIG_KUNIT_DEBUGFS=y
CONFIG_LKDTM=m
+CONFIG_KPROBES_SANITY_TEST=m
CONFIG_PERCPU_TEST=m
CONFIG_ATOMIC64_SELFTEST=y
CONFIG_TEST_BPF=m
-CONFIG_BUG_ON_DATA_CORRUPTION=y
-CONFIG_S390_PTDUMP=y
+CONFIG_TEST_LIVEPATCH=m
diff --git a/arch/s390/configs/kasan.config b/arch/s390/configs/kasan.config
new file mode 100644
index 000000000000..84c2b551e992
--- /dev/null
+++ b/arch/s390/configs/kasan.config
@@ -0,0 +1,4 @@
+# Help: Enable KASan for debugging
+CONFIG_KASAN=y
+CONFIG_KASAN_INLINE=y
+CONFIG_KASAN_VMALLOC=y
diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig
index 20c51e5d9353..30d2a1687665 100644
--- a/arch/s390/configs/zfcpdump_defconfig
+++ b/arch/s390/configs/zfcpdump_defconfig
@@ -1,38 +1,39 @@
-# CONFIG_SWAP is not set
CONFIG_NO_HZ_IDLE=y
CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BPF_SYSCALL=y
# CONFIG_CPU_ISOLATION is not set
# CONFIG_UTS_NS is not set
+# CONFIG_TIME_NS is not set
# CONFIG_PID_NS is not set
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_COMPAT_BRK is not set
-CONFIG_TUNE_ZEC12=y
-# CONFIG_COMPAT is not set
+CONFIG_CRASH_DUMP=y
+CONFIG_MARCH_Z13=y
CONFIG_NR_CPUS=2
CONFIG_HZ_100=y
-# CONFIG_ARCH_RANDOM is not set
-# CONFIG_RELOCATABLE is not set
# CONFIG_CHSC_SCH is not set
# CONFIG_SCM_BUS is not set
-CONFIG_CRASH_DUMP=y
-# CONFIG_SECCOMP is not set
# CONFIG_PFAULT is not set
-# CONFIG_S390_HYPFS_FS is not set
+# CONFIG_S390_HYPFS is not set
# CONFIG_VIRTUALIZATION is not set
# CONFIG_S390_GUEST is not set
+# CONFIG_SECCOMP is not set
+# CONFIG_GCC_PLUGINS is not set
+# CONFIG_BLOCK_LEGACY_AUTOLOAD is not set
CONFIG_PARTITION_ADVANCED=y
-CONFIG_IBM_PARTITION=y
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_SWAP is not set
+# CONFIG_COMPAT_BRK is not set
# CONFIG_COMPACTION is not set
# CONFIG_MIGRATION is not set
-# CONFIG_BOUNCE is not set
CONFIG_NET=y
# CONFIG_IUCV is not set
+# CONFIG_PCPU_DEV_REFCNT is not set
+# CONFIG_ETHTOOL_NETLINK is not set
CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_SAFE=y
CONFIG_BLK_DEV_RAM=y
-# CONFIG_BLK_DEV_XPRAM is not set
# CONFIG_DCSSBLK is not set
# CONFIG_DASD is not set
CONFIG_ENCLOSURE_SERVICES=y
@@ -46,28 +47,34 @@ CONFIG_ZFCP=y
# CONFIG_INPUT_KEYBOARD is not set
# CONFIG_INPUT_MOUSE is not set
# CONFIG_SERIO is not set
+# CONFIG_LEGACY_TIOCSTI is not set
# CONFIG_HVC_IUCV is not set
# CONFIG_HW_RANDOM_S390 is not set
-CONFIG_RAW_DRIVER=y
# CONFIG_HMC_DRV is not set
# CONFIG_S390_TAPE is not set
# CONFIG_VMCP is not set
# CONFIG_MONWRITER is not set
# CONFIG_S390_VMUR is not set
-# CONFIG_HID is not set
+# CONFIG_HID_SUPPORT is not set
+# CONFIG_VIRTIO_MENU is not set
+# CONFIG_VHOST_MENU is not set
# CONFIG_IOMMU_SUPPORT is not set
# CONFIG_DNOTIFY is not set
# CONFIG_INOTIFY_USER is not set
-CONFIG_CONFIGFS_FS=y
# CONFIG_MISC_FILESYSTEMS is not set
# CONFIG_NETWORK_FILESYSTEMS is not set
CONFIG_LSM="yama,loadpin,safesetid,integrity"
+CONFIG_INIT_STACK_NONE=y
+# CONFIG_ZLIB_DFLTCC is not set
+CONFIG_XZ_DEC_MICROLZMA=y
CONFIG_PRINTK_TIME=y
-CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_FS=y
+# CONFIG_SYMBOLIC_ERRNAME is not set
CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_INFO_DWARF4=y
+CONFIG_DEBUG_FS=y
CONFIG_PANIC_ON_OOPS=y
# CONFIG_SCHED_DEBUG is not set
CONFIG_RCU_CPU_STALL_TIMEOUT=60
+# CONFIG_RCU_TRACE is not set
# CONFIG_FTRACE is not set
# CONFIG_RUNTIME_TESTING_MENU is not set
diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig
new file mode 100644
index 000000000000..06ee706b0d78
--- /dev/null
+++ b/arch/s390/crypto/Kconfig
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menu "Accelerated Cryptographic Algorithms for CPU (s390)"
+
+config CRYPTO_CRC32_S390
+ tristate "CRC32c and CRC32"
+ depends on S390
+ select CRYPTO_HASH
+ select CRC32
+ help
+ CRC32c and CRC32 CRC algorithms
+
+ Architecture: s390
+
+ It is available with IBM z13 or later.
+
+config CRYPTO_SHA512_S390
+ tristate "Hash functions: SHA-384 and SHA-512"
+ depends on S390
+ select CRYPTO_HASH
+ help
+ SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
+
+ Architecture: s390
+
+ It is available as of z10.
+
+config CRYPTO_SHA1_S390
+ tristate "Hash functions: SHA-1"
+ depends on S390
+ select CRYPTO_HASH
+ help
+ SHA-1 secure hash algorithm (FIPS 180)
+
+ Architecture: s390
+
+ It is available as of z990.
+
+config CRYPTO_SHA256_S390
+ tristate "Hash functions: SHA-224 and SHA-256"
+ depends on S390
+ select CRYPTO_HASH
+ help
+ SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
+
+ Architecture: s390
+
+ It is available as of z9.
+
+config CRYPTO_SHA3_256_S390
+ tristate "Hash functions: SHA3-224 and SHA3-256"
+ depends on S390
+ select CRYPTO_HASH
+ help
+ SHA3-224 and SHA3-256 secure hash algorithms (FIPS 202)
+
+ Architecture: s390
+
+ It is available as of z14.
+
+config CRYPTO_SHA3_512_S390
+ tristate "Hash functions: SHA3-384 and SHA3-512"
+ depends on S390
+ select CRYPTO_HASH
+ help
+ SHA3-384 and SHA3-512 secure hash algorithms (FIPS 202)
+
+ Architecture: s390
+
+ It is available as of z14.
+
+config CRYPTO_GHASH_S390
+ tristate "Hash functions: GHASH"
+ depends on S390
+ select CRYPTO_HASH
+ help
+ GCM GHASH hash function (NIST SP800-38D)
+
+ Architecture: s390
+
+ It is available as of z196.
+
+config CRYPTO_AES_S390
+ tristate "Ciphers: AES, modes: ECB, CBC, CTR, XTS, GCM"
+ depends on S390
+ select CRYPTO_ALGAPI
+ select CRYPTO_SKCIPHER
+ help
+ Block cipher: AES cipher algorithms (FIPS 197)
+ AEAD cipher: AES with GCM
+ Length-preserving ciphers: AES with ECB, CBC, XTS, and CTR modes
+
+ Architecture: s390
+
+ As of z9 the ECB and CBC modes are hardware accelerated
+ for 128 bit keys.
+
+ As of z10 the ECB and CBC modes are hardware accelerated
+ for all AES key sizes.
+
+ As of z196 the CTR mode is hardware accelerated for all AES
+ key sizes and XTS mode is hardware accelerated for 256 and
+ 512 bit keys.
+
+config CRYPTO_DES_S390
+ tristate "Ciphers: DES and Triple DES EDE, modes: ECB, CBC, CTR"
+ depends on S390
+ select CRYPTO_ALGAPI
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_DES
+ help
+ Block ciphers: DES (FIPS 46-2) cipher algorithm
+ Block ciphers: Triple DES EDE (FIPS 46-3) cipher algorithm
+ Length-preserving ciphers: DES with ECB, CBC, and CTR modes
+ Length-preserving ciphers: Triple DES EDED with ECB, CBC, and CTR modes
+
+ Architecture: s390
+
+ As of z990 the ECB and CBC mode are hardware accelerated.
+ As of z196 the CTR mode is hardware accelerated.
+
+config CRYPTO_CHACHA_S390
+ tristate "Ciphers: ChaCha20"
+ depends on S390
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_CHACHA_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ help
+ Length-preserving cipher: ChaCha20 stream cipher (RFC 7539)
+
+ Architecture: s390
+
+ It is available as of z13.
+
+endmenu
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 12889d4652cc..1b1cc478fa94 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -11,9 +11,11 @@ obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o
obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o
+obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
obj-$(CONFIG_S390_PRNG) += prng.o
obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o
-obj-$(CONFIG_ARCH_RANDOM) += arch_random.o
+obj-y += arch_random.o
crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o
+chacha_s390-y := chacha-glue.o chacha-s390.o
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index ead0b2c9881d..c6fe5405de4a 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -21,6 +21,7 @@
#include <crypto/algapi.h>
#include <crypto/ghash.h>
#include <crypto/internal/aead.h>
+#include <crypto/internal/cipher.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <linux/err.h>
@@ -72,19 +73,12 @@ static int setkey_fallback_cip(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
- int ret;
sctx->fallback.cip->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
sctx->fallback.cip->base.crt_flags |= (tfm->crt_flags &
CRYPTO_TFM_REQ_MASK);
- ret = crypto_cipher_setkey(sctx->fallback.cip, in_key, key_len);
- if (ret) {
- tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
- tfm->crt_flags |= (sctx->fallback.cip->base.crt_flags &
- CRYPTO_TFM_RES_MASK);
- }
- return ret;
+ return crypto_cipher_setkey(sctx->fallback.cip, in_key, key_len);
}
static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
@@ -182,18 +176,13 @@ static int setkey_fallback_skcipher(struct crypto_skcipher *tfm, const u8 *key,
unsigned int len)
{
struct s390_aes_ctx *sctx = crypto_skcipher_ctx(tfm);
- int ret;
crypto_skcipher_clear_flags(sctx->fallback.skcipher,
CRYPTO_TFM_REQ_MASK);
crypto_skcipher_set_flags(sctx->fallback.skcipher,
crypto_skcipher_get_flags(tfm) &
CRYPTO_TFM_REQ_MASK);
- ret = crypto_skcipher_setkey(sctx->fallback.skcipher, key, len);
- crypto_skcipher_set_flags(tfm,
- crypto_skcipher_get_flags(sctx->fallback.skcipher) &
- CRYPTO_TFM_RES_MASK);
- return ret;
+ return crypto_skcipher_setkey(sctx->fallback.skcipher, key, len);
}
static int fallback_skcipher_crypt(struct s390_aes_ctx *sctx,
@@ -354,6 +343,7 @@ static int cbc_aes_crypt(struct skcipher_request *req, unsigned long modifier)
memcpy(walk.iv, param.iv, AES_BLOCK_SIZE);
ret = skcipher_walk_done(&walk, nbytes - n);
}
+ memzero_explicit(&param, sizeof(param));
return ret;
}
@@ -389,17 +379,12 @@ static int xts_fallback_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int len)
{
struct s390_xts_ctx *xts_ctx = crypto_skcipher_ctx(tfm);
- int ret;
crypto_skcipher_clear_flags(xts_ctx->fallback, CRYPTO_TFM_REQ_MASK);
crypto_skcipher_set_flags(xts_ctx->fallback,
crypto_skcipher_get_flags(tfm) &
CRYPTO_TFM_REQ_MASK);
- ret = crypto_skcipher_setkey(xts_ctx->fallback, key, len);
- crypto_skcipher_set_flags(tfm,
- crypto_skcipher_get_flags(xts_ctx->fallback) &
- CRYPTO_TFM_RES_MASK);
- return ret;
+ return crypto_skcipher_setkey(xts_ctx->fallback, key, len);
}
static int xts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
@@ -413,12 +398,6 @@ static int xts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
if (err)
return err;
- /* In fips mode only 128 bit or 256 bit keys are valid */
- if (fips_enabled && key_len != 32 && key_len != 64) {
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
-
/* Pick the correct function code based on the key length */
fc = (key_len == 32) ? CPACF_KM_XTS_128 :
(key_len == 64) ? CPACF_KM_XTS_256 : 0;
@@ -489,6 +468,8 @@ static int xts_aes_crypt(struct skcipher_request *req, unsigned long modifier)
walk.dst.virt.addr, walk.src.virt.addr, n);
ret = skcipher_walk_done(&walk, nbytes - n);
}
+ memzero_explicit(&pcc_param, sizeof(pcc_param));
+ memzero_explicit(&xts_param, sizeof(xts_param));
return ret;
}
@@ -616,7 +597,9 @@ static int ctr_aes_crypt(struct skcipher_request *req)
* final block may be < AES_BLOCK_SIZE, copy only nbytes
*/
if (nbytes) {
- cpacf_kmctr(sctx->fc, sctx->key, buf, walk.src.virt.addr,
+ memset(buf, 0, AES_BLOCK_SIZE);
+ memcpy(buf, walk.src.virt.addr, nbytes);
+ cpacf_kmctr(sctx->fc, sctx->key, buf, buf,
AES_BLOCK_SIZE, walk.iv);
memcpy(walk.dst.virt.addr, buf, nbytes);
crypto_inc(walk.iv, AES_BLOCK_SIZE);
@@ -716,7 +699,7 @@ static inline void _gcm_sg_unmap_and_advance(struct gcm_sg_walk *gw,
unsigned int nbytes)
{
gw->walk_bytes_remain -= nbytes;
- scatterwalk_unmap(&gw->walk);
+ scatterwalk_unmap(gw->walk_ptr);
scatterwalk_advance(&gw->walk, nbytes);
scatterwalk_done(&gw->walk, 0, gw->walk_bytes_remain);
gw->walk_ptr = NULL;
@@ -791,7 +774,7 @@ static int gcm_out_walk_go(struct gcm_sg_walk *gw, unsigned int minbytesneeded)
goto out;
}
- scatterwalk_unmap(&gw->walk);
+ scatterwalk_unmap(gw->walk_ptr);
gw->walk_ptr = NULL;
gw->ptr = gw->buf;
@@ -1064,10 +1047,11 @@ out_err:
return ret;
}
-module_cpu_feature_match(MSA, aes_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, aes_s390_init);
module_exit(aes_s390_fini);
MODULE_ALIAS_CRYPTO("aes-all");
MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
MODULE_LICENSE("GPL");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c
index dd95cdbd22ce..a8a2407381af 100644
--- a/arch/s390/crypto/arch_random.c
+++ b/arch/s390/crypto/arch_random.c
@@ -2,122 +2,18 @@
/*
* s390 arch random implementation.
*
- * Copyright IBM Corp. 2017, 2018
+ * Copyright IBM Corp. 2017, 2020
* Author(s): Harald Freudenberger
- *
- * The s390_arch_random_generate() function may be called from random.c
- * in interrupt context. So this implementation does the best to be very
- * fast. There is a buffer of random data which is asynchronously checked
- * and filled by a workqueue thread.
- * If there are enough bytes in the buffer the s390_arch_random_generate()
- * just delivers these bytes. Otherwise false is returned until the
- * worker thread refills the buffer.
- * The worker fills the rng buffer by pulling fresh entropy from the
- * high quality (but slow) true hardware random generator. This entropy
- * is then spread over the buffer with an pseudo random generator PRNG.
- * As the arch_get_random_seed_long() fetches 8 bytes and the calling
- * function add_interrupt_randomness() counts this as 1 bit entropy the
- * distribution needs to make sure there is in fact 1 bit entropy contained
- * in 8 bytes of the buffer. The current values pull 32 byte entropy
- * and scatter this into a 2048 byte buffer. So 8 byte in the buffer
- * will contain 1 bit of entropy.
- * The worker thread is rescheduled based on the charge level of the
- * buffer but at least with 500 ms delay to avoid too much CPU consumption.
- * So the max. amount of rng data delivered via arch_get_random_seed is
- * limited to 4k bytes per second.
*/
#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/random.h>
-#include <linux/slab.h>
#include <linux/static_key.h>
-#include <linux/workqueue.h>
+#include <asm/archrandom.h>
#include <asm/cpacf.h>
DEFINE_STATIC_KEY_FALSE(s390_arch_random_available);
atomic64_t s390_arch_random_counter = ATOMIC64_INIT(0);
EXPORT_SYMBOL(s390_arch_random_counter);
-
-#define ARCH_REFILL_TICKS (HZ/2)
-#define ARCH_PRNG_SEED_SIZE 32
-#define ARCH_RNG_BUF_SIZE 2048
-
-static DEFINE_SPINLOCK(arch_rng_lock);
-static u8 *arch_rng_buf;
-static unsigned int arch_rng_buf_idx;
-
-static void arch_rng_refill_buffer(struct work_struct *);
-static DECLARE_DELAYED_WORK(arch_rng_work, arch_rng_refill_buffer);
-
-bool s390_arch_random_generate(u8 *buf, unsigned int nbytes)
-{
- /* lock rng buffer */
- if (!spin_trylock(&arch_rng_lock))
- return false;
-
- /* try to resolve the requested amount of bytes from the buffer */
- arch_rng_buf_idx -= nbytes;
- if (arch_rng_buf_idx < ARCH_RNG_BUF_SIZE) {
- memcpy(buf, arch_rng_buf + arch_rng_buf_idx, nbytes);
- atomic64_add(nbytes, &s390_arch_random_counter);
- spin_unlock(&arch_rng_lock);
- return true;
- }
-
- /* not enough bytes in rng buffer, refill is done asynchronously */
- spin_unlock(&arch_rng_lock);
-
- return false;
-}
-EXPORT_SYMBOL(s390_arch_random_generate);
-
-static void arch_rng_refill_buffer(struct work_struct *unused)
-{
- unsigned int delay = ARCH_REFILL_TICKS;
-
- spin_lock(&arch_rng_lock);
- if (arch_rng_buf_idx > ARCH_RNG_BUF_SIZE) {
- /* buffer is exhausted and needs refill */
- u8 seed[ARCH_PRNG_SEED_SIZE];
- u8 prng_wa[240];
- /* fetch ARCH_PRNG_SEED_SIZE bytes of entropy */
- cpacf_trng(NULL, 0, seed, sizeof(seed));
- /* blow this entropy up to ARCH_RNG_BUF_SIZE with PRNG */
- memset(prng_wa, 0, sizeof(prng_wa));
- cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED,
- &prng_wa, NULL, 0, seed, sizeof(seed));
- cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN,
- &prng_wa, arch_rng_buf, ARCH_RNG_BUF_SIZE, NULL, 0);
- arch_rng_buf_idx = ARCH_RNG_BUF_SIZE;
- }
- delay += (ARCH_REFILL_TICKS * arch_rng_buf_idx) / ARCH_RNG_BUF_SIZE;
- spin_unlock(&arch_rng_lock);
-
- /* kick next check */
- queue_delayed_work(system_long_wq, &arch_rng_work, delay);
-}
-
-static int __init s390_arch_random_init(void)
-{
- /* all the needed PRNO subfunctions available ? */
- if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG) &&
- cpacf_query_func(CPACF_PRNO, CPACF_PRNO_SHA512_DRNG_GEN)) {
-
- /* alloc arch random working buffer */
- arch_rng_buf = kmalloc(ARCH_RNG_BUF_SIZE, GFP_KERNEL);
- if (!arch_rng_buf)
- return -ENOMEM;
-
- /* kick worker queue job to fill the random buffer */
- queue_delayed_work(system_long_wq,
- &arch_rng_work, ARCH_REFILL_TICKS);
-
- /* enable arch random to the outside world */
- static_branch_enable(&s390_arch_random_available);
- }
-
- return 0;
-}
-arch_initcall(s390_arch_random_init);
diff --git a/arch/s390/crypto/chacha-glue.c b/arch/s390/crypto/chacha-glue.c
new file mode 100644
index 000000000000..ed9959e6f714
--- /dev/null
+++ b/arch/s390/crypto/chacha-glue.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * s390 ChaCha stream cipher.
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#define KMSG_COMPONENT "chacha_s390"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <crypto/internal/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/algapi.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <asm/fpu/api.h>
+#include "chacha-s390.h"
+
+static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src,
+ unsigned int nbytes, const u32 *key,
+ u32 *counter)
+{
+ struct kernel_fpu vxstate;
+
+ kernel_fpu_begin(&vxstate, KERNEL_VXR);
+ chacha20_vx(dst, src, nbytes, key, counter);
+ kernel_fpu_end(&vxstate, KERNEL_VXR);
+
+ *counter += round_up(nbytes, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static int chacha20_s390(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ u32 state[CHACHA_STATE_WORDS] __aligned(16);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int rc;
+
+ rc = skcipher_walk_virt(&walk, req, false);
+ chacha_init_generic(state, ctx->key, req->iv);
+
+ while (walk.nbytes > 0) {
+ nbytes = walk.nbytes;
+ if (nbytes < walk.total)
+ nbytes = round_down(nbytes, walk.stride);
+
+ if (nbytes <= CHACHA_BLOCK_SIZE) {
+ chacha_crypt_generic(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes,
+ ctx->nrounds);
+ } else {
+ chacha20_crypt_s390(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes,
+ &state[4], &state[12]);
+ }
+ rc = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+ return rc;
+}
+
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+ /* TODO: implement hchacha_block_arch() in assembly */
+ hchacha_block_generic(state, stream, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+ chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ /* s390 chacha20 implementation has 20 rounds hard-coded,
+ * it cannot handle a block of data or less, but otherwise
+ * it can handle data of arbitrary size
+ */
+ if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx())
+ chacha_crypt_generic(state, dst, src, bytes, nrounds);
+ else
+ chacha20_crypt_s390(state, dst, src, bytes,
+ &state[4], &state[12]);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+static struct skcipher_alg chacha_algs[] = {
+ {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-s390",
+ .base.cra_priority = 900,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = chacha20_setkey,
+ .encrypt = chacha20_s390,
+ .decrypt = chacha20_s390,
+ }
+};
+
+static int __init chacha_mod_init(void)
+{
+ return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ?
+ crypto_register_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs)) : 0;
+}
+
+static void __exit chacha_mod_fini(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER))
+ crypto_unregister_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs));
+}
+
+module_cpu_feature_match(S390_CPU_FEATURE_VXRS, chacha_mod_init);
+module_exit(chacha_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha20 stream cipher");
+MODULE_LICENSE("GPL v2");
+
+MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/s390/crypto/chacha-s390.S b/arch/s390/crypto/chacha-s390.S
new file mode 100644
index 000000000000..37cb63f25b17
--- /dev/null
+++ b/arch/s390/crypto/chacha-s390.S
@@ -0,0 +1,908 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Original implementation written by Andy Polyakov, @dot-asm.
+ * This is an adaptation of the original code for kernel use.
+ *
+ * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/nospec-insn.h>
+#include <asm/vx-insn.h>
+
+#define SP %r15
+#define FRAME (16 * 8 + 4 * 8)
+
+ .data
+ .balign 32
+
+SYM_DATA_START_LOCAL(sigma)
+ .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
+ .long 1,0,0,0
+ .long 2,0,0,0
+ .long 3,0,0,0
+ .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap
+
+ .long 0,1,2,3
+ .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma
+ .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e
+ .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32
+ .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574
+SYM_DATA_END(sigma)
+
+ .previous
+
+ GEN_BR_THUNK %r14
+
+ .text
+
+#############################################################################
+# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
+# counst u32 *key, const u32 *counter)
+
+#define OUT %r2
+#define INP %r3
+#define LEN %r4
+#define KEY %r5
+#define COUNTER %r6
+
+#define BEPERM %v31
+#define CTR %v26
+
+#define K0 %v16
+#define K1 %v17
+#define K2 %v18
+#define K3 %v19
+
+#define XA0 %v0
+#define XA1 %v1
+#define XA2 %v2
+#define XA3 %v3
+
+#define XB0 %v4
+#define XB1 %v5
+#define XB2 %v6
+#define XB3 %v7
+
+#define XC0 %v8
+#define XC1 %v9
+#define XC2 %v10
+#define XC3 %v11
+
+#define XD0 %v12
+#define XD1 %v13
+#define XD2 %v14
+#define XD3 %v15
+
+#define XT0 %v27
+#define XT1 %v28
+#define XT2 %v29
+#define XT3 %v30
+
+SYM_FUNC_START(chacha20_vx_4x)
+ stmg %r6,%r7,6*8(SP)
+
+ larl %r7,sigma
+ lhi %r0,10
+ lhi %r1,0
+
+ VL K0,0,,%r7 # load sigma
+ VL K1,0,,KEY # load key
+ VL K2,16,,KEY
+ VL K3,0,,COUNTER # load counter
+
+ VL BEPERM,0x40,,%r7
+ VL CTR,0x50,,%r7
+
+ VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma
+
+ VREPF XB0,K1,0 # smash the key
+ VREPF XB1,K1,1
+ VREPF XB2,K1,2
+ VREPF XB3,K1,3
+
+ VREPF XD0,K3,0
+ VREPF XD1,K3,1
+ VREPF XD2,K3,2
+ VREPF XD3,K3,3
+ VAF XD0,XD0,CTR
+
+ VREPF XC0,K2,0
+ VREPF XC1,K2,1
+ VREPF XC2,K2,2
+ VREPF XC3,K2,3
+
+.Loop_4x:
+ VAF XA0,XA0,XB0
+ VX XD0,XD0,XA0
+ VERLLF XD0,XD0,16
+
+ VAF XA1,XA1,XB1
+ VX XD1,XD1,XA1
+ VERLLF XD1,XD1,16
+
+ VAF XA2,XA2,XB2
+ VX XD2,XD2,XA2
+ VERLLF XD2,XD2,16
+
+ VAF XA3,XA3,XB3
+ VX XD3,XD3,XA3
+ VERLLF XD3,XD3,16
+
+ VAF XC0,XC0,XD0
+ VX XB0,XB0,XC0
+ VERLLF XB0,XB0,12
+
+ VAF XC1,XC1,XD1
+ VX XB1,XB1,XC1
+ VERLLF XB1,XB1,12
+
+ VAF XC2,XC2,XD2
+ VX XB2,XB2,XC2
+ VERLLF XB2,XB2,12
+
+ VAF XC3,XC3,XD3
+ VX XB3,XB3,XC3
+ VERLLF XB3,XB3,12
+
+ VAF XA0,XA0,XB0
+ VX XD0,XD0,XA0
+ VERLLF XD0,XD0,8
+
+ VAF XA1,XA1,XB1
+ VX XD1,XD1,XA1
+ VERLLF XD1,XD1,8
+
+ VAF XA2,XA2,XB2
+ VX XD2,XD2,XA2
+ VERLLF XD2,XD2,8
+
+ VAF XA3,XA3,XB3
+ VX XD3,XD3,XA3
+ VERLLF XD3,XD3,8
+
+ VAF XC0,XC0,XD0
+ VX XB0,XB0,XC0
+ VERLLF XB0,XB0,7
+
+ VAF XC1,XC1,XD1
+ VX XB1,XB1,XC1
+ VERLLF XB1,XB1,7
+
+ VAF XC2,XC2,XD2
+ VX XB2,XB2,XC2
+ VERLLF XB2,XB2,7
+
+ VAF XC3,XC3,XD3
+ VX XB3,XB3,XC3
+ VERLLF XB3,XB3,7
+
+ VAF XA0,XA0,XB1
+ VX XD3,XD3,XA0
+ VERLLF XD3,XD3,16
+
+ VAF XA1,XA1,XB2
+ VX XD0,XD0,XA1
+ VERLLF XD0,XD0,16
+
+ VAF XA2,XA2,XB3
+ VX XD1,XD1,XA2
+ VERLLF XD1,XD1,16
+
+ VAF XA3,XA3,XB0
+ VX XD2,XD2,XA3
+ VERLLF XD2,XD2,16
+
+ VAF XC2,XC2,XD3
+ VX XB1,XB1,XC2
+ VERLLF XB1,XB1,12
+
+ VAF XC3,XC3,XD0
+ VX XB2,XB2,XC3
+ VERLLF XB2,XB2,12
+
+ VAF XC0,XC0,XD1
+ VX XB3,XB3,XC0
+ VERLLF XB3,XB3,12
+
+ VAF XC1,XC1,XD2
+ VX XB0,XB0,XC1
+ VERLLF XB0,XB0,12
+
+ VAF XA0,XA0,XB1
+ VX XD3,XD3,XA0
+ VERLLF XD3,XD3,8
+
+ VAF XA1,XA1,XB2
+ VX XD0,XD0,XA1
+ VERLLF XD0,XD0,8
+
+ VAF XA2,XA2,XB3
+ VX XD1,XD1,XA2
+ VERLLF XD1,XD1,8
+
+ VAF XA3,XA3,XB0
+ VX XD2,XD2,XA3
+ VERLLF XD2,XD2,8
+
+ VAF XC2,XC2,XD3
+ VX XB1,XB1,XC2
+ VERLLF XB1,XB1,7
+
+ VAF XC3,XC3,XD0
+ VX XB2,XB2,XC3
+ VERLLF XB2,XB2,7
+
+ VAF XC0,XC0,XD1
+ VX XB3,XB3,XC0
+ VERLLF XB3,XB3,7
+
+ VAF XC1,XC1,XD2
+ VX XB0,XB0,XC1
+ VERLLF XB0,XB0,7
+ brct %r0,.Loop_4x
+
+ VAF XD0,XD0,CTR
+
+ VMRHF XT0,XA0,XA1 # transpose data
+ VMRHF XT1,XA2,XA3
+ VMRLF XT2,XA0,XA1
+ VMRLF XT3,XA2,XA3
+ VPDI XA0,XT0,XT1,0b0000
+ VPDI XA1,XT0,XT1,0b0101
+ VPDI XA2,XT2,XT3,0b0000
+ VPDI XA3,XT2,XT3,0b0101
+
+ VMRHF XT0,XB0,XB1
+ VMRHF XT1,XB2,XB3
+ VMRLF XT2,XB0,XB1
+ VMRLF XT3,XB2,XB3
+ VPDI XB0,XT0,XT1,0b0000
+ VPDI XB1,XT0,XT1,0b0101
+ VPDI XB2,XT2,XT3,0b0000
+ VPDI XB3,XT2,XT3,0b0101
+
+ VMRHF XT0,XC0,XC1
+ VMRHF XT1,XC2,XC3
+ VMRLF XT2,XC0,XC1
+ VMRLF XT3,XC2,XC3
+ VPDI XC0,XT0,XT1,0b0000
+ VPDI XC1,XT0,XT1,0b0101
+ VPDI XC2,XT2,XT3,0b0000
+ VPDI XC3,XT2,XT3,0b0101
+
+ VMRHF XT0,XD0,XD1
+ VMRHF XT1,XD2,XD3
+ VMRLF XT2,XD0,XD1
+ VMRLF XT3,XD2,XD3
+ VPDI XD0,XT0,XT1,0b0000
+ VPDI XD1,XT0,XT1,0b0101
+ VPDI XD2,XT2,XT3,0b0000
+ VPDI XD3,XT2,XT3,0b0101
+
+ VAF XA0,XA0,K0
+ VAF XB0,XB0,K1
+ VAF XC0,XC0,K2
+ VAF XD0,XD0,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+
+ VAF XA0,XA1,K0
+ VAF XB0,XB1,K1
+ VAF XC0,XC1,K2
+ VAF XD0,XD1,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_4x
+
+ VAF XA0,XA2,K0
+ VAF XB0,XB2,K1
+ VAF XC0,XC2,K2
+ VAF XD0,XD2,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_4x
+
+ VAF XA0,XA3,K0
+ VAF XB0,XB3,K1
+ VAF XC0,XC3,K2
+ VAF XD0,XD3,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+.Ldone_4x:
+ lmg %r6,%r7,6*8(SP)
+ BR_EX %r14
+
+.Ltail_4x:
+ VLR XT0,XC0
+ VLR XT1,XD0
+
+ VST XA0,8*8+0x00,,SP
+ VST XB0,8*8+0x10,,SP
+ VST XT0,8*8+0x20,,SP
+ VST XT1,8*8+0x30,,SP
+
+ lghi %r1,0
+
+.Loop_tail_4x:
+ llgc %r5,0(%r1,INP)
+ llgc %r6,8*8(%r1,SP)
+ xr %r6,%r5
+ stc %r6,0(%r1,OUT)
+ la %r1,1(%r1)
+ brct LEN,.Loop_tail_4x
+
+ lmg %r6,%r7,6*8(SP)
+ BR_EX %r14
+SYM_FUNC_END(chacha20_vx_4x)
+
+#undef OUT
+#undef INP
+#undef LEN
+#undef KEY
+#undef COUNTER
+
+#undef BEPERM
+
+#undef K0
+#undef K1
+#undef K2
+#undef K3
+
+
+#############################################################################
+# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
+# counst u32 *key, const u32 *counter)
+
+#define OUT %r2
+#define INP %r3
+#define LEN %r4
+#define KEY %r5
+#define COUNTER %r6
+
+#define BEPERM %v31
+
+#define K0 %v27
+#define K1 %v24
+#define K2 %v25
+#define K3 %v26
+
+#define A0 %v0
+#define B0 %v1
+#define C0 %v2
+#define D0 %v3
+
+#define A1 %v4
+#define B1 %v5
+#define C1 %v6
+#define D1 %v7
+
+#define A2 %v8
+#define B2 %v9
+#define C2 %v10
+#define D2 %v11
+
+#define A3 %v12
+#define B3 %v13
+#define C3 %v14
+#define D3 %v15
+
+#define A4 %v16
+#define B4 %v17
+#define C4 %v18
+#define D4 %v19
+
+#define A5 %v20
+#define B5 %v21
+#define C5 %v22
+#define D5 %v23
+
+#define T0 %v27
+#define T1 %v28
+#define T2 %v29
+#define T3 %v30
+
+SYM_FUNC_START(chacha20_vx)
+ clgfi LEN,256
+ jle chacha20_vx_4x
+ stmg %r6,%r7,6*8(SP)
+
+ lghi %r1,-FRAME
+ lgr %r0,SP
+ la SP,0(%r1,SP)
+ stg %r0,0(SP) # back-chain
+
+ larl %r7,sigma
+ lhi %r0,10
+
+ VLM K1,K2,0,KEY,0 # load key
+ VL K3,0,,COUNTER # load counter
+
+ VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ...
+
+.Loop_outer_vx:
+ VLR A0,K0
+ VLR B0,K1
+ VLR A1,K0
+ VLR B1,K1
+ VLR A2,K0
+ VLR B2,K1
+ VLR A3,K0
+ VLR B3,K1
+ VLR A4,K0
+ VLR B4,K1
+ VLR A5,K0
+ VLR B5,K1
+
+ VLR D0,K3
+ VAF D1,K3,T1 # K[3]+1
+ VAF D2,K3,T2 # K[3]+2
+ VAF D3,K3,T3 # K[3]+3
+ VAF D4,D2,T2 # K[3]+4
+ VAF D5,D2,T3 # K[3]+5
+
+ VLR C0,K2
+ VLR C1,K2
+ VLR C2,K2
+ VLR C3,K2
+ VLR C4,K2
+ VLR C5,K2
+
+ VLR T1,D1
+ VLR T2,D2
+ VLR T3,D3
+
+.Loop_vx:
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,16
+ VERLLF D1,D1,16
+ VERLLF D2,D2,16
+ VERLLF D3,D3,16
+ VERLLF D4,D4,16
+ VERLLF D5,D5,16
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,12
+ VERLLF B1,B1,12
+ VERLLF B2,B2,12
+ VERLLF B3,B3,12
+ VERLLF B4,B4,12
+ VERLLF B5,B5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,8
+ VERLLF D1,D1,8
+ VERLLF D2,D2,8
+ VERLLF D3,D3,8
+ VERLLF D4,D4,8
+ VERLLF D5,D5,8
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,7
+ VERLLF B1,B1,7
+ VERLLF B2,B2,7
+ VERLLF B3,B3,7
+ VERLLF B4,B4,7
+ VERLLF B5,B5,7
+
+ VSLDB C0,C0,C0,8
+ VSLDB C1,C1,C1,8
+ VSLDB C2,C2,C2,8
+ VSLDB C3,C3,C3,8
+ VSLDB C4,C4,C4,8
+ VSLDB C5,C5,C5,8
+ VSLDB B0,B0,B0,4
+ VSLDB B1,B1,B1,4
+ VSLDB B2,B2,B2,4
+ VSLDB B3,B3,B3,4
+ VSLDB B4,B4,B4,4
+ VSLDB B5,B5,B5,4
+ VSLDB D0,D0,D0,12
+ VSLDB D1,D1,D1,12
+ VSLDB D2,D2,D2,12
+ VSLDB D3,D3,D3,12
+ VSLDB D4,D4,D4,12
+ VSLDB D5,D5,D5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,16
+ VERLLF D1,D1,16
+ VERLLF D2,D2,16
+ VERLLF D3,D3,16
+ VERLLF D4,D4,16
+ VERLLF D5,D5,16
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,12
+ VERLLF B1,B1,12
+ VERLLF B2,B2,12
+ VERLLF B3,B3,12
+ VERLLF B4,B4,12
+ VERLLF B5,B5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,8
+ VERLLF D1,D1,8
+ VERLLF D2,D2,8
+ VERLLF D3,D3,8
+ VERLLF D4,D4,8
+ VERLLF D5,D5,8
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,7
+ VERLLF B1,B1,7
+ VERLLF B2,B2,7
+ VERLLF B3,B3,7
+ VERLLF B4,B4,7
+ VERLLF B5,B5,7
+
+ VSLDB C0,C0,C0,8
+ VSLDB C1,C1,C1,8
+ VSLDB C2,C2,C2,8
+ VSLDB C3,C3,C3,8
+ VSLDB C4,C4,C4,8
+ VSLDB C5,C5,C5,8
+ VSLDB B0,B0,B0,12
+ VSLDB B1,B1,B1,12
+ VSLDB B2,B2,B2,12
+ VSLDB B3,B3,B3,12
+ VSLDB B4,B4,B4,12
+ VSLDB B5,B5,B5,12
+ VSLDB D0,D0,D0,4
+ VSLDB D1,D1,D1,4
+ VSLDB D2,D2,D2,4
+ VSLDB D3,D3,D3,4
+ VSLDB D4,D4,D4,4
+ VSLDB D5,D5,D5,4
+ brct %r0,.Loop_vx
+
+ VAF A0,A0,K0
+ VAF B0,B0,K1
+ VAF C0,C0,K2
+ VAF D0,D0,K3
+ VAF A1,A1,K0
+ VAF D1,D1,T1 # +K[3]+1
+
+ VPERM A0,A0,A0,BEPERM
+ VPERM B0,B0,B0,BEPERM
+ VPERM C0,C0,C0,BEPERM
+ VPERM D0,D0,D0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VAF D2,D2,T2 # +K[3]+2
+ VAF D3,D3,T3 # +K[3]+3
+ VLM T0,T3,0,INP,0
+
+ VX A0,A0,T0
+ VX B0,B0,T1
+ VX C0,C0,T2
+ VX D0,D0,T3
+
+ VLM K0,T3,0,%r7,4 # re-load sigma and increments
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF B1,B1,K1
+ VAF C1,C1,K2
+
+ VPERM A0,A1,A1,BEPERM
+ VPERM B0,B1,B1,BEPERM
+ VPERM C0,C1,C1,BEPERM
+ VPERM D0,D1,D1,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A2,A2,K0
+ VAF B2,B2,K1
+ VAF C2,C2,K2
+
+ VPERM A0,A2,A2,BEPERM
+ VPERM B0,B2,B2,BEPERM
+ VPERM C0,C2,C2,BEPERM
+ VPERM D0,D2,D2,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A3,A3,K0
+ VAF B3,B3,K1
+ VAF C3,C3,K2
+ VAF D2,K3,T3 # K[3]+3
+
+ VPERM A0,A3,A3,BEPERM
+ VPERM B0,B3,B3,BEPERM
+ VPERM C0,C3,C3,BEPERM
+ VPERM D0,D3,D3,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VAF D3,D2,T1 # K[3]+4
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A4,A4,K0
+ VAF B4,B4,K1
+ VAF C4,C4,K2
+ VAF D4,D4,D3 # +K[3]+4
+ VAF D3,D3,T1 # K[3]+5
+ VAF K3,D2,T3 # K[3]+=6
+
+ VPERM A0,A4,A4,BEPERM
+ VPERM B0,B4,B4,BEPERM
+ VPERM C0,C4,C4,BEPERM
+ VPERM D0,D4,D4,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A5,A5,K0
+ VAF B5,B5,K1
+ VAF C5,C5,K2
+ VAF D5,D5,D3 # +K[3]+5
+
+ VPERM A0,A5,A5,BEPERM
+ VPERM B0,B5,B5,BEPERM
+ VPERM C0,C5,C5,BEPERM
+ VPERM D0,D5,D5,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ lhi %r0,10
+ aghi LEN,-0x40
+ jne .Loop_outer_vx
+
+.Ldone_vx:
+ lmg %r6,%r7,FRAME+6*8(SP)
+ la SP,FRAME(SP)
+ BR_EX %r14
+
+.Ltail_vx:
+ VSTM A0,D0,8*8,SP,3
+ lghi %r1,0
+
+.Loop_tail_vx:
+ llgc %r5,0(%r1,INP)
+ llgc %r6,8*8(%r1,SP)
+ xr %r6,%r5
+ stc %r6,0(%r1,OUT)
+ la %r1,1(%r1)
+ brct LEN,.Loop_tail_vx
+
+ lmg %r6,%r7,FRAME+6*8(SP)
+ la SP,FRAME(SP)
+ BR_EX %r14
+SYM_FUNC_END(chacha20_vx)
+
+.previous
diff --git a/arch/s390/crypto/chacha-s390.h b/arch/s390/crypto/chacha-s390.h
new file mode 100644
index 000000000000..733744ce30f5
--- /dev/null
+++ b/arch/s390/crypto/chacha-s390.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * s390 ChaCha stream cipher.
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#ifndef _CHACHA_S390_H
+#define _CHACHA_S390_H
+
+void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key,
+ const u32 *counter);
+
+#endif /* _CHACHA_S390_H */
diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c
index 423ee05887e6..017143e9cef7 100644
--- a/arch/s390/crypto/crc32-vx.c
+++ b/arch/s390/crypto/crc32-vx.c
@@ -111,10 +111,8 @@ static int crc32_vx_setkey(struct crypto_shash *tfm, const u8 *newkey,
{
struct crc_ctx *mctx = crypto_shash_ctx(tfm);
- if (newkeylen != sizeof(mctx->key)) {
- crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ if (newkeylen != sizeof(mctx->key))
return -EINVAL;
- }
mctx->key = le32_to_cpu(*(__le32 *)newkey);
return 0;
}
@@ -124,10 +122,8 @@ static int crc32be_vx_setkey(struct crypto_shash *tfm, const u8 *newkey,
{
struct crc_ctx *mctx = crypto_shash_ctx(tfm);
- if (newkeylen != sizeof(mctx->key)) {
- crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ if (newkeylen != sizeof(mctx->key))
return -EINVAL;
- }
mctx->key = be32_to_cpu(*(__be32 *)newkey);
return 0;
}
@@ -302,7 +298,7 @@ static void __exit crc_vx_mod_exit(void)
crypto_unregister_shashes(crc32_vx_algs, ARRAY_SIZE(crc32_vx_algs));
}
-module_cpu_feature_match(VXRS, crc_vx_mod_init);
+module_cpu_feature_match(S390_CPU_FEATURE_VXRS, crc_vx_mod_init);
module_exit(crc_vx_mod_exit);
MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>");
diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/crypto/crc32be-vx.S
index 0099044e2c86..34ee47926891 100644
--- a/arch/s390/crypto/crc32be-vx.S
+++ b/arch/s390/crypto/crc32be-vx.S
@@ -24,15 +24,15 @@
#define CONST_RU_POLY %v13
#define CONST_CRC_POLY %v14
-.data
-.align 8
+ .data
+ .balign 8
/*
* The CRC-32 constant block contains reduction constants to fold and
* process particular chunks of the input data stream in parallel.
*
* For the CRC-32 variants, the constants are precomputed according to
- * these defintions:
+ * these definitions:
*
* R1 = x4*128+64 mod P(x)
* R2 = x4*128 mod P(x)
@@ -48,7 +48,7 @@
*
* Note that the constant definitions below are extended in order to compute
* intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
- * The righmost doubleword can be 0 to prevent contribution to the result or
+ * The rightmost doubleword can be 0 to prevent contribution to the result or
* can be multiplied by 1 to perform an XOR without the need for a separate
* VECTOR EXCLUSIVE OR instruction.
*
@@ -58,19 +58,20 @@
* P'(x) = 0xEDB88320
*/
-.Lconstants_CRC_32_BE:
+SYM_DATA_START_LOCAL(constants_CRC_32_BE)
.quad 0x08833794c, 0x0e6228b11 # R1, R2
.quad 0x0c5b9cd4c, 0x0e8a45605 # R3, R4
.quad 0x0f200aa66, 1 << 32 # R5, x32
.quad 0x0490d678d, 1 # R6, 1
.quad 0x104d101df, 0 # u
.quad 0x104C11DB7, 0 # P(x)
+SYM_DATA_END(constants_CRC_32_BE)
-.previous
+ .previous
GEN_BR_THUNK %r14
-.text
+ .text
/*
* The CRC-32 function(s) use these calling conventions:
*
@@ -90,9 +91,9 @@
*
* V9..V14: CRC-32 constants.
*/
-ENTRY(crc32_be_vgfm_16)
+SYM_FUNC_START(crc32_be_vgfm_16)
/* Load CRC-32 constants */
- larl %r5,.Lconstants_CRC_32_BE
+ larl %r5,constants_CRC_32_BE
VLM CONST_R1R2,CONST_CRC_POLY,0,%r5
/* Load the initial CRC value into the leftmost word of V0. */
@@ -189,7 +190,7 @@ ENTRY(crc32_be_vgfm_16)
* Note: To compensate the division by x^32, use the vector unpack
* instruction to move the leftmost word into the leftmost doubleword
* of the vector register. The rightmost doubleword is multiplied
- * with zero to not contribute to the intermedate results.
+ * with zero to not contribute to the intermediate results.
*/
/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
@@ -207,6 +208,6 @@ ENTRY(crc32_be_vgfm_16)
.Ldone:
VLGVF %r2,%v2,3
BR_EX %r14
-ENDPROC(crc32_be_vgfm_16)
+SYM_FUNC_END(crc32_be_vgfm_16)
.previous
diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/crypto/crc32le-vx.S
index 71caf0f4ec08..5a819ae09a0b 100644
--- a/arch/s390/crypto/crc32le-vx.S
+++ b/arch/s390/crypto/crc32le-vx.S
@@ -25,8 +25,8 @@
#define CONST_RU_POLY %v13
#define CONST_CRC_POLY %v14
-.data
-.align 8
+ .data
+ .balign 8
/*
* The CRC-32 constant block contains reduction constants to fold and
@@ -59,27 +59,29 @@
* P'(x) = 0x82F63B78
*/
-.Lconstants_CRC_32_LE:
+SYM_DATA_START_LOCAL(constants_CRC_32_LE)
.octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask
.quad 0x1c6e41596, 0x154442bd4 # R2, R1
.quad 0x0ccaa009e, 0x1751997d0 # R4, R3
.octa 0x163cd6124 # R5
.octa 0x1F7011641 # u'
.octa 0x1DB710641 # P'(x) << 1
+SYM_DATA_END(constants_CRC_32_LE)
-.Lconstants_CRC_32C_LE:
+SYM_DATA_START_LOCAL(constants_CRC_32C_LE)
.octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask
.quad 0x09e4addf8, 0x740eef02 # R2, R1
.quad 0x14cd00bd6, 0xf20c0dfe # R4, R3
.octa 0x0dd45aab8 # R5
.octa 0x0dea713f1 # u'
.octa 0x105ec76f0 # P'(x) << 1
+SYM_DATA_END(constants_CRC_32C_LE)
-.previous
+ .previous
GEN_BR_THUNK %r14
-.text
+ .text
/*
* The CRC-32 functions use these calling conventions:
@@ -102,17 +104,17 @@
* V10..V14: CRC-32 constants.
*/
-ENTRY(crc32_le_vgfm_16)
- larl %r5,.Lconstants_CRC_32_LE
+SYM_FUNC_START(crc32_le_vgfm_16)
+ larl %r5,constants_CRC_32_LE
j crc32_le_vgfm_generic
-ENDPROC(crc32_le_vgfm_16)
+SYM_FUNC_END(crc32_le_vgfm_16)
-ENTRY(crc32c_le_vgfm_16)
- larl %r5,.Lconstants_CRC_32C_LE
+SYM_FUNC_START(crc32c_le_vgfm_16)
+ larl %r5,constants_CRC_32C_LE
j crc32_le_vgfm_generic
-ENDPROC(crc32c_le_vgfm_16)
+SYM_FUNC_END(crc32c_le_vgfm_16)
-ENTRY(crc32_le_vgfm_generic)
+SYM_FUNC_START(crc32_le_vgfm_generic)
/* Load CRC-32 constants */
VLM CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5
@@ -268,6 +270,6 @@ ENTRY(crc32_le_vgfm_generic)
.Ldone:
VLGVF %r2,%v2,2
BR_EX %r14
-ENDPROC(crc32_le_vgfm_generic)
+SYM_FUNC_END(crc32_le_vgfm_generic)
.previous
diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c
index bfbafd35bcbd..8e75b83a5ddc 100644
--- a/arch/s390/crypto/des_s390.c
+++ b/arch/s390/crypto/des_s390.c
@@ -194,7 +194,7 @@ static struct skcipher_alg cbc_des_alg = {
* same as DES. Implementers MUST reject keys that exhibit this
* property.
*
- * In fips mode additinally check for all 3 keys are unique.
+ * In fips mode additionally check for all 3 keys are unique.
*
*/
static int des3_setkey(struct crypto_tfm *tfm, const u8 *key,
@@ -492,7 +492,7 @@ out_err:
return ret;
}
-module_cpu_feature_match(MSA, des_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, des_s390_init);
module_exit(des_s390_exit);
MODULE_ALIAS_CRYPTO("des");
diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c
index a3e7400e031c..0800a2a5799f 100644
--- a/arch/s390/crypto/ghash_s390.c
+++ b/arch/s390/crypto/ghash_s390.c
@@ -43,10 +43,8 @@ static int ghash_setkey(struct crypto_shash *tfm,
{
struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
- if (keylen != GHASH_BLOCK_SIZE) {
- crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ if (keylen != GHASH_BLOCK_SIZE)
return -EINVAL;
- }
memcpy(ctx->key, key, GHASH_BLOCK_SIZE);
@@ -147,7 +145,7 @@ static void __exit ghash_mod_exit(void)
crypto_unregister_shash(&ghash_alg);
}
-module_cpu_feature_match(MSA, ghash_mod_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, ghash_mod_init);
module_exit(ghash_mod_exit);
MODULE_ALIAS_CRYPTO("ghash");
diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
index c7119c617b6e..55ee5567a5ea 100644
--- a/arch/s390/crypto/paes_s390.c
+++ b/arch/s390/crypto/paes_s390.c
@@ -5,7 +5,7 @@
* s390 implementation of the AES Cipher Algorithm with protected keys.
*
* s390 Version:
- * Copyright IBM Corp. 2017,2019
+ * Copyright IBM Corp. 2017, 2023
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
* Harald Freudenberger <freude@de.ibm.com>
*/
@@ -20,7 +20,9 @@
#include <linux/module.h>
#include <linux/cpufeature.h>
#include <linux/init.h>
+#include <linux/mutex.h>
#include <linux/spinlock.h>
+#include <linux/delay.h>
#include <crypto/internal/skcipher.h>
#include <crypto/xts.h>
#include <asm/cpacf.h>
@@ -32,11 +34,11 @@
* is called. As paes can handle different kinds of key blobs
* and padding is also possible, the limits need to be generous.
*/
-#define PAES_MIN_KEYSIZE 64
-#define PAES_MAX_KEYSIZE 256
+#define PAES_MIN_KEYSIZE 16
+#define PAES_MAX_KEYSIZE MAXEP11AESKEYBLOBSIZE
static u8 *ctrblk;
-static DEFINE_SPINLOCK(ctrblk_lock);
+static DEFINE_MUTEX(ctrblk_lock);
static cpacf_mask_t km_functions, kmc_functions, kmctr_functions;
@@ -53,19 +55,46 @@ struct key_blob {
unsigned int keylen;
};
-static inline int _copy_key_to_kb(struct key_blob *kb,
- const u8 *key,
- unsigned int keylen)
-{
- if (keylen <= sizeof(kb->keybuf))
+static inline int _key_to_kb(struct key_blob *kb,
+ const u8 *key,
+ unsigned int keylen)
+{
+ struct clearkey_header {
+ u8 type;
+ u8 res0[3];
+ u8 version;
+ u8 res1[3];
+ u32 keytype;
+ u32 len;
+ } __packed * h;
+
+ switch (keylen) {
+ case 16:
+ case 24:
+ case 32:
+ /* clear key value, prepare pkey clear key token in keybuf */
+ memset(kb->keybuf, 0, sizeof(kb->keybuf));
+ h = (struct clearkey_header *) kb->keybuf;
+ h->version = 0x02; /* TOKVER_CLEAR_KEY */
+ h->keytype = (keylen - 8) >> 3;
+ h->len = keylen;
+ memcpy(kb->keybuf + sizeof(*h), key, keylen);
+ kb->keylen = sizeof(*h) + keylen;
kb->key = kb->keybuf;
- else {
- kb->key = kmalloc(keylen, GFP_KERNEL);
- if (!kb->key)
- return -ENOMEM;
+ break;
+ default:
+ /* other key material, let pkey handle this */
+ if (keylen <= sizeof(kb->keybuf))
+ kb->key = kb->keybuf;
+ else {
+ kb->key = kmalloc(keylen, GFP_KERNEL);
+ if (!kb->key)
+ return -ENOMEM;
+ }
+ memcpy(kb->key, key, keylen);
+ kb->keylen = keylen;
+ break;
}
- memcpy(kb->key, key, keylen);
- kb->keylen = keylen;
return 0;
}
@@ -74,7 +103,7 @@ static inline void _free_kb_keybuf(struct key_blob *kb)
{
if (kb->key && kb->key != kb->keybuf
&& kb->keylen > sizeof(kb->keybuf)) {
- kfree(kb->key);
+ kfree_sensitive(kb->key);
kb->key = NULL;
}
}
@@ -82,23 +111,29 @@ static inline void _free_kb_keybuf(struct key_blob *kb)
struct s390_paes_ctx {
struct key_blob kb;
struct pkey_protkey pk;
+ spinlock_t pk_lock;
unsigned long fc;
};
struct s390_pxts_ctx {
struct key_blob kb[2];
struct pkey_protkey pk[2];
+ spinlock_t pk_lock;
unsigned long fc;
};
-static inline int __paes_convert_key(struct key_blob *kb,
+static inline int __paes_keyblob2pkey(struct key_blob *kb,
struct pkey_protkey *pk)
{
int i, ret;
/* try three times in case of failure */
for (i = 0; i < 3; i++) {
- ret = pkey_keyblob2pkey(kb->key, kb->keylen, pk);
+ if (i > 0 && ret == -EAGAIN && in_task())
+ if (msleep_interruptible(1000))
+ return -EINTR;
+ ret = pkey_keyblob2pkey(kb->key, kb->keylen,
+ pk->protkey, &pk->len, &pk->type);
if (ret == 0)
break;
}
@@ -106,22 +141,21 @@ static inline int __paes_convert_key(struct key_blob *kb,
return ret;
}
-static int __paes_set_key(struct s390_paes_ctx *ctx)
+static inline int __paes_convert_key(struct s390_paes_ctx *ctx)
{
- unsigned long fc;
-
- if (__paes_convert_key(&ctx->kb, &ctx->pk))
- return -EINVAL;
+ int ret;
+ struct pkey_protkey pkey;
- /* Pick the correct function code based on the protected key type */
- fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 :
- (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KM_PAES_192 :
- (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KM_PAES_256 : 0;
+ pkey.len = sizeof(pkey.protkey);
+ ret = __paes_keyblob2pkey(&ctx->kb, &pkey);
+ if (ret)
+ return ret;
- /* Check if the function code is available */
- ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
+ spin_lock_bh(&ctx->pk_lock);
+ memcpy(&ctx->pk, &pkey, sizeof(pkey));
+ spin_unlock_bh(&ctx->pk_lock);
- return ctx->fc ? 0 : -EINVAL;
+ return 0;
}
static int ecb_paes_init(struct crypto_skcipher *tfm)
@@ -129,6 +163,7 @@ static int ecb_paes_init(struct crypto_skcipher *tfm)
struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
ctx->kb.key = NULL;
+ spin_lock_init(&ctx->pk_lock);
return 0;
}
@@ -140,6 +175,26 @@ static void ecb_paes_exit(struct crypto_skcipher *tfm)
_free_kb_keybuf(&ctx->kb);
}
+static inline int __ecb_paes_set_key(struct s390_paes_ctx *ctx)
+{
+ int rc;
+ unsigned long fc;
+
+ rc = __paes_convert_key(ctx);
+ if (rc)
+ return rc;
+
+ /* Pick the correct function code based on the protected key type */
+ fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 :
+ (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KM_PAES_192 :
+ (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KM_PAES_256 : 0;
+
+ /* Check if the function code is available */
+ ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
+
+ return ctx->fc ? 0 : -EINVAL;
+}
+
static int ecb_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
unsigned int key_len)
{
@@ -147,15 +202,11 @@ static int ecb_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
_free_kb_keybuf(&ctx->kb);
- rc = _copy_key_to_kb(&ctx->kb, in_key, key_len);
+ rc = _key_to_kb(&ctx->kb, in_key, key_len);
if (rc)
return rc;
- if (__paes_set_key(ctx)) {
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
- return 0;
+ return __ecb_paes_set_key(ctx);
}
static int ecb_paes_crypt(struct skcipher_request *req, unsigned long modifier)
@@ -165,18 +216,31 @@ static int ecb_paes_crypt(struct skcipher_request *req, unsigned long modifier)
struct skcipher_walk walk;
unsigned int nbytes, n, k;
int ret;
+ struct {
+ u8 key[MAXPROTKEYSIZE];
+ } param;
ret = skcipher_walk_virt(&walk, req, false);
+ if (ret)
+ return ret;
+
+ spin_lock_bh(&ctx->pk_lock);
+ memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ spin_unlock_bh(&ctx->pk_lock);
+
while ((nbytes = walk.nbytes) != 0) {
/* only use complete blocks */
n = nbytes & ~(AES_BLOCK_SIZE - 1);
- k = cpacf_km(ctx->fc | modifier, ctx->pk.protkey,
+ k = cpacf_km(ctx->fc | modifier, &param,
walk.dst.virt.addr, walk.src.virt.addr, n);
if (k)
ret = skcipher_walk_done(&walk, nbytes - k);
if (k < n) {
- if (__paes_set_key(ctx) != 0)
+ if (__paes_convert_key(ctx))
return skcipher_walk_done(&walk, -EIO);
+ spin_lock_bh(&ctx->pk_lock);
+ memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ spin_unlock_bh(&ctx->pk_lock);
}
}
return ret;
@@ -214,6 +278,7 @@ static int cbc_paes_init(struct crypto_skcipher *tfm)
struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
ctx->kb.key = NULL;
+ spin_lock_init(&ctx->pk_lock);
return 0;
}
@@ -225,12 +290,14 @@ static void cbc_paes_exit(struct crypto_skcipher *tfm)
_free_kb_keybuf(&ctx->kb);
}
-static int __cbc_paes_set_key(struct s390_paes_ctx *ctx)
+static inline int __cbc_paes_set_key(struct s390_paes_ctx *ctx)
{
+ int rc;
unsigned long fc;
- if (__paes_convert_key(&ctx->kb, &ctx->pk))
- return -EINVAL;
+ rc = __paes_convert_key(ctx);
+ if (rc)
+ return rc;
/* Pick the correct function code based on the protected key type */
fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMC_PAES_128 :
@@ -250,15 +317,11 @@ static int cbc_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
_free_kb_keybuf(&ctx->kb);
- rc = _copy_key_to_kb(&ctx->kb, in_key, key_len);
+ rc = _key_to_kb(&ctx->kb, in_key, key_len);
if (rc)
return rc;
- if (__cbc_paes_set_key(ctx)) {
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
- return 0;
+ return __cbc_paes_set_key(ctx);
}
static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier)
@@ -276,8 +339,12 @@ static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier)
ret = skcipher_walk_virt(&walk, req, false);
if (ret)
return ret;
+
memcpy(param.iv, walk.iv, AES_BLOCK_SIZE);
+ spin_lock_bh(&ctx->pk_lock);
memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ spin_unlock_bh(&ctx->pk_lock);
+
while ((nbytes = walk.nbytes) != 0) {
/* only use complete blocks */
n = nbytes & ~(AES_BLOCK_SIZE - 1);
@@ -288,9 +355,11 @@ static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier)
ret = skcipher_walk_done(&walk, nbytes - k);
}
if (k < n) {
- if (__cbc_paes_set_key(ctx) != 0)
+ if (__paes_convert_key(ctx))
return skcipher_walk_done(&walk, -EIO);
+ spin_lock_bh(&ctx->pk_lock);
memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ spin_unlock_bh(&ctx->pk_lock);
}
}
return ret;
@@ -330,6 +399,7 @@ static int xts_paes_init(struct crypto_skcipher *tfm)
ctx->kb[0].key = NULL;
ctx->kb[1].key = NULL;
+ spin_lock_init(&ctx->pk_lock);
return 0;
}
@@ -342,12 +412,30 @@ static void xts_paes_exit(struct crypto_skcipher *tfm)
_free_kb_keybuf(&ctx->kb[1]);
}
-static int __xts_paes_set_key(struct s390_pxts_ctx *ctx)
+static inline int __xts_paes_convert_key(struct s390_pxts_ctx *ctx)
+{
+ struct pkey_protkey pkey0, pkey1;
+
+ pkey0.len = sizeof(pkey0.protkey);
+ pkey1.len = sizeof(pkey1.protkey);
+
+ if (__paes_keyblob2pkey(&ctx->kb[0], &pkey0) ||
+ __paes_keyblob2pkey(&ctx->kb[1], &pkey1))
+ return -EINVAL;
+
+ spin_lock_bh(&ctx->pk_lock);
+ memcpy(&ctx->pk[0], &pkey0, sizeof(pkey0));
+ memcpy(&ctx->pk[1], &pkey1, sizeof(pkey1));
+ spin_unlock_bh(&ctx->pk_lock);
+
+ return 0;
+}
+
+static inline int __xts_paes_set_key(struct s390_pxts_ctx *ctx)
{
unsigned long fc;
- if (__paes_convert_key(&ctx->kb[0], &ctx->pk[0]) ||
- __paes_convert_key(&ctx->kb[1], &ctx->pk[1]))
+ if (__xts_paes_convert_key(ctx))
return -EINVAL;
if (ctx->pk[0].type != ctx->pk[1].type)
@@ -379,20 +467,19 @@ static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
_free_kb_keybuf(&ctx->kb[0]);
_free_kb_keybuf(&ctx->kb[1]);
- rc = _copy_key_to_kb(&ctx->kb[0], in_key, key_len);
+ rc = _key_to_kb(&ctx->kb[0], in_key, key_len);
if (rc)
return rc;
- rc = _copy_key_to_kb(&ctx->kb[1], in_key + key_len, key_len);
+ rc = _key_to_kb(&ctx->kb[1], in_key + key_len, key_len);
if (rc)
return rc;
- if (__xts_paes_set_key(ctx)) {
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
+ rc = __xts_paes_set_key(ctx);
+ if (rc)
+ return rc;
/*
- * xts_check_key verifies the key length is not odd and makes
+ * xts_verify_key verifies the key length is not odd and makes
* sure that the two keys are not the same. This can be done
* on the two protected keys as well
*/
@@ -425,15 +512,17 @@ static int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier)
ret = skcipher_walk_virt(&walk, req, false);
if (ret)
return ret;
+
keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 48 : 64;
offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 16 : 0;
-retry:
+
memset(&pcc_param, 0, sizeof(pcc_param));
memcpy(pcc_param.tweak, walk.iv, sizeof(pcc_param.tweak));
+ spin_lock_bh(&ctx->pk_lock);
memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen);
- cpacf_pcc(ctx->fc, pcc_param.key + offset);
-
memcpy(xts_param.key + offset, ctx->pk[0].protkey, keylen);
+ spin_unlock_bh(&ctx->pk_lock);
+ cpacf_pcc(ctx->fc, pcc_param.key + offset);
memcpy(xts_param.init, pcc_param.xts, 16);
while ((nbytes = walk.nbytes) != 0) {
@@ -444,11 +533,15 @@ retry:
if (k)
ret = skcipher_walk_done(&walk, nbytes - k);
if (k < n) {
- if (__xts_paes_set_key(ctx) != 0)
+ if (__xts_paes_convert_key(ctx))
return skcipher_walk_done(&walk, -EIO);
- goto retry;
+ spin_lock_bh(&ctx->pk_lock);
+ memcpy(xts_param.key + offset,
+ ctx->pk[0].protkey, keylen);
+ spin_unlock_bh(&ctx->pk_lock);
}
}
+
return ret;
}
@@ -485,6 +578,7 @@ static int ctr_paes_init(struct crypto_skcipher *tfm)
struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
ctx->kb.key = NULL;
+ spin_lock_init(&ctx->pk_lock);
return 0;
}
@@ -496,12 +590,14 @@ static void ctr_paes_exit(struct crypto_skcipher *tfm)
_free_kb_keybuf(&ctx->kb);
}
-static int __ctr_paes_set_key(struct s390_paes_ctx *ctx)
+static inline int __ctr_paes_set_key(struct s390_paes_ctx *ctx)
{
+ int rc;
unsigned long fc;
- if (__paes_convert_key(&ctx->kb, &ctx->pk))
- return -EINVAL;
+ rc = __paes_convert_key(ctx);
+ if (rc)
+ return rc;
/* Pick the correct function code based on the protected key type */
fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMCTR_PAES_128 :
@@ -522,15 +618,11 @@ static int ctr_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
_free_kb_keybuf(&ctx->kb);
- rc = _copy_key_to_kb(&ctx->kb, in_key, key_len);
+ rc = _key_to_kb(&ctx->kb, in_key, key_len);
if (rc)
return rc;
- if (__ctr_paes_set_key(ctx)) {
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
- return 0;
+ return __ctr_paes_set_key(ctx);
}
static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes)
@@ -556,49 +648,67 @@ static int ctr_paes_crypt(struct skcipher_request *req)
struct skcipher_walk walk;
unsigned int nbytes, n, k;
int ret, locked;
-
- locked = spin_trylock(&ctrblk_lock);
+ struct {
+ u8 key[MAXPROTKEYSIZE];
+ } param;
ret = skcipher_walk_virt(&walk, req, false);
+ if (ret)
+ return ret;
+
+ spin_lock_bh(&ctx->pk_lock);
+ memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ spin_unlock_bh(&ctx->pk_lock);
+
+ locked = mutex_trylock(&ctrblk_lock);
+
while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
n = AES_BLOCK_SIZE;
if (nbytes >= 2*AES_BLOCK_SIZE && locked)
n = __ctrblk_init(ctrblk, walk.iv, nbytes);
ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk.iv;
- k = cpacf_kmctr(ctx->fc, ctx->pk.protkey, walk.dst.virt.addr,
+ k = cpacf_kmctr(ctx->fc, &param, walk.dst.virt.addr,
walk.src.virt.addr, n, ctrptr);
if (k) {
if (ctrptr == ctrblk)
memcpy(walk.iv, ctrptr + k - AES_BLOCK_SIZE,
AES_BLOCK_SIZE);
crypto_inc(walk.iv, AES_BLOCK_SIZE);
- ret = skcipher_walk_done(&walk, nbytes - n);
+ ret = skcipher_walk_done(&walk, nbytes - k);
}
if (k < n) {
- if (__ctr_paes_set_key(ctx) != 0) {
+ if (__paes_convert_key(ctx)) {
if (locked)
- spin_unlock(&ctrblk_lock);
+ mutex_unlock(&ctrblk_lock);
return skcipher_walk_done(&walk, -EIO);
}
+ spin_lock_bh(&ctx->pk_lock);
+ memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ spin_unlock_bh(&ctx->pk_lock);
}
}
if (locked)
- spin_unlock(&ctrblk_lock);
+ mutex_unlock(&ctrblk_lock);
/*
* final block may be < AES_BLOCK_SIZE, copy only nbytes
*/
if (nbytes) {
+ memset(buf, 0, AES_BLOCK_SIZE);
+ memcpy(buf, walk.src.virt.addr, nbytes);
while (1) {
- if (cpacf_kmctr(ctx->fc, ctx->pk.protkey, buf,
- walk.src.virt.addr, AES_BLOCK_SIZE,
+ if (cpacf_kmctr(ctx->fc, &param, buf,
+ buf, AES_BLOCK_SIZE,
walk.iv) == AES_BLOCK_SIZE)
break;
- if (__ctr_paes_set_key(ctx) != 0)
+ if (__paes_convert_key(ctx))
return skcipher_walk_done(&walk, -EIO);
+ spin_lock_bh(&ctx->pk_lock);
+ memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ spin_unlock_bh(&ctx->pk_lock);
}
memcpy(walk.dst.virt.addr, buf, nbytes);
crypto_inc(walk.iv, AES_BLOCK_SIZE);
- ret = skcipher_walk_done(&walk, 0);
+ ret = skcipher_walk_done(&walk, nbytes);
}
return ret;
@@ -631,12 +741,12 @@ static inline void __crypto_unregister_skcipher(struct skcipher_alg *alg)
static void paes_s390_fini(void)
{
- if (ctrblk)
- free_page((unsigned long) ctrblk);
__crypto_unregister_skcipher(&ctr_paes_alg);
__crypto_unregister_skcipher(&xts_paes_alg);
__crypto_unregister_skcipher(&cbc_paes_alg);
__crypto_unregister_skcipher(&ecb_paes_alg);
+ if (ctrblk)
+ free_page((unsigned long) ctrblk);
}
static int __init paes_s390_init(void)
@@ -674,14 +784,14 @@ static int __init paes_s390_init(void)
if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_128) ||
cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_192) ||
cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_256)) {
- ret = crypto_register_skcipher(&ctr_paes_alg);
- if (ret)
- goto out_err;
ctrblk = (u8 *) __get_free_page(GFP_KERNEL);
if (!ctrblk) {
ret = -ENOMEM;
goto out_err;
}
+ ret = crypto_register_skcipher(&ctr_paes_alg);
+ if (ret)
+ goto out_err;
}
return 0;
diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index d977643fa627..a077087bc6cc 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -249,7 +249,7 @@ static void prng_tdes_deinstantiate(void)
{
pr_debug("The prng module stopped "
"after running in triple DES mode\n");
- kzfree(prng_data);
+ kfree_sensitive(prng_data);
}
@@ -414,7 +414,7 @@ static int __init prng_sha512_instantiate(void)
}
/* append the seed by 16 bytes of unique nonce */
- get_tod_clock_ext(seed + seedlen);
+ store_tod_clock_ext((union tod_clock *)(seed + seedlen));
seedlen += 16;
/* now initial seed of the prno drng */
@@ -442,7 +442,7 @@ outfree:
static void prng_sha512_deinstantiate(void)
{
pr_debug("The prng module stopped after running in SHA-512 mode\n");
- kzfree(prng_data);
+ kfree_sensitive(prng_data);
}
@@ -528,7 +528,7 @@ static ssize_t prng_tdes_read(struct file *file, char __user *ubuf,
/* give mutex free before calling schedule() */
mutex_unlock(&prng_data->mutex);
schedule();
- /* occopy mutex again */
+ /* occupy mutex again */
if (mutex_lock_interruptible(&prng_data->mutex)) {
if (ret == 0)
ret = -ERESTARTSYS;
@@ -674,26 +674,12 @@ static const struct file_operations prng_tdes_fops = {
.llseek = noop_llseek,
};
-static struct miscdevice prng_sha512_dev = {
- .name = "prandom",
- .minor = MISC_DYNAMIC_MINOR,
- .mode = 0644,
- .fops = &prng_sha512_fops,
-};
-static struct miscdevice prng_tdes_dev = {
- .name = "prandom",
- .minor = MISC_DYNAMIC_MINOR,
- .mode = 0644,
- .fops = &prng_tdes_fops,
-};
-
-
/* chunksize attribute (ro) */
static ssize_t prng_chunksize_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%u\n", prng_chunk_size);
+ return scnprintf(buf, PAGE_SIZE, "%u\n", prng_chunk_size);
}
static DEVICE_ATTR(chunksize, 0444, prng_chunksize_show, NULL);
@@ -712,7 +698,7 @@ static ssize_t prng_counter_show(struct device *dev,
counter = prng_data->prngws.byte_counter;
mutex_unlock(&prng_data->mutex);
- return snprintf(buf, PAGE_SIZE, "%llu\n", counter);
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", counter);
}
static DEVICE_ATTR(byte_counter, 0444, prng_counter_show, NULL);
@@ -721,7 +707,7 @@ static ssize_t prng_errorflag_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", prng_errorflag);
+ return scnprintf(buf, PAGE_SIZE, "%d\n", prng_errorflag);
}
static DEVICE_ATTR(errorflag, 0444, prng_errorflag_show, NULL);
@@ -731,9 +717,9 @@ static ssize_t prng_mode_show(struct device *dev,
char *buf)
{
if (prng_mode == PRNG_MODE_TDES)
- return snprintf(buf, PAGE_SIZE, "TDES\n");
+ return scnprintf(buf, PAGE_SIZE, "TDES\n");
else
- return snprintf(buf, PAGE_SIZE, "SHA512\n");
+ return scnprintf(buf, PAGE_SIZE, "SHA512\n");
}
static DEVICE_ATTR(mode, 0444, prng_mode_show, NULL);
@@ -756,7 +742,7 @@ static ssize_t prng_reseed_limit_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%u\n", prng_reseed_limit);
+ return scnprintf(buf, PAGE_SIZE, "%u\n", prng_reseed_limit);
}
static ssize_t prng_reseed_limit_store(struct device *dev,
struct device_attribute *attr,
@@ -787,7 +773,7 @@ static ssize_t prng_strength_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "256\n");
+ return scnprintf(buf, PAGE_SIZE, "256\n");
}
static DEVICE_ATTR(strength, 0444, prng_strength_show, NULL);
@@ -801,18 +787,30 @@ static struct attribute *prng_sha512_dev_attrs[] = {
&dev_attr_strength.attr,
NULL
};
+ATTRIBUTE_GROUPS(prng_sha512_dev);
+
static struct attribute *prng_tdes_dev_attrs[] = {
&dev_attr_chunksize.attr,
&dev_attr_byte_counter.attr,
&dev_attr_mode.attr,
NULL
};
+ATTRIBUTE_GROUPS(prng_tdes_dev);
-static struct attribute_group prng_sha512_dev_attr_group = {
- .attrs = prng_sha512_dev_attrs
+static struct miscdevice prng_sha512_dev = {
+ .name = "prandom",
+ .minor = MISC_DYNAMIC_MINOR,
+ .mode = 0644,
+ .fops = &prng_sha512_fops,
+ .groups = prng_sha512_dev_groups,
};
-static struct attribute_group prng_tdes_dev_attr_group = {
- .attrs = prng_tdes_dev_attrs
+
+static struct miscdevice prng_tdes_dev = {
+ .name = "prandom",
+ .minor = MISC_DYNAMIC_MINOR,
+ .mode = 0644,
+ .fops = &prng_tdes_fops,
+ .groups = prng_tdes_dev_groups,
};
@@ -867,13 +865,6 @@ static int __init prng_init(void)
prng_sha512_deinstantiate();
goto out;
}
- ret = sysfs_create_group(&prng_sha512_dev.this_device->kobj,
- &prng_sha512_dev_attr_group);
- if (ret) {
- misc_deregister(&prng_sha512_dev);
- prng_sha512_deinstantiate();
- goto out;
- }
} else {
@@ -898,14 +889,6 @@ static int __init prng_init(void)
prng_tdes_deinstantiate();
goto out;
}
- ret = sysfs_create_group(&prng_tdes_dev.this_device->kobj,
- &prng_tdes_dev_attr_group);
- if (ret) {
- misc_deregister(&prng_tdes_dev);
- prng_tdes_deinstantiate();
- goto out;
- }
-
}
out:
@@ -916,17 +899,13 @@ out:
static void __exit prng_exit(void)
{
if (prng_mode == PRNG_MODE_SHA512) {
- sysfs_remove_group(&prng_sha512_dev.this_device->kobj,
- &prng_sha512_dev_attr_group);
misc_deregister(&prng_sha512_dev);
prng_sha512_deinstantiate();
} else {
- sysfs_remove_group(&prng_tdes_dev.this_device->kobj,
- &prng_tdes_dev_attr_group);
misc_deregister(&prng_tdes_dev);
prng_tdes_deinstantiate();
}
}
-module_cpu_feature_match(MSA, prng_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, prng_init);
module_exit(prng_exit);
diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h
index ada2f98c27b7..65ea12fc87a1 100644
--- a/arch/s390/crypto/sha.h
+++ b/arch/s390/crypto/sha.h
@@ -11,7 +11,8 @@
#define _CRYPTO_ARCH_S390_SHA_H
#include <linux/crypto.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
#include <crypto/sha3.h>
/* must be big enough for the largest SHA variant */
diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c
index 7c15542d3685..bc3a22704e09 100644
--- a/arch/s390/crypto/sha1_s390.c
+++ b/arch/s390/crypto/sha1_s390.c
@@ -22,12 +22,12 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
#include <asm/cpacf.h>
#include "sha.h"
-static int sha1_init(struct shash_desc *desc)
+static int s390_sha1_init(struct shash_desc *desc)
{
struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
@@ -42,7 +42,7 @@ static int sha1_init(struct shash_desc *desc)
return 0;
}
-static int sha1_export(struct shash_desc *desc, void *out)
+static int s390_sha1_export(struct shash_desc *desc, void *out)
{
struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
struct sha1_state *octx = out;
@@ -53,7 +53,7 @@ static int sha1_export(struct shash_desc *desc, void *out)
return 0;
}
-static int sha1_import(struct shash_desc *desc, const void *in)
+static int s390_sha1_import(struct shash_desc *desc, const void *in)
{
struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
const struct sha1_state *ictx = in;
@@ -67,11 +67,11 @@ static int sha1_import(struct shash_desc *desc, const void *in)
static struct shash_alg alg = {
.digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_init,
+ .init = s390_sha1_init,
.update = s390_sha_update,
.final = s390_sha_final,
- .export = sha1_export,
- .import = sha1_import,
+ .export = s390_sha1_export,
+ .import = s390_sha1_import,
.descsize = sizeof(struct s390_sha_ctx),
.statesize = sizeof(struct sha1_state),
.base = {
@@ -95,7 +95,7 @@ static void __exit sha1_s390_fini(void)
crypto_unregister_shash(&alg);
}
-module_cpu_feature_match(MSA, sha1_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha1_s390_init);
module_exit(sha1_s390_fini);
MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c
index b52c87e44939..6f1ccdf93d3e 100644
--- a/arch/s390/crypto/sha256_s390.c
+++ b/arch/s390/crypto/sha256_s390.c
@@ -12,7 +12,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <asm/cpacf.h>
#include "sha.h"
@@ -134,7 +134,7 @@ static void __exit sha256_s390_fini(void)
crypto_unregister_shash(&sha256_alg);
}
-module_cpu_feature_match(MSA, sha256_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha256_s390_init);
module_exit(sha256_s390_fini);
MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c
index 460cbbbaa44a..e1350e033a32 100644
--- a/arch/s390/crypto/sha3_256_s390.c
+++ b/arch/s390/crypto/sha3_256_s390.c
@@ -12,7 +12,6 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
-#include <crypto/sha.h>
#include <crypto/sha3.h>
#include <asm/cpacf.h>
@@ -138,7 +137,7 @@ static void __exit sha3_256_s390_fini(void)
crypto_unregister_shash(&sha3_256_alg);
}
-module_cpu_feature_match(MSA, sha3_256_s390_init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha3_256_s390_init);
module_exit(sha3_256_s390_fini);
MODULE_ALIAS_CRYPTO("sha3-256");
diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c
index 72cf460a53e5..06c142ed9bb1 100644
--- a/arch/s390/crypto/sha3_512_s390.c
+++ b/arch/s390/crypto/sha3_512_s390.c
@@ -11,7 +11,6 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
-#include <crypto/sha.h>
#include <crypto/sha3.h>
#include <asm/cpacf.h>
@@ -148,7 +147,7 @@ static void __exit fini(void)
crypto_unregister_shash(&sha3_384_alg);
}
-module_cpu_feature_match(MSA, init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, init);
module_exit(fini);
MODULE_LICENSE("GPL");
diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c
index ad29db085a18..04f11c407763 100644
--- a/arch/s390/crypto/sha512_s390.c
+++ b/arch/s390/crypto/sha512_s390.c
@@ -8,7 +8,7 @@
* Author(s): Jan Glauber (jang@de.ibm.com)
*/
#include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -22,14 +22,14 @@ static int sha512_init(struct shash_desc *desc)
{
struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
- *(__u64 *)&ctx->state[0] = 0x6a09e667f3bcc908ULL;
- *(__u64 *)&ctx->state[2] = 0xbb67ae8584caa73bULL;
- *(__u64 *)&ctx->state[4] = 0x3c6ef372fe94f82bULL;
- *(__u64 *)&ctx->state[6] = 0xa54ff53a5f1d36f1ULL;
- *(__u64 *)&ctx->state[8] = 0x510e527fade682d1ULL;
- *(__u64 *)&ctx->state[10] = 0x9b05688c2b3e6c1fULL;
- *(__u64 *)&ctx->state[12] = 0x1f83d9abfb41bd6bULL;
- *(__u64 *)&ctx->state[14] = 0x5be0cd19137e2179ULL;
+ *(__u64 *)&ctx->state[0] = SHA512_H0;
+ *(__u64 *)&ctx->state[2] = SHA512_H1;
+ *(__u64 *)&ctx->state[4] = SHA512_H2;
+ *(__u64 *)&ctx->state[6] = SHA512_H3;
+ *(__u64 *)&ctx->state[8] = SHA512_H4;
+ *(__u64 *)&ctx->state[10] = SHA512_H5;
+ *(__u64 *)&ctx->state[12] = SHA512_H6;
+ *(__u64 *)&ctx->state[14] = SHA512_H7;
ctx->count = 0;
ctx->func = CPACF_KIMD_SHA_512;
@@ -87,14 +87,14 @@ static int sha384_init(struct shash_desc *desc)
{
struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
- *(__u64 *)&ctx->state[0] = 0xcbbb9d5dc1059ed8ULL;
- *(__u64 *)&ctx->state[2] = 0x629a292a367cd507ULL;
- *(__u64 *)&ctx->state[4] = 0x9159015a3070dd17ULL;
- *(__u64 *)&ctx->state[6] = 0x152fecd8f70e5939ULL;
- *(__u64 *)&ctx->state[8] = 0x67332667ffc00b31ULL;
- *(__u64 *)&ctx->state[10] = 0x8eb44a8768581511ULL;
- *(__u64 *)&ctx->state[12] = 0xdb0c2e0d64f98fa7ULL;
- *(__u64 *)&ctx->state[14] = 0x47b5481dbefa4fa4ULL;
+ *(__u64 *)&ctx->state[0] = SHA384_H0;
+ *(__u64 *)&ctx->state[2] = SHA384_H1;
+ *(__u64 *)&ctx->state[4] = SHA384_H2;
+ *(__u64 *)&ctx->state[6] = SHA384_H3;
+ *(__u64 *)&ctx->state[8] = SHA384_H4;
+ *(__u64 *)&ctx->state[10] = SHA384_H5;
+ *(__u64 *)&ctx->state[12] = SHA384_H6;
+ *(__u64 *)&ctx->state[14] = SHA384_H7;
ctx->count = 0;
ctx->func = CPACF_KIMD_SHA_512;
@@ -142,7 +142,7 @@ static void __exit fini(void)
crypto_unregister_shash(&sha384_alg);
}
-module_cpu_feature_match(MSA, init);
+module_cpu_feature_match(S390_CPU_FEATURE_MSA, init);
module_exit(fini);
MODULE_LICENSE("GPL");
diff --git a/arch/s390/hypfs/Makefile b/arch/s390/hypfs/Makefile
index 06f601509ce9..c34854d298f8 100644
--- a/arch/s390/hypfs/Makefile
+++ b/arch/s390/hypfs/Makefile
@@ -3,7 +3,12 @@
# Makefile for the linux hypfs filesystem routines.
#
-obj-$(CONFIG_S390_HYPFS_FS) += s390_hypfs.o
+obj-$(CONFIG_S390_HYPFS) += hypfs_dbfs.o
+obj-$(CONFIG_S390_HYPFS) += hypfs_diag.o
+obj-$(CONFIG_S390_HYPFS) += hypfs_diag0c.o
+obj-$(CONFIG_S390_HYPFS) += hypfs_sprp.o
+obj-$(CONFIG_S390_HYPFS) += hypfs_vm.o
-s390_hypfs-objs := inode.o hypfs_diag.o hypfs_vm.o hypfs_dbfs.o hypfs_sprp.o
-s390_hypfs-objs += hypfs_diag0c.o
+obj-$(CONFIG_S390_HYPFS_FS) += hypfs_diag_fs.o
+obj-$(CONFIG_S390_HYPFS_FS) += hypfs_vm_fs.o
+obj-$(CONFIG_S390_HYPFS_FS) += inode.o
diff --git a/arch/s390/hypfs/hypfs.h b/arch/s390/hypfs/hypfs.h
index 05f3f9aee5fc..65f4036fd541 100644
--- a/arch/s390/hypfs/hypfs.h
+++ b/arch/s390/hypfs/hypfs.h
@@ -46,6 +46,15 @@ void hypfs_diag0c_exit(void);
void hypfs_sprp_init(void);
void hypfs_sprp_exit(void);
+int __hypfs_fs_init(void);
+
+static inline int hypfs_fs_init(void)
+{
+ if (IS_ENABLED(CONFIG_S390_HYPFS_FS))
+ return __hypfs_fs_init();
+ return 0;
+}
+
/* debugfs interface */
struct hypfs_dbfs_file;
@@ -69,7 +78,6 @@ struct hypfs_dbfs_file {
struct dentry *dentry;
};
-extern void hypfs_dbfs_init(void);
extern void hypfs_dbfs_exit(void);
extern void hypfs_dbfs_create_file(struct hypfs_dbfs_file *df);
extern void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df);
diff --git a/arch/s390/hypfs/hypfs_dbfs.c b/arch/s390/hypfs/hypfs_dbfs.c
index f4c7dbfaf8ee..4024599eb448 100644
--- a/arch/s390/hypfs/hypfs_dbfs.c
+++ b/arch/s390/hypfs/hypfs_dbfs.c
@@ -90,12 +90,33 @@ void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df)
debugfs_remove(df->dentry);
}
-void hypfs_dbfs_init(void)
+static int __init hypfs_dbfs_init(void)
{
- dbfs_dir = debugfs_create_dir("s390_hypfs", NULL);
-}
+ int rc = -ENODATA;
-void hypfs_dbfs_exit(void)
-{
+ dbfs_dir = debugfs_create_dir("s390_hypfs", NULL);
+ if (hypfs_diag_init())
+ goto fail_dbfs_exit;
+ if (hypfs_vm_init())
+ goto fail_hypfs_diag_exit;
+ hypfs_sprp_init();
+ if (hypfs_diag0c_init())
+ goto fail_hypfs_sprp_exit;
+ rc = hypfs_fs_init();
+ if (rc)
+ goto fail_hypfs_diag0c_exit;
+ return 0;
+
+fail_hypfs_diag0c_exit:
+ hypfs_diag0c_exit();
+fail_hypfs_sprp_exit:
+ hypfs_sprp_exit();
+ hypfs_vm_exit();
+fail_hypfs_diag_exit:
+ hypfs_diag_exit();
+ pr_err("Initialization of hypfs failed with rc=%i\n", rc);
+fail_dbfs_exit:
debugfs_remove(dbfs_dir);
+ return rc;
}
+device_initcall(hypfs_dbfs_init)
diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index f0bc4dc3e9bf..279b7bba4d43 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -18,196 +18,27 @@
#include <linux/mm.h>
#include <asm/diag.h>
#include <asm/ebcdic.h>
+#include "hypfs_diag.h"
#include "hypfs.h"
-#define TMP_SIZE 64 /* size of temporary buffers */
-
#define DBFS_D204_HDR_VERSION 0
-static char *diag224_cpu_names; /* diag 224 name table */
static enum diag204_sc diag204_store_sc; /* used subcode for store */
static enum diag204_format diag204_info_type; /* used diag 204 data format */
static void *diag204_buf; /* 4K aligned buffer for diag204 data */
-static void *diag204_buf_vmalloc; /* vmalloc pointer for diag204 data */
static int diag204_buf_pages; /* number of pages for diag204 data */
static struct dentry *dbfs_d204_file;
-/*
- * DIAG 204 member access functions.
- *
- * Since we have two different diag 204 data formats for old and new s390
- * machines, we do not access the structs directly, but use getter functions for
- * each struct member instead. This should make the code more readable.
- */
-
-/* Time information block */
-
-static inline int info_blk_hdr__size(enum diag204_format type)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return sizeof(struct diag204_info_blk_hdr);
- else /* DIAG204_INFO_EXT */
- return sizeof(struct diag204_x_info_blk_hdr);
-}
-
-static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_info_blk_hdr *)hdr)->npar;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_info_blk_hdr *)hdr)->npar;
-}
-
-static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_info_blk_hdr *)hdr)->flags;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_info_blk_hdr *)hdr)->flags;
-}
-
-static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_info_blk_hdr *)hdr)->phys_cpus;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_info_blk_hdr *)hdr)->phys_cpus;
-}
-
-/* Partition header */
-
-static inline int part_hdr__size(enum diag204_format type)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return sizeof(struct diag204_part_hdr);
- else /* DIAG204_INFO_EXT */
- return sizeof(struct diag204_x_part_hdr);
-}
-
-static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_part_hdr *)hdr)->cpus;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_part_hdr *)hdr)->rcpus;
-}
-
-static inline void part_hdr__part_name(enum diag204_format type, void *hdr,
- char *name)
-{
- if (type == DIAG204_INFO_SIMPLE)
- memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name,
- DIAG204_LPAR_NAME_LEN);
- else /* DIAG204_INFO_EXT */
- memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name,
- DIAG204_LPAR_NAME_LEN);
- EBCASC(name, DIAG204_LPAR_NAME_LEN);
- name[DIAG204_LPAR_NAME_LEN] = 0;
- strim(name);
-}
-
-/* CPU info block */
-
-static inline int cpu_info__size(enum diag204_format type)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return sizeof(struct diag204_cpu_info);
- else /* DIAG204_INFO_EXT */
- return sizeof(struct diag204_x_cpu_info);
-}
-
-static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_cpu_info *)hdr)->ctidx;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_cpu_info *)hdr)->ctidx;
-}
-
-static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_cpu_info *)hdr)->cpu_addr;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_cpu_info *)hdr)->cpu_addr;
-}
-
-static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_cpu_info *)hdr)->acc_time;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_cpu_info *)hdr)->acc_time;
-}
-
-static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_cpu_info *)hdr)->lp_time;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_cpu_info *)hdr)->lp_time;
-}
-
-static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return 0; /* online_time not available in simple info */
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_cpu_info *)hdr)->online_time;
-}
-
-/* Physical header */
-
-static inline int phys_hdr__size(enum diag204_format type)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return sizeof(struct diag204_phys_hdr);
- else /* DIAG204_INFO_EXT */
- return sizeof(struct diag204_x_phys_hdr);
-}
-
-static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_phys_hdr *)hdr)->cpus;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_phys_hdr *)hdr)->cpus;
-}
-
-/* Physical CPU info block */
-
-static inline int phys_cpu__size(enum diag204_format type)
+enum diag204_format diag204_get_info_type(void)
{
- if (type == DIAG204_INFO_SIMPLE)
- return sizeof(struct diag204_phys_cpu);
- else /* DIAG204_INFO_EXT */
- return sizeof(struct diag204_x_phys_cpu);
+ return diag204_info_type;
}
-static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr)
+static void diag204_set_info_type(enum diag204_format type)
{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_phys_cpu *)hdr)->cpu_addr;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr;
-}
-
-static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_phys_cpu *)hdr)->mgm_time;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_phys_cpu *)hdr)->mgm_time;
-}
-
-static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_phys_cpu *)hdr)->ctidx;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_phys_cpu *)hdr)->ctidx;
+ diag204_info_type = type;
}
/* Diagnose 204 functions */
@@ -220,43 +51,11 @@ static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
static void diag204_free_buffer(void)
{
- if (!diag204_buf)
- return;
- if (diag204_buf_vmalloc) {
- vfree(diag204_buf_vmalloc);
- diag204_buf_vmalloc = NULL;
- } else {
- free_pages((unsigned long) diag204_buf, 0);
- }
+ vfree(diag204_buf);
diag204_buf = NULL;
}
-static void *page_align_ptr(void *ptr)
-{
- return (void *) PAGE_ALIGN((unsigned long) ptr);
-}
-
-static void *diag204_alloc_vbuf(int pages)
-{
- /* The buffer has to be page aligned! */
- diag204_buf_vmalloc = vmalloc(array_size(PAGE_SIZE, (pages + 1)));
- if (!diag204_buf_vmalloc)
- return ERR_PTR(-ENOMEM);
- diag204_buf = page_align_ptr(diag204_buf_vmalloc);
- diag204_buf_pages = pages;
- return diag204_buf;
-}
-
-static void *diag204_alloc_rbuf(void)
-{
- diag204_buf = (void*)__get_free_pages(GFP_KERNEL,0);
- if (!diag204_buf)
- return ERR_PTR(-ENOMEM);
- diag204_buf_pages = 1;
- return diag204_buf;
-}
-
-static void *diag204_get_buffer(enum diag204_format fmt, int *pages)
+void *diag204_get_buffer(enum diag204_format fmt, int *pages)
{
if (diag204_buf) {
*pages = diag204_buf_pages;
@@ -264,15 +63,19 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages)
}
if (fmt == DIAG204_INFO_SIMPLE) {
*pages = 1;
- return diag204_alloc_rbuf();
} else {/* DIAG204_INFO_EXT */
*pages = diag204((unsigned long)DIAG204_SUBC_RSI |
(unsigned long)DIAG204_INFO_EXT, 0, NULL);
if (*pages <= 0)
- return ERR_PTR(-ENOSYS);
- else
- return diag204_alloc_vbuf(*pages);
+ return ERR_PTR(-EOPNOTSUPP);
}
+ diag204_buf = __vmalloc_node(array_size(*pages, PAGE_SIZE),
+ PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE,
+ __builtin_return_address(0));
+ if (!diag204_buf)
+ return ERR_PTR(-ENOMEM);
+ diag204_buf_pages = *pages;
+ return diag204_buf;
}
/*
@@ -299,13 +102,13 @@ static int diag204_probe(void)
if (diag204((unsigned long)DIAG204_SUBC_STIB7 |
(unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
diag204_store_sc = DIAG204_SUBC_STIB7;
- diag204_info_type = DIAG204_INFO_EXT;
+ diag204_set_info_type(DIAG204_INFO_EXT);
goto out;
}
if (diag204((unsigned long)DIAG204_SUBC_STIB6 |
(unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
diag204_store_sc = DIAG204_SUBC_STIB6;
- diag204_info_type = DIAG204_INFO_EXT;
+ diag204_set_info_type(DIAG204_INFO_EXT);
goto out;
}
diag204_free_buffer();
@@ -321,10 +124,10 @@ static int diag204_probe(void)
if (diag204((unsigned long)DIAG204_SUBC_STIB4 |
(unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) {
diag204_store_sc = DIAG204_SUBC_STIB4;
- diag204_info_type = DIAG204_INFO_SIMPLE;
+ diag204_set_info_type(DIAG204_INFO_SIMPLE);
goto out;
} else {
- rc = -ENOSYS;
+ rc = -EOPNOTSUPP;
goto fail_store;
}
out:
@@ -335,58 +138,13 @@ fail_alloc:
return rc;
}
-static int diag204_do_store(void *buf, int pages)
+int diag204_store(void *buf, int pages)
{
int rc;
- rc = diag204((unsigned long) diag204_store_sc |
- (unsigned long) diag204_info_type, pages, buf);
- return rc < 0 ? -ENOSYS : 0;
-}
-
-static void *diag204_store(void)
-{
- void *buf;
- int pages, rc;
-
- buf = diag204_get_buffer(diag204_info_type, &pages);
- if (IS_ERR(buf))
- goto out;
- rc = diag204_do_store(buf, pages);
- if (rc)
- return ERR_PTR(rc);
-out:
- return buf;
-}
-
-/* Diagnose 224 functions */
-
-static int diag224_get_name_table(void)
-{
- /* memory must be below 2GB */
- diag224_cpu_names = (char *) __get_free_page(GFP_KERNEL | GFP_DMA);
- if (!diag224_cpu_names)
- return -ENOMEM;
- if (diag224(diag224_cpu_names)) {
- free_page((unsigned long) diag224_cpu_names);
- return -EOPNOTSUPP;
- }
- EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16);
- return 0;
-}
-
-static void diag224_delete_name_table(void)
-{
- free_page((unsigned long) diag224_cpu_names);
-}
-
-static int diag224_idx2name(int index, char *name)
-{
- memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN),
- DIAG204_CPU_NAME_LEN);
- name[DIAG204_CPU_NAME_LEN] = 0;
- strim(name);
- return 0;
+ rc = diag204((unsigned long)diag204_store_sc |
+ (unsigned long)diag204_get_info_type(), pages, buf);
+ return rc < 0 ? -EOPNOTSUPP : 0;
}
struct dbfs_d204_hdr {
@@ -411,8 +169,8 @@ static int dbfs_d204_create(void **data, void **data_free_ptr, size_t *size)
base = vzalloc(buf_size);
if (!base)
return -ENOMEM;
- d204 = page_align_ptr(base + sizeof(d204->hdr)) - sizeof(d204->hdr);
- rc = diag204_do_store(d204->buf, diag204_buf_pages);
+ d204 = PTR_ALIGN(base + sizeof(d204->hdr), PAGE_SIZE) - sizeof(d204->hdr);
+ rc = diag204_store(d204->buf, diag204_buf_pages);
if (rc) {
vfree(base);
return rc;
@@ -437,180 +195,25 @@ __init int hypfs_diag_init(void)
int rc;
if (diag204_probe()) {
- pr_err("The hardware system does not support hypfs\n");
+ pr_info("The hardware system does not support hypfs\n");
return -ENODATA;
}
- if (diag204_info_type == DIAG204_INFO_EXT)
+ if (diag204_get_info_type() == DIAG204_INFO_EXT)
hypfs_dbfs_create_file(&dbfs_file_d204);
- if (MACHINE_IS_LPAR) {
- rc = diag224_get_name_table();
- if (rc) {
- pr_err("The hardware system does not provide all "
- "functions required by hypfs\n");
- debugfs_remove(dbfs_d204_file);
- return rc;
- }
+ rc = hypfs_diag_fs_init();
+ if (rc) {
+ pr_err("The hardware system does not provide all functions required by hypfs\n");
+ debugfs_remove(dbfs_d204_file);
}
- return 0;
+ return rc;
}
void hypfs_diag_exit(void)
{
debugfs_remove(dbfs_d204_file);
- diag224_delete_name_table();
+ hypfs_diag_fs_exit();
diag204_free_buffer();
hypfs_dbfs_remove_file(&dbfs_file_d204);
}
-
-/*
- * Functions to create the directory structure
- * *******************************************
- */
-
-static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info)
-{
- struct dentry *cpu_dir;
- char buffer[TMP_SIZE];
- void *rc;
-
- snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_info_type,
- cpu_info));
- cpu_dir = hypfs_mkdir(cpus_dir, buffer);
- rc = hypfs_create_u64(cpu_dir, "mgmtime",
- cpu_info__acc_time(diag204_info_type, cpu_info) -
- cpu_info__lp_time(diag204_info_type, cpu_info));
- if (IS_ERR(rc))
- return PTR_ERR(rc);
- rc = hypfs_create_u64(cpu_dir, "cputime",
- cpu_info__lp_time(diag204_info_type, cpu_info));
- if (IS_ERR(rc))
- return PTR_ERR(rc);
- if (diag204_info_type == DIAG204_INFO_EXT) {
- rc = hypfs_create_u64(cpu_dir, "onlinetime",
- cpu_info__online_time(diag204_info_type,
- cpu_info));
- if (IS_ERR(rc))
- return PTR_ERR(rc);
- }
- diag224_idx2name(cpu_info__ctidx(diag204_info_type, cpu_info), buffer);
- rc = hypfs_create_str(cpu_dir, "type", buffer);
- return PTR_ERR_OR_ZERO(rc);
-}
-
-static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr)
-{
- struct dentry *cpus_dir;
- struct dentry *lpar_dir;
- char lpar_name[DIAG204_LPAR_NAME_LEN + 1];
- void *cpu_info;
- int i;
-
- part_hdr__part_name(diag204_info_type, part_hdr, lpar_name);
- lpar_name[DIAG204_LPAR_NAME_LEN] = 0;
- lpar_dir = hypfs_mkdir(systems_dir, lpar_name);
- if (IS_ERR(lpar_dir))
- return lpar_dir;
- cpus_dir = hypfs_mkdir(lpar_dir, "cpus");
- if (IS_ERR(cpus_dir))
- return cpus_dir;
- cpu_info = part_hdr + part_hdr__size(diag204_info_type);
- for (i = 0; i < part_hdr__rcpus(diag204_info_type, part_hdr); i++) {
- int rc;
- rc = hypfs_create_cpu_files(cpus_dir, cpu_info);
- if (rc)
- return ERR_PTR(rc);
- cpu_info += cpu_info__size(diag204_info_type);
- }
- return cpu_info;
-}
-
-static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info)
-{
- struct dentry *cpu_dir;
- char buffer[TMP_SIZE];
- void *rc;
-
- snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_info_type,
- cpu_info));
- cpu_dir = hypfs_mkdir(cpus_dir, buffer);
- if (IS_ERR(cpu_dir))
- return PTR_ERR(cpu_dir);
- rc = hypfs_create_u64(cpu_dir, "mgmtime",
- phys_cpu__mgm_time(diag204_info_type, cpu_info));
- if (IS_ERR(rc))
- return PTR_ERR(rc);
- diag224_idx2name(phys_cpu__ctidx(diag204_info_type, cpu_info), buffer);
- rc = hypfs_create_str(cpu_dir, "type", buffer);
- return PTR_ERR_OR_ZERO(rc);
-}
-
-static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr)
-{
- int i;
- void *cpu_info;
- struct dentry *cpus_dir;
-
- cpus_dir = hypfs_mkdir(parent_dir, "cpus");
- if (IS_ERR(cpus_dir))
- return cpus_dir;
- cpu_info = phys_hdr + phys_hdr__size(diag204_info_type);
- for (i = 0; i < phys_hdr__cpus(diag204_info_type, phys_hdr); i++) {
- int rc;
- rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info);
- if (rc)
- return ERR_PTR(rc);
- cpu_info += phys_cpu__size(diag204_info_type);
- }
- return cpu_info;
-}
-
-int hypfs_diag_create_files(struct dentry *root)
-{
- struct dentry *systems_dir, *hyp_dir;
- void *time_hdr, *part_hdr;
- int i, rc;
- void *buffer, *ptr;
-
- buffer = diag204_store();
- if (IS_ERR(buffer))
- return PTR_ERR(buffer);
-
- systems_dir = hypfs_mkdir(root, "systems");
- if (IS_ERR(systems_dir)) {
- rc = PTR_ERR(systems_dir);
- goto err_out;
- }
- time_hdr = (struct x_info_blk_hdr *)buffer;
- part_hdr = time_hdr + info_blk_hdr__size(diag204_info_type);
- for (i = 0; i < info_blk_hdr__npar(diag204_info_type, time_hdr); i++) {
- part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr);
- if (IS_ERR(part_hdr)) {
- rc = PTR_ERR(part_hdr);
- goto err_out;
- }
- }
- if (info_blk_hdr__flags(diag204_info_type, time_hdr) &
- DIAG204_LPAR_PHYS_FLG) {
- ptr = hypfs_create_phys_files(root, part_hdr);
- if (IS_ERR(ptr)) {
- rc = PTR_ERR(ptr);
- goto err_out;
- }
- }
- hyp_dir = hypfs_mkdir(root, "hyp");
- if (IS_ERR(hyp_dir)) {
- rc = PTR_ERR(hyp_dir);
- goto err_out;
- }
- ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor");
- if (IS_ERR(ptr)) {
- rc = PTR_ERR(ptr);
- goto err_out;
- }
- rc = 0;
-
-err_out:
- return rc;
-}
diff --git a/arch/s390/hypfs/hypfs_diag.h b/arch/s390/hypfs/hypfs_diag.h
new file mode 100644
index 000000000000..7090eff27fef
--- /dev/null
+++ b/arch/s390/hypfs/hypfs_diag.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Hypervisor filesystem for Linux on s390. Diag 204 and 224
+ * implementation.
+ *
+ * Copyright IBM Corp. 2006, 2008
+ * Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#ifndef _S390_HYPFS_DIAG_H_
+#define _S390_HYPFS_DIAG_H_
+
+#include <asm/diag.h>
+
+enum diag204_format diag204_get_info_type(void);
+void *diag204_get_buffer(enum diag204_format fmt, int *pages);
+int diag204_store(void *buf, int pages);
+
+int __hypfs_diag_fs_init(void);
+void __hypfs_diag_fs_exit(void);
+
+static inline int hypfs_diag_fs_init(void)
+{
+ if (IS_ENABLED(CONFIG_S390_HYPFS_FS))
+ return __hypfs_diag_fs_init();
+ return 0;
+}
+
+static inline void hypfs_diag_fs_exit(void)
+{
+ if (IS_ENABLED(CONFIG_S390_HYPFS_FS))
+ __hypfs_diag_fs_exit();
+}
+
+#endif /* _S390_HYPFS_DIAG_H_ */
diff --git a/arch/s390/hypfs/hypfs_diag0c.c b/arch/s390/hypfs/hypfs_diag0c.c
index 3235e4d82f2d..9a2786079e3a 100644
--- a/arch/s390/hypfs/hypfs_diag0c.c
+++ b/arch/s390/hypfs/hypfs_diag0c.c
@@ -21,7 +21,7 @@
static void diag0c_fn(void *data)
{
diag_stat_inc(DIAG_STAT_X00C);
- diag_dma_ops.diag0c(((void **) data)[smp_processor_id()]);
+ diag_amode31_ops.diag0c(((void **)data)[smp_processor_id()]);
}
/*
@@ -33,12 +33,12 @@ static void *diag0c_store(unsigned int *count)
unsigned int cpu_count, cpu, i;
void **cpu_vec;
- get_online_cpus();
+ cpus_read_lock();
cpu_count = num_online_cpus();
cpu_vec = kmalloc_array(num_possible_cpus(), sizeof(*cpu_vec),
GFP_KERNEL);
if (!cpu_vec)
- goto fail_put_online_cpus;
+ goto fail_unlock_cpus;
/* Note: Diag 0c needs 8 byte alignment and real storage */
diag0c_data = kzalloc(struct_size(diag0c_data, entry, cpu_count),
GFP_KERNEL | GFP_DMA);
@@ -54,13 +54,13 @@ static void *diag0c_store(unsigned int *count)
on_each_cpu(diag0c_fn, cpu_vec, 1);
*count = cpu_count;
kfree(cpu_vec);
- put_online_cpus();
+ cpus_read_unlock();
return diag0c_data;
fail_kfree_cpu_vec:
kfree(cpu_vec);
-fail_put_online_cpus:
- put_online_cpus();
+fail_unlock_cpus:
+ cpus_read_unlock();
return ERR_PTR(-ENOMEM);
}
@@ -84,7 +84,7 @@ static int dbfs_diag0c_create(void **data, void **data_free_ptr, size_t *size)
if (IS_ERR(diag0c_data))
return PTR_ERR(diag0c_data);
memset(&diag0c_data->hdr, 0, sizeof(diag0c_data->hdr));
- get_tod_clock_ext(diag0c_data->hdr.tod_ext);
+ store_tod_clock_ext((union tod_clock *)diag0c_data->hdr.tod_ext);
diag0c_data->hdr.len = count * sizeof(struct hypfs_diag0c_entry);
diag0c_data->hdr.version = DBFS_D0C_HDR_VERSION;
diag0c_data->hdr.count = count;
diff --git a/arch/s390/hypfs/hypfs_diag_fs.c b/arch/s390/hypfs/hypfs_diag_fs.c
new file mode 100644
index 000000000000..00a6d370a280
--- /dev/null
+++ b/arch/s390/hypfs/hypfs_diag_fs.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hypervisor filesystem for Linux on s390. Diag 204 and 224
+ * implementation.
+ *
+ * Copyright IBM Corp. 2006, 2008
+ * Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#define KMSG_COMPONENT "hypfs"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <asm/diag.h>
+#include <asm/ebcdic.h>
+#include "hypfs_diag.h"
+#include "hypfs.h"
+
+#define TMP_SIZE 64 /* size of temporary buffers */
+
+static char *diag224_cpu_names; /* diag 224 name table */
+static int diag224_idx2name(int index, char *name);
+
+/*
+ * DIAG 204 member access functions.
+ *
+ * Since we have two different diag 204 data formats for old and new s390
+ * machines, we do not access the structs directly, but use getter functions for
+ * each struct member instead. This should make the code more readable.
+ */
+
+/* Time information block */
+
+static inline int info_blk_hdr__size(enum diag204_format type)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_info_blk_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_info_blk_hdr);
+}
+
+static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_info_blk_hdr *)hdr)->npar;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_info_blk_hdr *)hdr)->npar;
+}
+
+static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_info_blk_hdr *)hdr)->flags;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_info_blk_hdr *)hdr)->flags;
+}
+
+/* Partition header */
+
+static inline int part_hdr__size(enum diag204_format type)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_part_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_part_hdr);
+}
+
+static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_part_hdr *)hdr)->cpus;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_part_hdr *)hdr)->rcpus;
+}
+
+static inline void part_hdr__part_name(enum diag204_format type, void *hdr,
+ char *name)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name,
+ DIAG204_LPAR_NAME_LEN);
+ else /* DIAG204_INFO_EXT */
+ memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name,
+ DIAG204_LPAR_NAME_LEN);
+ EBCASC(name, DIAG204_LPAR_NAME_LEN);
+ name[DIAG204_LPAR_NAME_LEN] = 0;
+ strim(name);
+}
+
+/* CPU info block */
+
+static inline int cpu_info__size(enum diag204_format type)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_cpu_info);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_cpu_info);
+}
+
+static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->ctidx;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->ctidx;
+}
+
+static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->cpu_addr;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->cpu_addr;
+}
+
+static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->acc_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->acc_time;
+}
+
+static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->lp_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->lp_time;
+}
+
+static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return 0; /* online_time not available in simple info */
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->online_time;
+}
+
+/* Physical header */
+
+static inline int phys_hdr__size(enum diag204_format type)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_phys_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_phys_hdr);
+}
+
+static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_hdr *)hdr)->cpus;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_hdr *)hdr)->cpus;
+}
+
+/* Physical CPU info block */
+
+static inline int phys_cpu__size(enum diag204_format type)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_phys_cpu);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_phys_cpu);
+}
+
+static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->cpu_addr;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr;
+}
+
+static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->mgm_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->mgm_time;
+}
+
+static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->ctidx;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->ctidx;
+}
+
+/*
+ * Functions to create the directory structure
+ * *******************************************
+ */
+
+static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info)
+{
+ struct dentry *cpu_dir;
+ char buffer[TMP_SIZE];
+ void *rc;
+
+ snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_get_info_type(),
+ cpu_info));
+ cpu_dir = hypfs_mkdir(cpus_dir, buffer);
+ rc = hypfs_create_u64(cpu_dir, "mgmtime",
+ cpu_info__acc_time(diag204_get_info_type(), cpu_info) -
+ cpu_info__lp_time(diag204_get_info_type(), cpu_info));
+ if (IS_ERR(rc))
+ return PTR_ERR(rc);
+ rc = hypfs_create_u64(cpu_dir, "cputime",
+ cpu_info__lp_time(diag204_get_info_type(), cpu_info));
+ if (IS_ERR(rc))
+ return PTR_ERR(rc);
+ if (diag204_get_info_type() == DIAG204_INFO_EXT) {
+ rc = hypfs_create_u64(cpu_dir, "onlinetime",
+ cpu_info__online_time(diag204_get_info_type(),
+ cpu_info));
+ if (IS_ERR(rc))
+ return PTR_ERR(rc);
+ }
+ diag224_idx2name(cpu_info__ctidx(diag204_get_info_type(), cpu_info), buffer);
+ rc = hypfs_create_str(cpu_dir, "type", buffer);
+ return PTR_ERR_OR_ZERO(rc);
+}
+
+static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr)
+{
+ struct dentry *cpus_dir;
+ struct dentry *lpar_dir;
+ char lpar_name[DIAG204_LPAR_NAME_LEN + 1];
+ void *cpu_info;
+ int i;
+
+ part_hdr__part_name(diag204_get_info_type(), part_hdr, lpar_name);
+ lpar_name[DIAG204_LPAR_NAME_LEN] = 0;
+ lpar_dir = hypfs_mkdir(systems_dir, lpar_name);
+ if (IS_ERR(lpar_dir))
+ return lpar_dir;
+ cpus_dir = hypfs_mkdir(lpar_dir, "cpus");
+ if (IS_ERR(cpus_dir))
+ return cpus_dir;
+ cpu_info = part_hdr + part_hdr__size(diag204_get_info_type());
+ for (i = 0; i < part_hdr__rcpus(diag204_get_info_type(), part_hdr); i++) {
+ int rc;
+
+ rc = hypfs_create_cpu_files(cpus_dir, cpu_info);
+ if (rc)
+ return ERR_PTR(rc);
+ cpu_info += cpu_info__size(diag204_get_info_type());
+ }
+ return cpu_info;
+}
+
+static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info)
+{
+ struct dentry *cpu_dir;
+ char buffer[TMP_SIZE];
+ void *rc;
+
+ snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_get_info_type(),
+ cpu_info));
+ cpu_dir = hypfs_mkdir(cpus_dir, buffer);
+ if (IS_ERR(cpu_dir))
+ return PTR_ERR(cpu_dir);
+ rc = hypfs_create_u64(cpu_dir, "mgmtime",
+ phys_cpu__mgm_time(diag204_get_info_type(), cpu_info));
+ if (IS_ERR(rc))
+ return PTR_ERR(rc);
+ diag224_idx2name(phys_cpu__ctidx(diag204_get_info_type(), cpu_info), buffer);
+ rc = hypfs_create_str(cpu_dir, "type", buffer);
+ return PTR_ERR_OR_ZERO(rc);
+}
+
+static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr)
+{
+ int i;
+ void *cpu_info;
+ struct dentry *cpus_dir;
+
+ cpus_dir = hypfs_mkdir(parent_dir, "cpus");
+ if (IS_ERR(cpus_dir))
+ return cpus_dir;
+ cpu_info = phys_hdr + phys_hdr__size(diag204_get_info_type());
+ for (i = 0; i < phys_hdr__cpus(diag204_get_info_type(), phys_hdr); i++) {
+ int rc;
+
+ rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info);
+ if (rc)
+ return ERR_PTR(rc);
+ cpu_info += phys_cpu__size(diag204_get_info_type());
+ }
+ return cpu_info;
+}
+
+int hypfs_diag_create_files(struct dentry *root)
+{
+ struct dentry *systems_dir, *hyp_dir;
+ void *time_hdr, *part_hdr;
+ void *buffer, *ptr;
+ int i, rc, pages;
+
+ buffer = diag204_get_buffer(diag204_get_info_type(), &pages);
+ if (IS_ERR(buffer))
+ return PTR_ERR(buffer);
+ rc = diag204_store(buffer, pages);
+ if (rc)
+ return rc;
+
+ systems_dir = hypfs_mkdir(root, "systems");
+ if (IS_ERR(systems_dir)) {
+ rc = PTR_ERR(systems_dir);
+ goto err_out;
+ }
+ time_hdr = (struct x_info_blk_hdr *)buffer;
+ part_hdr = time_hdr + info_blk_hdr__size(diag204_get_info_type());
+ for (i = 0; i < info_blk_hdr__npar(diag204_get_info_type(), time_hdr); i++) {
+ part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr);
+ if (IS_ERR(part_hdr)) {
+ rc = PTR_ERR(part_hdr);
+ goto err_out;
+ }
+ }
+ if (info_blk_hdr__flags(diag204_get_info_type(), time_hdr) &
+ DIAG204_LPAR_PHYS_FLG) {
+ ptr = hypfs_create_phys_files(root, part_hdr);
+ if (IS_ERR(ptr)) {
+ rc = PTR_ERR(ptr);
+ goto err_out;
+ }
+ }
+ hyp_dir = hypfs_mkdir(root, "hyp");
+ if (IS_ERR(hyp_dir)) {
+ rc = PTR_ERR(hyp_dir);
+ goto err_out;
+ }
+ ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor");
+ if (IS_ERR(ptr)) {
+ rc = PTR_ERR(ptr);
+ goto err_out;
+ }
+ rc = 0;
+
+err_out:
+ return rc;
+}
+
+/* Diagnose 224 functions */
+
+static int diag224_idx2name(int index, char *name)
+{
+ memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN),
+ DIAG204_CPU_NAME_LEN);
+ name[DIAG204_CPU_NAME_LEN] = 0;
+ strim(name);
+ return 0;
+}
+
+static int diag224_get_name_table(void)
+{
+ /* memory must be below 2GB */
+ diag224_cpu_names = (char *)__get_free_page(GFP_KERNEL | GFP_DMA);
+ if (!diag224_cpu_names)
+ return -ENOMEM;
+ if (diag224(diag224_cpu_names)) {
+ free_page((unsigned long)diag224_cpu_names);
+ return -EOPNOTSUPP;
+ }
+ EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16);
+ return 0;
+}
+
+static void diag224_delete_name_table(void)
+{
+ free_page((unsigned long)diag224_cpu_names);
+}
+
+int __init __hypfs_diag_fs_init(void)
+{
+ if (MACHINE_IS_LPAR)
+ return diag224_get_name_table();
+ return 0;
+}
+
+void __hypfs_diag_fs_exit(void)
+{
+ diag224_delete_name_table();
+}
diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c
index 7d9fb496d155..f5f7e78ddc0c 100644
--- a/arch/s390/hypfs/hypfs_sprp.c
+++ b/arch/s390/hypfs/hypfs_sprp.c
@@ -25,14 +25,13 @@
static inline unsigned long __hypfs_sprp_diag304(void *data, unsigned long cmd)
{
- register unsigned long _data asm("2") = (unsigned long) data;
- register unsigned long _rc asm("3");
- register unsigned long _cmd asm("4") = cmd;
+ union register_pair r1 = { .even = (unsigned long)data, };
- asm volatile("diag %1,%2,0x304\n"
- : "=d" (_rc) : "d" (_data), "d" (_cmd) : "memory");
-
- return _rc;
+ asm volatile("diag %[r1],%[r3],0x304\n"
+ : [r1] "+&d" (r1.pair)
+ : [r3] "d" (cmd)
+ : "memory");
+ return r1.odd;
}
static unsigned long hypfs_sprp_diag304(void *data, unsigned long cmd)
diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c
index e1fcc03159ef..3db40ad853e0 100644
--- a/arch/s390/hypfs/hypfs_vm.c
+++ b/arch/s390/hypfs/hypfs_vm.c
@@ -10,49 +10,19 @@
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/vmalloc.h>
+#include <asm/extable.h>
#include <asm/diag.h>
#include <asm/ebcdic.h>
#include <asm/timex.h>
+#include "hypfs_vm.h"
#include "hypfs.h"
-#define NAME_LEN 8
#define DBFS_D2FC_HDR_VERSION 0
static char local_guest[] = " ";
static char all_guests[] = "* ";
-static char *guest_query;
-
-struct diag2fc_data {
- __u32 version;
- __u32 flags;
- __u64 used_cpu;
- __u64 el_time;
- __u64 mem_min_kb;
- __u64 mem_max_kb;
- __u64 mem_share_kb;
- __u64 mem_used_kb;
- __u32 pcpus;
- __u32 lcpus;
- __u32 vcpus;
- __u32 ocpus;
- __u32 cpu_max;
- __u32 cpu_shares;
- __u32 cpu_use_samp;
- __u32 cpu_delay_samp;
- __u32 page_wait_samp;
- __u32 idle_samp;
- __u32 other_samp;
- __u32 total_samp;
- char guest_name[NAME_LEN];
-};
-
-struct diag2fc_parm_list {
- char userid[NAME_LEN];
- char aci_grp[NAME_LEN];
- __u64 addr;
- __u32 size;
- __u32 fmt;
-};
+static char *all_groups = all_guests;
+char *diag2fc_guest_query;
static int diag2fc(int size, char* query, void *addr)
{
@@ -60,12 +30,13 @@ static int diag2fc(int size, char* query, void *addr)
unsigned long rc;
struct diag2fc_parm_list parm_list;
- memcpy(parm_list.userid, query, NAME_LEN);
- ASCEBC(parm_list.userid, NAME_LEN);
- parm_list.addr = (unsigned long) addr ;
+ memcpy(parm_list.userid, query, DIAG2FC_NAME_LEN);
+ ASCEBC(parm_list.userid, DIAG2FC_NAME_LEN);
+ memcpy(parm_list.aci_grp, all_groups, DIAG2FC_NAME_LEN);
+ ASCEBC(parm_list.aci_grp, DIAG2FC_NAME_LEN);
+ parm_list.addr = (unsigned long)addr;
parm_list.size = size;
parm_list.fmt = 0x02;
- memset(parm_list.aci_grp, 0x40, NAME_LEN);
rc = -1;
diag_stat_inc(DIAG_STAT_X2FC);
@@ -84,7 +55,7 @@ static int diag2fc(int size, char* query, void *addr)
/*
* Allocate buffer for "query" and store diag 2fc at "offset"
*/
-static void *diag2fc_store(char *query, unsigned int *count, int offset)
+void *diag2fc_store(char *query, unsigned int *count, int offset)
{
void *data;
int size;
@@ -105,136 +76,15 @@ static void *diag2fc_store(char *query, unsigned int *count, int offset)
return data;
}
-static void diag2fc_free(const void *data)
+void diag2fc_free(const void *data)
{
vfree(data);
}
-#define ATTRIBUTE(dir, name, member) \
-do { \
- void *rc; \
- rc = hypfs_create_u64(dir, name, member); \
- if (IS_ERR(rc)) \
- return PTR_ERR(rc); \
-} while(0)
-
-static int hypfs_vm_create_guest(struct dentry *systems_dir,
- struct diag2fc_data *data)
-{
- char guest_name[NAME_LEN + 1] = {};
- struct dentry *guest_dir, *cpus_dir, *samples_dir, *mem_dir;
- int dedicated_flag, capped_value;
-
- capped_value = (data->flags & 0x00000006) >> 1;
- dedicated_flag = (data->flags & 0x00000008) >> 3;
-
- /* guest dir */
- memcpy(guest_name, data->guest_name, NAME_LEN);
- EBCASC(guest_name, NAME_LEN);
- strim(guest_name);
- guest_dir = hypfs_mkdir(systems_dir, guest_name);
- if (IS_ERR(guest_dir))
- return PTR_ERR(guest_dir);
- ATTRIBUTE(guest_dir, "onlinetime_us", data->el_time);
-
- /* logical cpu information */
- cpus_dir = hypfs_mkdir(guest_dir, "cpus");
- if (IS_ERR(cpus_dir))
- return PTR_ERR(cpus_dir);
- ATTRIBUTE(cpus_dir, "cputime_us", data->used_cpu);
- ATTRIBUTE(cpus_dir, "capped", capped_value);
- ATTRIBUTE(cpus_dir, "dedicated", dedicated_flag);
- ATTRIBUTE(cpus_dir, "count", data->vcpus);
- /*
- * Note: The "weight_min" attribute got the wrong name.
- * The value represents the number of non-stopped (operating)
- * CPUS.
- */
- ATTRIBUTE(cpus_dir, "weight_min", data->ocpus);
- ATTRIBUTE(cpus_dir, "weight_max", data->cpu_max);
- ATTRIBUTE(cpus_dir, "weight_cur", data->cpu_shares);
-
- /* memory information */
- mem_dir = hypfs_mkdir(guest_dir, "mem");
- if (IS_ERR(mem_dir))
- return PTR_ERR(mem_dir);
- ATTRIBUTE(mem_dir, "min_KiB", data->mem_min_kb);
- ATTRIBUTE(mem_dir, "max_KiB", data->mem_max_kb);
- ATTRIBUTE(mem_dir, "used_KiB", data->mem_used_kb);
- ATTRIBUTE(mem_dir, "share_KiB", data->mem_share_kb);
-
- /* samples */
- samples_dir = hypfs_mkdir(guest_dir, "samples");
- if (IS_ERR(samples_dir))
- return PTR_ERR(samples_dir);
- ATTRIBUTE(samples_dir, "cpu_using", data->cpu_use_samp);
- ATTRIBUTE(samples_dir, "cpu_delay", data->cpu_delay_samp);
- ATTRIBUTE(samples_dir, "mem_delay", data->page_wait_samp);
- ATTRIBUTE(samples_dir, "idle", data->idle_samp);
- ATTRIBUTE(samples_dir, "other", data->other_samp);
- ATTRIBUTE(samples_dir, "total", data->total_samp);
- return 0;
-}
-
-int hypfs_vm_create_files(struct dentry *root)
-{
- struct dentry *dir, *file;
- struct diag2fc_data *data;
- unsigned int count = 0;
- int rc, i;
-
- data = diag2fc_store(guest_query, &count, 0);
- if (IS_ERR(data))
- return PTR_ERR(data);
-
- /* Hpervisor Info */
- dir = hypfs_mkdir(root, "hyp");
- if (IS_ERR(dir)) {
- rc = PTR_ERR(dir);
- goto failed;
- }
- file = hypfs_create_str(dir, "type", "z/VM Hypervisor");
- if (IS_ERR(file)) {
- rc = PTR_ERR(file);
- goto failed;
- }
-
- /* physical cpus */
- dir = hypfs_mkdir(root, "cpus");
- if (IS_ERR(dir)) {
- rc = PTR_ERR(dir);
- goto failed;
- }
- file = hypfs_create_u64(dir, "count", data->lcpus);
- if (IS_ERR(file)) {
- rc = PTR_ERR(file);
- goto failed;
- }
-
- /* guests */
- dir = hypfs_mkdir(root, "systems");
- if (IS_ERR(dir)) {
- rc = PTR_ERR(dir);
- goto failed;
- }
-
- for (i = 0; i < count; i++) {
- rc = hypfs_vm_create_guest(dir, &(data[i]));
- if (rc)
- goto failed;
- }
- diag2fc_free(data);
- return 0;
-
-failed:
- diag2fc_free(data);
- return rc;
-}
-
struct dbfs_d2fc_hdr {
u64 len; /* Length of d2fc buffer without header */
u16 version; /* Version of header */
- char tod_ext[STORE_CLOCK_EXT_SIZE]; /* TOD clock for d2fc */
+ union tod_clock tod_ext; /* TOD clock for d2fc */
u64 count; /* Number of VM guests in d2fc buffer */
char reserved[30];
} __attribute__ ((packed));
@@ -249,10 +99,10 @@ static int dbfs_diag2fc_create(void **data, void **data_free_ptr, size_t *size)
struct dbfs_d2fc *d2fc;
unsigned int count;
- d2fc = diag2fc_store(guest_query, &count, sizeof(d2fc->hdr));
+ d2fc = diag2fc_store(diag2fc_guest_query, &count, sizeof(d2fc->hdr));
if (IS_ERR(d2fc))
return PTR_ERR(d2fc);
- get_tod_clock_ext(d2fc->hdr.tod_ext);
+ store_tod_clock_ext(&d2fc->hdr.tod_ext);
d2fc->hdr.len = count * sizeof(struct diag2fc_data);
d2fc->hdr.version = DBFS_D2FC_HDR_VERSION;
d2fc->hdr.count = count;
@@ -274,9 +124,9 @@ int hypfs_vm_init(void)
if (!MACHINE_IS_VM)
return 0;
if (diag2fc(0, all_guests, NULL) > 0)
- guest_query = all_guests;
+ diag2fc_guest_query = all_guests;
else if (diag2fc(0, local_guest, NULL) > 0)
- guest_query = local_guest;
+ diag2fc_guest_query = local_guest;
else
return -EACCES;
hypfs_dbfs_create_file(&dbfs_file_2fc);
diff --git a/arch/s390/hypfs/hypfs_vm.h b/arch/s390/hypfs/hypfs_vm.h
new file mode 100644
index 000000000000..fe2e5851addd
--- /dev/null
+++ b/arch/s390/hypfs/hypfs_vm.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Hypervisor filesystem for Linux on s390. z/VM implementation.
+ *
+ * Copyright IBM Corp. 2006
+ * Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#ifndef _S390_HYPFS_VM_H_
+#define _S390_HYPFS_VM_H_
+
+#define DIAG2FC_NAME_LEN 8
+
+struct diag2fc_data {
+ __u32 version;
+ __u32 flags;
+ __u64 used_cpu;
+ __u64 el_time;
+ __u64 mem_min_kb;
+ __u64 mem_max_kb;
+ __u64 mem_share_kb;
+ __u64 mem_used_kb;
+ __u32 pcpus;
+ __u32 lcpus;
+ __u32 vcpus;
+ __u32 ocpus;
+ __u32 cpu_max;
+ __u32 cpu_shares;
+ __u32 cpu_use_samp;
+ __u32 cpu_delay_samp;
+ __u32 page_wait_samp;
+ __u32 idle_samp;
+ __u32 other_samp;
+ __u32 total_samp;
+ char guest_name[DIAG2FC_NAME_LEN];
+};
+
+struct diag2fc_parm_list {
+ char userid[DIAG2FC_NAME_LEN];
+ char aci_grp[DIAG2FC_NAME_LEN];
+ __u64 addr;
+ __u32 size;
+ __u32 fmt;
+};
+
+void *diag2fc_store(char *query, unsigned int *count, int offset);
+void diag2fc_free(const void *data);
+extern char *diag2fc_guest_query;
+
+#endif /* _S390_HYPFS_VM_H_ */
diff --git a/arch/s390/hypfs/hypfs_vm_fs.c b/arch/s390/hypfs/hypfs_vm_fs.c
new file mode 100644
index 000000000000..6011289afa8c
--- /dev/null
+++ b/arch/s390/hypfs/hypfs_vm_fs.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hypervisor filesystem for Linux on s390. z/VM implementation.
+ *
+ * Copyright IBM Corp. 2006
+ * Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <asm/extable.h>
+#include <asm/diag.h>
+#include <asm/ebcdic.h>
+#include <asm/timex.h>
+#include "hypfs_vm.h"
+#include "hypfs.h"
+
+#define ATTRIBUTE(dir, name, member) \
+do { \
+ void *rc; \
+ rc = hypfs_create_u64(dir, name, member); \
+ if (IS_ERR(rc)) \
+ return PTR_ERR(rc); \
+} while (0)
+
+static int hypfs_vm_create_guest(struct dentry *systems_dir,
+ struct diag2fc_data *data)
+{
+ char guest_name[DIAG2FC_NAME_LEN + 1] = {};
+ struct dentry *guest_dir, *cpus_dir, *samples_dir, *mem_dir;
+ int dedicated_flag, capped_value;
+
+ capped_value = (data->flags & 0x00000006) >> 1;
+ dedicated_flag = (data->flags & 0x00000008) >> 3;
+
+ /* guest dir */
+ memcpy(guest_name, data->guest_name, DIAG2FC_NAME_LEN);
+ EBCASC(guest_name, DIAG2FC_NAME_LEN);
+ strim(guest_name);
+ guest_dir = hypfs_mkdir(systems_dir, guest_name);
+ if (IS_ERR(guest_dir))
+ return PTR_ERR(guest_dir);
+ ATTRIBUTE(guest_dir, "onlinetime_us", data->el_time);
+
+ /* logical cpu information */
+ cpus_dir = hypfs_mkdir(guest_dir, "cpus");
+ if (IS_ERR(cpus_dir))
+ return PTR_ERR(cpus_dir);
+ ATTRIBUTE(cpus_dir, "cputime_us", data->used_cpu);
+ ATTRIBUTE(cpus_dir, "capped", capped_value);
+ ATTRIBUTE(cpus_dir, "dedicated", dedicated_flag);
+ ATTRIBUTE(cpus_dir, "count", data->vcpus);
+ /*
+ * Note: The "weight_min" attribute got the wrong name.
+ * The value represents the number of non-stopped (operating)
+ * CPUS.
+ */
+ ATTRIBUTE(cpus_dir, "weight_min", data->ocpus);
+ ATTRIBUTE(cpus_dir, "weight_max", data->cpu_max);
+ ATTRIBUTE(cpus_dir, "weight_cur", data->cpu_shares);
+
+ /* memory information */
+ mem_dir = hypfs_mkdir(guest_dir, "mem");
+ if (IS_ERR(mem_dir))
+ return PTR_ERR(mem_dir);
+ ATTRIBUTE(mem_dir, "min_KiB", data->mem_min_kb);
+ ATTRIBUTE(mem_dir, "max_KiB", data->mem_max_kb);
+ ATTRIBUTE(mem_dir, "used_KiB", data->mem_used_kb);
+ ATTRIBUTE(mem_dir, "share_KiB", data->mem_share_kb);
+
+ /* samples */
+ samples_dir = hypfs_mkdir(guest_dir, "samples");
+ if (IS_ERR(samples_dir))
+ return PTR_ERR(samples_dir);
+ ATTRIBUTE(samples_dir, "cpu_using", data->cpu_use_samp);
+ ATTRIBUTE(samples_dir, "cpu_delay", data->cpu_delay_samp);
+ ATTRIBUTE(samples_dir, "mem_delay", data->page_wait_samp);
+ ATTRIBUTE(samples_dir, "idle", data->idle_samp);
+ ATTRIBUTE(samples_dir, "other", data->other_samp);
+ ATTRIBUTE(samples_dir, "total", data->total_samp);
+ return 0;
+}
+
+int hypfs_vm_create_files(struct dentry *root)
+{
+ struct dentry *dir, *file;
+ struct diag2fc_data *data;
+ unsigned int count = 0;
+ int rc, i;
+
+ data = diag2fc_store(diag2fc_guest_query, &count, 0);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ /* Hypervisor Info */
+ dir = hypfs_mkdir(root, "hyp");
+ if (IS_ERR(dir)) {
+ rc = PTR_ERR(dir);
+ goto failed;
+ }
+ file = hypfs_create_str(dir, "type", "z/VM Hypervisor");
+ if (IS_ERR(file)) {
+ rc = PTR_ERR(file);
+ goto failed;
+ }
+
+ /* physical cpus */
+ dir = hypfs_mkdir(root, "cpus");
+ if (IS_ERR(dir)) {
+ rc = PTR_ERR(dir);
+ goto failed;
+ }
+ file = hypfs_create_u64(dir, "count", data->lcpus);
+ if (IS_ERR(file)) {
+ rc = PTR_ERR(file);
+ goto failed;
+ }
+
+ /* guests */
+ dir = hypfs_mkdir(root, "systems");
+ if (IS_ERR(dir)) {
+ rc = PTR_ERR(dir);
+ goto failed;
+ }
+
+ for (i = 0; i < count; i++) {
+ rc = hypfs_vm_create_guest(dir, &data[i]);
+ if (rc)
+ goto failed;
+ }
+ diag2fc_free(data);
+ return 0;
+
+failed:
+ diag2fc_free(data);
+ return rc;
+}
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 70139d0791b6..858beaf4a8cb 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -53,7 +53,7 @@ static void hypfs_update_update(struct super_block *sb)
struct inode *inode = d_inode(sb_info->update_file);
sb_info->last_update = ktime_get_seconds();
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
}
/* directory tree removal functions */
@@ -101,7 +101,7 @@ static struct inode *hypfs_make_inode(struct super_block *sb, umode_t mode)
ret->i_mode = mode;
ret->i_uid = hypfs_info->uid;
ret->i_gid = hypfs_info->gid;
- ret->i_atime = ret->i_mtime = ret->i_ctime = current_time(ret);
+ simple_inode_init_ts(ret);
if (S_ISDIR(mode))
set_nlink(ret, 2);
}
@@ -209,17 +209,12 @@ static int hypfs_release(struct inode *inode, struct file *filp)
enum { Opt_uid, Opt_gid, };
-static const struct fs_parameter_spec hypfs_param_specs[] = {
+static const struct fs_parameter_spec hypfs_fs_parameters[] = {
fsparam_u32("gid", Opt_gid),
fsparam_u32("uid", Opt_uid),
{}
};
-static const struct fs_parameter_description hypfs_fs_parameters = {
- .name = "hypfs",
- .specs = hypfs_param_specs,
-};
-
static int hypfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct hypfs_sb_info *hypfs_info = fc->s_fs_info;
@@ -228,7 +223,7 @@ static int hypfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
kgid_t gid;
int opt;
- opt = fs_parse(fc, &hypfs_fs_parameters, param, &result);
+ opt = fs_parse(fc, hypfs_fs_parameters, param, &result);
if (opt < 0)
return opt;
@@ -455,7 +450,7 @@ static struct file_system_type hypfs_type = {
.owner = THIS_MODULE,
.name = "s390_hypfs",
.init_fs_context = hypfs_init_fs_context,
- .parameters = &hypfs_fs_parameters,
+ .parameters = hypfs_fs_parameters,
.kill_sb = hypfs_kill_super
};
@@ -465,45 +460,18 @@ static const struct super_operations hypfs_s_ops = {
.show_options = hypfs_show_options,
};
-static int __init hypfs_init(void)
+int __init __hypfs_fs_init(void)
{
int rc;
- hypfs_dbfs_init();
-
- if (hypfs_diag_init()) {
- rc = -ENODATA;
- goto fail_dbfs_exit;
- }
- if (hypfs_vm_init()) {
- rc = -ENODATA;
- goto fail_hypfs_diag_exit;
- }
- hypfs_sprp_init();
- if (hypfs_diag0c_init()) {
- rc = -ENODATA;
- goto fail_hypfs_sprp_exit;
- }
rc = sysfs_create_mount_point(hypervisor_kobj, "s390");
if (rc)
- goto fail_hypfs_diag0c_exit;
+ return rc;
rc = register_filesystem(&hypfs_type);
if (rc)
- goto fail_filesystem;
+ goto fail;
return 0;
-
-fail_filesystem:
+fail:
sysfs_remove_mount_point(hypervisor_kobj, "s390");
-fail_hypfs_diag0c_exit:
- hypfs_diag0c_exit();
-fail_hypfs_sprp_exit:
- hypfs_sprp_exit();
- hypfs_vm_exit();
-fail_hypfs_diag_exit:
- hypfs_diag_exit();
-fail_dbfs_exit:
- hypfs_dbfs_exit();
- pr_err("Initialization of hypfs failed with rc=%i\n", rc);
return rc;
}
-device_initcall(hypfs_init)
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 2531f673f099..4b904110d27c 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -5,22 +5,5 @@ generated-y += syscall_table.h
generated-y += unistd_nr.h
generic-y += asm-offsets.h
-generic-y += cacheflush.h
-generic-y += device.h
-generic-y += dma-contiguous.h
-generic-y += dma-mapping.h
-generic-y += div64.h
-generic-y += emergency-restart.h
-generic-y += export.h
-generic-y += fb.h
-generic-y += irq_regs.h
-generic-y += irq_work.h
-generic-y += kmap_types.h
-generic-y += local.h
-generic-y += local64.h
+generic-y += kvm_types.h
generic-y += mcs_spinlock.h
-generic-y += mm-arch-hooks.h
-generic-y += mmiowb.h
-generic-y += trace_clock.h
-generic-y += unaligned.h
-generic-y += word-at-a-time.h
diff --git a/arch/s390/include/asm/abs_lowcore.h b/arch/s390/include/asm/abs_lowcore.h
new file mode 100644
index 000000000000..6f264b79e377
--- /dev/null
+++ b/arch/s390/include/asm/abs_lowcore.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_ABS_LOWCORE_H
+#define _ASM_S390_ABS_LOWCORE_H
+
+#include <asm/lowcore.h>
+
+#define ABS_LOWCORE_MAP_SIZE (NR_CPUS * sizeof(struct lowcore))
+
+extern unsigned long __abs_lowcore;
+
+int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc);
+void abs_lowcore_unmap(int cpu);
+
+static inline struct lowcore *get_abs_lowcore(void)
+{
+ int cpu;
+
+ cpu = get_cpu();
+ return ((struct lowcore *)__abs_lowcore) + cpu;
+}
+
+static inline void put_abs_lowcore(struct lowcore *lc)
+{
+ put_cpu();
+}
+
+#endif /* _ASM_S390_ABS_LOWCORE_H */
diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h
index 01936fdfaddb..c4c28c2609a5 100644
--- a/arch/s390/include/asm/airq.h
+++ b/arch/s390/include/asm/airq.h
@@ -12,12 +12,12 @@
#include <linux/bit_spinlock.h>
#include <linux/dma-mapping.h>
+#include <asm/tpi.h>
struct airq_struct {
struct hlist_node list; /* Handler queueing. */
- void (*handler)(struct airq_struct *airq, bool floating);
+ void (*handler)(struct airq_struct *airq, struct tpi_info *tpi_info);
u8 *lsi_ptr; /* Local-Summary-Indicator pointer */
- u8 lsi_mask; /* Local-Summary-Indicator mask */
u8 isc; /* Interrupt-subclass */
u8 flags;
};
@@ -46,8 +46,10 @@ struct airq_iv {
#define AIRQ_IV_PTR 4 /* Allocate the ptr array */
#define AIRQ_IV_DATA 8 /* Allocate the data array */
#define AIRQ_IV_CACHELINE 16 /* Cacheline alignment for the vector */
+#define AIRQ_IV_GUESTVEC 32 /* Vector is a pinned guest page */
-struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags);
+struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags,
+ unsigned long *vec);
void airq_iv_release(struct airq_iv *iv);
unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num);
void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num);
diff --git a/arch/s390/include/asm/alternative-asm.h b/arch/s390/include/asm/alternative-asm.h
index 955d620db23e..7db046596b93 100644
--- a/arch/s390/include/asm/alternative-asm.h
+++ b/arch/s390/include/asm/alternative-asm.h
@@ -5,19 +5,6 @@
#ifdef __ASSEMBLY__
/*
- * Check the length of an instruction sequence. The length may not be larger
- * than 254 bytes and it has to be divisible by 2.
- */
-.macro alt_len_check start,end
- .if ( \end - \start ) > 254
- .error "cpu alternatives does not support instructions blocks > 254 bytes\n"
- .endif
- .if ( \end - \start ) % 2
- .error "cpu alternatives instructions length is odd\n"
- .endif
-.endm
-
-/*
* Issue one struct alt_instr descriptor entry (need to put it into
* the section .altinstructions, see below). This entry contains
* enough information for the alternatives patching code to patch an
@@ -28,60 +15,29 @@
.long \alt_start - .
.word \feature
.byte \orig_end - \orig_start
- .byte \alt_end - \alt_start
-.endm
-
-/*
- * Fill up @bytes with nops. The macro emits 6-byte nop instructions
- * for the bulk of the area, possibly followed by a 4-byte and/or
- * a 2-byte nop if the size of the area is not divisible by 6.
- */
-.macro alt_pad_fill bytes
- .fill ( \bytes ) / 6, 6, 0xc0040000
- .fill ( \bytes ) % 6 / 4, 4, 0x47000000
- .fill ( \bytes ) % 6 % 4 / 2, 2, 0x0700
-.endm
-
-/*
- * Fill up @bytes with nops. If the number of bytes is larger
- * than 6, emit a jg instruction to branch over all nops, then
- * fill an area of size (@bytes - 6) with nop instructions.
- */
-.macro alt_pad bytes
- .if ( \bytes > 0 )
- .if ( \bytes > 6 )
- jg . + \bytes
- alt_pad_fill \bytes - 6
- .else
- alt_pad_fill \bytes
- .endif
- .endif
+ .org . - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start )
+ .org . - ( \alt_end - \alt_start ) + ( \orig_end - \orig_start )
.endm
/*
* Define an alternative between two instructions. If @feature is
* present, early code in apply_alternatives() replaces @oldinstr with
- * @newinstr. ".skip" directive takes care of proper instruction padding
- * in case @newinstr is longer than @oldinstr.
+ * @newinstr.
*/
.macro ALTERNATIVE oldinstr, newinstr, feature
.pushsection .altinstr_replacement,"ax"
770: \newinstr
771: .popsection
772: \oldinstr
-773: alt_len_check 770b, 771b
- alt_len_check 772b, 773b
- alt_pad ( ( 771b - 770b ) - ( 773b - 772b ) )
-774: .pushsection .altinstructions,"a"
- alt_entry 772b, 774b, 770b, 771b, \feature
+773: .pushsection .altinstructions,"a"
+ alt_entry 772b, 773b, 770b, 771b, \feature
.popsection
.endm
/*
* Define an alternative between two instructions. If @feature is
* present, early code in apply_alternatives() replaces @oldinstr with
- * @newinstr. ".skip" directive takes care of proper instruction padding
- * in case @newinstr is longer than @oldinstr.
+ * @newinstr.
*/
.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
.pushsection .altinstr_replacement,"ax"
@@ -89,17 +45,9 @@
771: \newinstr2
772: .popsection
773: \oldinstr
-774: alt_len_check 770b, 771b
- alt_len_check 771b, 772b
- alt_len_check 773b, 774b
- .if ( 771b - 770b > 772b - 771b )
- alt_pad ( ( 771b - 770b ) - ( 774b - 773b ) )
- .else
- alt_pad ( ( 772b - 771b ) - ( 774b - 773b ) )
- .endif
-775: .pushsection .altinstructions,"a"
- alt_entry 773b, 775b, 770b, 771b,\feature1
- alt_entry 773b, 775b, 771b, 772b,\feature2
+774: .pushsection .altinstructions,"a"
+ alt_entry 773b, 774b, 770b, 771b,\feature1
+ alt_entry 773b, 774b, 771b, 772b,\feature2
.popsection
.endm
diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
index 1c8a38f762a3..904dd049f954 100644
--- a/arch/s390/include/asm/alternative.h
+++ b/arch/s390/include/asm/alternative.h
@@ -13,32 +13,25 @@ struct alt_instr {
s32 repl_offset; /* offset to replacement instruction */
u16 facility; /* facility bit set for replacement */
u8 instrlen; /* length of original instruction */
- u8 replacementlen; /* length of new instruction */
} __packed;
void apply_alternative_instructions(void);
void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
/*
- * |661: |662: |6620 |663:
- * +-----------+---------------------+
- * | oldinstr | oldinstr_padding |
- * | +----------+----------+
- * | | | |
- * | | >6 bytes |6/4/2 nops|
- * | |6 bytes jg----------->
- * +-----------+---------------------+
- * ^^ static padding ^^
+ * +---------------------------------+
+ * |661: |662:
+ * | oldinstr |
+ * +---------------------------------+
*
* .altinstr_replacement section
- * +---------------------+-----------+
+ * +---------------------------------+
* |6641: |6651:
* | alternative instr 1 |
- * +-----------+---------+- - - - - -+
- * |6642: |6652: |
- * | alternative instr 2 | padding
- * +---------------------+- - - - - -+
- * ^ runtime ^
+ * +---------------------------------+
+ * |6642: |6652:
+ * | alternative instr 2 |
+ * +---------------------------------+
*
* .altinstructions section
* +---------------------------------+
@@ -47,70 +40,31 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
* +---------------------------------+
*/
-#define b_altinstr(num) "664"#num
-#define e_altinstr(num) "665"#num
-
-#define e_oldinstr_pad_end "663"
+#define b_altinstr(num) "664"#num
+#define e_altinstr(num) "665"#num
#define oldinstr_len "662b-661b"
-#define oldinstr_total_len e_oldinstr_pad_end"b-661b"
#define altinstr_len(num) e_altinstr(num)"b-"b_altinstr(num)"b"
-#define oldinstr_pad_len(num) \
- "-(((" altinstr_len(num) ")-(" oldinstr_len ")) > 0) * " \
- "((" altinstr_len(num) ")-(" oldinstr_len "))"
-
-#define INSTR_LEN_SANITY_CHECK(len) \
- ".if " len " > 254\n" \
- "\t.error \"cpu alternatives does not support instructions " \
- "blocks > 254 bytes\"\n" \
- ".endif\n" \
- ".if (" len ") %% 2\n" \
- "\t.error \"cpu alternatives instructions length is odd\"\n" \
- ".endif\n"
-
-#define OLDINSTR_PADDING(oldinstr, num) \
- ".if " oldinstr_pad_len(num) " > 6\n" \
- "\tjg " e_oldinstr_pad_end "f\n" \
- "6620:\n" \
- "\t.fill (" oldinstr_pad_len(num) " - (6620b-662b)) / 2, 2, 0x0700\n" \
- ".else\n" \
- "\t.fill " oldinstr_pad_len(num) " / 6, 6, 0xc0040000\n" \
- "\t.fill " oldinstr_pad_len(num) " %% 6 / 4, 4, 0x47000000\n" \
- "\t.fill " oldinstr_pad_len(num) " %% 6 %% 4 / 2, 2, 0x0700\n" \
- ".endif\n"
-
-#define OLDINSTR(oldinstr, num) \
- "661:\n\t" oldinstr "\n662:\n" \
- OLDINSTR_PADDING(oldinstr, num) \
- e_oldinstr_pad_end ":\n" \
- INSTR_LEN_SANITY_CHECK(oldinstr_len)
-
-#define OLDINSTR_2(oldinstr, num1, num2) \
- "661:\n\t" oldinstr "\n662:\n" \
- ".if " altinstr_len(num1) " < " altinstr_len(num2) "\n" \
- OLDINSTR_PADDING(oldinstr, num2) \
- ".else\n" \
- OLDINSTR_PADDING(oldinstr, num1) \
- ".endif\n" \
- e_oldinstr_pad_end ":\n" \
- INSTR_LEN_SANITY_CHECK(oldinstr_len)
+
+#define OLDINSTR(oldinstr) \
+ "661:\n\t" oldinstr "\n662:\n"
#define ALTINSTR_ENTRY(facility, num) \
"\t.long 661b - .\n" /* old instruction */ \
"\t.long " b_altinstr(num)"b - .\n" /* alt instruction */ \
"\t.word " __stringify(facility) "\n" /* facility bit */ \
- "\t.byte " oldinstr_total_len "\n" /* source len */ \
- "\t.byte " altinstr_len(num) "\n" /* alt instruction len */
+ "\t.byte " oldinstr_len "\n" /* instruction len */ \
+ "\t.org . - (" oldinstr_len ") + (" altinstr_len(num) ")\n" \
+ "\t.org . - (" altinstr_len(num) ") + (" oldinstr_len ")\n"
#define ALTINSTR_REPLACEMENT(altinstr, num) /* replacement */ \
- b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" \
- INSTR_LEN_SANITY_CHECK(altinstr_len(num))
+ b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n"
/* alternative assembly primitive: */
#define ALTERNATIVE(oldinstr, altinstr, facility) \
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(altinstr, 1) \
".popsection\n" \
- OLDINSTR(oldinstr, 1) \
+ OLDINSTR(oldinstr) \
".pushsection .altinstructions,\"a\"\n" \
ALTINSTR_ENTRY(facility, 1) \
".popsection\n"
@@ -120,7 +74,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
ALTINSTR_REPLACEMENT(altinstr1, 1) \
ALTINSTR_REPLACEMENT(altinstr2, 2) \
".popsection\n" \
- OLDINSTR_2(oldinstr, 1, 2) \
+ OLDINSTR(oldinstr) \
".pushsection .altinstructions,\"a\"\n" \
ALTINSTR_ENTRY(facility1, 1) \
ALTINSTR_ENTRY(facility2, 2) \
@@ -145,6 +99,22 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
asm_inline volatile(ALTERNATIVE_2(oldinstr, altinstr1, facility1, \
altinstr2, facility2) ::: "memory")
+/* Alternative inline assembly with input. */
+#define alternative_input(oldinstr, newinstr, feature, input...) \
+ asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+ : : input)
+
+/* Like alternative_input, but with a single output argument */
+#define alternative_io(oldinstr, altinstr, facility, output, input...) \
+ asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, facility) \
+ : output : input)
+
+/* Use this macro if more than one output parameter is needed. */
+#define ASM_OUTPUT2(a...) a
+
+/* Use this macro if clobbers are needed without inputs. */
+#define ASM_NO_INPUT_CLOBBER(clobber...) : clobber
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_ALTERNATIVE_H */
diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h
index aea32dda3d14..43ac4a64f49b 100644
--- a/arch/s390/include/asm/ap.h
+++ b/arch/s390/include/asm/ap.h
@@ -12,6 +12,9 @@
#ifndef _ASM_S390_AP_H_
#define _ASM_S390_AP_H_
+#include <linux/io.h>
+#include <asm/asm-extable.h>
+
/**
* The ap_qid_t identifier of an ap queue.
* If the AP facilities test (APFT) facility is available,
@@ -40,10 +43,24 @@ struct ap_queue_status {
unsigned int queue_empty : 1;
unsigned int replies_waiting : 1;
unsigned int queue_full : 1;
- unsigned int _pad1 : 4;
+ unsigned int : 3;
+ unsigned int async : 1;
unsigned int irq_enabled : 1;
unsigned int response_code : 8;
- unsigned int _pad2 : 16;
+ unsigned int : 16;
+};
+
+/*
+ * AP queue status reg union to access the reg1
+ * register with the lower 32 bits comprising the
+ * ap queue status.
+ */
+union ap_queue_status_reg {
+ unsigned long value;
+ struct {
+ u32 _pad;
+ struct ap_queue_status status;
+ };
};
/**
@@ -53,54 +70,98 @@ struct ap_queue_status {
*/
static inline bool ap_instructions_available(void)
{
- register unsigned long reg0 asm ("0") = AP_MKQID(0, 0);
- register unsigned long reg1 asm ("1") = 0;
- register unsigned long reg2 asm ("2") = 0;
+ unsigned long reg0 = AP_MKQID(0, 0);
+ unsigned long reg1 = 0;
asm volatile(
- " .long 0xb2af0000\n" /* PQAP(TAPQ) */
- "0: la %0,1\n"
+ " lgr 0,%[reg0]\n" /* qid into gr0 */
+ " lghi 1,0\n" /* 0 into gr1 */
+ " lghi 2,0\n" /* 0 into gr2 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(TAPQ) */
+ "0: la %[reg1],1\n" /* 1 into reg1 */
"1:\n"
EX_TABLE(0b, 1b)
- : "+d" (reg1), "+d" (reg2)
- : "d" (reg0)
- : "cc");
+ : [reg1] "+&d" (reg1)
+ : [reg0] "d" (reg0)
+ : "cc", "0", "1", "2");
return reg1 != 0;
}
+/* TAPQ register GR2 response struct */
+struct ap_tapq_hwinfo {
+ union {
+ unsigned long value;
+ struct {
+ unsigned int fac : 32; /* facility bits */
+ unsigned int apinfo : 32; /* ap type, ... */
+ };
+ struct {
+ unsigned int apsc : 1; /* APSC */
+ unsigned int mex4k : 1; /* AP4KM */
+ unsigned int crt4k : 1; /* AP4KC */
+ unsigned int cca : 1; /* D */
+ unsigned int accel : 1; /* A */
+ unsigned int ep11 : 1; /* X */
+ unsigned int apxa : 1; /* APXA */
+ unsigned int : 1;
+ unsigned int class : 8;
+ unsigned int bs : 2; /* SE bind/assoc */
+ unsigned int : 14;
+ unsigned int at : 8; /* ap type */
+ unsigned int nd : 8; /* nr of domains */
+ unsigned int : 4;
+ unsigned int ml : 4; /* apxl ml */
+ unsigned int : 4;
+ unsigned int qd : 4; /* queue depth */
+ };
+ };
+};
+
+/*
+ * Convenience defines to be used with the bs field from struct ap_tapq_gr2
+ */
+#define AP_BS_Q_USABLE 0
+#define AP_BS_Q_USABLE_NO_SECURE_KEY 1
+#define AP_BS_Q_AVAIL_FOR_BINDING 2
+#define AP_BS_Q_UNUSABLE 3
+
/**
* ap_tapq(): Test adjunct processor queue.
* @qid: The AP queue number
- * @info: Pointer to queue descriptor
+ * @info: Pointer to tapq hwinfo struct
*
* Returns AP queue status structure.
*/
-static inline struct ap_queue_status ap_tapq(ap_qid_t qid, unsigned long *info)
+static inline struct ap_queue_status ap_tapq(ap_qid_t qid,
+ struct ap_tapq_hwinfo *info)
{
- register unsigned long reg0 asm ("0") = qid;
- register struct ap_queue_status reg1 asm ("1");
- register unsigned long reg2 asm ("2");
-
- asm volatile(".long 0xb2af0000" /* PQAP(TAPQ) */
- : "=d" (reg1), "=d" (reg2)
- : "d" (reg0)
- : "cc");
+ union ap_queue_status_reg reg1;
+ unsigned long reg2;
+
+ asm volatile(
+ " lgr 0,%[qid]\n" /* qid into gr0 */
+ " lghi 2,0\n" /* 0 into gr2 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(TAPQ) */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ " lgr %[reg2],2\n" /* gr2 into reg2 */
+ : [reg1] "=&d" (reg1.value), [reg2] "=&d" (reg2)
+ : [qid] "d" (qid)
+ : "cc", "0", "1", "2");
if (info)
- *info = reg2;
- return reg1;
+ info->value = reg2;
+ return reg1.status;
}
/**
* ap_test_queue(): Test adjunct processor queue.
* @qid: The AP queue number
* @tbit: Test facilities bit
- * @info: Pointer to queue descriptor
+ * @info: Ptr to tapq gr2 struct
*
* Returns AP queue status structure.
*/
-static inline struct ap_queue_status ap_test_queue(ap_qid_t qid,
- int tbit,
- unsigned long *info)
+static inline struct ap_queue_status ap_test_queue(ap_qid_t qid, int tbit,
+ struct ap_tapq_hwinfo *info)
{
if (tbit)
qid |= 1UL << 23; /* set T bit*/
@@ -110,39 +171,51 @@ static inline struct ap_queue_status ap_test_queue(ap_qid_t qid,
/**
* ap_pqap_rapq(): Reset adjunct processor queue.
* @qid: The AP queue number
+ * @fbit: if != 0 set F bit
*
* Returns AP queue status structure.
*/
-static inline struct ap_queue_status ap_rapq(ap_qid_t qid)
+static inline struct ap_queue_status ap_rapq(ap_qid_t qid, int fbit)
{
- register unsigned long reg0 asm ("0") = qid | (1UL << 24);
- register struct ap_queue_status reg1 asm ("1");
+ unsigned long reg0 = qid | (1UL << 24); /* fc 1UL is RAPQ */
+ union ap_queue_status_reg reg1;
+
+ if (fbit)
+ reg0 |= 1UL << 22;
asm volatile(
- ".long 0xb2af0000" /* PQAP(RAPQ) */
- : "=d" (reg1)
- : "d" (reg0)
- : "cc");
- return reg1;
+ " lgr 0,%[reg0]\n" /* qid arg into gr0 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(RAPQ) */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ : [reg1] "=&d" (reg1.value)
+ : [reg0] "d" (reg0)
+ : "cc", "0", "1");
+ return reg1.status;
}
/**
* ap_pqap_zapq(): Reset and zeroize adjunct processor queue.
* @qid: The AP queue number
+ * @fbit: if != 0 set F bit
*
* Returns AP queue status structure.
*/
-static inline struct ap_queue_status ap_zapq(ap_qid_t qid)
+static inline struct ap_queue_status ap_zapq(ap_qid_t qid, int fbit)
{
- register unsigned long reg0 asm ("0") = qid | (2UL << 24);
- register struct ap_queue_status reg1 asm ("1");
+ unsigned long reg0 = qid | (2UL << 24); /* fc 2UL is ZAPQ */
+ union ap_queue_status_reg reg1;
+
+ if (fbit)
+ reg0 |= 1UL << 22;
asm volatile(
- ".long 0xb2af0000" /* PQAP(ZAPQ) */
- : "=d" (reg1)
- : "d" (reg0)
- : "cc");
- return reg1;
+ " lgr 0,%[reg0]\n" /* qid arg into gr0 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(ZAPQ) */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ : [reg1] "=&d" (reg1.value)
+ : [reg0] "d" (reg0)
+ : "cc", "0", "1");
+ return reg1.status;
}
/**
@@ -154,15 +227,16 @@ struct ap_config_info {
unsigned int apxa : 1; /* N bit */
unsigned int qact : 1; /* C bit */
unsigned int rc8a : 1; /* R bit */
- unsigned char _reserved1 : 4;
- unsigned char _reserved2[3];
- unsigned char Na; /* max # of APs - 1 */
- unsigned char Nd; /* max # of Domains - 1 */
- unsigned char _reserved3[10];
+ unsigned int : 4;
+ unsigned int apsb : 1; /* B bit */
+ unsigned int : 23;
+ unsigned char na; /* max # of APs - 1 */
+ unsigned char nd; /* max # of Domains - 1 */
+ unsigned char _reserved0[10];
unsigned int apm[8]; /* AP ID mask */
unsigned int aqm[8]; /* AP (usage) queue mask */
unsigned int adm[8]; /* AP (control) domain mask */
- unsigned char _reserved4[16];
+ unsigned char _reserved1[16];
} __aligned(8);
/**
@@ -172,18 +246,20 @@ struct ap_config_info {
*/
static inline int ap_qci(struct ap_config_info *config)
{
- register unsigned long reg0 asm ("0") = 4UL << 24;
- register unsigned long reg1 asm ("1") = -EOPNOTSUPP;
- register struct ap_config_info *reg2 asm ("2") = config;
+ unsigned long reg0 = 4UL << 24; /* fc 4UL is QCI */
+ unsigned long reg1 = -EOPNOTSUPP;
+ struct ap_config_info *reg2 = config;
asm volatile(
- ".long 0xb2af0000\n" /* PQAP(QCI) */
- "0: la %0,0\n"
+ " lgr 0,%[reg0]\n" /* QCI fc into gr0 */
+ " lgr 2,%[reg2]\n" /* ptr to config into gr2 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(QCI) */
+ "0: la %[reg1],0\n" /* good case, QCI fc available */
"1:\n"
EX_TABLE(0b, 1b)
- : "+d" (reg1)
- : "d" (reg0), "d" (reg2)
- : "cc", "memory");
+ : [reg1] "+&d" (reg1)
+ : [reg0] "d" (reg0), [reg2] "d" (reg2)
+ : "cc", "memory", "0", "2");
return reg1;
}
@@ -194,47 +270,50 @@ static inline int ap_qci(struct ap_config_info *config)
* parameter to the PQAP(AQIC) instruction. For details please
* see the AR documentation.
*/
-struct ap_qirq_ctrl {
- unsigned int _res1 : 8;
- unsigned int zone : 8; /* zone info */
- unsigned int ir : 1; /* ir flag: enable (1) or disable (0) irq */
- unsigned int _res2 : 4;
- unsigned int gisc : 3; /* guest isc field */
- unsigned int _res3 : 6;
- unsigned int gf : 2; /* gisa format */
- unsigned int _res4 : 1;
- unsigned int gisa : 27; /* gisa origin */
- unsigned int _res5 : 1;
- unsigned int isc : 3; /* irq sub class */
+union ap_qirq_ctrl {
+ unsigned long value;
+ struct {
+ unsigned int : 8;
+ unsigned int zone : 8; /* zone info */
+ unsigned int ir : 1; /* ir flag: enable (1) or disable (0) irq */
+ unsigned int : 4;
+ unsigned int gisc : 3; /* guest isc field */
+ unsigned int : 6;
+ unsigned int gf : 2; /* gisa format */
+ unsigned int : 1;
+ unsigned int gisa : 27; /* gisa origin */
+ unsigned int : 1;
+ unsigned int isc : 3; /* irq sub class */
+ };
};
/**
* ap_aqic(): Control interruption for a specific AP.
* @qid: The AP queue number
* @qirqctrl: struct ap_qirq_ctrl (64 bit value)
- * @ind: The notification indicator byte
+ * @pa_ind: Physical address of the notification indicator byte
*
* Returns AP queue status.
*/
static inline struct ap_queue_status ap_aqic(ap_qid_t qid,
- struct ap_qirq_ctrl qirqctrl,
- void *ind)
+ union ap_qirq_ctrl qirqctrl,
+ phys_addr_t pa_ind)
{
- register unsigned long reg0 asm ("0") = qid | (3UL << 24);
- register union {
- unsigned long value;
- struct ap_qirq_ctrl qirqctrl;
- struct ap_queue_status status;
- } reg1 asm ("1");
- register void *reg2 asm ("2") = ind;
+ unsigned long reg0 = qid | (3UL << 24); /* fc 3UL is AQIC */
+ union ap_queue_status_reg reg1;
+ unsigned long reg2 = pa_ind;
- reg1.qirqctrl = qirqctrl;
+ reg1.value = qirqctrl.value;
asm volatile(
- ".long 0xb2af0000" /* PQAP(AQIC) */
- : "+d" (reg1)
- : "d" (reg0), "d" (reg2)
- : "cc");
+ " lgr 0,%[reg0]\n" /* qid param into gr0 */
+ " lgr 1,%[reg1]\n" /* irq ctrl into gr1 */
+ " lgr 2,%[reg2]\n" /* ni addr into gr2 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(AQIC) */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ : [reg1] "+&d" (reg1.value)
+ : [reg0] "d" (reg0), [reg2] "d" (reg2)
+ : "cc", "memory", "0", "1", "2");
return reg1.status;
}
@@ -257,7 +336,7 @@ union ap_qact_ap_info {
};
/**
- * ap_qact(): Query AP combatibility type.
+ * ap_qact(): Query AP compatibility type.
* @qid: The AP queue number
* @apinfo: On input the info about the AP queue. On output the
* alternate AP queue info provided by the qact function
@@ -268,25 +347,78 @@ union ap_qact_ap_info {
static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit,
union ap_qact_ap_info *apinfo)
{
- register unsigned long reg0 asm ("0") = qid | (5UL << 24)
- | ((ifbit & 0x01) << 22);
- register union {
- unsigned long value;
- struct ap_queue_status status;
- } reg1 asm ("1");
- register unsigned long reg2 asm ("2");
+ unsigned long reg0 = qid | (5UL << 24) | ((ifbit & 0x01) << 22);
+ union ap_queue_status_reg reg1;
+ unsigned long reg2;
reg1.value = apinfo->val;
asm volatile(
- ".long 0xb2af0000" /* PQAP(QACT) */
- : "+d" (reg1), "=d" (reg2)
- : "d" (reg0)
- : "cc");
+ " lgr 0,%[reg0]\n" /* qid param into gr0 */
+ " lgr 1,%[reg1]\n" /* qact in info into gr1 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(QACT) */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ " lgr %[reg2],2\n" /* qact out info into reg2 */
+ : [reg1] "+&d" (reg1.value), [reg2] "=&d" (reg2)
+ : [reg0] "d" (reg0)
+ : "cc", "0", "1", "2");
apinfo->val = reg2;
return reg1.status;
}
+/*
+ * ap_bapq(): SE bind AP queue.
+ * @qid: The AP queue number
+ *
+ * Returns AP queue status structure.
+ *
+ * Invoking this function in a non-SE environment
+ * may case a specification exception.
+ */
+static inline struct ap_queue_status ap_bapq(ap_qid_t qid)
+{
+ unsigned long reg0 = qid | (7UL << 24); /* fc 7 is BAPQ */
+ union ap_queue_status_reg reg1;
+
+ asm volatile(
+ " lgr 0,%[reg0]\n" /* qid arg into gr0 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(BAPQ) */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ : [reg1] "=&d" (reg1.value)
+ : [reg0] "d" (reg0)
+ : "cc", "0", "1");
+
+ return reg1.status;
+}
+
+/*
+ * ap_aapq(): SE associate AP queue.
+ * @qid: The AP queue number
+ * @sec_idx: The secret index
+ *
+ * Returns AP queue status structure.
+ *
+ * Invoking this function in a non-SE environment
+ * may case a specification exception.
+ */
+static inline struct ap_queue_status ap_aapq(ap_qid_t qid, unsigned int sec_idx)
+{
+ unsigned long reg0 = qid | (8UL << 24); /* fc 8 is AAPQ */
+ unsigned long reg2 = sec_idx;
+ union ap_queue_status_reg reg1;
+
+ asm volatile(
+ " lgr 0,%[reg0]\n" /* qid arg into gr0 */
+ " lgr 2,%[reg2]\n" /* secret index into gr2 */
+ " .insn rre,0xb2af0000,0,0\n" /* PQAP(AAPQ) */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ : [reg1] "=&d" (reg1.value)
+ : [reg0] "d" (reg0), [reg2] "d" (reg2)
+ : "cc", "0", "1", "2");
+
+ return reg1.status;
+}
+
/**
* ap_nqap(): Send message to adjunct processor queue.
* @qid: The AP queue number
@@ -303,28 +435,36 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid,
unsigned long long psmid,
void *msg, size_t length)
{
- register unsigned long reg0 asm ("0") = qid | 0x40000000UL;
- register struct ap_queue_status reg1 asm ("1");
- register unsigned long reg2 asm ("2") = (unsigned long) msg;
- register unsigned long reg3 asm ("3") = (unsigned long) length;
- register unsigned long reg4 asm ("4") = (unsigned int) (psmid >> 32);
- register unsigned long reg5 asm ("5") = psmid & 0xffffffff;
+ unsigned long reg0 = qid | 0x40000000UL; /* 0x4... is last msg part */
+ union register_pair nqap_r1, nqap_r2;
+ union ap_queue_status_reg reg1;
+
+ nqap_r1.even = (unsigned int)(psmid >> 32);
+ nqap_r1.odd = psmid & 0xffffffff;
+ nqap_r2.even = (unsigned long)msg;
+ nqap_r2.odd = (unsigned long)length;
asm volatile (
- "0: .long 0xb2ad0042\n" /* NQAP */
- " brc 2,0b"
- : "+d" (reg0), "=d" (reg1), "+d" (reg2), "+d" (reg3)
- : "d" (reg4), "d" (reg5)
- : "cc", "memory");
- return reg1;
+ " lgr 0,%[reg0]\n" /* qid param in gr0 */
+ "0: .insn rre,0xb2ad0000,%[nqap_r1],%[nqap_r2]\n"
+ " brc 2,0b\n" /* handle partial completion */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1.value),
+ [nqap_r2] "+&d" (nqap_r2.pair)
+ : [nqap_r1] "d" (nqap_r1.pair)
+ : "cc", "memory", "0", "1");
+ return reg1.status;
}
/**
* ap_dqap(): Receive message from adjunct processor queue.
* @qid: The AP queue number
* @psmid: Pointer to program supplied message identifier
- * @msg: The message text
- * @length: The message length
+ * @msg: Pointer to message buffer
+ * @msglen: Message buffer size
+ * @length: Pointer to length of actually written bytes
+ * @reslength: Residual length on return
+ * @resgr0: input: gr0 value (only used if != 0), output: residual gr0 content
*
* Returns AP queue status structure.
* Condition code 1 on DQAP means the receive has taken place
@@ -336,28 +476,72 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid,
* Note that gpr2 is used by the DQAP instruction to keep track of
* any 'residual' length, in case the instruction gets interrupted.
* Hence it gets zeroed before the instruction.
+ * If the message does not fit into the buffer, this function will
+ * return with a truncated message and the reply in the firmware queue
+ * is not removed. This is indicated to the caller with an
+ * ap_queue_status response_code value of all bits on (0xFF) and (if
+ * the reslength ptr is given) the remaining length is stored in
+ * *reslength and (if the resgr0 ptr is given) the updated gr0 value
+ * for further processing of this msg entry is stored in *resgr0. The
+ * caller needs to detect this situation and should invoke ap_dqap
+ * with a valid resgr0 ptr and a value in there != 0 to indicate that
+ * *resgr0 is to be used instead of qid to further process this entry.
*/
static inline struct ap_queue_status ap_dqap(ap_qid_t qid,
- unsigned long long *psmid,
- void *msg, size_t length)
+ unsigned long *psmid,
+ void *msg, size_t msglen,
+ size_t *length,
+ size_t *reslength,
+ unsigned long *resgr0)
{
- register unsigned long reg0 asm("0") = qid | 0x80000000UL;
- register struct ap_queue_status reg1 asm ("1");
- register unsigned long reg2 asm("2") = 0UL;
- register unsigned long reg4 asm("4") = (unsigned long) msg;
- register unsigned long reg5 asm("5") = (unsigned long) length;
- register unsigned long reg6 asm("6") = 0UL;
- register unsigned long reg7 asm("7") = 0UL;
+ unsigned long reg0 = resgr0 && *resgr0 ? *resgr0 : qid | 0x80000000UL;
+ union ap_queue_status_reg reg1;
+ unsigned long reg2;
+ union register_pair rp1, rp2;
+ rp1.even = 0UL;
+ rp1.odd = 0UL;
+ rp2.even = (unsigned long)msg;
+ rp2.odd = (unsigned long)msglen;
asm volatile(
- "0: .long 0xb2ae0064\n" /* DQAP */
- " brc 6,0b\n"
- : "+d" (reg0), "=d" (reg1), "+d" (reg2),
- "+d" (reg4), "+d" (reg5), "+d" (reg6), "+d" (reg7)
- : : "cc", "memory");
- *psmid = (((unsigned long long) reg6) << 32) + reg7;
- return reg1;
+ " lgr 0,%[reg0]\n" /* qid param into gr0 */
+ " lghi 2,0\n" /* 0 into gr2 (res length) */
+ "0: ltgr %N[rp2],%N[rp2]\n" /* check buf len */
+ " jz 2f\n" /* go out if buf len is 0 */
+ "1: .insn rre,0xb2ae0000,%[rp1],%[rp2]\n"
+ " brc 6,0b\n" /* handle partial complete */
+ "2: lgr %[reg0],0\n" /* gr0 (qid + info) into reg0 */
+ " lgr %[reg1],1\n" /* gr1 (status) into reg1 */
+ " lgr %[reg2],2\n" /* gr2 (res length) into reg2 */
+ : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1.value),
+ [reg2] "=&d" (reg2), [rp1] "+&d" (rp1.pair),
+ [rp2] "+&d" (rp2.pair)
+ :
+ : "cc", "memory", "0", "1", "2");
+
+ if (reslength)
+ *reslength = reg2;
+ if (reg2 != 0 && rp2.odd == 0) {
+ /*
+ * Partially complete, status in gr1 is not set.
+ * Signal the caller that this dqap is only partially received
+ * with a special status response code 0xFF and *resgr0 updated
+ */
+ reg1.status.response_code = 0xFF;
+ if (resgr0)
+ *resgr0 = reg0;
+ } else {
+ *psmid = (rp1.even << 32) + rp1.odd;
+ if (resgr0)
+ *resgr0 = 0;
+ }
+
+ /* update *length with the nr of bytes stored into the msg buffer */
+ if (length)
+ *length = msglen - rp2.odd;
+
+ return reg1.status;
}
/*
@@ -368,7 +552,7 @@ static inline struct ap_queue_status ap_dqap(ap_qid_t qid,
#if IS_ENABLED(CONFIG_ZCRYPT)
void ap_bus_cfg_chg(void);
#else
-static inline void ap_bus_cfg_chg(void){};
+static inline void ap_bus_cfg_chg(void){}
#endif
#endif /* _ASM_S390_AP_H_ */
diff --git a/arch/s390/include/asm/appldata.h b/arch/s390/include/asm/appldata.h
index c5bd9f4437e5..f2240392c708 100644
--- a/arch/s390/include/asm/appldata.h
+++ b/arch/s390/include/asm/appldata.h
@@ -8,8 +8,8 @@
#ifndef _ASM_S390_APPLDATA_H
#define _ASM_S390_APPLDATA_H
+#include <linux/io.h>
#include <asm/diag.h>
-#include <asm/io.h>
#define APPLDATA_START_INTERVAL_REC 0x80
#define APPLDATA_STOP_REC 0x81
diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h
index c67b82dfa558..1594049893e0 100644
--- a/arch/s390/include/asm/archrandom.h
+++ b/arch/s390/include/asm/archrandom.h
@@ -2,7 +2,7 @@
/*
* Kernel interface for the s390 arch_random_* functions
*
- * Copyright IBM Corp. 2017
+ * Copyright IBM Corp. 2017, 2022
*
* Author: Harald Freudenberger <freude@de.ibm.com>
*
@@ -11,53 +11,28 @@
#ifndef _ASM_S390_ARCHRANDOM_H
#define _ASM_S390_ARCHRANDOM_H
-#ifdef CONFIG_ARCH_RANDOM
-
#include <linux/static_key.h>
+#include <linux/preempt.h>
#include <linux/atomic.h>
+#include <asm/cpacf.h>
DECLARE_STATIC_KEY_FALSE(s390_arch_random_available);
extern atomic64_t s390_arch_random_counter;
-bool s390_arch_random_generate(u8 *buf, unsigned int nbytes);
-
-static inline bool arch_has_random(void)
-{
- return false;
-}
-
-static inline bool arch_has_random_seed(void)
-{
- if (static_branch_likely(&s390_arch_random_available))
- return true;
- return false;
-}
-
-static inline bool arch_get_random_long(unsigned long *v)
+static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs)
{
- return false;
-}
-
-static inline bool arch_get_random_int(unsigned int *v)
-{
- return false;
-}
-
-static inline bool arch_get_random_seed_long(unsigned long *v)
-{
- if (static_branch_likely(&s390_arch_random_available)) {
- return s390_arch_random_generate((u8 *)v, sizeof(*v));
- }
- return false;
+ return 0;
}
-static inline bool arch_get_random_seed_int(unsigned int *v)
+static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs)
{
- if (static_branch_likely(&s390_arch_random_available)) {
- return s390_arch_random_generate((u8 *)v, sizeof(*v));
+ if (static_branch_likely(&s390_arch_random_available) &&
+ in_task()) {
+ cpacf_trng(NULL, 0, (u8 *)v, max_longs * sizeof(*v));
+ atomic64_add(max_longs * sizeof(*v), &s390_arch_random_counter);
+ return max_longs;
}
- return false;
+ return 0;
}
-#endif /* CONFIG_ARCH_RANDOM */
#endif /* _ASM_S390_ARCHRANDOM_H */
diff --git a/arch/s390/include/asm/asm-const.h b/arch/s390/include/asm/asm-const.h
new file mode 100644
index 000000000000..11f615eb0066
--- /dev/null
+++ b/arch/s390/include/asm/asm-const.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_ASM_CONST_H
+#define _ASM_S390_ASM_CONST_H
+
+#ifdef __ASSEMBLY__
+# define stringify_in_c(...) __VA_ARGS__
+#else
+/* This version of stringify will deal with commas... */
+# define __stringify_in_c(...) #__VA_ARGS__
+# define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " "
+#endif
+#endif /* _ASM_S390_ASM_CONST_H */
diff --git a/arch/s390/include/asm/asm-extable.h b/arch/s390/include/asm/asm-extable.h
new file mode 100644
index 000000000000..4a6b0a8b6412
--- /dev/null
+++ b/arch/s390/include/asm/asm-extable.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_EXTABLE_H
+#define __ASM_EXTABLE_H
+
+#include <linux/stringify.h>
+#include <linux/bits.h>
+#include <asm/asm-const.h>
+
+#define EX_TYPE_NONE 0
+#define EX_TYPE_FIXUP 1
+#define EX_TYPE_BPF 2
+#define EX_TYPE_UA_STORE 3
+#define EX_TYPE_UA_LOAD_MEM 4
+#define EX_TYPE_UA_LOAD_REG 5
+#define EX_TYPE_UA_LOAD_REGPAIR 6
+#define EX_TYPE_ZEROPAD 7
+
+#define EX_DATA_REG_ERR_SHIFT 0
+#define EX_DATA_REG_ERR GENMASK(3, 0)
+
+#define EX_DATA_REG_ADDR_SHIFT 4
+#define EX_DATA_REG_ADDR GENMASK(7, 4)
+
+#define EX_DATA_LEN_SHIFT 8
+#define EX_DATA_LEN GENMASK(11, 8)
+
+#define __EX_TABLE(_section, _fault, _target, _type, _regerr, _regaddr, _len) \
+ stringify_in_c(.section _section,"a";) \
+ stringify_in_c(.balign 4;) \
+ stringify_in_c(.long (_fault) - .;) \
+ stringify_in_c(.long (_target) - .;) \
+ stringify_in_c(.short (_type);) \
+ stringify_in_c(.macro extable_reg regerr, regaddr;) \
+ stringify_in_c(.set .Lfound, 0;) \
+ stringify_in_c(.set .Lcurr, 0;) \
+ stringify_in_c(.irp rs,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15;) \
+ stringify_in_c( .ifc "\regerr", "%%r\rs";) \
+ stringify_in_c( .set .Lfound, 1;) \
+ stringify_in_c( .set .Lregerr, .Lcurr;) \
+ stringify_in_c( .endif;) \
+ stringify_in_c( .set .Lcurr, .Lcurr+1;) \
+ stringify_in_c(.endr;) \
+ stringify_in_c(.ifne (.Lfound != 1);) \
+ stringify_in_c( .error "extable_reg: bad register argument1";) \
+ stringify_in_c(.endif;) \
+ stringify_in_c(.set .Lfound, 0;) \
+ stringify_in_c(.set .Lcurr, 0;) \
+ stringify_in_c(.irp rs,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15;) \
+ stringify_in_c( .ifc "\regaddr", "%%r\rs";) \
+ stringify_in_c( .set .Lfound, 1;) \
+ stringify_in_c( .set .Lregaddr, .Lcurr;) \
+ stringify_in_c( .endif;) \
+ stringify_in_c( .set .Lcurr, .Lcurr+1;) \
+ stringify_in_c(.endr;) \
+ stringify_in_c(.ifne (.Lfound != 1);) \
+ stringify_in_c( .error "extable_reg: bad register argument2";) \
+ stringify_in_c(.endif;) \
+ stringify_in_c(.short .Lregerr << EX_DATA_REG_ERR_SHIFT | \
+ .Lregaddr << EX_DATA_REG_ADDR_SHIFT | \
+ _len << EX_DATA_LEN_SHIFT;) \
+ stringify_in_c(.endm;) \
+ stringify_in_c(extable_reg _regerr,_regaddr;) \
+ stringify_in_c(.purgem extable_reg;) \
+ stringify_in_c(.previous)
+
+#define EX_TABLE(_fault, _target) \
+ __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_FIXUP, __stringify(%%r0), __stringify(%%r0), 0)
+
+#define EX_TABLE_AMODE31(_fault, _target) \
+ __EX_TABLE(.amode31.ex_table, _fault, _target, EX_TYPE_FIXUP, __stringify(%%r0), __stringify(%%r0), 0)
+
+#define EX_TABLE_UA_STORE(_fault, _target, _regerr) \
+ __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_STORE, _regerr, _regerr, 0)
+
+#define EX_TABLE_UA_LOAD_MEM(_fault, _target, _regerr, _regmem, _len) \
+ __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_MEM, _regerr, _regmem, _len)
+
+#define EX_TABLE_UA_LOAD_REG(_fault, _target, _regerr, _regzero) \
+ __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REG, _regerr, _regzero, 0)
+
+#define EX_TABLE_UA_LOAD_REGPAIR(_fault, _target, _regerr, _regzero) \
+ __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REGPAIR, _regerr, _regzero, 0)
+
+#define EX_TABLE_ZEROPAD(_fault, _target, _regdata, _regaddr) \
+ __EX_TABLE(__ex_table, _fault, _target, EX_TYPE_ZEROPAD, _regdata, _regaddr, 0)
+
+#endif /* __ASM_EXTABLE_H */
diff --git a/arch/s390/include/asm/asm-prototypes.h b/arch/s390/include/asm/asm-prototypes.h
index c37eb921bfbf..a873e873e1ee 100644
--- a/arch/s390/include/asm/asm-prototypes.h
+++ b/arch/s390/include/asm/asm-prototypes.h
@@ -6,4 +6,8 @@
#include <asm/fpu/api.h>
#include <asm-generic/asm-prototypes.h>
+__int128_t __ashlti3(__int128_t a, int b);
+__int128_t __ashrti3(__int128_t a, int b);
+__int128_t __lshrti3(__int128_t a, int b);
+
#endif /* _ASM_S390_PROTOTYPES_H */
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index 491ad53a0d4e..7138d189cc42 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -15,56 +15,46 @@
#include <asm/barrier.h>
#include <asm/cmpxchg.h>
-#define ATOMIC_INIT(i) { (i) }
-
-static inline int atomic_read(const atomic_t *v)
+static inline int arch_atomic_read(const atomic_t *v)
{
- int c;
-
- asm volatile(
- " l %0,%1\n"
- : "=d" (c) : "Q" (v->counter));
- return c;
+ return __atomic_read(v);
}
+#define arch_atomic_read arch_atomic_read
-static inline void atomic_set(atomic_t *v, int i)
+static inline void arch_atomic_set(atomic_t *v, int i)
{
- asm volatile(
- " st %1,%0\n"
- : "=Q" (v->counter) : "d" (i));
+ __atomic_set(v, i);
}
+#define arch_atomic_set arch_atomic_set
-static inline int atomic_add_return(int i, atomic_t *v)
+static inline int arch_atomic_add_return(int i, atomic_t *v)
{
return __atomic_add_barrier(i, &v->counter) + i;
}
+#define arch_atomic_add_return arch_atomic_add_return
-static inline int atomic_fetch_add(int i, atomic_t *v)
+static inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
return __atomic_add_barrier(i, &v->counter);
}
+#define arch_atomic_fetch_add arch_atomic_fetch_add
-static inline void atomic_add(int i, atomic_t *v)
+static inline void arch_atomic_add(int i, atomic_t *v)
{
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
- if (__builtin_constant_p(i) && (i > -129) && (i < 128)) {
- __atomic_add_const(i, &v->counter);
- return;
- }
-#endif
__atomic_add(i, &v->counter);
}
+#define arch_atomic_add arch_atomic_add
-#define atomic_sub(_i, _v) atomic_add(-(int)(_i), _v)
-#define atomic_sub_return(_i, _v) atomic_add_return(-(int)(_i), _v)
-#define atomic_fetch_sub(_i, _v) atomic_fetch_add(-(int)(_i), _v)
+#define arch_atomic_sub(_i, _v) arch_atomic_add(-(int)(_i), _v)
+#define arch_atomic_sub_return(_i, _v) arch_atomic_add_return(-(int)(_i), _v)
+#define arch_atomic_fetch_sub(_i, _v) arch_atomic_fetch_add(-(int)(_i), _v)
#define ATOMIC_OPS(op) \
-static inline void atomic_##op(int i, atomic_t *v) \
+static inline void arch_atomic_##op(int i, atomic_t *v) \
{ \
__atomic_##op(i, &v->counter); \
} \
-static inline int atomic_fetch_##op(int i, atomic_t *v) \
+static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \
{ \
return __atomic_##op##_barrier(i, &v->counter); \
}
@@ -75,66 +65,67 @@ ATOMIC_OPS(xor)
#undef ATOMIC_OPS
-#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+#define arch_atomic_and arch_atomic_and
+#define arch_atomic_or arch_atomic_or
+#define arch_atomic_xor arch_atomic_xor
+#define arch_atomic_fetch_and arch_atomic_fetch_and
+#define arch_atomic_fetch_or arch_atomic_fetch_or
+#define arch_atomic_fetch_xor arch_atomic_fetch_xor
+
+#define arch_atomic_xchg(v, new) (arch_xchg(&((v)->counter), new))
-static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+static inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
return __atomic_cmpxchg(&v->counter, old, new);
}
+#define arch_atomic_cmpxchg arch_atomic_cmpxchg
#define ATOMIC64_INIT(i) { (i) }
-static inline s64 atomic64_read(const atomic64_t *v)
+static inline s64 arch_atomic64_read(const atomic64_t *v)
{
- s64 c;
-
- asm volatile(
- " lg %0,%1\n"
- : "=d" (c) : "Q" (v->counter));
- return c;
+ return __atomic64_read(v);
}
+#define arch_atomic64_read arch_atomic64_read
-static inline void atomic64_set(atomic64_t *v, s64 i)
+static inline void arch_atomic64_set(atomic64_t *v, s64 i)
{
- asm volatile(
- " stg %1,%0\n"
- : "=Q" (v->counter) : "d" (i));
+ __atomic64_set(v, i);
}
+#define arch_atomic64_set arch_atomic64_set
-static inline s64 atomic64_add_return(s64 i, atomic64_t *v)
+static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
{
return __atomic64_add_barrier(i, (long *)&v->counter) + i;
}
+#define arch_atomic64_add_return arch_atomic64_add_return
-static inline s64 atomic64_fetch_add(s64 i, atomic64_t *v)
+static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
return __atomic64_add_barrier(i, (long *)&v->counter);
}
+#define arch_atomic64_fetch_add arch_atomic64_fetch_add
-static inline void atomic64_add(s64 i, atomic64_t *v)
+static inline void arch_atomic64_add(s64 i, atomic64_t *v)
{
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
- if (__builtin_constant_p(i) && (i > -129) && (i < 128)) {
- __atomic64_add_const(i, (long *)&v->counter);
- return;
- }
-#endif
__atomic64_add(i, (long *)&v->counter);
}
+#define arch_atomic64_add arch_atomic64_add
-#define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
+#define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new))
-static inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
+static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
return __atomic64_cmpxchg((long *)&v->counter, old, new);
}
+#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
#define ATOMIC64_OPS(op) \
-static inline void atomic64_##op(s64 i, atomic64_t *v) \
+static inline void arch_atomic64_##op(s64 i, atomic64_t *v) \
{ \
__atomic64_##op(i, (long *)&v->counter); \
} \
-static inline long atomic64_fetch_##op(s64 i, atomic64_t *v) \
+static inline long arch_atomic64_fetch_##op(s64 i, atomic64_t *v) \
{ \
return __atomic64_##op##_barrier(i, (long *)&v->counter); \
}
@@ -145,8 +136,15 @@ ATOMIC64_OPS(xor)
#undef ATOMIC64_OPS
-#define atomic64_sub_return(_i, _v) atomic64_add_return(-(s64)(_i), _v)
-#define atomic64_fetch_sub(_i, _v) atomic64_fetch_add(-(s64)(_i), _v)
-#define atomic64_sub(_i, _v) atomic64_add(-(s64)(_i), _v)
+#define arch_atomic64_and arch_atomic64_and
+#define arch_atomic64_or arch_atomic64_or
+#define arch_atomic64_xor arch_atomic64_xor
+#define arch_atomic64_fetch_and arch_atomic64_fetch_and
+#define arch_atomic64_fetch_or arch_atomic64_fetch_or
+#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
+
+#define arch_atomic64_sub_return(_i, _v) arch_atomic64_add_return(-(s64)(_i), _v)
+#define arch_atomic64_fetch_sub(_i, _v) arch_atomic64_fetch_add(-(s64)(_i), _v)
+#define arch_atomic64_sub(_i, _v) arch_atomic64_add(-(s64)(_i), _v)
#endif /* __ARCH_S390_ATOMIC__ */
diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h
index 61467b9eecc7..50510e08b893 100644
--- a/arch/s390/include/asm/atomic_ops.h
+++ b/arch/s390/include/asm/atomic_ops.h
@@ -8,6 +8,40 @@
#ifndef __ARCH_S390_ATOMIC_OPS__
#define __ARCH_S390_ATOMIC_OPS__
+static inline int __atomic_read(const atomic_t *v)
+{
+ int c;
+
+ asm volatile(
+ " l %0,%1\n"
+ : "=d" (c) : "R" (v->counter));
+ return c;
+}
+
+static inline void __atomic_set(atomic_t *v, int i)
+{
+ asm volatile(
+ " st %1,%0\n"
+ : "=R" (v->counter) : "d" (i));
+}
+
+static inline s64 __atomic64_read(const atomic64_t *v)
+{
+ s64 c;
+
+ asm volatile(
+ " lg %0,%1\n"
+ : "=d" (c) : "RT" (v->counter));
+ return c;
+}
+
+static inline void __atomic64_set(atomic64_t *v, s64 i)
+{
+ asm volatile(
+ " stg %1,%0\n"
+ : "=RT" (v->counter) : "d" (i));
+}
+
#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
#define __ATOMIC_OP(op_name, op_type, op_string, op_barrier) \
@@ -18,7 +52,7 @@ static inline op_type op_name(op_type val, op_type *ptr) \
asm volatile( \
op_string " %[old],%[val],%[ptr]\n" \
op_barrier \
- : [old] "=d" (old), [ptr] "+Q" (*ptr) \
+ : [old] "=d" (old), [ptr] "+QS" (*ptr) \
: [val] "d" (val) : "cc", "memory"); \
return old; \
} \
@@ -46,7 +80,7 @@ static __always_inline void op_name(op_type val, op_type *ptr) \
asm volatile( \
op_string " %[ptr],%[val]\n" \
op_barrier \
- : [ptr] "+Q" (*ptr) : [val] "i" (val) : "cc", "memory");\
+ : [ptr] "+QS" (*ptr) : [val] "i" (val) : "cc", "memory");\
}
#define __ATOMIC_CONST_OPS(op_name, op_type, op_string) \
@@ -97,7 +131,7 @@ static inline long op_name(long val, long *ptr) \
op_string " %[new],%[val]\n" \
" csg %[old],%[new],%[ptr]\n" \
" jl 0b" \
- : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\
+ : [old] "=d" (old), [new] "=&d" (new), [ptr] "+QS" (*ptr)\
: [val] "d" (val), "0" (*ptr) : "cc", "memory"); \
return old; \
}
@@ -122,22 +156,46 @@ __ATOMIC64_OPS(__atomic64_xor, "xgr")
static inline int __atomic_cmpxchg(int *ptr, int old, int new)
{
- return __sync_val_compare_and_swap(ptr, old, new);
+ asm volatile(
+ " cs %[old],%[new],%[ptr]"
+ : [old] "+d" (old), [ptr] "+Q" (*ptr)
+ : [new] "d" (new)
+ : "cc", "memory");
+ return old;
}
-static inline int __atomic_cmpxchg_bool(int *ptr, int old, int new)
+static inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new)
{
- return __sync_bool_compare_and_swap(ptr, old, new);
+ int old_expected = old;
+
+ asm volatile(
+ " cs %[old],%[new],%[ptr]"
+ : [old] "+d" (old), [ptr] "+Q" (*ptr)
+ : [new] "d" (new)
+ : "cc", "memory");
+ return old == old_expected;
}
static inline long __atomic64_cmpxchg(long *ptr, long old, long new)
{
- return __sync_val_compare_and_swap(ptr, old, new);
+ asm volatile(
+ " csg %[old],%[new],%[ptr]"
+ : [old] "+d" (old), [ptr] "+QS" (*ptr)
+ : [new] "d" (new)
+ : "cc", "memory");
+ return old;
}
-static inline long __atomic64_cmpxchg_bool(long *ptr, long old, long new)
+static inline bool __atomic64_cmpxchg_bool(long *ptr, long old, long new)
{
- return __sync_bool_compare_and_swap(ptr, old, new);
+ long old_expected = old;
+
+ asm volatile(
+ " csg %[old],%[new],%[ptr]"
+ : [old] "+d" (old), [ptr] "+QS" (*ptr)
+ : [new] "d" (new)
+ : "cc", "memory");
+ return old == old_expected;
}
#endif /* __ARCH_S390_ATOMIC_OPS__ */
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index f9eddbca79d2..82de2a7c4160 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -16,20 +16,24 @@
#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
/* Fast-BCR without checkpoint synchronization */
-#define __ASM_BARRIER "bcr 14,0\n"
+#define __ASM_BCR_SERIALIZE "bcr 14,0\n"
#else
-#define __ASM_BARRIER "bcr 15,0\n"
+#define __ASM_BCR_SERIALIZE "bcr 15,0\n"
#endif
-#define mb() do { asm volatile(__ASM_BARRIER : : : "memory"); } while (0)
+static __always_inline void bcr_serialize(void)
+{
+ asm volatile(__ASM_BCR_SERIALIZE : : : "memory");
+}
-#define rmb() barrier()
-#define wmb() barrier()
-#define dma_rmb() mb()
-#define dma_wmb() mb()
-#define __smp_mb() mb()
-#define __smp_rmb() rmb()
-#define __smp_wmb() wmb()
+#define __mb() bcr_serialize()
+#define __rmb() barrier()
+#define __wmb() barrier()
+#define __dma_rmb() __mb()
+#define __dma_wmb() __mb()
+#define __smp_mb() __mb()
+#define __smp_rmb() __rmb()
+#define __smp_wmb() __wmb()
#define __smp_store_release(p, v) \
do { \
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 431e208a5ea4..c467dffa8c12 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -42,7 +42,7 @@
#define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG)
static inline unsigned long *
-__bitops_word(unsigned long nr, volatile unsigned long *ptr)
+__bitops_word(unsigned long nr, const volatile unsigned long *ptr)
{
unsigned long addr;
@@ -50,73 +50,33 @@ __bitops_word(unsigned long nr, volatile unsigned long *ptr)
return (unsigned long *)addr;
}
-static inline unsigned char *
-__bitops_byte(unsigned long nr, volatile unsigned long *ptr)
+static inline unsigned long __bitops_mask(unsigned long nr)
{
- return ((unsigned char *)ptr) + ((nr ^ (BITS_PER_LONG - 8)) >> 3);
+ return 1UL << (nr & (BITS_PER_LONG - 1));
}
static __always_inline void arch_set_bit(unsigned long nr, volatile unsigned long *ptr)
{
unsigned long *addr = __bitops_word(nr, ptr);
- unsigned long mask;
+ unsigned long mask = __bitops_mask(nr);
-#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES
- if (__builtin_constant_p(nr)) {
- unsigned char *caddr = __bitops_byte(nr, ptr);
-
- asm volatile(
- "oi %0,%b1\n"
- : "+Q" (*caddr)
- : "i" (1 << (nr & 7))
- : "cc", "memory");
- return;
- }
-#endif
- mask = 1UL << (nr & (BITS_PER_LONG - 1));
__atomic64_or(mask, (long *)addr);
}
static __always_inline void arch_clear_bit(unsigned long nr, volatile unsigned long *ptr)
{
unsigned long *addr = __bitops_word(nr, ptr);
- unsigned long mask;
+ unsigned long mask = __bitops_mask(nr);
-#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES
- if (__builtin_constant_p(nr)) {
- unsigned char *caddr = __bitops_byte(nr, ptr);
-
- asm volatile(
- "ni %0,%b1\n"
- : "+Q" (*caddr)
- : "i" (~(1 << (nr & 7)))
- : "cc", "memory");
- return;
- }
-#endif
- mask = ~(1UL << (nr & (BITS_PER_LONG - 1)));
- __atomic64_and(mask, (long *)addr);
+ __atomic64_and(~mask, (long *)addr);
}
static __always_inline void arch_change_bit(unsigned long nr,
volatile unsigned long *ptr)
{
unsigned long *addr = __bitops_word(nr, ptr);
- unsigned long mask;
-
-#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES
- if (__builtin_constant_p(nr)) {
- unsigned char *caddr = __bitops_byte(nr, ptr);
+ unsigned long mask = __bitops_mask(nr);
- asm volatile(
- "xi %0,%b1\n"
- : "+Q" (*caddr)
- : "i" (1 << (nr & 7))
- : "cc", "memory");
- return;
- }
-#endif
- mask = 1UL << (nr & (BITS_PER_LONG - 1));
__atomic64_xor(mask, (long *)addr);
}
@@ -124,106 +84,106 @@ static inline bool arch_test_and_set_bit(unsigned long nr,
volatile unsigned long *ptr)
{
unsigned long *addr = __bitops_word(nr, ptr);
- unsigned long old, mask;
+ unsigned long mask = __bitops_mask(nr);
+ unsigned long old;
- mask = 1UL << (nr & (BITS_PER_LONG - 1));
old = __atomic64_or_barrier(mask, (long *)addr);
- return (old & mask) != 0;
+ return old & mask;
}
static inline bool arch_test_and_clear_bit(unsigned long nr,
volatile unsigned long *ptr)
{
unsigned long *addr = __bitops_word(nr, ptr);
- unsigned long old, mask;
+ unsigned long mask = __bitops_mask(nr);
+ unsigned long old;
- mask = ~(1UL << (nr & (BITS_PER_LONG - 1)));
- old = __atomic64_and_barrier(mask, (long *)addr);
- return (old & ~mask) != 0;
+ old = __atomic64_and_barrier(~mask, (long *)addr);
+ return old & mask;
}
static inline bool arch_test_and_change_bit(unsigned long nr,
volatile unsigned long *ptr)
{
unsigned long *addr = __bitops_word(nr, ptr);
- unsigned long old, mask;
+ unsigned long mask = __bitops_mask(nr);
+ unsigned long old;
- mask = 1UL << (nr & (BITS_PER_LONG - 1));
old = __atomic64_xor_barrier(mask, (long *)addr);
- return (old & mask) != 0;
+ return old & mask;
}
-static inline void arch___set_bit(unsigned long nr, volatile unsigned long *ptr)
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
{
- unsigned char *addr = __bitops_byte(nr, ptr);
+ unsigned long *p = __bitops_word(nr, addr);
+ unsigned long mask = __bitops_mask(nr);
- *addr |= 1 << (nr & 7);
+ *p |= mask;
}
-static inline void arch___clear_bit(unsigned long nr,
- volatile unsigned long *ptr)
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
{
- unsigned char *addr = __bitops_byte(nr, ptr);
+ unsigned long *p = __bitops_word(nr, addr);
+ unsigned long mask = __bitops_mask(nr);
- *addr &= ~(1 << (nr & 7));
+ *p &= ~mask;
}
-static inline void arch___change_bit(unsigned long nr,
- volatile unsigned long *ptr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
{
- unsigned char *addr = __bitops_byte(nr, ptr);
+ unsigned long *p = __bitops_word(nr, addr);
+ unsigned long mask = __bitops_mask(nr);
- *addr ^= 1 << (nr & 7);
+ *p ^= mask;
}
-static inline bool arch___test_and_set_bit(unsigned long nr,
- volatile unsigned long *ptr)
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
{
- unsigned char *addr = __bitops_byte(nr, ptr);
- unsigned char ch;
+ unsigned long *p = __bitops_word(nr, addr);
+ unsigned long mask = __bitops_mask(nr);
+ unsigned long old;
- ch = *addr;
- *addr |= 1 << (nr & 7);
- return (ch >> (nr & 7)) & 1;
+ old = *p;
+ *p |= mask;
+ return old & mask;
}
-static inline bool arch___test_and_clear_bit(unsigned long nr,
- volatile unsigned long *ptr)
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
{
- unsigned char *addr = __bitops_byte(nr, ptr);
- unsigned char ch;
+ unsigned long *p = __bitops_word(nr, addr);
+ unsigned long mask = __bitops_mask(nr);
+ unsigned long old;
- ch = *addr;
- *addr &= ~(1 << (nr & 7));
- return (ch >> (nr & 7)) & 1;
+ old = *p;
+ *p &= ~mask;
+ return old & mask;
}
-static inline bool arch___test_and_change_bit(unsigned long nr,
- volatile unsigned long *ptr)
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
{
- unsigned char *addr = __bitops_byte(nr, ptr);
- unsigned char ch;
+ unsigned long *p = __bitops_word(nr, addr);
+ unsigned long mask = __bitops_mask(nr);
+ unsigned long old;
- ch = *addr;
- *addr ^= 1 << (nr & 7);
- return (ch >> (nr & 7)) & 1;
+ old = *p;
+ *p ^= mask;
+ return old & mask;
}
-static inline bool arch_test_bit(unsigned long nr,
- const volatile unsigned long *ptr)
-{
- const volatile unsigned char *addr;
-
- addr = ((const volatile unsigned char *)ptr);
- addr += (nr ^ (BITS_PER_LONG - 8)) >> 3;
- return (*addr >> (nr & 7)) & 1;
-}
+#define arch_test_bit generic_test_bit
+#define arch_test_bit_acquire generic_test_bit_acquire
static inline bool arch_test_and_set_bit_lock(unsigned long nr,
volatile unsigned long *ptr)
{
if (arch_test_bit(nr, ptr))
- return 1;
+ return true;
return arch_test_and_set_bit(nr, ptr);
}
@@ -241,6 +201,16 @@ static inline void arch___clear_bit_unlock(unsigned long nr,
arch___clear_bit(nr, ptr);
}
+static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
+ volatile unsigned long *ptr)
+{
+ unsigned long old;
+
+ old = __atomic64_xor_barrier(mask, (long *)ptr);
+ return old & BIT(7);
+}
+#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte
+
#include <asm-generic/bitops/instrumented-atomic.h>
#include <asm-generic/bitops/instrumented-non-atomic.h>
#include <asm-generic/bitops/instrumented-lock.h>
@@ -291,8 +261,6 @@ static inline bool test_bit_inv(unsigned long nr,
return test_bit(nr ^ (BITS_PER_LONG - 1), ptr);
}
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
-
/**
* __flogr - find leftmost one
* @word - The word to search
@@ -334,13 +302,13 @@ static inline unsigned char __flogr(unsigned long word)
}
return bit;
} else {
- register unsigned long bit asm("4") = word;
- register unsigned long out asm("5");
+ union register_pair rp;
+ rp.even = word;
asm volatile(
- " flogr %[bit],%[bit]\n"
- : [bit] "+d" (bit), [out] "=d" (out) : : "cc");
- return bit;
+ " flogr %[rp],%[rp]\n"
+ : [rp] "+d" (rp.pair) : : "cc");
+ return rp.even;
}
}
@@ -411,18 +379,7 @@ static inline int fls(unsigned int word)
return fls64(word);
}
-#else /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */
-
-#include <asm-generic/bitops/__ffs.h>
-#include <asm-generic/bitops/ffs.h>
-#include <asm-generic/bitops/__fls.h>
-#include <asm-generic/bitops/fls.h>
-#include <asm-generic/bitops/fls64.h>
-
-#endif /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */
-
#include <asm-generic/bitops/ffz.h>
-#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/hweight.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/le.h>
diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h
index a2b11ac00f60..aebe1e22c7be 100644
--- a/arch/s390/include/asm/bug.h
+++ b/arch/s390/include/asm/bug.h
@@ -2,7 +2,7 @@
#ifndef _ASM_S390_BUG_H
#define _ASM_S390_BUG_H
-#include <linux/kernel.h>
+#include <linux/compiler.h>
#ifdef CONFIG_BUG
@@ -10,15 +10,15 @@
#define __EMIT_BUG(x) do { \
asm_inline volatile( \
- "0: j 0b+2\n" \
- "1:\n" \
+ "0: mc 0,0\n" \
".section .rodata.str,\"aMS\",@progbits,1\n" \
- "2: .asciz \""__FILE__"\"\n" \
+ "1: .asciz \""__FILE__"\"\n" \
".previous\n" \
".section __bug_table,\"awM\",@progbits,%2\n" \
- "3: .long 1b-3b,2b-3b\n" \
+ "2: .long 0b-.\n" \
+ " .long 1b-.\n" \
" .short %0,%1\n" \
- " .org 3b+%2\n" \
+ " .org 2b+%2\n" \
".previous\n" \
: : "i" (__LINE__), \
"i" (x), \
@@ -29,12 +29,11 @@
#define __EMIT_BUG(x) do { \
asm_inline volatile( \
- "0: j 0b+2\n" \
- "1:\n" \
+ "0: mc 0,0\n" \
".section __bug_table,\"awM\",@progbits,%1\n" \
- "2: .long 1b-2b\n" \
+ "1: .long 0b-.\n" \
" .short %0\n" \
- " .org 2b+%1\n" \
+ " .org 1b+%1\n" \
".previous\n" \
: : "i" (x), \
"i" (sizeof(struct bug_entry))); \
diff --git a/arch/s390/include/asm/bugs.h b/arch/s390/include/asm/bugs.h
deleted file mode 100644
index aa42a179be33..000000000000
--- a/arch/s390/include/asm/bugs.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * S390 version
- * Copyright IBM Corp. 1999
- * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- *
- * Derived from "include/asm-i386/bugs.h"
- * Copyright (C) 1994 Linus Torvalds
- */
-
-/*
- * This is included by init/main.c to check for architecture-dependent bugs.
- *
- * Needs:
- * void check_bugs(void);
- */
-
-static inline void check_bugs(void)
-{
- /* s390 has no bugs ... */
-}
diff --git a/arch/s390/include/asm/cache.h b/arch/s390/include/asm/cache.h
index d5e22e837416..00128174c025 100644
--- a/arch/s390/include/asm/cache.h
+++ b/arch/s390/include/asm/cache.h
@@ -14,6 +14,6 @@
#define L1_CACHE_SHIFT 8
#define NET_SKB_PAD 32
-#define __read_mostly __section(.data..read_mostly)
+#define __read_mostly __section(".data..read_mostly")
#endif
diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h
index 865ce1cb86d5..91d261751d25 100644
--- a/arch/s390/include/asm/ccwdev.h
+++ b/arch/s390/include/asm/ccwdev.h
@@ -11,9 +11,11 @@
#include <linux/device.h>
#include <linux/mod_devicetable.h>
+#include <asm/chsc.h>
#include <asm/fcx.h>
#include <asm/irq.h>
#include <asm/schid.h>
+#include <linux/mutex.h>
/* structs from asm/cio.h */
struct irb;
@@ -86,6 +88,7 @@ struct ccw_device {
spinlock_t *ccwlock;
/* private: */
struct ccw_device_private *private; /* cio private information */
+ struct mutex reg_mutex;
/* public: */
struct ccw_device_id id;
struct ccw_driver *drv;
@@ -103,6 +106,8 @@ struct ccw_device {
was successfully verified. */
#define PE_PATHGROUP_ESTABLISHED 0x4 /* A pathgroup was reset and had
to be established again. */
+#define PE_PATH_FCES_EVENT 0x8 /* The FCES Status of a path has
+ * changed. */
/*
* Possible CIO actions triggered by the unit check handler.
@@ -114,7 +119,7 @@ enum uc_todo {
};
/**
- * struct ccw driver - device driver for channel attached devices
+ * struct ccw_driver - device driver for channel attached devices
* @ids: ids supported by this driver
* @probe: function called on probe
* @remove: function called on remove
@@ -123,11 +128,6 @@ enum uc_todo {
* @notify: notify driver of device state changes
* @path_event: notify driver of channel path events
* @shutdown: called at device shutdown
- * @prepare: prepare for pm state transition
- * @complete: undo work done in @prepare
- * @freeze: callback for freezing during hibernation snapshotting
- * @thaw: undo work done in @freeze
- * @restore: callback for restoring after hibernation
* @uc_handler: callback for unit check handler
* @driver: embedded device driver structure
* @int_class: interruption class to use for accounting interrupts
@@ -141,11 +141,6 @@ struct ccw_driver {
int (*notify) (struct ccw_device *, int);
void (*path_event) (struct ccw_device *, int *);
void (*shutdown) (struct ccw_device *);
- int (*prepare) (struct ccw_device *);
- void (*complete) (struct ccw_device *);
- int (*freeze)(struct ccw_device *);
- int (*thaw) (struct ccw_device *);
- int (*restore)(struct ccw_device *);
enum uc_todo (*uc_handler) (struct ccw_device *, struct irb *);
struct device_driver driver;
enum interruption_class int_class;
@@ -159,9 +154,6 @@ extern struct ccw_device *get_ccwdev_by_busid(struct ccw_driver *cdrv,
* when new devices for its type pop up */
extern int ccw_driver_register (struct ccw_driver *driver);
extern void ccw_driver_unregister (struct ccw_driver *driver);
-
-struct ccw1;
-
extern int ccw_device_set_options_mask(struct ccw_device *, unsigned long);
extern int ccw_device_set_options(struct ccw_device *, unsigned long);
extern void ccw_device_clear_options(struct ccw_device *, unsigned long);
@@ -224,7 +216,6 @@ extern struct ccw_device *ccw_device_create_console(struct ccw_driver *);
extern void ccw_device_destroy_console(struct ccw_device *);
extern int ccw_device_enable_console(struct ccw_device *);
extern void ccw_device_wait_idle(struct ccw_device *);
-extern int ccw_device_force_console(struct ccw_device *);
extern void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size);
extern void ccw_device_dma_free(struct ccw_device *cdev,
@@ -236,4 +227,11 @@ extern void ccw_device_get_schid(struct ccw_device *, struct subchannel_id *);
struct channel_path_desc_fmt0 *ccw_device_get_chp_desc(struct ccw_device *, int);
u8 *ccw_device_get_util_str(struct ccw_device *cdev, int chp_idx);
+int ccw_device_pnso(struct ccw_device *cdev,
+ struct chsc_pnso_area *pnso_area, u8 oc,
+ struct chsc_pnso_resume_token resume_token, int cnc);
+int ccw_device_get_cssid(struct ccw_device *cdev, u8 *cssid);
+int ccw_device_get_iid(struct ccw_device *cdev, u8 *iid);
+int ccw_device_get_chpid(struct ccw_device *cdev, int chp_idx, u8 *chpid);
+int ccw_device_get_chid(struct ccw_device *cdev, int chp_idx, u16 *chid);
#endif /* _S390_CCWDEV_H_ */
diff --git a/arch/s390/include/asm/ccwgroup.h b/arch/s390/include/asm/ccwgroup.h
index 7293c139dd79..11d2fb3de4f5 100644
--- a/arch/s390/include/asm/ccwgroup.h
+++ b/arch/s390/include/asm/ccwgroup.h
@@ -11,8 +11,7 @@ struct ccw_driver;
* @count: number of attached slave devices
* @dev: embedded device structure
* @cdev: variable number of slave devices, allocated as needed
- * @ungroup_work: work to be done when a ccwgroup notifier has action
- * type %BUS_NOTIFY_UNBIND_DRIVER
+ * @ungroup_work: used to ungroup the ccwgroup device
*/
struct ccwgroup_device {
enum {
@@ -26,7 +25,7 @@ struct ccwgroup_device {
unsigned int count;
struct device dev;
struct work_struct ungroup_work;
- struct ccw_device *cdev[0];
+ struct ccw_device *cdev[];
};
/**
@@ -36,11 +35,6 @@ struct ccwgroup_device {
* @set_online: function called when device is set online
* @set_offline: function called when device is set offline
* @shutdown: function called when device is shut down
- * @prepare: prepare for pm state transition
- * @complete: undo work done in @prepare
- * @freeze: callback for freezing during hibernation snapshotting
- * @thaw: undo work done in @freeze
- * @restore: callback for restoring after hibernation
* @driver: embedded driver structure
* @ccw_driver: supported ccw_driver (optional)
*/
@@ -50,11 +44,6 @@ struct ccwgroup_driver {
int (*set_online) (struct ccwgroup_device *);
int (*set_offline) (struct ccwgroup_device *);
void (*shutdown)(struct ccwgroup_device *);
- int (*prepare) (struct ccwgroup_device *);
- void (*complete) (struct ccwgroup_device *);
- int (*freeze)(struct ccwgroup_device *);
- int (*thaw) (struct ccwgroup_device *);
- int (*restore)(struct ccwgroup_device *);
struct device_driver driver;
struct ccw_driver *ccw_driver;
@@ -64,11 +53,9 @@ extern int ccwgroup_driver_register (struct ccwgroup_driver *cdriver);
extern void ccwgroup_driver_unregister (struct ccwgroup_driver *cdriver);
int ccwgroup_create_dev(struct device *root, struct ccwgroup_driver *gdrv,
int num_devices, const char *buf);
-struct ccwgroup_device *get_ccwgroupdev_by_busid(struct ccwgroup_driver *gdrv,
- char *bus_id);
extern int ccwgroup_set_online(struct ccwgroup_device *gdev);
-extern int ccwgroup_set_offline(struct ccwgroup_device *gdev);
+int ccwgroup_set_offline(struct ccwgroup_device *gdev, bool call_gdrv);
extern int ccwgroup_probe_ccwdev(struct ccw_device *cdev);
extern void ccwgroup_remove_ccwdev(struct ccw_device *cdev);
diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h
index 91e376b0d28c..69837eec2ff5 100644
--- a/arch/s390/include/asm/checksum.h
+++ b/arch/s390/include/asm/checksum.h
@@ -12,128 +12,122 @@
#ifndef _S390_CHECKSUM_H
#define _S390_CHECKSUM_H
-#include <linux/uaccess.h>
+#include <linux/kasan-checks.h>
+#include <linux/in6.h>
/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
+ * Computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit).
*
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
+ * Returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic.
*
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
+ * This function must be called with even lengths, except
+ * for the last fragment, which may be odd.
*
- * it's best to have buff aligned on a 32-bit boundary
+ * It's best to have buff aligned on a 32-bit boundary.
*/
-static inline __wsum
-csum_partial(const void *buff, int len, __wsum sum)
+static inline __wsum csum_partial(const void *buff, int len, __wsum sum)
{
- register unsigned long reg2 asm("2") = (unsigned long) buff;
- register unsigned long reg3 asm("3") = (unsigned long) len;
+ union register_pair rp = {
+ .even = (unsigned long) buff,
+ .odd = (unsigned long) len,
+ };
+ kasan_check_read(buff, len);
asm volatile(
- "0: cksm %0,%1\n" /* do checksum on longs */
+ "0: cksm %[sum],%[rp]\n"
" jo 0b\n"
- : "+d" (sum), "+d" (reg2), "+d" (reg3) : : "cc", "memory");
+ : [sum] "+&d" (sum), [rp] "+&d" (rp.pair) : : "cc", "memory");
return sum;
}
/*
- * the same as csum_partial_copy, but copies from user space.
- *
- * here even more important to align src and dst on a 32-bit (or even
- * better 64-bit) boundary
- *
- * Copy from userspace and compute checksum.
- */
-static inline __wsum
-csum_partial_copy_from_user(const void __user *src, void *dst,
- int len, __wsum sum,
- int *err_ptr)
-{
- if (unlikely(copy_from_user(dst, src, len)))
- *err_ptr = -EFAULT;
- return csum_partial(dst, len, sum);
-}
-
-
-static inline __wsum
-csum_partial_copy_nocheck (const void *src, void *dst, int len, __wsum sum)
-{
- memcpy(dst,src,len);
- return csum_partial(dst, len, sum);
-}
-
-/*
- * Fold a partial checksum without adding pseudo headers
+ * Fold a partial checksum without adding pseudo headers.
*/
static inline __sum16 csum_fold(__wsum sum)
{
u32 csum = (__force u32) sum;
- csum += (csum >> 16) + (csum << 16);
+ csum += (csum >> 16) | (csum << 16);
csum >>= 16;
return (__force __sum16) ~csum;
}
/*
- * This is a version of ip_compute_csum() optimized for IP headers,
- * which always checksum on 4 octet boundaries.
- *
+ * This is a version of ip_compute_csum() optimized for IP headers,
+ * which always checksums on 4 octet boundaries.
*/
static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
{
- return csum_fold(csum_partial(iph, ihl*4, 0));
+ __u64 csum = 0;
+ __u32 *ptr = (u32 *)iph;
+
+ csum += *ptr++;
+ csum += *ptr++;
+ csum += *ptr++;
+ csum += *ptr++;
+ ihl -= 4;
+ while (ihl--)
+ csum += *ptr++;
+ csum += (csum >> 32) | (csum << 32);
+ return csum_fold((__force __wsum)(csum >> 32));
}
/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 32-bit checksum
+ * Computes the checksum of the TCP/UDP pseudo-header.
+ * Returns a 32-bit checksum.
*/
-static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto,
- __wsum sum)
+static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+ __u8 proto, __wsum sum)
{
- __u32 csum = (__force __u32)sum;
+ __u64 csum = (__force __u64)sum;
csum += (__force __u32)saddr;
- if (csum < (__force __u32)saddr)
- csum++;
-
csum += (__force __u32)daddr;
- if (csum < (__force __u32)daddr)
- csum++;
-
- csum += len + proto;
- if (csum < len + proto)
- csum++;
-
- return (__force __wsum)csum;
+ csum += len;
+ csum += proto;
+ csum += (csum >> 32) | (csum << 32);
+ return (__force __wsum)(csum >> 32);
}
/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented
+ * Computes the checksum of the TCP/UDP pseudo-header.
+ * Returns a 16-bit checksum, already complemented.
*/
-
-static inline __sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto,
- __wsum sum)
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
+ __u8 proto, __wsum sum)
{
- return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
+ return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
}
/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
+ * Used for miscellaneous IP-like checksums, mainly icmp.
*/
-
static inline __sum16 ip_compute_csum(const void *buff, int len)
{
return csum_fold(csum_partial(buff, len, 0));
}
-#endif /* _S390_CHECKSUM_H */
-
+#define _HAVE_ARCH_IPV6_CSUM
+static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, __u8 proto, __wsum csum)
+{
+ __u64 sum = (__force __u64)csum;
+
+ sum += (__force __u32)saddr->s6_addr32[0];
+ sum += (__force __u32)saddr->s6_addr32[1];
+ sum += (__force __u32)saddr->s6_addr32[2];
+ sum += (__force __u32)saddr->s6_addr32[3];
+ sum += (__force __u32)daddr->s6_addr32[0];
+ sum += (__force __u32)daddr->s6_addr32[1];
+ sum += (__force __u32)daddr->s6_addr32[2];
+ sum += (__force __u32)daddr->s6_addr32[3];
+ sum += len;
+ sum += proto;
+ sum += (sum >> 32) | (sum << 32);
+ return csum_fold((__force __wsum)(sum >> 32));
+}
+#endif /* _S390_CHECKSUM_H */
diff --git a/arch/s390/include/asm/chsc.h b/arch/s390/include/asm/chsc.h
new file mode 100644
index 000000000000..bb48ea380c0d
--- /dev/null
+++ b/arch/s390/include/asm/chsc.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s): Alexandra Winter <wintera@linux.ibm.com>
+ *
+ * Interface for Channel Subsystem Call
+ */
+#ifndef _ASM_S390_CHSC_H
+#define _ASM_S390_CHSC_H
+
+#include <uapi/asm/chsc.h>
+
+/**
+ * Operation codes for CHSC PNSO:
+ * PNSO_OC_NET_BRIDGE_INFO - only addresses that are visible to a bridgeport
+ * PNSO_OC_NET_ADDR_INFO - all addresses
+ */
+#define PNSO_OC_NET_BRIDGE_INFO 0
+#define PNSO_OC_NET_ADDR_INFO 3
+/**
+ * struct chsc_pnso_naid_l2 - network address information descriptor
+ * @nit: Network interface token
+ * @addr_lnid: network address and logical network id (VLAN ID)
+ */
+struct chsc_pnso_naid_l2 {
+ u64 nit;
+ struct { u8 mac[6]; u16 lnid; } addr_lnid;
+} __packed;
+
+struct chsc_pnso_resume_token {
+ u64 t1;
+ u64 t2;
+} __packed;
+
+struct chsc_pnso_naihdr {
+ struct chsc_pnso_resume_token resume_token;
+ u32:32;
+ u32 instance;
+ u32:24;
+ u8 naids;
+ u32 reserved[3];
+} __packed;
+
+struct chsc_pnso_area {
+ struct chsc_header request;
+ u8:2;
+ u8 m:1;
+ u8:5;
+ u8:2;
+ u8 ssid:2;
+ u8 fmt:4;
+ u16 sch;
+ u8:8;
+ u8 cssid;
+ u16:16;
+ u8 oc;
+ u32:24;
+ struct chsc_pnso_resume_token resume_token;
+ u32 n:1;
+ u32:31;
+ u32 reserved[3];
+ struct chsc_header response;
+ u32:32;
+ struct chsc_pnso_naihdr naihdr;
+ struct chsc_pnso_naid_l2 entries[];
+} __packed __aligned(PAGE_SIZE);
+
+#endif /* _ASM_S390_CHSC_H */
diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
index b5bfb3123cb1..1c4f585dd39b 100644
--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -5,10 +5,10 @@
#ifndef _ASM_S390_CIO_H_
#define _ASM_S390_CIO_H_
-#include <linux/spinlock.h>
#include <linux/bitops.h>
#include <linux/genalloc.h>
#include <asm/types.h>
+#include <asm/tpi.h>
#define LPM_ANYPATH 0xff
#define __MAX_CSSID 0
@@ -329,7 +329,7 @@ struct ccw_dev_id {
};
/**
- * ccw_device_id_is_equal() - compare two ccw_dev_ids
+ * ccw_dev_id_is_equal() - compare two ccw_dev_ids
* @dev_id1: a ccw_dev_id
* @dev_id2: another ccw_dev_id
* Returns:
@@ -356,7 +356,6 @@ static inline u8 pathmask_to_pos(u8 mask)
return 8 - ffs(mask);
}
-void channel_subsystem_reinit(void);
extern void css_schedule_reprobe(void);
extern void *cio_dma_zalloc(size_t size);
@@ -370,8 +369,10 @@ void cio_gp_dma_destroy(struct gen_pool *gp_dma, struct device *dma_dev);
struct gen_pool *cio_gp_dma_create(struct device *dma_dev, int nr_pages);
/* Function from drivers/s390/cio/chsc.c */
-int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta);
+int chsc_sstpc(void *page, unsigned int op, u16 ctrl, long *clock_delta);
int chsc_sstpi(void *page, void *result, size_t size);
+int chsc_stzi(void *page, void *result, size_t size);
int chsc_sgib(u32 origin);
+int chsc_scud(u16 cu, u64 *esm, u8 *esm_valid);
#endif
diff --git a/arch/s390/include/asm/clocksource.h b/arch/s390/include/asm/clocksource.h
new file mode 100644
index 000000000000..03434369fce4
--- /dev/null
+++ b/arch/s390/include/asm/clocksource.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* s390-specific clocksource additions */
+
+#ifndef _ASM_S390_CLOCKSOURCE_H
+#define _ASM_S390_CLOCKSOURCE_H
+
+#endif /* _ASM_S390_CLOCKSOURCE_H */
diff --git a/arch/s390/include/asm/clp.h b/arch/s390/include/asm/clp.h
index 3925b0f085b7..10919eeb7533 100644
--- a/arch/s390/include/asm/clp.h
+++ b/arch/s390/include/asm/clp.h
@@ -5,6 +5,9 @@
/* CLP common request & response block size */
#define CLP_BLK_SIZE PAGE_SIZE
+/* Call Logical Processor - Command Code */
+#define CLP_SLPC 0x0001
+
#define CLP_LPS_BASE 0
#define CLP_LPS_PCI 2
diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h
index af99c1f66f12..aae0315374de 100644
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -12,55 +12,196 @@
#include <linux/types.h>
#include <linux/bug.h>
-#define cmpxchg(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __o = (o); \
- __typeof__(*(ptr)) __n = (n); \
- (__typeof__(*(ptr))) __sync_val_compare_and_swap((ptr),__o,__n);\
-})
+void __xchg_called_with_bad_pointer(void);
-#define cmpxchg64 cmpxchg
-#define cmpxchg_local cmpxchg
-#define cmpxchg64_local cmpxchg
+static __always_inline unsigned long
+__arch_xchg(unsigned long x, unsigned long address, int size)
+{
+ unsigned long old;
+ int shift;
-#define xchg(ptr, x) \
-({ \
- __typeof__(ptr) __ptr = (ptr); \
- __typeof__(*(ptr)) __old; \
- do { \
- __old = *__ptr; \
- } while (!__sync_bool_compare_and_swap(__ptr, __old, x)); \
- __old; \
-})
+ switch (size) {
+ case 1:
+ shift = (3 ^ (address & 3)) << 3;
+ address ^= address & 3;
+ asm volatile(
+ " l %0,%1\n"
+ "0: lr 0,%0\n"
+ " nr 0,%3\n"
+ " or 0,%2\n"
+ " cs %0,0,%1\n"
+ " jl 0b\n"
+ : "=&d" (old), "+Q" (*(int *) address)
+ : "d" ((x & 0xff) << shift), "d" (~(0xff << shift))
+ : "memory", "cc", "0");
+ return old >> shift;
+ case 2:
+ shift = (2 ^ (address & 2)) << 3;
+ address ^= address & 2;
+ asm volatile(
+ " l %0,%1\n"
+ "0: lr 0,%0\n"
+ " nr 0,%3\n"
+ " or 0,%2\n"
+ " cs %0,0,%1\n"
+ " jl 0b\n"
+ : "=&d" (old), "+Q" (*(int *) address)
+ : "d" ((x & 0xffff) << shift), "d" (~(0xffff << shift))
+ : "memory", "cc", "0");
+ return old >> shift;
+ case 4:
+ asm volatile(
+ " l %0,%1\n"
+ "0: cs %0,%2,%1\n"
+ " jl 0b\n"
+ : "=&d" (old), "+Q" (*(int *) address)
+ : "d" (x)
+ : "memory", "cc");
+ return old;
+ case 8:
+ asm volatile(
+ " lg %0,%1\n"
+ "0: csg %0,%2,%1\n"
+ " jl 0b\n"
+ : "=&d" (old), "+QS" (*(long *) address)
+ : "d" (x)
+ : "memory", "cc");
+ return old;
+ }
+ __xchg_called_with_bad_pointer();
+ return x;
+}
-#define __cmpxchg_double(p1, p2, o1, o2, n1, n2) \
+#define arch_xchg(ptr, x) \
({ \
- register __typeof__(*(p1)) __old1 asm("2") = (o1); \
- register __typeof__(*(p2)) __old2 asm("3") = (o2); \
- register __typeof__(*(p1)) __new1 asm("4") = (n1); \
- register __typeof__(*(p2)) __new2 asm("5") = (n2); \
- int cc; \
- asm volatile( \
- " cdsg %[old],%[new],%[ptr]\n" \
- " ipm %[cc]\n" \
- " srl %[cc],28" \
- : [cc] "=d" (cc), [old] "+d" (__old1), "+d" (__old2) \
- : [new] "d" (__new1), "d" (__new2), \
- [ptr] "Q" (*(p1)), "Q" (*(p2)) \
- : "memory", "cc"); \
- !cc; \
+ __typeof__(*(ptr)) __ret; \
+ \
+ __ret = (__typeof__(*(ptr))) \
+ __arch_xchg((unsigned long)(x), (unsigned long)(ptr), \
+ sizeof(*(ptr))); \
+ __ret; \
})
-#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \
+void __cmpxchg_called_with_bad_pointer(void);
+
+static __always_inline unsigned long __cmpxchg(unsigned long address,
+ unsigned long old,
+ unsigned long new, int size)
+{
+ switch (size) {
+ case 1: {
+ unsigned int prev, shift, mask;
+
+ shift = (3 ^ (address & 3)) << 3;
+ address ^= address & 3;
+ old = (old & 0xff) << shift;
+ new = (new & 0xff) << shift;
+ mask = ~(0xff << shift);
+ asm volatile(
+ " l %[prev],%[address]\n"
+ " nr %[prev],%[mask]\n"
+ " xilf %[mask],0xffffffff\n"
+ " or %[new],%[prev]\n"
+ " or %[prev],%[tmp]\n"
+ "0: lr %[tmp],%[prev]\n"
+ " cs %[prev],%[new],%[address]\n"
+ " jnl 1f\n"
+ " xr %[tmp],%[prev]\n"
+ " xr %[new],%[tmp]\n"
+ " nr %[tmp],%[mask]\n"
+ " jz 0b\n"
+ "1:"
+ : [prev] "=&d" (prev),
+ [address] "+Q" (*(int *)address),
+ [tmp] "+&d" (old),
+ [new] "+&d" (new),
+ [mask] "+&d" (mask)
+ :: "memory", "cc");
+ return prev >> shift;
+ }
+ case 2: {
+ unsigned int prev, shift, mask;
+
+ shift = (2 ^ (address & 2)) << 3;
+ address ^= address & 2;
+ old = (old & 0xffff) << shift;
+ new = (new & 0xffff) << shift;
+ mask = ~(0xffff << shift);
+ asm volatile(
+ " l %[prev],%[address]\n"
+ " nr %[prev],%[mask]\n"
+ " xilf %[mask],0xffffffff\n"
+ " or %[new],%[prev]\n"
+ " or %[prev],%[tmp]\n"
+ "0: lr %[tmp],%[prev]\n"
+ " cs %[prev],%[new],%[address]\n"
+ " jnl 1f\n"
+ " xr %[tmp],%[prev]\n"
+ " xr %[new],%[tmp]\n"
+ " nr %[tmp],%[mask]\n"
+ " jz 0b\n"
+ "1:"
+ : [prev] "=&d" (prev),
+ [address] "+Q" (*(int *)address),
+ [tmp] "+&d" (old),
+ [new] "+&d" (new),
+ [mask] "+&d" (mask)
+ :: "memory", "cc");
+ return prev >> shift;
+ }
+ case 4: {
+ unsigned int prev = old;
+
+ asm volatile(
+ " cs %[prev],%[new],%[address]\n"
+ : [prev] "+&d" (prev),
+ [address] "+Q" (*(int *)address)
+ : [new] "d" (new)
+ : "memory", "cc");
+ return prev;
+ }
+ case 8: {
+ unsigned long prev = old;
+
+ asm volatile(
+ " csg %[prev],%[new],%[address]\n"
+ : [prev] "+&d" (prev),
+ [address] "+QS" (*(long *)address)
+ : [new] "d" (new)
+ : "memory", "cc");
+ return prev;
+ }
+ }
+ __cmpxchg_called_with_bad_pointer();
+ return old;
+}
+
+#define arch_cmpxchg(ptr, o, n) \
({ \
- __typeof__(p1) __p1 = (p1); \
- __typeof__(p2) __p2 = (p2); \
- BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long)); \
- BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \
- VM_BUG_ON((unsigned long)((__p1) + 1) != (unsigned long)(__p2));\
- __cmpxchg_double(__p1, __p2, o1, o2, n1, n2); \
+ __typeof__(*(ptr)) __ret; \
+ \
+ __ret = (__typeof__(*(ptr))) \
+ __cmpxchg((unsigned long)(ptr), (unsigned long)(o), \
+ (unsigned long)(n), sizeof(*(ptr))); \
+ __ret; \
})
-#define system_has_cmpxchg_double() 1
+#define arch_cmpxchg64 arch_cmpxchg
+#define arch_cmpxchg_local arch_cmpxchg
+#define arch_cmpxchg64_local arch_cmpxchg
+
+#define system_has_cmpxchg128() 1
+
+static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
+{
+ asm volatile(
+ " cdsg %[old],%[new],%[ptr]\n"
+ : [old] "+d" (old), [ptr] "+QS" (*ptr)
+ : [new] "d" (new)
+ : "memory", "cc");
+ return old;
+}
+
+#define arch_cmpxchg128 arch_cmpxchg128
#endif /* __ASM_CMPXCHG_H */
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index 63b46e30b2c3..3cb9d813f022 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -8,6 +8,22 @@
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/thread_info.h>
+#include <asm/ptrace.h>
+
+#define compat_mode_t compat_mode_t
+typedef u16 compat_mode_t;
+
+#define __compat_uid_t __compat_uid_t
+typedef u16 __compat_uid_t;
+typedef u16 __compat_gid_t;
+
+#define compat_dev_t compat_dev_t
+typedef u16 compat_dev_t;
+
+#define compat_ipc_pid_t compat_ipc_pid_t
+typedef u16 compat_ipc_pid_t;
+
+#define compat_statfs compat_statfs
#include <asm-generic/compat.h>
@@ -19,52 +35,16 @@
(__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \
})
-#define PSW32_MASK_PER 0x40000000UL
-#define PSW32_MASK_DAT 0x04000000UL
-#define PSW32_MASK_IO 0x02000000UL
-#define PSW32_MASK_EXT 0x01000000UL
-#define PSW32_MASK_KEY 0x00F00000UL
-#define PSW32_MASK_BASE 0x00080000UL /* Always one */
-#define PSW32_MASK_MCHECK 0x00040000UL
-#define PSW32_MASK_WAIT 0x00020000UL
-#define PSW32_MASK_PSTATE 0x00010000UL
-#define PSW32_MASK_ASC 0x0000C000UL
-#define PSW32_MASK_CC 0x00003000UL
-#define PSW32_MASK_PM 0x00000f00UL
-#define PSW32_MASK_RI 0x00000080UL
-
#define PSW32_MASK_USER 0x0000FF00UL
-#define PSW32_ADDR_AMODE 0x80000000UL
-#define PSW32_ADDR_INSN 0x7FFFFFFFUL
-
-#define PSW32_DEFAULT_KEY (((u32) PAGE_DEFAULT_ACC) << 20)
-
-#define PSW32_ASC_PRIMARY 0x00000000UL
-#define PSW32_ASC_ACCREG 0x00004000UL
-#define PSW32_ASC_SECONDARY 0x00008000UL
-#define PSW32_ASC_HOME 0x0000C000UL
-
#define PSW32_USER_BITS (PSW32_MASK_DAT | PSW32_MASK_IO | PSW32_MASK_EXT | \
PSW32_DEFAULT_KEY | PSW32_MASK_BASE | \
PSW32_MASK_MCHECK | PSW32_MASK_PSTATE | \
PSW32_ASC_PRIMARY)
-#define COMPAT_USER_HZ 100
#define COMPAT_UTS_MACHINE "s390\0\0\0\0"
-typedef u16 __compat_uid_t;
-typedef u16 __compat_gid_t;
-typedef u32 __compat_uid32_t;
-typedef u32 __compat_gid32_t;
-typedef u16 compat_mode_t;
-typedef u16 compat_dev_t;
typedef u16 compat_nlink_t;
-typedef u16 compat_ipc_pid_t;
-typedef u32 compat_caddr_t;
-typedef __kernel_fsid_t compat_fsid_t;
-typedef s64 compat_s64;
-typedef u64 compat_u64;
typedef struct {
u32 mask;
@@ -105,26 +85,6 @@ struct compat_stat {
u32 __unused5;
};
-struct compat_flock {
- short l_type;
- short l_whence;
- compat_off_t l_start;
- compat_off_t l_len;
- compat_pid_t l_pid;
-};
-
-#define F_GETLK64 12
-#define F_SETLK64 13
-#define F_SETLKW64 14
-
-struct compat_flock64 {
- short l_type;
- short l_whence;
- compat_loff_t l_start;
- compat_loff_t l_len;
- compat_pid_t l_pid;
-};
-
struct compat_statfs {
u32 f_type;
u32 f_bsize;
@@ -152,20 +112,9 @@ struct compat_statfs64 {
u32 f_namelen;
u32 f_frsize;
u32 f_flags;
- u32 f_spare[4];
+ u32 f_spare[5];
};
-#define COMPAT_RLIM_INFINITY 0xffffffff
-
-typedef u32 compat_old_sigset_t; /* at least 32 bits */
-
-#define _COMPAT_NSIG 64
-#define _COMPAT_NSIG_BPW 32
-
-typedef u32 compat_sigset_word;
-
-#define COMPAT_OFF_T_MAX 0x7fffffff
-
/*
* A pointer passed in from user mode. This should not
* be used for syscall parameters, just declare them
@@ -177,11 +126,7 @@ static inline void __user *compat_ptr(compat_uptr_t uptr)
{
return (void __user *)(unsigned long)(uptr & 0x7fffffffUL);
}
-
-static inline compat_uptr_t ptr_to_compat(void __user *uptr)
-{
- return (u32)(unsigned long)uptr;
-}
+#define compat_ptr(uptr) compat_ptr(uptr)
#ifdef CONFIG_COMPAT
@@ -190,73 +135,6 @@ static inline int is_compat_task(void)
return test_thread_flag(TIF_31BIT);
}
-static inline void __user *arch_compat_alloc_user_space(long len)
-{
- unsigned long stack;
-
- stack = KSTK_ESP(current);
- if (is_compat_task())
- stack &= 0x7fffffffUL;
- return (void __user *) (stack - len);
-}
-
#endif
-struct compat_ipc64_perm {
- compat_key_t key;
- __compat_uid32_t uid;
- __compat_gid32_t gid;
- __compat_uid32_t cuid;
- __compat_gid32_t cgid;
- compat_mode_t mode;
- unsigned short __pad1;
- unsigned short seq;
- unsigned short __pad2;
- unsigned int __unused1;
- unsigned int __unused2;
-};
-
-struct compat_semid64_ds {
- struct compat_ipc64_perm sem_perm;
- compat_ulong_t sem_otime;
- compat_ulong_t sem_otime_high;
- compat_ulong_t sem_ctime;
- compat_ulong_t sem_ctime_high;
- compat_ulong_t sem_nsems;
- compat_ulong_t __unused1;
- compat_ulong_t __unused2;
-};
-
-struct compat_msqid64_ds {
- struct compat_ipc64_perm msg_perm;
- compat_ulong_t msg_stime;
- compat_ulong_t msg_stime_high;
- compat_ulong_t msg_rtime;
- compat_ulong_t msg_rtime_high;
- compat_ulong_t msg_ctime;
- compat_ulong_t msg_ctime_high;
- compat_ulong_t msg_cbytes;
- compat_ulong_t msg_qnum;
- compat_ulong_t msg_qbytes;
- compat_pid_t msg_lspid;
- compat_pid_t msg_lrpid;
- compat_ulong_t __unused1;
- compat_ulong_t __unused2;
-};
-
-struct compat_shmid64_ds {
- struct compat_ipc64_perm shm_perm;
- compat_size_t shm_segsz;
- compat_ulong_t shm_atime;
- compat_ulong_t shm_atime_high;
- compat_ulong_t shm_dtime;
- compat_ulong_t shm_dtime_high;
- compat_ulong_t shm_ctime;
- compat_ulong_t shm_ctime_high;
- compat_pid_t shm_cpid;
- compat_pid_t shm_lpid;
- compat_ulong_t shm_nattch;
- compat_ulong_t __unused1;
- compat_ulong_t __unused2;
-};
#endif /* _ASM_S390X_COMPAT_H */
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index c0f3bfeddcbe..b378e2b57ad8 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -2,7 +2,7 @@
/*
* CP Assist for Cryptographic Functions (CPACF)
*
- * Copyright IBM Corp. 2003, 2017
+ * Copyright IBM Corp. 2003, 2023
* Author(s): Thomas Spatzier
* Jan Glauber
* Harald Freudenberger (freude@de.ibm.com)
@@ -132,6 +132,11 @@
#define CPACF_PCKMO_ENC_AES_128_KEY 0x12
#define CPACF_PCKMO_ENC_AES_192_KEY 0x13
#define CPACF_PCKMO_ENC_AES_256_KEY 0x14
+#define CPACF_PCKMO_ENC_ECC_P256_KEY 0x20
+#define CPACF_PCKMO_ENC_ECC_P384_KEY 0x21
+#define CPACF_PCKMO_ENC_ECC_P521_KEY 0x22
+#define CPACF_PCKMO_ENC_ECC_ED25519_KEY 0x28
+#define CPACF_PCKMO_ENC_ECC_ED448_KEY 0x29
/*
* Function codes for the PRNO (PERFORM RANDOM NUMBER OPERATION)
@@ -173,17 +178,16 @@ typedef struct { unsigned char bytes[16]; } cpacf_mask_t;
*/
static __always_inline void __cpacf_query(unsigned int opcode, cpacf_mask_t *mask)
{
- register unsigned long r0 asm("0") = 0; /* query function */
- register unsigned long r1 asm("1") = (unsigned long) mask;
-
asm volatile(
- " spm 0\n" /* pckmo doesn't change the cc */
+ " lghi 0,0\n" /* query function */
+ " lgr 1,%[mask]\n"
+ " spm 0\n" /* pckmo doesn't change the cc */
/* Parameter regs are ignored, but must be nonzero and unique */
"0: .insn rrf,%[opc] << 16,2,4,6,0\n"
" brc 1,0b\n" /* handle partial completion */
: "=m" (*mask)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (opcode)
- : "cc");
+ : [mask] "d" ((unsigned long)mask), [opc] "i" (opcode)
+ : "cc", "0", "1");
}
static __always_inline int __cpacf_check_opcode(unsigned int opcode)
@@ -249,20 +253,22 @@ static __always_inline int cpacf_query_func(unsigned int opcode, unsigned int fu
static inline int cpacf_km(unsigned long func, void *param,
u8 *dest, const u8 *src, long src_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
- register unsigned long r4 asm("4") = (unsigned long) dest;
+ union register_pair d, s;
+ d.even = (unsigned long)dest;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,%[dst],%[src]\n"
" brc 1,0b\n" /* handle partial completion */
- : [src] "+a" (r2), [len] "+d" (r3), [dst] "+a" (r4)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KM)
- : "cc", "memory");
+ : [src] "+&d" (s.pair), [dst] "+&d" (d.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_KM)
+ : "cc", "memory", "0", "1");
- return src_len - r3;
+ return src_len - s.odd;
}
/**
@@ -279,20 +285,22 @@ static inline int cpacf_km(unsigned long func, void *param,
static inline int cpacf_kmc(unsigned long func, void *param,
u8 *dest, const u8 *src, long src_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
- register unsigned long r4 asm("4") = (unsigned long) dest;
+ union register_pair d, s;
+ d.even = (unsigned long)dest;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,%[dst],%[src]\n"
" brc 1,0b\n" /* handle partial completion */
- : [src] "+a" (r2), [len] "+d" (r3), [dst] "+a" (r4)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMC)
- : "cc", "memory");
+ : [src] "+&d" (s.pair), [dst] "+&d" (d.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_KMC)
+ : "cc", "memory", "0", "1");
- return src_len - r3;
+ return src_len - s.odd;
}
/**
@@ -306,17 +314,19 @@ static inline int cpacf_kmc(unsigned long func, void *param,
static inline void cpacf_kimd(unsigned long func, void *param,
const u8 *src, long src_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
+ union register_pair s;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,0,%[src]\n"
" brc 1,0b\n" /* handle partial completion */
- : [src] "+a" (r2), [len] "+d" (r3)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KIMD)
- : "cc", "memory");
+ : [src] "+&d" (s.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)(param)),
+ [opc] "i" (CPACF_KIMD)
+ : "cc", "memory", "0", "1");
}
/**
@@ -329,17 +339,19 @@ static inline void cpacf_kimd(unsigned long func, void *param,
static inline void cpacf_klmd(unsigned long func, void *param,
const u8 *src, long src_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
+ union register_pair s;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,0,%[src]\n"
" brc 1,0b\n" /* handle partial completion */
- : [src] "+a" (r2), [len] "+d" (r3)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KLMD)
- : "cc", "memory");
+ : [src] "+&d" (s.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_KLMD)
+ : "cc", "memory", "0", "1");
}
/**
@@ -355,19 +367,21 @@ static inline void cpacf_klmd(unsigned long func, void *param,
static inline int cpacf_kmac(unsigned long func, void *param,
const u8 *src, long src_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
+ union register_pair s;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,0,%[src]\n"
" brc 1,0b\n" /* handle partial completion */
- : [src] "+a" (r2), [len] "+d" (r3)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMAC)
- : "cc", "memory");
+ : [src] "+&d" (s.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_KMAC)
+ : "cc", "memory", "0", "1");
- return src_len - r3;
+ return src_len - s.odd;
}
/**
@@ -385,22 +399,24 @@ static inline int cpacf_kmac(unsigned long func, void *param,
static inline int cpacf_kmctr(unsigned long func, void *param, u8 *dest,
const u8 *src, long src_len, u8 *counter)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
- register unsigned long r4 asm("4") = (unsigned long) dest;
- register unsigned long r6 asm("6") = (unsigned long) counter;
+ union register_pair d, s, c;
+ d.even = (unsigned long)dest;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
+ c.even = (unsigned long)counter;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rrf,%[opc] << 16,%[dst],%[src],%[ctr],0\n"
" brc 1,0b\n" /* handle partial completion */
- : [src] "+a" (r2), [len] "+d" (r3),
- [dst] "+a" (r4), [ctr] "+a" (r6)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMCTR)
- : "cc", "memory");
+ : [src] "+&d" (s.pair), [dst] "+&d" (d.pair),
+ [ctr] "+&d" (c.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_KMCTR)
+ : "cc", "memory", "0", "1");
- return src_len - r3;
+ return src_len - s.odd;
}
/**
@@ -417,20 +433,21 @@ static inline void cpacf_prno(unsigned long func, void *param,
u8 *dest, unsigned long dest_len,
const u8 *seed, unsigned long seed_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) dest;
- register unsigned long r3 asm("3") = (unsigned long) dest_len;
- register unsigned long r4 asm("4") = (unsigned long) seed;
- register unsigned long r5 asm("5") = (unsigned long) seed_len;
+ union register_pair d, s;
+ d.even = (unsigned long)dest;
+ d.odd = (unsigned long)dest_len;
+ s.even = (unsigned long)seed;
+ s.odd = (unsigned long)seed_len;
asm volatile (
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,%[dst],%[seed]\n"
" brc 1,0b\n" /* handle partial completion */
- : [dst] "+a" (r2), [dlen] "+d" (r3)
- : [fc] "d" (r0), [pba] "a" (r1),
- [seed] "a" (r4), [slen] "d" (r5), [opc] "i" (CPACF_PRNO)
- : "cc", "memory");
+ : [dst] "+&d" (d.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [seed] "d" (s.pair), [opc] "i" (CPACF_PRNO)
+ : "cc", "memory", "0", "1");
}
/**
@@ -443,19 +460,19 @@ static inline void cpacf_prno(unsigned long func, void *param,
static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len,
u8 *cbuf, unsigned long cbuf_len)
{
- register unsigned long r0 asm("0") = (unsigned long) CPACF_PRNO_TRNG;
- register unsigned long r2 asm("2") = (unsigned long) ucbuf;
- register unsigned long r3 asm("3") = (unsigned long) ucbuf_len;
- register unsigned long r4 asm("4") = (unsigned long) cbuf;
- register unsigned long r5 asm("5") = (unsigned long) cbuf_len;
+ union register_pair u, c;
+ u.even = (unsigned long)ucbuf;
+ u.odd = (unsigned long)ucbuf_len;
+ c.even = (unsigned long)cbuf;
+ c.odd = (unsigned long)cbuf_len;
asm volatile (
+ " lghi 0,%[fc]\n"
"0: .insn rre,%[opc] << 16,%[ucbuf],%[cbuf]\n"
" brc 1,0b\n" /* handle partial completion */
- : [ucbuf] "+a" (r2), [ucbuflen] "+d" (r3),
- [cbuf] "+a" (r4), [cbuflen] "+d" (r5)
- : [fc] "d" (r0), [opc] "i" (CPACF_PRNO)
- : "cc", "memory");
+ : [ucbuf] "+&d" (u.pair), [cbuf] "+&d" (c.pair)
+ : [fc] "K" (CPACF_PRNO_TRNG), [opc] "i" (CPACF_PRNO)
+ : "cc", "memory", "0");
}
/**
@@ -466,15 +483,15 @@ static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len,
*/
static inline void cpacf_pcc(unsigned long func, void *param)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
-
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rre,%[opc] << 16,0,0\n" /* PCC opcode */
" brc 1,0b\n" /* handle partial completion */
:
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_PCC)
- : "cc", "memory");
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_PCC)
+ : "cc", "memory", "0", "1");
}
/**
@@ -487,14 +504,14 @@ static inline void cpacf_pcc(unsigned long func, void *param)
*/
static inline void cpacf_pckmo(long func, void *param)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
-
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
" .insn rre,%[opc] << 16,0,0\n" /* PCKMO opcode */
:
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_PCKMO)
- : "cc", "memory");
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_PCKMO)
+ : "cc", "memory", "0", "1");
}
/**
@@ -512,21 +529,23 @@ static inline void cpacf_kma(unsigned long func, void *param, u8 *dest,
const u8 *src, unsigned long src_len,
const u8 *aad, unsigned long aad_len)
{
- register unsigned long r0 asm("0") = (unsigned long) func;
- register unsigned long r1 asm("1") = (unsigned long) param;
- register unsigned long r2 asm("2") = (unsigned long) src;
- register unsigned long r3 asm("3") = (unsigned long) src_len;
- register unsigned long r4 asm("4") = (unsigned long) aad;
- register unsigned long r5 asm("5") = (unsigned long) aad_len;
- register unsigned long r6 asm("6") = (unsigned long) dest;
+ union register_pair d, s, a;
+ d.even = (unsigned long)dest;
+ s.even = (unsigned long)src;
+ s.odd = (unsigned long)src_len;
+ a.even = (unsigned long)aad;
+ a.odd = (unsigned long)aad_len;
asm volatile(
+ " lgr 0,%[fc]\n"
+ " lgr 1,%[pba]\n"
"0: .insn rrf,%[opc] << 16,%[dst],%[src],%[aad],0\n"
" brc 1,0b\n" /* handle partial completion */
- : [dst] "+a" (r6), [src] "+a" (r2), [slen] "+d" (r3),
- [aad] "+a" (r4), [alen] "+d" (r5)
- : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMA)
- : "cc", "memory");
+ : [dst] "+&d" (d.pair), [src] "+&d" (s.pair),
+ [aad] "+&d" (a.pair)
+ : [fc] "d" (func), [pba] "d" ((unsigned long)param),
+ [opc] "i" (CPACF_KMA)
+ : "cc", "memory", "0", "1");
}
#endif /* _ASM_S390_CPACF_H */
diff --git a/arch/s390/include/asm/cpu.h b/arch/s390/include/asm/cpu.h
index 62228a884e06..26c710cd3485 100644
--- a/arch/s390/include/asm/cpu.h
+++ b/arch/s390/include/asm/cpu.h
@@ -12,6 +12,7 @@
#ifndef __ASSEMBLY__
#include <linux/types.h>
+#include <linux/jump_label.h>
struct cpuid
{
@@ -21,5 +22,7 @@ struct cpuid
unsigned int unused : 16;
} __attribute__ ((packed, aligned(8)));
+DECLARE_STATIC_KEY_FALSE(cpu_has_bear);
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_CPU_H */
diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h
deleted file mode 100644
index 649b9fc60685..000000000000
--- a/arch/s390/include/asm/cpu_mcf.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Counter facility support definitions for the Linux perf
- *
- * Copyright IBM Corp. 2019
- * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
- */
-#ifndef _ASM_S390_CPU_MCF_H
-#define _ASM_S390_CPU_MCF_H
-
-#include <linux/perf_event.h>
-#include <asm/cpu_mf.h>
-
-enum cpumf_ctr_set {
- CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */
- CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */
- CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */
- CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */
- CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */
-
- /* Maximum number of counter sets */
- CPUMF_CTR_SET_MAX,
-};
-
-#define CPUMF_LCCTL_ENABLE_SHIFT 16
-#define CPUMF_LCCTL_ACTCTL_SHIFT 0
-static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = {
- [CPUMF_CTR_SET_BASIC] = 0x02,
- [CPUMF_CTR_SET_USER] = 0x04,
- [CPUMF_CTR_SET_CRYPTO] = 0x08,
- [CPUMF_CTR_SET_EXT] = 0x01,
- [CPUMF_CTR_SET_MT_DIAG] = 0x20,
-};
-
-static inline void ctr_set_enable(u64 *state, int ctr_set)
-{
- *state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT;
-}
-static inline void ctr_set_disable(u64 *state, int ctr_set)
-{
- *state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT);
-}
-static inline void ctr_set_start(u64 *state, int ctr_set)
-{
- *state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT;
-}
-static inline void ctr_set_stop(u64 *state, int ctr_set)
-{
- *state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT);
-}
-
-static inline void ctr_set_multiple_enable(u64 *state, u64 ctrsets)
-{
- *state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT;
-}
-
-static inline void ctr_set_multiple_disable(u64 *state, u64 ctrsets)
-{
- *state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT);
-}
-
-static inline void ctr_set_multiple_start(u64 *state, u64 ctrsets)
-{
- *state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT;
-}
-
-static inline void ctr_set_multiple_stop(u64 *state, u64 ctrsets)
-{
- *state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT);
-}
-
-static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest)
-{
- switch (set) {
- case CPUMF_CTR_SET_BASIC:
- return stcctm(BASIC, range, dest);
- case CPUMF_CTR_SET_USER:
- return stcctm(PROBLEM_STATE, range, dest);
- case CPUMF_CTR_SET_CRYPTO:
- return stcctm(CRYPTO_ACTIVITY, range, dest);
- case CPUMF_CTR_SET_EXT:
- return stcctm(EXTENDED, range, dest);
- case CPUMF_CTR_SET_MT_DIAG:
- return stcctm(MT_DIAG_CLEARING, range, dest);
- case CPUMF_CTR_SET_MAX:
- return 3;
- }
- return 3;
-}
-
-struct cpu_cf_events {
- struct cpumf_ctr_info info;
- atomic_t ctr_set[CPUMF_CTR_SET_MAX];
- atomic64_t alert;
- u64 state, tx_state;
- unsigned int flags;
- unsigned int txn_flags;
-};
-DECLARE_PER_CPU(struct cpu_cf_events, cpu_cf_events);
-
-bool kernel_cpumcf_avail(void);
-int __kernel_cpumcf_begin(void);
-unsigned long kernel_cpumcf_alert(int clear);
-void __kernel_cpumcf_end(void);
-
-static inline int kernel_cpumcf_begin(void)
-{
- if (!cpum_cf_avail())
- return -ENODEV;
-
- preempt_disable();
- return __kernel_cpumcf_begin();
-}
-static inline void kernel_cpumcf_end(void)
-{
- __kernel_cpumcf_end();
- preempt_enable();
-}
-
-/* Return true if store counter set multiple instruction is available */
-static inline int stccm_avail(void)
-{
- return test_facility(142);
-}
-
-#endif /* _ASM_S390_CPU_MCF_H */
diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index 0d90cbeb89b4..a0de5b9b02ea 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -10,6 +10,7 @@
#define _ASM_S390_CPU_MF_H
#include <linux/errno.h>
+#include <asm/asm-extable.h>
#include <asm/facility.h>
asm(".include \"asm/cpu_mf-insn.h\"\n");
@@ -41,7 +42,6 @@ static inline int cpum_sf_avail(void)
return test_facility(40) && test_facility(68);
}
-
struct cpumf_ctr_info {
u16 cfvn;
u16 auth_ctl;
@@ -109,7 +109,9 @@ struct hws_basic_entry {
unsigned int AS:2; /* 29-30 PSW address-space control */
unsigned int I:1; /* 31 entry valid or invalid */
unsigned int CL:2; /* 32-33 Configuration Level */
- unsigned int:14;
+ unsigned int H:1; /* 34 Host Indicator */
+ unsigned int LS:1; /* 35 Limited Sampling */
+ unsigned int:12;
unsigned int prim_asn:16; /* primary ASN */
unsigned long long ia; /* Instruction Address */
unsigned long long gpp; /* Guest Program Parameter */
@@ -128,19 +130,21 @@ struct hws_combined_entry {
struct hws_diag_entry diag; /* Diagnostic-sampling data entry */
} __packed;
-struct hws_trailer_entry {
- union {
- struct {
- unsigned int f:1; /* 0 - Block Full Indicator */
- unsigned int a:1; /* 1 - Alert request control */
- unsigned int t:1; /* 2 - Timestamp format */
- unsigned int :29; /* 3 - 31: Reserved */
- unsigned int bsdes:16; /* 32-47: size of basic SDE */
- unsigned int dsdes:16; /* 48-63: size of diagnostic SDE */
- };
- unsigned long long flags; /* 0 - 63: All indicators */
+union hws_trailer_header {
+ struct {
+ unsigned int f:1; /* 0 - Block Full Indicator */
+ unsigned int a:1; /* 1 - Alert request control */
+ unsigned int t:1; /* 2 - Timestamp format */
+ unsigned int :29; /* 3 - 31: Reserved */
+ unsigned int bsdes:16; /* 32-47: size of basic SDE */
+ unsigned int dsdes:16; /* 48-63: size of diagnostic SDE */
+ unsigned long long overflow; /* 64 - Overflow Count */
};
- unsigned long long overflow; /* 64 - sample Overflow count */
+ u128 val;
+};
+
+struct hws_trailer_entry {
+ union hws_trailer_header header; /* 0 - 15 Flags + Overflow Count */
unsigned char timestamp[16]; /* 16 - 31 timestamp */
unsigned long long reserved1; /* 32 -Reserved */
unsigned long long reserved2; /* */
@@ -157,7 +161,7 @@ struct hws_trailer_entry {
/* Load program parameter */
static inline void lpp(void *pp)
{
- asm volatile(".insn s,0xb2800000,0(%0)\n":: "a" (pp) : "memory");
+ asm volatile("lpp 0(%0)\n" :: "a" (pp) : "memory");
}
/* Query counter information */
@@ -166,7 +170,7 @@ static inline int qctri(struct cpumf_ctr_info *info)
int rc = -EINVAL;
asm volatile (
- "0: .insn s,0xb28e0000,%1\n"
+ "0: qctri %1\n"
"1: lhi %0,0\n"
"2:\n"
EX_TABLE(1b, 2b)
@@ -180,7 +184,7 @@ static inline int lcctl(u64 ctl)
int cc;
asm volatile (
- " .insn s,0xb2840000,%1\n"
+ " lcctl %1\n"
" ipm %0\n"
" srl %0,28\n"
: "=d" (cc) : "Q" (ctl) : "cc");
@@ -194,7 +198,7 @@ static inline int __ecctr(u64 ctr, u64 *content)
int cc;
asm volatile (
- " .insn rre,0xb2e40000,%0,%2\n"
+ " ecctr %0,%2\n"
" ipm %1\n"
" srl %1,28\n"
: "=d" (_content), "=d" (cc) : "d" (ctr) : "cc");
@@ -244,7 +248,7 @@ static inline int qsi(struct hws_qsi_info_block *info)
int cc = 1;
asm volatile(
- "0: .insn s,0xb2860000,%1\n"
+ "0: qsi %1\n"
"1: lhi %0,0\n"
"2:\n"
EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
@@ -259,7 +263,7 @@ static inline int lsctl(struct hws_lsctl_request_block *req)
cc = 1;
asm volatile(
- "0: .insn s,0xb2870000,0(%1)\n"
+ "0: lsctl 0(%1)\n"
"1: ipm %0\n"
" srl %0,28\n"
"2:\n"
@@ -270,59 +274,4 @@ static inline int lsctl(struct hws_lsctl_request_block *req)
return cc ? -EINVAL : 0;
}
-
-/* Sampling control helper functions */
-
-#include <linux/time.h>
-
-static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi,
- unsigned long freq)
-{
- return (USEC_PER_SEC / freq) * qsi->cpu_speed;
-}
-
-static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi,
- unsigned long rate)
-{
- return USEC_PER_SEC * qsi->cpu_speed / rate;
-}
-
-#define SDB_TE_ALERT_REQ_MASK 0x4000000000000000UL
-#define SDB_TE_BUFFER_FULL_MASK 0x8000000000000000UL
-
-/* Return TOD timestamp contained in an trailer entry */
-static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te)
-{
- /* TOD in STCKE format */
- if (te->t)
- return *((unsigned long long *) &te->timestamp[1]);
-
- /* TOD in STCK format */
- return *((unsigned long long *) &te->timestamp[0]);
-}
-
-/* Return pointer to trailer entry of an sample data block */
-static inline unsigned long *trailer_entry_ptr(unsigned long v)
-{
- void *ret;
-
- ret = (void *) v;
- ret += PAGE_SIZE;
- ret -= sizeof(struct hws_trailer_entry);
-
- return (unsigned long *) ret;
-}
-
-/* Return true if the entry in the sample data block table (sdbt)
- * is a link to the next sdbt */
-static inline int is_link_entry(unsigned long *s)
-{
- return *s & 0x1ul ? 1 : 0;
-}
-
-/* Return pointer to the linked sdbt */
-static inline unsigned long *get_next_sdbt(unsigned long *s)
-{
- return (unsigned long *) (*s & ~0x1ul);
-}
#endif /* _ASM_S390_CPU_MF_H */
diff --git a/arch/s390/include/asm/cpufeature.h b/arch/s390/include/asm/cpufeature.h
index 1d007c6ede95..931204613753 100644
--- a/arch/s390/include/asm/cpufeature.h
+++ b/arch/s390/include/asm/cpufeature.h
@@ -2,28 +2,21 @@
/*
* Module interface for CPU features
*
- * Copyright IBM Corp. 2015
+ * Copyright IBM Corp. 2015, 2022
* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
*/
#ifndef __ASM_S390_CPUFEATURE_H
#define __ASM_S390_CPUFEATURE_H
-#include <asm/elf.h>
+enum {
+ S390_CPU_FEATURE_MSA,
+ S390_CPU_FEATURE_VXRS,
+ S390_CPU_FEATURE_UV,
+ MAX_CPU_FEATURES
+};
-/* Hardware features on Linux on z Systems are indicated by facility bits that
- * are mapped to the so-called machine flags. Particular machine flags are
- * then used to define ELF hardware capabilities; most notably hardware flags
- * that are essential for user space / glibc.
- *
- * Restrict the set of exposed CPU features to ELF hardware capabilities for
- * now. Additional machine flags can be indicated by values larger than
- * MAX_ELF_HWCAP_FEATURES.
- */
-#define MAX_ELF_HWCAP_FEATURES (8 * sizeof(elf_hwcap))
-#define MAX_CPU_FEATURES MAX_ELF_HWCAP_FEATURES
-
-#define cpu_feature(feat) ilog2(HWCAP_S390_ ## feat)
+#define cpu_feature(feature) (feature)
int cpu_have_feature(unsigned int nr);
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index cb729d111e20..30bb3ec4e5fc 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -11,28 +11,11 @@
#include <linux/types.h>
#include <asm/timex.h>
-#define CPUTIME_PER_USEC 4096ULL
-#define CPUTIME_PER_SEC (CPUTIME_PER_USEC * USEC_PER_SEC)
-
-/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */
-
-#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new)
-
-/*
- * Convert cputime to microseconds.
- */
-static inline u64 cputime_to_usecs(const u64 cputime)
-{
- return cputime >> 12;
-}
-
/*
* Convert cputime to nanoseconds.
*/
#define cputime_to_nsecs(cputime) tod_to_ns(cputime)
-u64 arch_cpu_idle_time(int cpu);
-
-#define arch_idle_time(cpu) arch_cpu_idle_time(cpu)
+void account_idle_time_irq(void);
#endif /* _S390_CPUTIME_H */
diff --git a/arch/s390/include/asm/crw.h b/arch/s390/include/asm/crw.h
index c6ebfd31f1db..97456d98fe76 100644
--- a/arch/s390/include/asm/crw.h
+++ b/arch/s390/include/asm/crw.h
@@ -5,7 +5,6 @@
* Author(s): Ingo Adlung <adlung@de.ibm.com>,
* Martin Schwidefsky <schwidefsky@de.ibm.com>,
* Cornelia Huck <cornelia.huck@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*/
#ifndef _ASM_S390_CRW_H
diff --git a/arch/s390/include/asm/css_chars.h b/arch/s390/include/asm/css_chars.h
index 480bb02ccacd..638137d46c85 100644
--- a/arch/s390/include/asm/css_chars.h
+++ b/arch/s390/include/asm/css_chars.h
@@ -36,7 +36,9 @@ struct css_general_char {
u64 alt_ssi : 1; /* bit 108 */
u64 : 1;
u64 narf : 1; /* bit 110 */
- u64 : 12;
+ u64 : 5;
+ u64 enarf: 1; /* bit 116 */
+ u64 : 6;
u64 util_str : 1;/* bit 123 */
} __packed;
diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h
deleted file mode 100644
index ed5efbb531c4..000000000000
--- a/arch/s390/include/asm/ctl_reg.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright IBM Corp. 1999, 2009
- *
- * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- */
-
-#ifndef __ASM_CTL_REG_H
-#define __ASM_CTL_REG_H
-
-#include <linux/bits.h>
-
-#define CR0_CLOCK_COMPARATOR_SIGN BIT(63 - 10)
-#define CR0_LOW_ADDRESS_PROTECTION BIT(63 - 35)
-#define CR0_EMERGENCY_SIGNAL_SUBMASK BIT(63 - 49)
-#define CR0_EXTERNAL_CALL_SUBMASK BIT(63 - 50)
-#define CR0_CLOCK_COMPARATOR_SUBMASK BIT(63 - 52)
-#define CR0_CPU_TIMER_SUBMASK BIT(63 - 53)
-#define CR0_SERVICE_SIGNAL_SUBMASK BIT(63 - 54)
-#define CR0_UNUSED_56 BIT(63 - 56)
-#define CR0_INTERRUPT_KEY_SUBMASK BIT(63 - 57)
-#define CR0_MEASUREMENT_ALERT_SUBMASK BIT(63 - 58)
-
-#define CR2_GUARDED_STORAGE BIT(63 - 59)
-
-#define CR14_UNUSED_32 BIT(63 - 32)
-#define CR14_UNUSED_33 BIT(63 - 33)
-#define CR14_CHANNEL_REPORT_SUBMASK BIT(63 - 35)
-#define CR14_RECOVERY_SUBMASK BIT(63 - 36)
-#define CR14_DEGRADATION_SUBMASK BIT(63 - 37)
-#define CR14_EXTERNAL_DAMAGE_SUBMASK BIT(63 - 38)
-#define CR14_WARNING_SUBMASK BIT(63 - 39)
-
-#ifndef __ASSEMBLY__
-
-#include <linux/bug.h>
-
-#define __ctl_load(array, low, high) do { \
- typedef struct { char _[sizeof(array)]; } addrtype; \
- \
- BUILD_BUG_ON(sizeof(addrtype) != (high - low + 1) * sizeof(long));\
- asm volatile( \
- " lctlg %1,%2,%0\n" \
- : \
- : "Q" (*(addrtype *)(&array)), "i" (low), "i" (high) \
- : "memory"); \
-} while (0)
-
-#define __ctl_store(array, low, high) do { \
- typedef struct { char _[sizeof(array)]; } addrtype; \
- \
- BUILD_BUG_ON(sizeof(addrtype) != (high - low + 1) * sizeof(long));\
- asm volatile( \
- " stctg %1,%2,%0\n" \
- : "=Q" (*(addrtype *)(&array)) \
- : "i" (low), "i" (high)); \
-} while (0)
-
-static __always_inline void __ctl_set_bit(unsigned int cr, unsigned int bit)
-{
- unsigned long reg;
-
- __ctl_store(reg, cr, cr);
- reg |= 1UL << bit;
- __ctl_load(reg, cr, cr);
-}
-
-static __always_inline void __ctl_clear_bit(unsigned int cr, unsigned int bit)
-{
- unsigned long reg;
-
- __ctl_store(reg, cr, cr);
- reg &= ~(1UL << bit);
- __ctl_load(reg, cr, cr);
-}
-
-void smp_ctl_set_bit(int cr, int bit);
-void smp_ctl_clear_bit(int cr, int bit);
-
-union ctlreg0 {
- unsigned long val;
- struct {
- unsigned long : 8;
- unsigned long tcx : 1; /* Transactional-Execution control */
- unsigned long pifo : 1; /* Transactional-Execution Program-
- Interruption-Filtering Override */
- unsigned long : 22;
- unsigned long : 3;
- unsigned long lap : 1; /* Low-address-protection control */
- unsigned long : 4;
- unsigned long edat : 1; /* Enhanced-DAT-enablement control */
- unsigned long : 2;
- unsigned long iep : 1; /* Instruction-Execution-Protection */
- unsigned long : 1;
- unsigned long afp : 1; /* AFP-register control */
- unsigned long vx : 1; /* Vector enablement control */
- unsigned long : 7;
- unsigned long sssm : 1; /* Service signal subclass mask */
- unsigned long : 9;
- };
-};
-
-union ctlreg2 {
- unsigned long val;
- struct {
- unsigned long : 33;
- unsigned long ducto : 25;
- unsigned long : 1;
- unsigned long gse : 1;
- unsigned long : 1;
- unsigned long tds : 1;
- unsigned long tdc : 2;
- };
-};
-
-#define ctl_set_bit(cr, bit) smp_ctl_set_bit(cr, bit)
-#define ctl_clear_bit(cr, bit) smp_ctl_clear_bit(cr, bit)
-
-#endif /* __ASSEMBLY__ */
-#endif /* __ASM_CTL_REG_H */
diff --git a/arch/s390/include/asm/ctlreg.h b/arch/s390/include/asm/ctlreg.h
new file mode 100644
index 000000000000..72a9556d04f3
--- /dev/null
+++ b/arch/s390/include/asm/ctlreg.h
@@ -0,0 +1,255 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 1999, 2009
+ *
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#ifndef __ASM_S390_CTLREG_H
+#define __ASM_S390_CTLREG_H
+
+#include <linux/bits.h>
+
+#define CR0_TRANSACTIONAL_EXECUTION_BIT (63 - 8)
+#define CR0_CLOCK_COMPARATOR_SIGN_BIT (63 - 10)
+#define CR0_CRYPTOGRAPHY_COUNTER_BIT (63 - 13)
+#define CR0_PAI_EXTENSION_BIT (63 - 14)
+#define CR0_CPUMF_EXTRACTION_AUTH_BIT (63 - 15)
+#define CR0_WARNING_TRACK_BIT (63 - 30)
+#define CR0_LOW_ADDRESS_PROTECTION_BIT (63 - 35)
+#define CR0_FETCH_PROTECTION_OVERRIDE_BIT (63 - 38)
+#define CR0_STORAGE_PROTECTION_OVERRIDE_BIT (63 - 39)
+#define CR0_EDAT_BIT (63 - 40)
+#define CR0_INSTRUCTION_EXEC_PROTECTION_BIT (63 - 43)
+#define CR0_VECTOR_BIT (63 - 46)
+#define CR0_MALFUNCTION_ALERT_SUBMASK_BIT (63 - 48)
+#define CR0_EMERGENCY_SIGNAL_SUBMASK_BIT (63 - 49)
+#define CR0_EXTERNAL_CALL_SUBMASK_BIT (63 - 50)
+#define CR0_CLOCK_COMPARATOR_SUBMASK_BIT (63 - 52)
+#define CR0_CPU_TIMER_SUBMASK_BIT (63 - 53)
+#define CR0_SERVICE_SIGNAL_SUBMASK_BIT (63 - 54)
+#define CR0_UNUSED_56_BIT (63 - 56)
+#define CR0_INTERRUPT_KEY_SUBMASK_BIT (63 - 57)
+#define CR0_MEASUREMENT_ALERT_SUBMASK_BIT (63 - 58)
+#define CR0_ETR_SUBMASK_BIT (63 - 59)
+#define CR0_IUCV_BIT (63 - 62)
+
+#define CR0_TRANSACTIONAL_EXECUTION BIT(CR0_TRANSACTIONAL_EXECUTION_BIT)
+#define CR0_CLOCK_COMPARATOR_SIGN BIT(CR0_CLOCK_COMPARATOR_SIGN_BIT)
+#define CR0_CRYPTOGRAPHY_COUNTER BIT(CR0_CRYPTOGRAPHY_COUNTER_BIT)
+#define CR0_PAI_EXTENSION BIT(CR0_PAI_EXTENSION_BIT)
+#define CR0_CPUMF_EXTRACTION_AUTH BIT(CR0_CPUMF_EXTRACTION_AUTH_BIT)
+#define CR0_WARNING_TRACK BIT(CR0_WARNING_TRACK_BIT)
+#define CR0_LOW_ADDRESS_PROTECTION BIT(CR0_LOW_ADDRESS_PROTECTION_BIT)
+#define CR0_FETCH_PROTECTION_OVERRIDE BIT(CR0_FETCH_PROTECTION_OVERRIDE_BIT)
+#define CR0_STORAGE_PROTECTION_OVERRIDE BIT(CR0_STORAGE_PROTECTION_OVERRIDE_BIT)
+#define CR0_EDAT BIT(CR0_EDAT_BIT)
+#define CR0_INSTRUCTION_EXEC_PROTECTION BIT(CR0_INSTRUCTION_EXEC_PROTECTION_BIT)
+#define CR0_VECTOR BIT(CR0_VECTOR_BIT)
+#define CR0_MALFUNCTION_ALERT_SUBMASK BIT(CR0_MALFUNCTION_ALERT_SUBMASK_BIT)
+#define CR0_EMERGENCY_SIGNAL_SUBMASK BIT(CR0_EMERGENCY_SIGNAL_SUBMASK_BIT)
+#define CR0_EXTERNAL_CALL_SUBMASK BIT(CR0_EXTERNAL_CALL_SUBMASK_BIT)
+#define CR0_CLOCK_COMPARATOR_SUBMASK BIT(CR0_CLOCK_COMPARATOR_SUBMASK_BIT)
+#define CR0_CPU_TIMER_SUBMASK BIT(CR0_CPU_TIMER_SUBMASK_BIT)
+#define CR0_SERVICE_SIGNAL_SUBMASK BIT(CR0_SERVICE_SIGNAL_SUBMASK_BIT)
+#define CR0_UNUSED_56 BIT(CR0_UNUSED_56_BIT)
+#define CR0_INTERRUPT_KEY_SUBMASK BIT(CR0_INTERRUPT_KEY_SUBMASK_BIT)
+#define CR0_MEASUREMENT_ALERT_SUBMASK BIT(CR0_MEASUREMENT_ALERT_SUBMASK_BIT)
+#define CR0_ETR_SUBMASK BIT(CR0_ETR_SUBMASK_BIT)
+#define CR0_IUCV BIT(CR0_IUCV_BIT)
+
+#define CR2_MIO_ADDRESSING_BIT (63 - 58)
+#define CR2_GUARDED_STORAGE_BIT (63 - 59)
+
+#define CR2_MIO_ADDRESSING BIT(CR2_MIO_ADDRESSING_BIT)
+#define CR2_GUARDED_STORAGE BIT(CR2_GUARDED_STORAGE_BIT)
+
+#define CR14_UNUSED_32_BIT (63 - 32)
+#define CR14_UNUSED_33_BIT (63 - 33)
+#define CR14_CHANNEL_REPORT_SUBMASK_BIT (63 - 35)
+#define CR14_RECOVERY_SUBMASK_BIT (63 - 36)
+#define CR14_DEGRADATION_SUBMASK_BIT (63 - 37)
+#define CR14_EXTERNAL_DAMAGE_SUBMASK_BIT (63 - 38)
+#define CR14_WARNING_SUBMASK_BIT (63 - 39)
+
+#define CR14_UNUSED_32 BIT(CR14_UNUSED_32_BIT)
+#define CR14_UNUSED_33 BIT(CR14_UNUSED_33_BIT)
+#define CR14_CHANNEL_REPORT_SUBMASK BIT(CR14_CHANNEL_REPORT_SUBMASK_BIT)
+#define CR14_RECOVERY_SUBMASK BIT(CR14_RECOVERY_SUBMASK_BIT)
+#define CR14_DEGRADATION_SUBMASK BIT(CR14_DEGRADATION_SUBMASK_BIT)
+#define CR14_EXTERNAL_DAMAGE_SUBMASK BIT(CR14_EXTERNAL_DAMAGE_SUBMASK_BIT)
+#define CR14_WARNING_SUBMASK BIT(CR14_WARNING_SUBMASK_BIT)
+
+#ifndef __ASSEMBLY__
+
+#include <linux/bug.h>
+
+struct ctlreg {
+ unsigned long val;
+};
+
+#define __local_ctl_load(low, high, array) do { \
+ struct addrtype { \
+ char _[sizeof(array)]; \
+ }; \
+ int _high = high; \
+ int _low = low; \
+ int _esize; \
+ \
+ _esize = (_high - _low + 1) * sizeof(struct ctlreg); \
+ BUILD_BUG_ON(sizeof(struct addrtype) != _esize); \
+ typecheck(struct ctlreg, array[0]); \
+ asm volatile( \
+ " lctlg %[_low],%[_high],%[_arr]\n" \
+ : \
+ : [_arr] "Q" (*(struct addrtype *)(&array)), \
+ [_low] "i" (low), [_high] "i" (high) \
+ : "memory"); \
+} while (0)
+
+#define __local_ctl_store(low, high, array) do { \
+ struct addrtype { \
+ char _[sizeof(array)]; \
+ }; \
+ int _high = high; \
+ int _low = low; \
+ int _esize; \
+ \
+ _esize = (_high - _low + 1) * sizeof(struct ctlreg); \
+ BUILD_BUG_ON(sizeof(struct addrtype) != _esize); \
+ typecheck(struct ctlreg, array[0]); \
+ asm volatile( \
+ " stctg %[_low],%[_high],%[_arr]\n" \
+ : [_arr] "=Q" (*(struct addrtype *)(&array)) \
+ : [_low] "i" (low), [_high] "i" (high)); \
+} while (0)
+
+static __always_inline void local_ctl_load(unsigned int cr, struct ctlreg *reg)
+{
+ asm volatile(
+ " lctlg %[cr],%[cr],%[reg]\n"
+ :
+ : [reg] "Q" (*reg), [cr] "i" (cr)
+ : "memory");
+}
+
+static __always_inline void local_ctl_store(unsigned int cr, struct ctlreg *reg)
+{
+ asm volatile(
+ " stctg %[cr],%[cr],%[reg]\n"
+ : [reg] "=Q" (*reg)
+ : [cr] "i" (cr));
+}
+
+static __always_inline struct ctlreg local_ctl_set_bit(unsigned int cr, unsigned int bit)
+{
+ struct ctlreg new, old;
+
+ local_ctl_store(cr, &old);
+ new = old;
+ new.val |= 1UL << bit;
+ local_ctl_load(cr, &new);
+ return old;
+}
+
+static __always_inline struct ctlreg local_ctl_clear_bit(unsigned int cr, unsigned int bit)
+{
+ struct ctlreg new, old;
+
+ local_ctl_store(cr, &old);
+ new = old;
+ new.val &= ~(1UL << bit);
+ local_ctl_load(cr, &new);
+ return old;
+}
+
+struct lowcore;
+
+void system_ctlreg_lock(void);
+void system_ctlreg_unlock(void);
+void system_ctlreg_init_save_area(struct lowcore *lc);
+void system_ctlreg_modify(unsigned int cr, unsigned long data, int request);
+
+enum {
+ CTLREG_SET_BIT,
+ CTLREG_CLEAR_BIT,
+ CTLREG_LOAD,
+};
+
+static inline void system_ctl_set_bit(unsigned int cr, unsigned int bit)
+{
+ system_ctlreg_modify(cr, bit, CTLREG_SET_BIT);
+}
+
+static inline void system_ctl_clear_bit(unsigned int cr, unsigned int bit)
+{
+ system_ctlreg_modify(cr, bit, CTLREG_CLEAR_BIT);
+}
+
+static inline void system_ctl_load(unsigned int cr, struct ctlreg *reg)
+{
+ system_ctlreg_modify(cr, reg->val, CTLREG_LOAD);
+}
+
+union ctlreg0 {
+ unsigned long val;
+ struct ctlreg reg;
+ struct {
+ unsigned long : 8;
+ unsigned long tcx : 1; /* Transactional-Execution control */
+ unsigned long pifo : 1; /* Transactional-Execution Program-
+ Interruption-Filtering Override */
+ unsigned long : 3;
+ unsigned long ccc : 1; /* Cryptography counter control */
+ unsigned long pec : 1; /* PAI extension control */
+ unsigned long : 17;
+ unsigned long : 3;
+ unsigned long lap : 1; /* Low-address-protection control */
+ unsigned long : 4;
+ unsigned long edat : 1; /* Enhanced-DAT-enablement control */
+ unsigned long : 2;
+ unsigned long iep : 1; /* Instruction-Execution-Protection */
+ unsigned long : 1;
+ unsigned long afp : 1; /* AFP-register control */
+ unsigned long vx : 1; /* Vector enablement control */
+ unsigned long : 7;
+ unsigned long sssm : 1; /* Service signal subclass mask */
+ unsigned long : 9;
+ };
+};
+
+union ctlreg2 {
+ unsigned long val;
+ struct ctlreg reg;
+ struct {
+ unsigned long : 33;
+ unsigned long ducto : 25;
+ unsigned long : 1;
+ unsigned long gse : 1;
+ unsigned long : 1;
+ unsigned long tds : 1;
+ unsigned long tdc : 2;
+ };
+};
+
+union ctlreg5 {
+ unsigned long val;
+ struct ctlreg reg;
+ struct {
+ unsigned long : 33;
+ unsigned long pasteo: 25;
+ unsigned long : 6;
+ };
+};
+
+union ctlreg15 {
+ unsigned long val;
+ struct ctlreg reg;
+ struct {
+ unsigned long lsea : 61;
+ unsigned long : 3;
+ };
+};
+
+#endif /* __ASSEMBLY__ */
+#endif /* __ASM_S390_CTLREG_H */
diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h
index 310134015541..ccd4e148b5ed 100644
--- a/arch/s390/include/asm/debug.h
+++ b/arch/s390/include/asm/debug.h
@@ -2,17 +2,18 @@
/*
* S/390 debug facility
*
- * Copyright IBM Corp. 1999, 2000
+ * Copyright IBM Corp. 1999, 2020
*/
-#ifndef DEBUG_H
-#define DEBUG_H
+#ifndef _ASM_S390_DEBUG_H
+#define _ASM_S390_DEBUG_H
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/kernel.h>
#include <linux/time.h>
#include <linux/refcount.h>
-#include <uapi/asm/debug.h>
+#include <linux/fs.h>
+#include <linux/init.h>
#define DEBUG_MAX_LEVEL 6 /* debug levels range from 0 to 6 */
#define DEBUG_OFF_LEVEL -1 /* level where debug is switched off */
@@ -26,6 +27,16 @@
#define DEBUG_DATA(entry) (char *)(entry + 1) /* data is stored behind */
/* the entry information */
+#define __DEBUG_FEATURE_VERSION 3 /* version of debug feature */
+
+struct __debug_entry {
+ unsigned long clock : 60;
+ unsigned long exception : 1;
+ unsigned long level : 3;
+ void *caller;
+ unsigned short cpu;
+} __packed;
+
typedef struct __debug_entry debug_entry_t;
struct debug_view;
@@ -82,7 +93,6 @@ struct debug_view {
};
extern struct debug_view debug_hex_ascii_view;
-extern struct debug_view debug_raw_view;
extern struct debug_view debug_sprintf_view;
/* do NOT use the _common functions */
@@ -212,7 +222,7 @@ static inline debug_entry_t *debug_text_event(debug_info_t *id, int level,
/*
* IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are
- * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details!
+ * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details!
*/
extern debug_entry_t *
__debug_sprintf_event(debug_info_t *id, int level, char *string, ...)
@@ -340,7 +350,7 @@ static inline debug_entry_t *debug_text_exception(debug_info_t *id, int level,
/*
* IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are
- * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details!
+ * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details!
*/
extern debug_entry_t *
__debug_sprintf_exception(debug_info_t *id, int level, char *string, ...)
@@ -382,38 +392,99 @@ int debug_register_view(debug_info_t *id, struct debug_view *view);
int debug_unregister_view(debug_info_t *id, struct debug_view *view);
+#ifndef MODULE
+
/*
- define the debug levels:
- - 0 No debugging output to console or syslog
- - 1 Log internal errors to syslog, ignore check conditions
- - 2 Log internal errors and check conditions to syslog
- - 3 Log internal errors to console, log check conditions to syslog
- - 4 Log internal errors and check conditions to console
- - 5 panic on internal errors, log check conditions to console
- - 6 panic on both, internal errors and check conditions
+ * Note: Initial page and area numbers must be fixed to allow static
+ * initialization. This enables very early tracing. Changes to these values
+ * must be reflected in __DEFINE_STATIC_AREA.
+ */
+#define EARLY_PAGES 8
+#define EARLY_AREAS 1
+
+#define VNAME(var, suffix) __##var##_##suffix
+
+/*
+ * Define static areas for early trace data. During boot debug_register_static()
+ * will replace these with dynamically allocated areas to allow custom page and
+ * area sizes, and dynamic resizing.
+ */
+#define __DEFINE_STATIC_AREA(var) \
+static char VNAME(var, data)[EARLY_PAGES][PAGE_SIZE] __initdata; \
+static debug_entry_t *VNAME(var, pages)[EARLY_PAGES] __initdata = { \
+ (debug_entry_t *)VNAME(var, data)[0], \
+ (debug_entry_t *)VNAME(var, data)[1], \
+ (debug_entry_t *)VNAME(var, data)[2], \
+ (debug_entry_t *)VNAME(var, data)[3], \
+ (debug_entry_t *)VNAME(var, data)[4], \
+ (debug_entry_t *)VNAME(var, data)[5], \
+ (debug_entry_t *)VNAME(var, data)[6], \
+ (debug_entry_t *)VNAME(var, data)[7], \
+}; \
+static debug_entry_t **VNAME(var, areas)[EARLY_AREAS] __initdata = { \
+ (debug_entry_t **)VNAME(var, pages), \
+}; \
+static int VNAME(var, active_pages)[EARLY_AREAS] __initdata; \
+static int VNAME(var, active_entries)[EARLY_AREAS] __initdata
+
+#define __DEBUG_INFO_INIT(var, _name, _buf_size) { \
+ .next = NULL, \
+ .prev = NULL, \
+ .ref_count = REFCOUNT_INIT(1), \
+ .lock = __SPIN_LOCK_UNLOCKED(var.lock), \
+ .level = DEBUG_DEFAULT_LEVEL, \
+ .nr_areas = EARLY_AREAS, \
+ .pages_per_area = EARLY_PAGES, \
+ .buf_size = (_buf_size), \
+ .entry_size = sizeof(debug_entry_t) + (_buf_size), \
+ .areas = VNAME(var, areas), \
+ .active_area = 0, \
+ .active_pages = VNAME(var, active_pages), \
+ .active_entries = VNAME(var, active_entries), \
+ .debugfs_root_entry = NULL, \
+ .debugfs_entries = { NULL }, \
+ .views = { NULL }, \
+ .name = (_name), \
+ .mode = 0600, \
+}
+
+#define __REGISTER_STATIC_DEBUG_INFO(var, name, pages, areas, view) \
+static int __init VNAME(var, reg)(void) \
+{ \
+ debug_register_static(&var, (pages), (areas)); \
+ debug_register_view(&var, (view)); \
+ return 0; \
+} \
+arch_initcall(VNAME(var, reg))
+
+/**
+ * DEFINE_STATIC_DEBUG_INFO - Define static debug_info_t
+ *
+ * @var: Name of debug_info_t variable
+ * @name: Name of debug log (e.g. used for debugfs entry)
+ * @pages: Number of pages per area
+ * @nr_areas: Number of debug areas
+ * @buf_size: Size of data area in each debug entry
+ * @view: Pointer to debug view struct
+ *
+ * Define a static debug_info_t for early tracing. The associated debugfs log
+ * is automatically registered with the specified debug view.
+ *
+ * Important: Users of this macro must not call any of the
+ * debug_register/_unregister() functions for this debug_info_t!
+ *
+ * Note: Tracing will start with a fixed number of initial pages and areas.
+ * The debug area will be changed to use the specified numbers during
+ * arch_initcall.
*/
+#define DEFINE_STATIC_DEBUG_INFO(var, name, pages, nr_areas, buf_size, view) \
+__DEFINE_STATIC_AREA(var); \
+static debug_info_t __refdata var = \
+ __DEBUG_INFO_INIT(var, (name), (buf_size)); \
+__REGISTER_STATIC_DEBUG_INFO(var, name, pages, nr_areas, view)
+
+void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas);
+
+#endif /* MODULE */
-#ifndef DEBUG_LEVEL
-#define DEBUG_LEVEL 4
-#endif
-
-#define INTERNAL_ERRMSG(x,y...) "E" __FILE__ "%d: " x, __LINE__, y
-#define INTERNAL_WRNMSG(x,y...) "W" __FILE__ "%d: " x, __LINE__, y
-#define INTERNAL_INFMSG(x,y...) "I" __FILE__ "%d: " x, __LINE__, y
-#define INTERNAL_DEBMSG(x,y...) "D" __FILE__ "%d: " x, __LINE__, y
-
-#if DEBUG_LEVEL > 0
-#define PRINT_DEBUG(x...) printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_INFO(x...) printk(KERN_INFO PRINTK_HEADER x)
-#define PRINT_WARN(x...) printk(KERN_WARNING PRINTK_HEADER x)
-#define PRINT_ERR(x...) printk(KERN_ERR PRINTK_HEADER x)
-#define PRINT_FATAL(x...) panic(PRINTK_HEADER x)
-#else
-#define PRINT_DEBUG(x...) printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_INFO(x...) printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_WARN(x...) printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_ERR(x...) printk(KERN_DEBUG PRINTK_HEADER x)
-#define PRINT_FATAL(x...) printk(KERN_DEBUG PRINTK_HEADER x)
-#endif /* DASD_DEBUG */
-
-#endif /* DEBUG_H */
+#endif /* _ASM_S390_DEBUG_H */
diff --git a/arch/s390/include/asm/delay.h b/arch/s390/include/asm/delay.h
index 898323fd93d2..21a8fe18fe66 100644
--- a/arch/s390/include/asm/delay.h
+++ b/arch/s390/include/asm/delay.h
@@ -13,13 +13,12 @@
#ifndef _S390_DELAY_H
#define _S390_DELAY_H
-void __ndelay(unsigned long long nsecs);
-void __udelay(unsigned long long usecs);
-void udelay_simple(unsigned long long usecs);
+void __ndelay(unsigned long nsecs);
+void __udelay(unsigned long usecs);
void __delay(unsigned long loops);
-#define ndelay(n) __ndelay((unsigned long long) (n))
-#define udelay(n) __udelay((unsigned long long) (n))
-#define mdelay(n) __udelay((unsigned long long) (n) * 1000)
+#define ndelay(n) __ndelay((unsigned long)(n))
+#define udelay(n) __udelay((unsigned long)(n))
+#define mdelay(n) __udelay((unsigned long)(n) * 1000)
#endif /* defined(_S390_DELAY_H) */
diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h
index 0036eab14391..bed804137537 100644
--- a/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@ -11,6 +11,8 @@
#include <linux/if_ether.h>
#include <linux/percpu.h>
+#include <asm/asm-extable.h>
+#include <asm/cio.h>
enum diag_stat_enum {
DIAG_STAT_X008,
@@ -19,6 +21,7 @@ enum diag_stat_enum {
DIAG_STAT_X014,
DIAG_STAT_X044,
DIAG_STAT_X064,
+ DIAG_STAT_X08C,
DIAG_STAT_X09C,
DIAG_STAT_X0DC,
DIAG_STAT_X204,
@@ -33,6 +36,7 @@ enum diag_stat_enum {
DIAG_STAT_X304,
DIAG_STAT_X308,
DIAG_STAT_X318,
+ DIAG_STAT_X320,
DIAG_STAT_X500,
NR_DIAG_STAT
};
@@ -47,8 +51,8 @@ static inline void diag10_range(unsigned long start_pfn, unsigned long num_pfn)
{
unsigned long start_addr, end_addr;
- start_addr = start_pfn << PAGE_SHIFT;
- end_addr = (start_pfn + num_pfn - 1) << PAGE_SHIFT;
+ start_addr = pfn_to_phys(start_pfn);
+ end_addr = pfn_to_phys(start_pfn + num_pfn - 1);
diag_stat_inc(DIAG_STAT_X010);
asm volatile(
@@ -78,10 +82,20 @@ struct diag210 {
u8 vrdccrty; /* real device type (output) */
u8 vrdccrmd; /* real device model (output) */
u8 vrdccrft; /* real device feature (output) */
-} __attribute__((packed, aligned(4)));
+} __packed __aligned(4);
extern int diag210(struct diag210 *addr);
+struct diag8c {
+ u8 flags;
+ u8 num_partitions;
+ u16 width;
+ u16 height;
+ u8 data[];
+} __packed __aligned(4);
+
+extern int diag8c(struct diag8c *out, struct ccw_dev_id *devno);
+
/* bit is set in flags, when physical cpu info is included in diag 204 data */
#define DIAG204_LPAR_PHYS_FLG 0x80
#define DIAG204_LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */
@@ -95,6 +109,8 @@ enum diag204_sc {
DIAG204_SUBC_STIB7 = 7
};
+#define DIAG204_SUBCODE_MASK 0xffff
+
/* The two available diag 204 data formats */
enum diag204_format {
DIAG204_INFO_SIMPLE = 0,
@@ -298,10 +314,8 @@ struct diag26c_mac_resp {
union diag318_info {
unsigned long val;
struct {
- unsigned int cpnc : 8;
- unsigned int cpvc_linux : 24;
- unsigned char cpvc_distro[3];
- unsigned char zero;
+ unsigned long cpnc : 8;
+ unsigned long cpvc : 56;
};
};
@@ -311,14 +325,27 @@ int diag26c(void *req, void *resp, enum diag26c_sc subcode);
struct hypfs_diag0c_entry;
+/*
+ * This structure must contain only pointers/references into
+ * the AMODE31 text section.
+ */
struct diag_ops {
int (*diag210)(struct diag210 *addr);
int (*diag26c)(void *req, void *resp, enum diag26c_sc subcode);
int (*diag14)(unsigned long rx, unsigned long ry1, unsigned long subcode);
+ int (*diag8c)(struct diag8c *addr, struct ccw_dev_id *devno, size_t len);
void (*diag0c)(struct hypfs_diag0c_entry *entry);
void (*diag308_reset)(void);
};
-extern struct diag_ops diag_dma_ops;
-extern struct diag210 *__diag210_tmp_dma;
+extern struct diag_ops diag_amode31_ops;
+extern struct diag210 *__diag210_tmp_amode31;
+
+int _diag210_amode31(struct diag210 *addr);
+int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode);
+int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode);
+void _diag0c_amode31(struct hypfs_diag0c_entry *entry);
+void _diag308_reset_amode31(void);
+int _diag8c_amode31(struct diag8c *addr, struct ccw_dev_id *devno, size_t len);
+
#endif /* _ASM_S390_DIAG_H */
diff --git a/arch/s390/include/asm/dma.h b/arch/s390/include/asm/dma.h
index 6f26f35d4a71..7fe3e31956d7 100644
--- a/arch/s390/include/asm/dma.h
+++ b/arch/s390/include/asm/dma.h
@@ -2,19 +2,13 @@
#ifndef _ASM_S390_DMA_H
#define _ASM_S390_DMA_H
-#include <asm/io.h>
+#include <linux/io.h>
/*
* MAX_DMA_ADDRESS is ambiguous because on s390 its completely unrelated
* to DMA. It _is_ used for the s390 memory zone split at 2GB caused
* by the 31 bit heritage.
*/
-#define MAX_DMA_ADDRESS 0x80000000
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy (0)
-#endif
+#define MAX_DMA_ADDRESS __va(0x80000000)
#endif /* _ASM_S390_DMA_H */
diff --git a/arch/s390/include/asm/eadm.h b/arch/s390/include/asm/eadm.h
index bb63b2afdf6f..06f795855af7 100644
--- a/arch/s390/include/asm/eadm.h
+++ b/arch/s390/include/asm/eadm.h
@@ -78,7 +78,7 @@ struct aob {
struct aob_rq_header {
struct scm_device *scmdev;
- char data[0];
+ char data[];
};
struct scm_device {
@@ -105,7 +105,7 @@ enum scm_event {SCM_CHANGE, SCM_AVAIL};
struct scm_driver {
struct device_driver drv;
int (*probe) (struct scm_device *scmdev);
- int (*remove) (struct scm_device *scmdev);
+ void (*remove) (struct scm_device *scmdev);
void (*notify) (struct scm_device *scmdev, enum scm_event event);
void (*handler) (struct scm_device *scmdev, void *data,
blk_status_t error);
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 5775fc22f410..70a30ae258b7 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -91,29 +91,57 @@
/* Keep this the last entry. */
#define R_390_NUM 61
-/* Bits present in AT_HWCAP. */
-#define HWCAP_S390_ESAN3 1
-#define HWCAP_S390_ZARCH 2
-#define HWCAP_S390_STFLE 4
-#define HWCAP_S390_MSA 8
-#define HWCAP_S390_LDISP 16
-#define HWCAP_S390_EIMM 32
-#define HWCAP_S390_DFP 64
-#define HWCAP_S390_HPAGE 128
-#define HWCAP_S390_ETF3EH 256
-#define HWCAP_S390_HIGH_GPRS 512
-#define HWCAP_S390_TE 1024
-#define HWCAP_S390_VXRS 2048
-#define HWCAP_S390_VXRS_BCD 4096
-#define HWCAP_S390_VXRS_EXT 8192
-#define HWCAP_S390_GS 16384
-#define HWCAP_S390_VXRS_EXT2 32768
-#define HWCAP_S390_VXRS_PDE 65536
-#define HWCAP_S390_SORT 131072
-#define HWCAP_S390_DFLT 262144
+enum {
+ HWCAP_NR_ESAN3 = 0,
+ HWCAP_NR_ZARCH = 1,
+ HWCAP_NR_STFLE = 2,
+ HWCAP_NR_MSA = 3,
+ HWCAP_NR_LDISP = 4,
+ HWCAP_NR_EIMM = 5,
+ HWCAP_NR_DFP = 6,
+ HWCAP_NR_HPAGE = 7,
+ HWCAP_NR_ETF3EH = 8,
+ HWCAP_NR_HIGH_GPRS = 9,
+ HWCAP_NR_TE = 10,
+ HWCAP_NR_VXRS = 11,
+ HWCAP_NR_VXRS_BCD = 12,
+ HWCAP_NR_VXRS_EXT = 13,
+ HWCAP_NR_GS = 14,
+ HWCAP_NR_VXRS_EXT2 = 15,
+ HWCAP_NR_VXRS_PDE = 16,
+ HWCAP_NR_SORT = 17,
+ HWCAP_NR_DFLT = 18,
+ HWCAP_NR_VXRS_PDE2 = 19,
+ HWCAP_NR_NNPA = 20,
+ HWCAP_NR_PCI_MIO = 21,
+ HWCAP_NR_SIE = 22,
+ HWCAP_NR_MAX
+};
-/* Internal bits, not exposed via elf */
-#define HWCAP_INT_SIE 1UL
+/* Bits present in AT_HWCAP. */
+#define HWCAP_ESAN3 BIT(HWCAP_NR_ESAN3)
+#define HWCAP_ZARCH BIT(HWCAP_NR_ZARCH)
+#define HWCAP_STFLE BIT(HWCAP_NR_STFLE)
+#define HWCAP_MSA BIT(HWCAP_NR_MSA)
+#define HWCAP_LDISP BIT(HWCAP_NR_LDISP)
+#define HWCAP_EIMM BIT(HWCAP_NR_EIMM)
+#define HWCAP_DFP BIT(HWCAP_NR_DFP)
+#define HWCAP_HPAGE BIT(HWCAP_NR_HPAGE)
+#define HWCAP_ETF3EH BIT(HWCAP_NR_ETF3EH)
+#define HWCAP_HIGH_GPRS BIT(HWCAP_NR_HIGH_GPRS)
+#define HWCAP_TE BIT(HWCAP_NR_TE)
+#define HWCAP_VXRS BIT(HWCAP_NR_VXRS)
+#define HWCAP_VXRS_BCD BIT(HWCAP_NR_VXRS_BCD)
+#define HWCAP_VXRS_EXT BIT(HWCAP_NR_VXRS_EXT)
+#define HWCAP_GS BIT(HWCAP_NR_GS)
+#define HWCAP_VXRS_EXT2 BIT(HWCAP_NR_VXRS_EXT2)
+#define HWCAP_VXRS_PDE BIT(HWCAP_NR_VXRS_PDE)
+#define HWCAP_SORT BIT(HWCAP_NR_SORT)
+#define HWCAP_DFLT BIT(HWCAP_NR_DFLT)
+#define HWCAP_VXRS_PDE2 BIT(HWCAP_NR_VXRS_PDE2)
+#define HWCAP_NNPA BIT(HWCAP_NR_NNPA)
+#define HWCAP_PCI_MIO BIT(HWCAP_NR_PCI_MIO)
+#define HWCAP_SIE BIT(HWCAP_NR_SIE)
/*
* These are used to set parameters in the core dumps.
@@ -144,10 +172,6 @@ typedef s390_compat_regs compat_elf_gregset_t;
#include <linux/sched/mm.h> /* for task_struct */
#include <asm/mmu_context.h>
-#include <asm/vdso.h>
-
-extern unsigned int vdso_enabled;
-
/*
* This is used to ensure we don't load something for the wrong architecture.
*/
@@ -176,7 +200,7 @@ struct arch_elf_state {
!current->mm->context.alloc_pgste) { \
set_thread_flag(TIF_PGSTE); \
set_pt_regs_flag(task_pt_regs(current), \
- PIF_SYSCALL_RESTART); \
+ PIF_EXECVE_PGSTE_RESTART); \
_state->rc = -EAGAIN; \
} \
_state->rc; \
@@ -213,10 +237,6 @@ struct arch_elf_state {
extern unsigned long elf_hwcap;
#define ELF_HWCAP (elf_hwcap)
-/* Internal hardware capabilities, not exposed via elf */
-
-extern unsigned long int_hwcap;
-
/* This yields a string that ld.so will use to load implementation
specific libraries for optimization. This is more specific in
intent than poking at uname or /proc/cpuinfo.
@@ -233,8 +253,7 @@ extern char elf_platform[];
do { \
set_personality(PER_LINUX | \
(current->personality & (~PER_MASK))); \
- current->thread.sys_call_table = \
- (unsigned long) &sys_call_table; \
+ current->thread.sys_call_table = sys_call_table; \
} while (0)
#else /* CONFIG_COMPAT */
#define SET_PERSONALITY(ex) \
@@ -245,11 +264,11 @@ do { \
if ((ex).e_ident[EI_CLASS] == ELFCLASS32) { \
set_thread_flag(TIF_31BIT); \
current->thread.sys_call_table = \
- (unsigned long) &sys_call_table_emu; \
+ sys_call_table_emu; \
} else { \
clear_thread_flag(TIF_31BIT); \
current->thread.sys_call_table = \
- (unsigned long) &sys_call_table; \
+ sys_call_table; \
} \
} while (0)
#endif /* CONFIG_COMPAT */
@@ -269,11 +288,10 @@ do { \
#define STACK_RND_MASK MMAP_RND_MASK
/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
-#define ARCH_DLINFO \
-do { \
- if (vdso_enabled) \
- NEW_AUX_ENT(AT_SYSINFO_EHDR, \
- (unsigned long)current->mm->context.vdso_base); \
+#define ARCH_DLINFO \
+do { \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, \
+ (unsigned long)current->mm->context.vdso_base); \
} while (0)
struct linux_binprm;
diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h
new file mode 100644
index 000000000000..fdd319a622b0
--- /dev/null
+++ b/arch/s390/include/asm/entry-common.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_S390_ENTRY_COMMON_H
+#define ARCH_S390_ENTRY_COMMON_H
+
+#include <linux/sched.h>
+#include <linux/audit.h>
+#include <linux/randomize_kstack.h>
+#include <linux/processor.h>
+#include <linux/uaccess.h>
+#include <asm/timex.h>
+#include <asm/fpu/api.h>
+#include <asm/pai.h>
+
+#define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP)
+
+void do_per_trap(struct pt_regs *regs);
+
+static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ debug_user_asce(0);
+
+ pai_kernel_enter(regs);
+}
+
+#define arch_enter_from_user_mode arch_enter_from_user_mode
+
+static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
+ unsigned long ti_work)
+{
+ if (ti_work & _TIF_PER_TRAP) {
+ clear_thread_flag(TIF_PER_TRAP);
+ do_per_trap(regs);
+ }
+
+ if (ti_work & _TIF_GUARDED_STORAGE)
+ gs_load_bc_cb(regs);
+}
+
+#define arch_exit_to_user_mode_work arch_exit_to_user_mode_work
+
+static __always_inline void arch_exit_to_user_mode(void)
+{
+ if (test_cpu_flag(CIF_FPU))
+ __load_fpu_regs();
+
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ debug_user_asce(1);
+
+ pai_kernel_exit(current_pt_regs());
+}
+
+#define arch_exit_to_user_mode arch_exit_to_user_mode
+
+static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
+ unsigned long ti_work)
+{
+ choose_random_kstack_offset(get_tod_clock_fast() & 0xff);
+}
+
+#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
+
+#endif
diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h
index ae27f756b409..af6ba52743e9 100644
--- a/arch/s390/include/asm/extable.h
+++ b/arch/s390/include/asm/extable.h
@@ -1,12 +1,20 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __S390_EXTABLE_H
#define __S390_EXTABLE_H
+
+#include <asm/ptrace.h>
+#include <linux/compiler.h>
+
/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue. No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
+ * The exception table consists of three addresses:
+ *
+ * - Address of an instruction that is allowed to fault.
+ * - Address at which the program should continue.
+ * - Optional address of handler that takes pt_regs * argument and runs in
+ * interrupt context.
+ *
+ * No registers are modified, so it is entirely up to the continuation code
+ * to figure out what to do.
*
* All the routines below use bits of fixup code that are out of line
* with the main instruction path. This means when everything is well,
@@ -17,10 +25,11 @@
struct exception_table_entry
{
int insn, fixup;
+ short type, data;
};
-extern struct exception_table_entry *__start_dma_ex_table;
-extern struct exception_table_entry *__stop_dma_ex_table;
+extern struct exception_table_entry *__start_amode31_ex_table;
+extern struct exception_table_entry *__stop_amode31_ex_table;
const struct exception_table_entry *s390_search_extables(unsigned long addr);
@@ -31,4 +40,33 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x)
#define ARCH_HAS_RELATIVE_EXTABLE
+static inline void swap_ex_entry_fixup(struct exception_table_entry *a,
+ struct exception_table_entry *b,
+ struct exception_table_entry tmp,
+ int delta)
+{
+ a->fixup = b->fixup + delta;
+ b->fixup = tmp.fixup - delta;
+ a->type = b->type;
+ b->type = tmp.type;
+ a->data = b->data;
+ b->data = tmp.data;
+}
+#define swap_ex_entry_fixup swap_ex_entry_fixup
+
+#ifdef CONFIG_BPF_JIT
+
+bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs);
+
+#else /* !CONFIG_BPF_JIT */
+
+static inline bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ return false;
+}
+
+#endif /* CONFIG_BPF_JIT */
+
+bool fixup_exception(struct pt_regs *regs);
+
#endif
diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h
index 68c476b20b57..796007125dff 100644
--- a/arch/s390/include/asm/facility.h
+++ b/arch/s390/include/asm/facility.h
@@ -9,11 +9,18 @@
#define __ASM_FACILITY_H
#include <asm/facility-defs.h>
+
+#include <linux/minmax.h>
#include <linux/string.h>
+#include <linux/types.h>
#include <linux/preempt.h>
+
#include <asm/lowcore.h>
-#define MAX_FACILITY_BIT (sizeof(((struct lowcore *)0)->stfle_fac_list) * 8)
+#define MAX_FACILITY_BIT (sizeof(stfle_fac_list) * 8)
+
+extern u64 stfle_fac_list[16];
+extern u64 alt_stfle_fac_list[16];
static inline void __set_facility(unsigned long nr, void *facilities)
{
@@ -44,7 +51,7 @@ static inline int __test_facility(unsigned long nr, void *facilities)
}
/*
- * The test_facility function uses the bit odering where the MSB is bit 0.
+ * The test_facility function uses the bit ordering where the MSB is bit 0.
* That makes it easier to query facility bits with the bit number as
* documented in the Principles of Operation.
*/
@@ -56,18 +63,20 @@ static inline int test_facility(unsigned long nr)
if (__test_facility(nr, &facilities_als))
return 1;
}
- return __test_facility(nr, &S390_lowcore.stfle_fac_list);
+ return __test_facility(nr, &stfle_fac_list);
}
static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size)
{
- register unsigned long reg0 asm("0") = size - 1;
+ unsigned long reg0 = size - 1;
asm volatile(
- ".insn s,0xb2b00000,0(%1)" /* stfle */
- : "+d" (reg0)
- : "a" (stfle_fac_list)
- : "memory", "cc");
+ " lgr 0,%[reg0]\n"
+ " .insn s,0xb2b00000,%[list]\n" /* stfle */
+ " lgr %[reg0],0\n"
+ : [reg0] "+&d" (reg0), [list] "+Q" (*stfle_fac_list)
+ :
+ : "memory", "cc", "0");
return reg0;
}
@@ -79,13 +88,15 @@ static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size)
static inline void __stfle(u64 *stfle_fac_list, int size)
{
unsigned long nr;
+ u32 stfl_fac_list;
asm volatile(
" stfl 0(0)\n"
: "=m" (S390_lowcore.stfl_fac_list));
+ stfl_fac_list = S390_lowcore.stfl_fac_list;
+ memcpy(stfle_fac_list, &stfl_fac_list, 4);
nr = 4; /* bytes stored by stfl */
- memcpy(stfle_fac_list, &S390_lowcore.stfl_fac_list, 4);
- if (S390_lowcore.stfl_fac_list & 0x01000000) {
+ if (stfl_fac_list & 0x01000000) {
/* More facility bits available with stfle */
nr = __stfle_asm(stfle_fac_list, size);
nr = min_t(unsigned long, (nr + 1) * 8, size * 8);
@@ -100,4 +111,10 @@ static inline void stfle(u64 *stfle_fac_list, int size)
preempt_enable();
}
+/**
+ * stfle_size - Actual size of the facility list as specified by stfle
+ * (number of double words)
+ */
+unsigned int stfle_size(void);
+
#endif /* __ASM_FACILITY_H */
diff --git a/arch/s390/include/asm/fault.h b/arch/s390/include/asm/fault.h
new file mode 100644
index 000000000000..d326f56603d6
--- /dev/null
+++ b/arch/s390/include/asm/fault.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 1999, 2023
+ */
+#ifndef _ASM_S390_FAULT_H
+#define _ASM_S390_FAULT_H
+
+union teid {
+ unsigned long val;
+ struct {
+ unsigned long addr : 52; /* Translation-exception Address */
+ unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */
+ unsigned long : 2;
+ unsigned long b56 : 1;
+ unsigned long : 3;
+ unsigned long b60 : 1;
+ unsigned long b61 : 1;
+ unsigned long as : 2; /* ASCE Identifier */
+ };
+};
+
+enum {
+ TEID_FSI_UNKNOWN = 0, /* Unknown whether fetch or store */
+ TEID_FSI_STORE = 1, /* Exception was due to store operation */
+ TEID_FSI_FETCH = 2 /* Exception was due to fetch operation */
+};
+
+#endif /* _ASM_S390_FAULT_H */
diff --git a/arch/s390/include/asm/fcx.h b/arch/s390/include/asm/fcx.h
index cff0749e9657..29784b4b44f6 100644
--- a/arch/s390/include/asm/fcx.h
+++ b/arch/s390/include/asm/fcx.h
@@ -214,7 +214,7 @@ struct dcw_intrg_data {
u32 :32;
u64 time;
u64 prog_id;
- u8 prog_data[0];
+ u8 prog_data[];
} __attribute__ ((packed));
#define DCW_FLAGS_CC (1 << (7 - 1))
@@ -241,7 +241,7 @@ struct dcw {
u32 :8;
u32 cd_count:8;
u32 count;
- u8 cd[0];
+ u8 cd[];
} __attribute__ ((packed));
#define TCCB_FORMAT_DEFAULT 0x7f
@@ -286,7 +286,7 @@ struct tccb_tcat {
*/
struct tccb {
struct tccb_tcah tcah;
- u8 tca[0];
+ u8 tca[];
} __attribute__ ((packed, aligned(8)));
struct tcw *tcw_get_intrg(struct tcw *tcw);
diff --git a/arch/s390/include/asm/fpu/api.h b/arch/s390/include/asm/fpu/api.h
index 34a7ae68485c..d6ca8bc6ca68 100644
--- a/arch/s390/include/asm/fpu/api.h
+++ b/arch/s390/include/asm/fpu/api.h
@@ -45,24 +45,34 @@
#define _ASM_S390_FPU_API_H
#include <linux/preempt.h>
+#include <asm/asm-extable.h>
+#include <asm/fpu/internal.h>
void save_fpu_regs(void);
+void load_fpu_regs(void);
+void __load_fpu_regs(void);
-static inline int test_fp_ctl(u32 fpc)
+/**
+ * sfpc_safe - Set floating point control register safely.
+ * @fpc: new value for floating point control register
+ *
+ * Set floating point control register. This may lead to an exception,
+ * since a saved value may have been modified by user space (ptrace,
+ * signal return, kvm registers) to an invalid value. In such a case
+ * set the floating point control register to zero.
+ */
+static inline void sfpc_safe(u32 fpc)
{
- u32 orig_fpc;
- int rc;
-
- asm volatile(
- " efpc %1\n"
- " sfpc %2\n"
- "0: sfpc %1\n"
- " la %0,0\n"
- "1:\n"
- EX_TABLE(0b,1b)
- : "=d" (rc), "=&d" (orig_fpc)
- : "d" (fpc), "0" (-EINVAL));
- return rc;
+ asm volatile("\n"
+ "0: sfpc %[fpc]\n"
+ "1: nopr %%r7\n"
+ ".pushsection .fixup, \"ax\"\n"
+ "2: lghi %[fpc],0\n"
+ " jg 0b\n"
+ ".popsection\n"
+ EX_TABLE(1b, 2b)
+ : [fpc] "+d" (fpc)
+ : : "memory");
}
#define KERNEL_FPC 1
@@ -76,7 +86,7 @@ static inline int test_fp_ctl(u32 fpc)
#define KERNEL_VXR_HIGH (KERNEL_VXR_V16V23|KERNEL_VXR_V24V31)
#define KERNEL_VXR (KERNEL_VXR_LOW|KERNEL_VXR_HIGH)
-#define KERNEL_FPR (KERNEL_FPC|KERNEL_VXR_V0V7)
+#define KERNEL_FPR (KERNEL_FPC|KERNEL_VXR_LOW)
struct kernel_fpu;
diff --git a/arch/s390/include/asm/fpu/internal.h b/arch/s390/include/asm/fpu/internal.h
index 4a71dbbf76fb..d511c4cf5afb 100644
--- a/arch/s390/include/asm/fpu/internal.h
+++ b/arch/s390/include/asm/fpu/internal.h
@@ -10,9 +10,14 @@
#define _ASM_S390_FPU_INTERNAL_H
#include <linux/string.h>
-#include <asm/ctl_reg.h>
+#include <asm/facility.h>
#include <asm/fpu/types.h>
+static inline bool cpu_has_vx(void)
+{
+ return likely(test_facility(129));
+}
+
static inline void save_vx_regs(__vector128 *vxrs)
{
asm volatile(
@@ -27,7 +32,7 @@ static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs)
int i;
for (i = 0; i < __NUM_FPRS; i++)
- fprs[i] = *(freg_t *)(vxrs + i);
+ fprs[i].ui = vxrs[i].high;
}
static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs)
@@ -35,14 +40,14 @@ static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs)
int i;
for (i = 0; i < __NUM_FPRS; i++)
- *(freg_t *)(vxrs + i) = fprs[i];
+ vxrs[i].high = fprs[i].ui;
}
static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu)
{
fpregs->pad = 0;
fpregs->fpc = fpu->fpc;
- if (MACHINE_HAS_VX)
+ if (cpu_has_vx())
convert_vx_to_fp((freg_t *)&fpregs->fprs, fpu->vxrs);
else
memcpy((freg_t *)&fpregs->fprs, fpu->fprs,
@@ -52,7 +57,7 @@ static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu)
static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu)
{
fpu->fpc = fpregs->fpc;
- if (MACHINE_HAS_VX)
+ if (cpu_has_vx())
convert_fp_to_vx(fpu->vxrs, (freg_t *)&fpregs->fprs);
else
memcpy(fpu->fprs, (freg_t *)&fpregs->fprs,
diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
index 68d362f8d6c1..5a82b08f03cd 100644
--- a/arch/s390/include/asm/ftrace.h
+++ b/arch/s390/include/asm/ftrace.h
@@ -2,16 +2,9 @@
#ifndef _ASM_S390_FTRACE_H
#define _ASM_S390_FTRACE_H
+#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
#define ARCH_SUPPORTS_FTRACE_OPS 1
-
-#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)
#define MCOUNT_INSN_SIZE 6
-#else
-#define MCOUNT_INSN_SIZE 24
-#define MCOUNT_RETURN_FIXUP 18
-#endif
-
-#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
#ifndef __ASSEMBLY__
@@ -22,72 +15,103 @@
#define ftrace_return_address(n) __builtin_return_address(n)
#endif
-void _mcount(void);
void ftrace_caller(void);
-extern char ftrace_graph_caller_end;
-extern unsigned long ftrace_plt;
+extern void *ftrace_func;
struct dyn_arch_ftrace { };
-#define MCOUNT_ADDR ((unsigned long)_mcount)
+#define MCOUNT_ADDR 0
#define FTRACE_ADDR ((unsigned long)ftrace_caller)
#define KPROBE_ON_FTRACE_NOP 0
#define KPROBE_ON_FTRACE_CALL 1
+struct module;
+struct dyn_ftrace;
+
+bool ftrace_need_init_nop(void);
+#define ftrace_need_init_nop ftrace_need_init_nop
+
+int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
+#define ftrace_init_nop ftrace_init_nop
+
static inline unsigned long ftrace_call_adjust(unsigned long addr)
{
return addr;
}
-struct ftrace_insn {
- u16 opc;
- s32 disp;
-} __packed;
+struct ftrace_regs {
+ struct pt_regs regs;
+};
-static inline void ftrace_generate_nop_insn(struct ftrace_insn *insn)
+static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs)
{
-#ifdef CONFIG_FUNCTION_TRACER
-#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)
- /* brcl 0,0 */
- insn->opc = 0xc004;
- insn->disp = 0;
-#else
- /* jg .+24 */
- insn->opc = 0xc0f4;
- insn->disp = MCOUNT_INSN_SIZE / 2;
-#endif
-#endif
+ struct pt_regs *regs = &fregs->regs;
+
+ if (test_pt_regs_flag(regs, PIF_FTRACE_FULL_REGS))
+ return regs;
+ return NULL;
}
-static inline int is_ftrace_nop(struct ftrace_insn *insn)
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+struct fgraph_ret_regs {
+ unsigned long gpr2;
+ unsigned long fp;
+};
+
+static __always_inline unsigned long fgraph_ret_regs_return_value(struct fgraph_ret_regs *ret_regs)
{
-#ifdef CONFIG_FUNCTION_TRACER
-#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)
- if (insn->disp == 0)
- return 1;
-#else
- if (insn->disp == MCOUNT_INSN_SIZE / 2)
- return 1;
-#endif
-#endif
- return 0;
+ return ret_regs->gpr2;
}
-static inline void ftrace_generate_call_insn(struct ftrace_insn *insn,
- unsigned long ip)
+static __always_inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph_ret_regs *ret_regs)
{
-#ifdef CONFIG_FUNCTION_TRACER
- unsigned long target;
+ return ret_regs->fp;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
- /* brasl r0,ftrace_caller */
- target = is_module_addr((void *) ip) ? ftrace_plt : FTRACE_ADDR;
- insn->opc = 0xc005;
- insn->disp = (target - ip) / 2;
-#endif
+static __always_inline unsigned long
+ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs)
+{
+ return fregs->regs.psw.addr;
+}
+
+static __always_inline void
+ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
+ unsigned long ip)
+{
+ fregs->regs.psw.addr = ip;
}
+#define ftrace_regs_get_argument(fregs, n) \
+ regs_get_kernel_argument(&(fregs)->regs, n)
+#define ftrace_regs_get_stack_pointer(fregs) \
+ kernel_stack_pointer(&(fregs)->regs)
+#define ftrace_regs_return_value(fregs) \
+ regs_return_value(&(fregs)->regs)
+#define ftrace_regs_set_return_value(fregs, ret) \
+ regs_set_return_value(&(fregs)->regs, ret)
+#define ftrace_override_function_with_return(fregs) \
+ override_function_with_return(&(fregs)->regs)
+#define ftrace_regs_query_register_offset(name) \
+ regs_query_register_offset(name)
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/*
+ * When an ftrace registered caller is tracing a function that is
+ * also set by a register_ftrace_direct() call, it needs to be
+ * differentiated in the ftrace_caller trampoline. To do this,
+ * place the direct caller in the ORIG_GPR2 part of pt_regs. This
+ * tells the ftrace_caller that there's a direct caller.
+ */
+static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr)
+{
+ struct pt_regs *regs = &fregs->regs;
+ regs->orig_gpr2 = addr;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
/*
* Even though the system call numbers are identical for s390/s390x a
* different system call table is used for compat tasks. This may lead
@@ -114,4 +138,32 @@ static inline bool arch_syscall_match_sym_name(const char *sym,
}
#endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+#define FTRACE_NOP_INSN .word 0xc004, 0x0000, 0x0000 /* brcl 0,0 */
+
+#ifndef CC_USING_HOTPATCH
+
+#define FTRACE_GEN_MCOUNT_RECORD(name) \
+ .section __mcount_loc, "a", @progbits; \
+ .quad name; \
+ .previous;
+
+#else /* !CC_USING_HOTPATCH */
+
+#define FTRACE_GEN_MCOUNT_RECORD(name)
+
+#endif /* !CC_USING_HOTPATCH */
+
+#define FTRACE_GEN_NOP_ASM(name) \
+ FTRACE_GEN_MCOUNT_RECORD(name) \
+ FTRACE_NOP_INSN
+
+#else /* CONFIG_FUNCTION_TRACER */
+
+#define FTRACE_GEN_NOP_ASM(name)
+
+#endif /* CONFIG_FUNCTION_TRACER */
+
#endif /* _ASM_S390_FTRACE_H */
diff --git a/arch/s390/include/asm/ftrace.lds.h b/arch/s390/include/asm/ftrace.lds.h
new file mode 100644
index 000000000000..968adfd41240
--- /dev/null
+++ b/arch/s390/include/asm/ftrace.lds.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef DIV_ROUND_UP
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+#endif
+
+#define SIZEOF_MCOUNT_LOC_ENTRY 8
+#define SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE 24
+#define FTRACE_HOTPATCH_TRAMPOLINES_SIZE(n) \
+ DIV_ROUND_UP(SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE * (n), \
+ SIZEOF_MCOUNT_LOC_ENTRY)
+
+#ifdef CONFIG_FUNCTION_TRACER
+#define FTRACE_HOTPATCH_TRAMPOLINES_TEXT \
+ . = ALIGN(8); \
+ __ftrace_hotpatch_trampolines_start = .; \
+ . = . + FTRACE_HOTPATCH_TRAMPOLINES_SIZE(__stop_mcount_loc - \
+ __start_mcount_loc); \
+ __ftrace_hotpatch_trampolines_end = .;
+#else
+#define FTRACE_HOTPATCH_TRAMPOLINES_TEXT
+#endif
diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h
index 5e97a4353147..eaeaeb3ff0be 100644
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -4,6 +4,7 @@
#include <linux/uaccess.h>
#include <linux/futex.h>
+#include <asm/asm-extable.h>
#include <asm/mmu_context.h>
#include <asm/errno.h>
@@ -16,7 +17,8 @@
"3: jl 1b\n" \
" lhi %0,0\n" \
"4: sacf 768\n" \
- EX_TABLE(0b,4b) EX_TABLE(2b,4b) EX_TABLE(3b,4b) \
+ EX_TABLE(0b,4b) EX_TABLE(1b,4b) \
+ EX_TABLE(2b,4b) EX_TABLE(3b,4b) \
: "=d" (ret), "=&d" (oldval), "=&d" (newval), \
"=m" (*uaddr) \
: "0" (-EFAULT), "d" (oparg), "a" (uaddr), \
@@ -26,10 +28,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
u32 __user *uaddr)
{
int oldval = 0, newval, ret;
- mm_segment_t old_fs;
- old_fs = enable_sacf_uaccess();
- pagefault_disable();
switch (op) {
case FUTEX_OP_SET:
__futex_atomic_op("lr %2,%5\n",
@@ -54,8 +53,6 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
default:
ret = -ENOSYS;
}
- pagefault_enable();
- disable_sacf_uaccess(old_fs);
if (!ret)
*oval = oldval;
@@ -66,10 +63,8 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
u32 oldval, u32 newval)
{
- mm_segment_t old_fs;
int ret;
- old_fs = enable_sacf_uaccess();
asm volatile(
" sacf 256\n"
"0: cs %1,%4,0(%5)\n"
@@ -79,7 +74,6 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
: "=d" (ret), "+d" (oldval), "=m" (*uaddr)
: "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr)
: "cc", "memory");
- disable_sacf_uaccess(old_fs);
*uval = oldval;
return ret;
}
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 37f96b6f0e61..5cc46e0dde62 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -9,6 +9,7 @@
#ifndef _ASM_S390_GMAP_H
#define _ASM_S390_GMAP_H
+#include <linux/radix-tree.h>
#include <linux/refcount.h>
/* Generic bits for GMAP notification on DAT table entry changes. */
@@ -31,6 +32,7 @@
* @table: pointer to the page directory
* @asce: address space control element for gmap page table
* @pfault_enabled: defines if pfaults are applicable for the guest
+ * @guest_handle: protected virtual machine handle for the ultravisor
* @host_to_rmap: radix tree with gmap_rmap lists
* @children: list of shadow gmap structures
* @pt_list: list of all page tables used in the shadow guest address space
@@ -54,6 +56,8 @@ struct gmap {
unsigned long asce_end;
void *private;
bool pfault_enabled;
+ /* only set for protected virtual machines */
+ unsigned long guest_handle;
/* Additional data for shadow guest address spaces */
struct radix_tree_root host_to_rmap;
struct list_head children;
@@ -136,12 +140,49 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
void gmap_register_pte_notifier(struct gmap_notifier *);
void gmap_unregister_pte_notifier(struct gmap_notifier *);
-void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
- unsigned long bits);
int gmap_mprotect_notify(struct gmap *, unsigned long start,
unsigned long len, int prot);
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
unsigned long gaddr, unsigned long vmaddr);
+int gmap_mark_unmergeable(void);
+void s390_unlist_old_asce(struct gmap *gmap);
+int s390_replace_asce(struct gmap *gmap);
+void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
+int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, bool interruptible);
+
+/**
+ * s390_uv_destroy_range - Destroy a range of pages in the given mm.
+ * @mm: the mm on which to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ *
+ * This function will call cond_sched, so it should not generate stalls, but
+ * it will otherwise only return when it completed.
+ */
+static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ (void)__s390_uv_destroy_range(mm, start, end, false);
+}
+
+/**
+ * s390_uv_destroy_range_interruptible - Destroy a range of pages in the
+ * given mm, but stop when a fatal signal is received.
+ * @mm: the mm on which to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ *
+ * This function will call cond_sched, so it should not generate stalls. If
+ * a fatal signal is received, it will return with -EINTR immediately,
+ * without finishing destroying the whole range. Upon successful
+ * completion, 0 is returned.
+ */
+static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ return __s390_uv_destroy_range(mm, start, end, true);
+}
#endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h
index dfbc3c6c0674..58668ffb5488 100644
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -18,7 +18,6 @@
#define or_softirq_pending(x) (S390_lowcore.softirq_pending |= (x))
#define __ARCH_IRQ_STAT
-#define __ARCH_HAS_DO_SOFTIRQ
#define __ARCH_IRQ_EXIT_IRQS_DISABLED
static inline void ack_bad_irq(unsigned int irq)
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index de8f0bf5f238..deb198a61039 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -9,25 +9,20 @@
#ifndef _ASM_S390_HUGETLB_H
#define _ASM_S390_HUGETLB_H
+#include <linux/pgtable.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#define hugetlb_free_pgd_range free_pgd_range
#define hugepages_supported() (MACHINE_HAS_EDAT1)
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned long sz);
+void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
pte_t huge_ptep_get(pte_t *ptep);
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
-static inline bool is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr,
- unsigned long len)
-{
- return false;
-}
-
/*
* If the arch doesn't supply something else, assume that hugepage
* size aligned regions are ok without further preparation.
@@ -35,9 +30,11 @@ static inline bool is_hugepage_only_range(struct mm_struct *mm,
static inline int prepare_hugepage_range(struct file *file,
unsigned long addr, unsigned long len)
{
- if (len & ~HPAGE_MASK)
+ struct hstate *h = hstate_file(file);
+
+ if (len & ~huge_page_mask(h))
return -EINVAL;
- if (addr & ~HPAGE_MASK)
+ if (addr & ~huge_page_mask(h))
return -EINVAL;
return 0;
}
@@ -46,20 +43,21 @@ static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_arch_1, &page->flags);
}
+#define arch_clear_hugepage_flags arch_clear_hugepage_flags
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz)
{
if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
- pte_val(*ptep) = _REGION3_ENTRY_EMPTY;
+ set_pte(ptep, __pte(_REGION3_ENTRY_EMPTY));
else
- pte_val(*ptep) = _SEGMENT_ENTRY_EMPTY;
+ set_pte(ptep, __pte(_SEGMENT_ENTRY_EMPTY));
}
-static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep)
+static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
{
- huge_ptep_get_and_clear(vma->vm_mm, address, ptep);
+ return huge_ptep_get_and_clear(vma->vm_mm, address, ptep);
}
static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
@@ -69,7 +67,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(huge_ptep_get(ptep), pte);
if (changed) {
huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
- set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+ __set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
}
return changed;
}
@@ -78,7 +76,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
pte_t pte = huge_ptep_get_and_clear(mm, addr, ptep);
- set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte));
+ __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte));
}
static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot)
@@ -91,6 +89,11 @@ static inline int huge_pte_none(pte_t pte)
return pte_none(pte);
}
+static inline int huge_pte_none_mostly(pte_t pte)
+{
+ return huge_pte_none(pte);
+}
+
static inline int huge_pte_write(pte_t pte)
{
return pte_write(pte);
@@ -103,7 +106,7 @@ static inline int huge_pte_dirty(pte_t pte)
static inline pte_t huge_pte_mkwrite(pte_t pte)
{
- return pte_mkwrite(pte);
+ return pte_mkwrite_novma(pte);
}
static inline pte_t huge_pte_mkdirty(pte_t pte)
@@ -121,6 +124,21 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
return pte_modify(pte, newprot);
}
+static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
+{
+ return pte;
+}
+
+static inline pte_t huge_pte_clear_uffd_wp(pte_t pte)
+{
+ return pte;
+}
+
+static inline int huge_pte_uffd_wp(pte_t pte)
+{
+ return 0;
+}
+
static inline bool gigantic_page_runtime_supported(void)
{
return true;
diff --git a/arch/s390/include/asm/hw_irq.h b/arch/s390/include/asm/hw_irq.h
index adae176757ae..9078b5b6b837 100644
--- a/arch/s390/include/asm/hw_irq.h
+++ b/arch/s390/include/asm/hw_irq.h
@@ -7,6 +7,5 @@
void __init init_airq_interrupts(void);
void __init init_cio_interrupts(void);
-void __init init_ext_interrupts(void);
#endif
diff --git a/arch/s390/include/asm/idals.h b/arch/s390/include/asm/idals.h
index 6fb7aced104a..59fcc3c72edf 100644
--- a/arch/s390/include/asm/idals.h
+++ b/arch/s390/include/asm/idals.h
@@ -23,6 +23,9 @@
#define IDA_SIZE_LOG 12 /* 11 for 2k , 12 for 4k */
#define IDA_BLOCK_SIZE (1L<<IDA_SIZE_LOG)
+#define IDA_2K_SIZE_LOG 11
+#define IDA_2K_BLOCK_SIZE (1L << IDA_2K_SIZE_LOG)
+
/*
* Test if an address/length pair needs an idal list.
*/
@@ -43,6 +46,15 @@ static inline unsigned int idal_nr_words(void *vaddr, unsigned int length)
}
/*
+ * Return the number of 2K IDA words needed for an address/length pair.
+ */
+static inline unsigned int idal_2k_nr_words(void *vaddr, unsigned int length)
+{
+ return ((__pa(vaddr) & (IDA_2K_BLOCK_SIZE - 1)) + length +
+ (IDA_2K_BLOCK_SIZE - 1)) >> IDA_2K_SIZE_LOG;
+}
+
+/*
* Create the list of idal words for an address/length pair.
*/
static inline unsigned long *idal_create_words(unsigned long *idaws,
@@ -108,7 +120,7 @@ clear_normalized_cda(struct ccw1 * ccw)
struct idal_buffer {
size_t size;
size_t page_order;
- void *data[0];
+ void *data[];
};
/*
diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h
index 6d4226dcf42a..09f763b9eb40 100644
--- a/arch/s390/include/asm/idle.h
+++ b/arch/s390/include/asm/idle.h
@@ -10,21 +10,18 @@
#include <linux/types.h>
#include <linux/device.h>
-#include <linux/seqlock.h>
struct s390_idle_data {
- seqcount_t seqcount;
- unsigned long long idle_count;
- unsigned long long idle_time;
- unsigned long long clock_idle_enter;
- unsigned long long clock_idle_exit;
- unsigned long long timer_idle_enter;
- unsigned long long timer_idle_exit;
+ unsigned long idle_count;
+ unsigned long idle_time;
+ unsigned long clock_idle_enter;
+ unsigned long timer_idle_enter;
+ unsigned long mt_cycles_enter[8];
};
extern struct device_attribute dev_attr_idle_count;
extern struct device_attribute dev_attr_idle_time_us;
-void psw_idle(struct s390_idle_data *, unsigned long);
+void psw_idle(struct s390_idle_data *data, unsigned long psw_mask);
#endif /* _S390_IDLE_H */
diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h
index 5a16f500515a..4453ad7c11ac 100644
--- a/arch/s390/include/asm/io.h
+++ b/arch/s390/include/asm/io.h
@@ -12,6 +12,7 @@
#include <linux/kernel.h>
#include <asm/page.h>
+#include <asm/pgtable.h>
#include <asm/pci_io.h>
#define xlate_dev_mem_ptr xlate_dev_mem_ptr
@@ -19,15 +20,20 @@ void *xlate_dev_mem_ptr(phys_addr_t phys);
#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
+#define IO_SPACE_LIMIT 0
+
/*
- * Convert a virtual cached pointer to an uncached pointer
+ * I/O memory mapping functions.
*/
-#define xlate_dev_kmem_ptr(p) p
+#define ioremap_prot ioremap_prot
+#define iounmap iounmap
-#define IO_SPACE_LIMIT 0
+#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL)
-void __iomem *ioremap(unsigned long offset, unsigned long size);
-void iounmap(volatile void __iomem *addr);
+#define ioremap_wc(addr, size) \
+ ioremap_prot((addr), (size), pgprot_val(pgprot_writecombine(PAGE_KERNEL)))
+#define ioremap_wt(addr, size) \
+ ioremap_prot((addr), (size), pgprot_val(pgprot_writethrough(PAGE_KERNEL)))
static inline void __iomem *ioport_map(unsigned long port, unsigned int nr)
{
diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h
index 084e71b7272a..b0d00032479d 100644
--- a/arch/s390/include/asm/ipl.h
+++ b/arch/s390/include/asm/ipl.h
@@ -12,6 +12,7 @@
#include <asm/types.h>
#include <asm/cio.h>
#include <asm/setup.h>
+#include <asm/page.h>
#include <uapi/asm/ipl.h>
struct ipl_parameter_block {
@@ -21,6 +22,8 @@ struct ipl_parameter_block {
struct ipl_pb0_common common;
struct ipl_pb0_fcp fcp;
struct ipl_pb0_ccw ccw;
+ struct ipl_pb0_eckd eckd;
+ struct ipl_pb0_nvme nvme;
char raw[PAGE_SIZE - sizeof(struct ipl_pl_hdr)];
};
} __packed __aligned(PAGE_SIZE);
@@ -30,10 +33,19 @@ struct ipl_parameter_block {
#define IPL_BP_FCP_LEN (sizeof(struct ipl_pl_hdr) + \
sizeof(struct ipl_pb0_fcp))
#define IPL_BP0_FCP_LEN (sizeof(struct ipl_pb0_fcp))
+
+#define IPL_BP_NVME_LEN (sizeof(struct ipl_pl_hdr) + \
+ sizeof(struct ipl_pb0_nvme))
+#define IPL_BP0_NVME_LEN (sizeof(struct ipl_pb0_nvme))
+
#define IPL_BP_CCW_LEN (sizeof(struct ipl_pl_hdr) + \
sizeof(struct ipl_pb0_ccw))
#define IPL_BP0_CCW_LEN (sizeof(struct ipl_pb0_ccw))
+#define IPL_BP_ECKD_LEN (sizeof(struct ipl_pl_hdr) + \
+ sizeof(struct ipl_pb0_eckd))
+#define IPL_BP0_ECKD_LEN (sizeof(struct ipl_pb0_eckd))
+
#define IPL_MAX_SUPPORTED_VERSION (0)
#define IPL_RB_CERT_UNKNOWN ((unsigned short)-1)
@@ -59,6 +71,10 @@ enum ipl_type {
IPL_TYPE_FCP = 4,
IPL_TYPE_FCP_DUMP = 8,
IPL_TYPE_NSS = 16,
+ IPL_TYPE_NVME = 32,
+ IPL_TYPE_NVME_DUMP = 64,
+ IPL_TYPE_ECKD = 128,
+ IPL_TYPE_ECKD_DUMP = 256,
};
struct ipl_info
@@ -70,10 +86,17 @@ struct ipl_info
} ccw;
struct {
struct ccw_dev_id dev_id;
+ } eckd;
+ struct {
+ struct ccw_dev_id dev_id;
u64 wwpn;
u64 lun;
} fcp;
struct {
+ u32 fid;
+ u32 nsid;
+ } nvme;
+ struct {
char name[NSS_NAME_SIZE + 1];
} nss;
} data;
@@ -83,6 +106,13 @@ extern struct ipl_info ipl_info;
extern void setup_ipl(void);
extern void set_os_info_reipl_block(void);
+static inline bool is_ipl_type_dump(void)
+{
+ return (ipl_info.type == IPL_TYPE_FCP_DUMP) ||
+ (ipl_info.type == IPL_TYPE_ECKD_DUMP) ||
+ (ipl_info.type == IPL_TYPE_NVME_DUMP);
+}
+
struct ipl_report {
struct ipl_parameter_block *ipib;
struct list_head components;
@@ -114,11 +144,18 @@ int ipl_report_add_certificate(struct ipl_report *report, void *key,
* DIAG 308 support
*/
enum diag308_subcode {
+ DIAG308_CLEAR_RESET = 0,
+ DIAG308_LOAD_NORMAL_RESET = 1,
DIAG308_REL_HSA = 2,
DIAG308_LOAD_CLEAR = 3,
DIAG308_LOAD_NORMAL_DUMP = 4,
DIAG308_SET = 5,
DIAG308_STORE = 6,
+ DIAG308_LOAD_NORMAL = 7,
+};
+
+enum diag308_subcode_flags {
+ DIAG308_FLAG_EI = 1UL << 16,
};
enum diag308_rc {
diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h
index 9f75d67b8c20..54b42817f70a 100644
--- a/arch/s390/include/asm/irq.h
+++ b/arch/s390/include/asm/irq.h
@@ -31,6 +31,7 @@
#include <linux/percpu.h>
#include <linux/cache.h>
#include <linux/types.h>
+#include <asm/ctlreg.h>
enum interruption_class {
IRQEXT_CLK,
@@ -81,8 +82,13 @@ static __always_inline void inc_irq_stat(enum interruption_class irq)
}
struct ext_code {
- unsigned short subcode;
- unsigned short code;
+ union {
+ struct {
+ unsigned short subcode;
+ unsigned short code;
+ };
+ unsigned int int_code;
+ };
};
typedef void (*ext_int_handler_t)(struct ext_code, unsigned int, unsigned long);
@@ -96,17 +102,17 @@ enum irq_subclass {
};
#define CR0_IRQ_SUBCLASS_MASK \
- ((1UL << (63 - 30)) /* Warning Track */ | \
- (1UL << (63 - 48)) /* Malfunction Alert */ | \
- (1UL << (63 - 49)) /* Emergency Signal */ | \
- (1UL << (63 - 50)) /* External Call */ | \
- (1UL << (63 - 52)) /* Clock Comparator */ | \
- (1UL << (63 - 53)) /* CPU Timer */ | \
- (1UL << (63 - 54)) /* Service Signal */ | \
- (1UL << (63 - 57)) /* Interrupt Key */ | \
- (1UL << (63 - 58)) /* Measurement Alert */ | \
- (1UL << (63 - 59)) /* Timing Alert */ | \
- (1UL << (63 - 62))) /* IUCV */
+ (CR0_WARNING_TRACK | \
+ CR0_MALFUNCTION_ALERT_SUBMASK | \
+ CR0_EMERGENCY_SIGNAL_SUBMASK | \
+ CR0_EXTERNAL_CALL_SUBMASK | \
+ CR0_CLOCK_COMPARATOR_SUBMASK | \
+ CR0_CPU_TIMER_SUBMASK | \
+ CR0_SERVICE_SIGNAL_SUBMASK | \
+ CR0_INTERRUPT_KEY_SUBMASK | \
+ CR0_MEASUREMENT_ALERT_SUBMASK | \
+ CR0_ETR_SUBMASK | \
+ CR0_IUCV)
void irq_subclass_register(enum irq_subclass subclass);
void irq_subclass_unregister(enum irq_subclass subclass);
diff --git a/arch/s390/include/asm/irq_work.h b/arch/s390/include/asm/irq_work.h
new file mode 100644
index 000000000000..f00c9f610d5a
--- /dev/null
+++ b/arch/s390/include/asm/irq_work.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_IRQ_WORK_H
+#define _ASM_S390_IRQ_WORK_H
+
+static inline bool arch_irq_work_has_interrupt(void)
+{
+ return true;
+}
+
+#endif /* _ASM_S390_IRQ_WORK_H */
diff --git a/arch/s390/include/asm/irqflags.h b/arch/s390/include/asm/irqflags.h
index 586df4c9e2f2..02427b205c11 100644
--- a/arch/s390/include/asm/irqflags.h
+++ b/arch/s390/include/asm/irqflags.h
@@ -32,45 +32,45 @@
})
/* set system mask. */
-static inline notrace void __arch_local_irq_ssm(unsigned long flags)
+static __always_inline void __arch_local_irq_ssm(unsigned long flags)
{
asm volatile("ssm %0" : : "Q" (flags) : "memory");
}
-static inline notrace unsigned long arch_local_save_flags(void)
+static __always_inline unsigned long arch_local_save_flags(void)
{
return __arch_local_irq_stnsm(0xff);
}
-static inline notrace unsigned long arch_local_irq_save(void)
+static __always_inline unsigned long arch_local_irq_save(void)
{
return __arch_local_irq_stnsm(0xfc);
}
-static inline notrace void arch_local_irq_disable(void)
+static __always_inline void arch_local_irq_disable(void)
{
arch_local_irq_save();
}
-static inline notrace void arch_local_irq_enable(void)
+static __always_inline void arch_local_irq_enable(void)
{
__arch_local_irq_stosm(0x03);
}
/* This only restores external and I/O interrupt state */
-static inline notrace void arch_local_irq_restore(unsigned long flags)
+static __always_inline void arch_local_irq_restore(unsigned long flags)
{
/* only disabled->disabled and disabled->enabled is valid */
if (flags & ARCH_IRQ_ENABLED)
arch_local_irq_enable();
}
-static inline notrace bool arch_irqs_disabled_flags(unsigned long flags)
+static __always_inline bool arch_irqs_disabled_flags(unsigned long flags)
{
return !(flags & ARCH_IRQ_ENABLED);
}
-static inline notrace bool arch_irqs_disabled(void)
+static __always_inline bool arch_irqs_disabled(void)
{
return arch_irqs_disabled_flags(arch_local_save_flags());
}
diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h
index 39f747d63758..895f774bbcc5 100644
--- a/arch/s390/include/asm/jump_label.h
+++ b/arch/s390/include/asm/jump_label.h
@@ -2,27 +2,30 @@
#ifndef _ASM_S390_JUMP_LABEL_H
#define _ASM_S390_JUMP_LABEL_H
+#define HAVE_JUMP_LABEL_BATCH
+
#ifndef __ASSEMBLY__
#include <linux/types.h>
#include <linux/stringify.h>
#define JUMP_LABEL_NOP_SIZE 6
-#define JUMP_LABEL_NOP_OFFSET 2
-#if __GNUC__ < 9
+#ifdef CONFIG_CC_IS_CLANG
+#define JUMP_LABEL_STATIC_KEY_CONSTRAINT "i"
+#elif __GNUC__ < 9
#define JUMP_LABEL_STATIC_KEY_CONSTRAINT "X"
#else
#define JUMP_LABEL_STATIC_KEY_CONSTRAINT "jdd"
#endif
/*
- * We use a brcl 0,2 instruction for jump labels at compile time so it
+ * We use a brcl 0,<offset> instruction for jump labels so it
* can be easily distinguished from a hotpatch generated instruction.
*/
static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
- asm_volatile_goto("0: brcl 0,"__stringify(JUMP_LABEL_NOP_OFFSET)"\n"
+ asm_volatile_goto("0: brcl 0,%l[label]\n"
".pushsection __jump_table,\"aw\"\n"
".balign 8\n"
".long 0b-.,%l[label]-.\n"
diff --git a/arch/s390/include/asm/kasan.h b/arch/s390/include/asm/kasan.h
index 70930fe5c496..0cffead0f2f2 100644
--- a/arch/s390/include/asm/kasan.h
+++ b/arch/s390/include/asm/kasan.h
@@ -2,29 +2,17 @@
#ifndef __ASM_KASAN_H
#define __ASM_KASAN_H
-#include <asm/pgtable.h>
+#include <linux/const.h>
#ifdef CONFIG_KASAN
#define KASAN_SHADOW_SCALE_SHIFT 3
-#ifdef CONFIG_KASAN_S390_4_LEVEL_PAGING
#define KASAN_SHADOW_SIZE \
(_AC(1, UL) << (_REGION1_SHIFT - KASAN_SHADOW_SCALE_SHIFT))
-#else
-#define KASAN_SHADOW_SIZE \
- (_AC(1, UL) << (_REGION2_SHIFT - KASAN_SHADOW_SCALE_SHIFT))
-#endif
#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
#define KASAN_SHADOW_START KASAN_SHADOW_OFFSET
#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
-extern void kasan_early_init(void);
-extern void kasan_copy_shadow(pgd_t *dst);
-extern void kasan_free_early_identity(void);
-#else
-static inline void kasan_early_init(void) { }
-static inline void kasan_copy_shadow(pgd_t *dst) { }
-static inline void kasan_free_early_identity(void) { }
#endif
#endif
diff --git a/arch/s390/include/asm/kdebug.h b/arch/s390/include/asm/kdebug.h
index d5327f064799..4377238e4752 100644
--- a/arch/s390/include/asm/kdebug.h
+++ b/arch/s390/include/asm/kdebug.h
@@ -23,6 +23,6 @@ enum die_val {
DIE_NMI_IPI,
};
-extern void die(struct pt_regs *, const char *);
+extern void __noreturn die(struct pt_regs *, const char *);
#endif
diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h
index ea398a05f643..1bd08eb56d5f 100644
--- a/arch/s390/include/asm/kexec.h
+++ b/arch/s390/include/asm/kexec.h
@@ -9,6 +9,8 @@
#ifndef _S390_KEXEC_H
#define _S390_KEXEC_H
+#include <linux/module.h>
+
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/setup.h>
@@ -29,7 +31,7 @@
#define KEXEC_CONTROL_MEMORY_LIMIT (1UL<<31)
/* Allocate control page with GFP_DMA */
-#define KEXEC_CONTROL_MEMORY_GFP GFP_DMA
+#define KEXEC_CONTROL_MEMORY_GFP (GFP_DMA | __GFP_NORETRY)
/* Maximum address we can use for the crash control pages */
#define KEXEC_CRASH_CONTROL_MEMORY_LIMIT (-1UL)
@@ -74,7 +76,35 @@ void *kexec_file_add_components(struct kimage *image,
int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val,
unsigned long addr);
+#define ARCH_HAS_KIMAGE_ARCH
+
+struct kimage_arch {
+ void *ipl_buf;
+};
+
extern const struct kexec_file_ops s390_kexec_image_ops;
extern const struct kexec_file_ops s390_kexec_elf_ops;
+#ifdef CONFIG_CRASH_DUMP
+void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
+#define crash_free_reserved_phys_range crash_free_reserved_phys_range
+
+void arch_kexec_protect_crashkres(void);
+#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres
+
+void arch_kexec_unprotect_crashkres(void);
+#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres
+#endif
+
+#ifdef CONFIG_KEXEC_FILE
+struct purgatory_info;
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ Elf_Shdr *section,
+ const Elf_Shdr *relsec,
+ const Elf_Shdr *symtab);
+#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image);
+#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
+#endif
#endif /*_S390_KEXEC_H */
diff --git a/arch/s390/include/asm/kfence.h b/arch/s390/include/asm/kfence.h
new file mode 100644
index 000000000000..e47fd8cbe701
--- /dev/null
+++ b/arch/s390/include/asm/kfence.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_KFENCE_H
+#define _ASM_S390_KFENCE_H
+
+#include <linux/mm.h>
+#include <linux/kfence.h>
+#include <asm/set_memory.h>
+#include <asm/page.h>
+
+void __kernel_map_pages(struct page *page, int numpages, int enable);
+
+static __always_inline bool arch_kfence_init_pool(void)
+{
+ return true;
+}
+
+#define arch_kfence_test_address(addr) ((addr) & PAGE_MASK)
+
+/*
+ * Do not split kfence pool to 4k mapping with arch_kfence_init_pool(),
+ * but earlier where page table allocations still happen with memblock.
+ * Reason is that arch_kfence_init_pool() gets called when the system
+ * is still in a limbo state - disabling and enabling bottom halves is
+ * not yet allowed, but that is what our page_table_alloc() would do.
+ */
+static __always_inline void kfence_split_mapping(void)
+{
+#ifdef CONFIG_KFENCE
+ unsigned long pool_pages = KFENCE_POOL_SIZE >> PAGE_SHIFT;
+
+ set_memory_4k((unsigned long)__kfence_pool, pool_pages);
+#endif
+}
+
+static inline bool kfence_protect_page(unsigned long addr, bool protect)
+{
+ __kernel_map_pages(virt_to_page((void *)addr), 1, !protect);
+ return true;
+}
+
+#endif /* _ASM_S390_KFENCE_H */
diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h
index b106aa29bf55..01f1682a73b7 100644
--- a/arch/s390/include/asm/kprobes.h
+++ b/arch/s390/include/asm/kprobes.h
@@ -15,6 +15,7 @@
* <grundym@us.ibm.com>
*/
#include <linux/types.h>
+#include <asm/ctlreg.h>
#include <asm-generic/kprobes.h>
#define BREAKPOINT_INSTRUCTION 0x0002
@@ -54,7 +55,6 @@ typedef u16 kprobe_opcode_t;
struct arch_specific_insn {
/* copy of original instruction */
kprobe_opcode_t *insn;
- unsigned int is_ftrace_insn : 1;
};
struct prev_kprobe {
@@ -66,16 +66,13 @@ struct prev_kprobe {
struct kprobe_ctlblk {
unsigned long kprobe_status;
unsigned long kprobe_saved_imask;
- unsigned long kprobe_saved_ctl[3];
+ struct ctlreg kprobe_saved_ctl[3];
struct prev_kprobe prev_kprobe;
};
void arch_remove_kprobe(struct kprobe *p);
-void kretprobe_trampoline(void);
int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-int kprobe_exceptions_notify(struct notifier_block *self,
- unsigned long val, void *data);
#define flush_insn_slot(p) do { } while (0)
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 02f4c21c57f6..52664105a473 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -19,6 +19,8 @@
#include <linux/kvm.h>
#include <linux/seqlock.h>
#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/mmu_notifier.h>
#include <asm/debug.h>
#include <asm/cpu.h>
#include <asm/fpu/api.h>
@@ -28,15 +30,14 @@
#define KVM_S390_BSCA_CPU_SLOTS 64
#define KVM_S390_ESCA_CPU_SLOTS 248
#define KVM_MAX_VCPUS 255
-#define KVM_USER_MEM_SLOTS 32
/*
- * These seem to be used for allocating ->chip in the routing table,
- * which we don't use. 4096 is an out-of-thin-air value. If we need
- * to look at ->chip later on, we'll need to revisit this.
+ * These seem to be used for allocating ->chip in the routing table, which we
+ * don't use. 1 is as small as we can get to reduce the needed memory. If we
+ * need to look at ->chip later on, we'll need to revisit this.
*/
#define KVM_NR_IRQCHIPS 1
-#define KVM_IRQCHIP_NUM_PINS 4096
+#define KVM_IRQCHIP_NUM_PINS 1
#define KVM_HALT_POLL_NS_DEFAULT 50000
/* s390-specific vcpu->requests bit members */
@@ -46,6 +47,8 @@
#define KVM_REQ_START_MIGRATION KVM_ARCH_REQ(3)
#define KVM_REQ_STOP_MIGRATION KVM_ARCH_REQ(4)
#define KVM_REQ_VSIE_RESTART KVM_ARCH_REQ(5)
+#define KVM_REQ_REFRESH_GUEST_PREFIX \
+ KVM_ARCH_REQ_FLAGS(6, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define SIGP_CTRL_C 0x80
#define SIGP_CTRL_SCN_MASK 0x3f
@@ -92,19 +95,30 @@ union ipte_control {
};
};
+union sca_utility {
+ __u16 val;
+ struct {
+ __u16 mtcr : 1;
+ __u16 reserved : 15;
+ };
+};
+
struct bsca_block {
union ipte_control ipte_control;
__u64 reserved[5];
__u64 mcn;
- __u64 reserved2;
+ union sca_utility utility;
+ __u8 reserved2[6];
struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS];
};
struct esca_block {
union ipte_control ipte_control;
- __u64 reserved1[7];
+ __u64 reserved1[6];
+ union sca_utility utility;
+ __u8 reserved2[6];
__u64 mcn[4];
- __u64 reserved2[20];
+ __u64 reserved3[20];
struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS];
};
@@ -122,6 +136,16 @@ struct mcck_volatile_info {
__u32 reserved;
};
+#define CR0_INITIAL_MASK (CR0_UNUSED_56 | CR0_INTERRUPT_KEY_SUBMASK | \
+ CR0_MEASUREMENT_ALERT_SUBMASK)
+#define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \
+ CR14_EXTERNAL_DAMAGE_SUBMASK)
+
+#define SIDAD_SIZE_MASK 0xff
+#define sida_addr(sie_block) phys_to_virt((sie_block)->sidad & PAGE_MASK)
+#define sida_size(sie_block) \
+ ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE)
+
#define CPUSTAT_STOPPED 0x80000000
#define CPUSTAT_WAIT 0x10000000
#define CPUSTAT_ECALL_PEND 0x08000000
@@ -155,7 +179,13 @@ struct kvm_s390_sie_block {
__u8 reserved08[4]; /* 0x0008 */
#define PROG_IN_SIE (1<<0)
__u32 prog0c; /* 0x000c */
- __u8 reserved10[16]; /* 0x0010 */
+ union {
+ __u8 reserved10[16]; /* 0x0010 */
+ struct {
+ __u64 pv_handle_cpu;
+ __u64 pv_handle_config;
+ };
+ };
#define PROG_BLOCK_SIE (1<<0)
#define PROG_REQUEST (1<<1)
atomic_t prog20; /* 0x0020 */
@@ -204,10 +234,23 @@ struct kvm_s390_sie_block {
#define ICPT_PARTEXEC 0x38
#define ICPT_IOINST 0x40
#define ICPT_KSS 0x5c
+#define ICPT_MCHKREQ 0x60
+#define ICPT_INT_ENABLE 0x64
+#define ICPT_PV_INSTR 0x68
+#define ICPT_PV_NOTIFY 0x6c
+#define ICPT_PV_PREF 0x70
__u8 icptcode; /* 0x0050 */
__u8 icptstatus; /* 0x0051 */
__u16 ihcpu; /* 0x0052 */
- __u8 reserved54[2]; /* 0x0054 */
+ __u8 reserved54; /* 0x0054 */
+#define IICTL_CODE_NONE 0x00
+#define IICTL_CODE_MCHK 0x01
+#define IICTL_CODE_EXT 0x02
+#define IICTL_CODE_IO 0x03
+#define IICTL_CODE_RESTART 0x04
+#define IICTL_CODE_SPECIFICATION 0x10
+#define IICTL_CODE_OPERAND 0x11
+ __u8 iictl; /* 0x0055 */
__u16 ipa; /* 0x0056 */
__u32 ipb; /* 0x0058 */
__u32 scaoh; /* 0x005c */
@@ -215,22 +258,29 @@ struct kvm_s390_sie_block {
__u8 fpf; /* 0x0060 */
#define ECB_GS 0x40
#define ECB_TE 0x10
+#define ECB_SPECI 0x08
#define ECB_SRSI 0x04
#define ECB_HOSTPROTINT 0x02
+#define ECB_PTF 0x01
__u8 ecb; /* 0x0061 */
#define ECB2_CMMA 0x80
#define ECB2_IEP 0x20
#define ECB2_PFMFI 0x08
#define ECB2_ESCA 0x04
+#define ECB2_ZPCI_LSI 0x02
__u8 ecb2; /* 0x0062 */
+#define ECB3_AISI 0x20
+#define ECB3_AISII 0x10
#define ECB3_DEA 0x08
#define ECB3_AES 0x04
#define ECB3_RI 0x01
__u8 ecb3; /* 0x0063 */
+#define ESCA_SCAOL_MASK ~0x3fU
__u32 scaol; /* 0x0064 */
- __u8 reserved68; /* 0x0068 */
+ __u8 sdf; /* 0x0068 */
__u8 epdx; /* 0x0069 */
- __u8 reserved6a[2]; /* 0x006a */
+ __u8 cpnc; /* 0x006a */
+ __u8 reserved6b; /* 0x006b */
__u32 todpr; /* 0x006c */
#define GISA_FORMAT1 0x00000001
__u32 gd; /* 0x0070 */
@@ -244,31 +294,58 @@ struct kvm_s390_sie_block {
#define HPID_KVM 0x4
#define HPID_VSIE 0x5
__u8 hpid; /* 0x00b8 */
- __u8 reservedb9[11]; /* 0x00b9 */
- __u16 extcpuaddr; /* 0x00c4 */
- __u16 eic; /* 0x00c6 */
+ __u8 reservedb9[7]; /* 0x00b9 */
+ union {
+ struct {
+ __u32 eiparams; /* 0x00c0 */
+ __u16 extcpuaddr; /* 0x00c4 */
+ __u16 eic; /* 0x00c6 */
+ };
+ __u64 mcic; /* 0x00c0 */
+ } __packed;
__u32 reservedc8; /* 0x00c8 */
- __u16 pgmilc; /* 0x00cc */
- __u16 iprcc; /* 0x00ce */
- __u32 dxc; /* 0x00d0 */
- __u16 mcn; /* 0x00d4 */
- __u8 perc; /* 0x00d6 */
- __u8 peratmid; /* 0x00d7 */
+ union {
+ struct {
+ __u16 pgmilc; /* 0x00cc */
+ __u16 iprcc; /* 0x00ce */
+ };
+ __u32 edc; /* 0x00cc */
+ } __packed;
+ union {
+ struct {
+ __u32 dxc; /* 0x00d0 */
+ __u16 mcn; /* 0x00d4 */
+ __u8 perc; /* 0x00d6 */
+ __u8 peratmid; /* 0x00d7 */
+ };
+ __u64 faddr; /* 0x00d0 */
+ } __packed;
__u64 peraddr; /* 0x00d8 */
__u8 eai; /* 0x00e0 */
__u8 peraid; /* 0x00e1 */
__u8 oai; /* 0x00e2 */
__u8 armid; /* 0x00e3 */
__u8 reservede4[4]; /* 0x00e4 */
- __u64 tecmc; /* 0x00e8 */
- __u8 reservedf0[12]; /* 0x00f0 */
+ union {
+ __u64 tecmc; /* 0x00e8 */
+ struct {
+ __u16 subchannel_id; /* 0x00e8 */
+ __u16 subchannel_nr; /* 0x00ea */
+ __u32 io_int_parm; /* 0x00ec */
+ __u32 io_int_word; /* 0x00f0 */
+ };
+ } __packed;
+ __u8 reservedf4[8]; /* 0x00f4 */
#define CRYCB_FORMAT_MASK 0x00000003
#define CRYCB_FORMAT0 0x00000000
#define CRYCB_FORMAT1 0x00000001
#define CRYCB_FORMAT2 0x00000003
__u32 crycbd; /* 0x00fc */
__u64 gcr[16]; /* 0x0100 */
- __u64 gbea; /* 0x0180 */
+ union {
+ __u64 gbea; /* 0x0180 */
+ __u64 sidad;
+ };
__u8 reserved188[8]; /* 0x0188 */
__u64 sdnxo; /* 0x0190 */
__u8 reserved198[8]; /* 0x0198 */
@@ -287,7 +364,7 @@ struct kvm_s390_sie_block {
__u64 itdba; /* 0x01e8 */
__u64 riccbd; /* 0x01f0 */
__u64 gvrd; /* 0x01f8 */
-} __attribute__((packed));
+} __packed __aligned(512);
struct kvm_s390_itdb {
__u8 data[256];
@@ -296,12 +373,15 @@ struct kvm_s390_itdb {
struct sie_page {
struct kvm_s390_sie_block sie_block;
struct mcck_volatile_info mcck_info; /* 0x0200 */
- __u8 reserved218[1000]; /* 0x0218 */
+ __u8 reserved218[360]; /* 0x0218 */
+ __u64 pv_grregs[16]; /* 0x0380 */
+ __u8 reserved400[512]; /* 0x0400 */
struct kvm_s390_itdb itdb; /* 0x0600 */
__u8 reserved700[2304]; /* 0x0700 */
};
struct kvm_vcpu_stat {
+ struct kvm_vcpu_stat_generic generic;
u64 exit_userspace;
u64 exit_null;
u64 exit_external_request;
@@ -311,11 +391,7 @@ struct kvm_vcpu_stat {
u64 exit_validity;
u64 exit_instruction;
u64 exit_pei;
- u64 halt_successful_poll;
- u64 halt_attempted_poll;
- u64 halt_poll_invalid;
u64 halt_no_poll_steal;
- u64 halt_wakeup;
u64 instruction_lctl;
u64 instruction_lctlg;
u64 instruction_stctl;
@@ -389,14 +465,16 @@ struct kvm_vcpu_stat {
u64 instruction_sigp_init_cpu_reset;
u64 instruction_sigp_cpu_reset;
u64 instruction_sigp_unknown;
- u64 diagnose_10;
- u64 diagnose_44;
- u64 diagnose_9c;
- u64 diagnose_9c_ignored;
- u64 diagnose_258;
- u64 diagnose_308;
- u64 diagnose_500;
- u64 diagnose_other;
+ u64 instruction_diagnose_10;
+ u64 instruction_diagnose_44;
+ u64 instruction_diagnose_9c;
+ u64 diag_9c_ignored;
+ u64 diag_9c_forward;
+ u64 instruction_diagnose_258;
+ u64 instruction_diagnose_308;
+ u64 instruction_diagnose_500;
+ u64 instruction_diagnose_other;
+ u64 pfault_sync;
};
#define PGM_OPERATION 0x01
@@ -471,6 +549,7 @@ enum irq_types {
IRQ_PEND_PFAULT_INIT,
IRQ_PEND_EXT_HOST,
IRQ_PEND_EXT_SERVICE,
+ IRQ_PEND_EXT_SERVICE_EV,
IRQ_PEND_EXT_TIMING,
IRQ_PEND_EXT_CPU_TIMER,
IRQ_PEND_EXT_CLOCK_COMP,
@@ -515,6 +594,7 @@ enum irq_types {
(1UL << IRQ_PEND_EXT_TIMING) | \
(1UL << IRQ_PEND_EXT_HOST) | \
(1UL << IRQ_PEND_EXT_SERVICE) | \
+ (1UL << IRQ_PEND_EXT_SERVICE_EV) | \
(1UL << IRQ_PEND_VIRTIO) | \
(1UL << IRQ_PEND_PFAULT_INIT) | \
(1UL << IRQ_PEND_PFAULT_DONE))
@@ -531,6 +611,13 @@ enum irq_types {
#define IRQ_PEND_MCHK_MASK ((1UL << IRQ_PEND_MCHK_REP) | \
(1UL << IRQ_PEND_MCHK_EX))
+#define IRQ_PEND_EXT_II_MASK ((1UL << IRQ_PEND_EXT_CPU_TIMER) | \
+ (1UL << IRQ_PEND_EXT_CLOCK_COMP) | \
+ (1UL << IRQ_PEND_EXT_EMERGENCY) | \
+ (1UL << IRQ_PEND_EXT_EXTERNAL) | \
+ (1UL << IRQ_PEND_EXT_SERVICE) | \
+ (1UL << IRQ_PEND_EXT_SERVICE_EV))
+
struct kvm_s390_interrupt_info {
struct list_head list;
u64 type;
@@ -589,6 +676,7 @@ struct kvm_s390_local_interrupt {
struct kvm_s390_float_interrupt {
unsigned long pending_irqs;
+ unsigned long masked_irqs;
spinlock_t lock;
struct list_head lists[FIRQ_LIST_COUNT];
int counters[FIRQ_MAX_COUNT];
@@ -628,6 +716,10 @@ struct kvm_hw_bp_info_arch {
#define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \
(vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING))
+#define KVM_GUESTDBG_VALID_MASK \
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |\
+ KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_EXIT_PENDING)
+
struct kvm_guestdbg_info_arch {
unsigned long cr0;
unsigned long cr9;
@@ -640,6 +732,11 @@ struct kvm_guestdbg_info_arch {
unsigned long last_bp;
};
+struct kvm_s390_pv_vcpu {
+ u64 handle;
+ unsigned long stor_base;
+};
+
struct kvm_vcpu_arch {
struct kvm_s390_sie_block *sie_block;
/* if vsie is active, currently executed shadow sie control block */
@@ -668,15 +765,25 @@ struct kvm_vcpu_arch {
__u64 cputm_start;
bool gs_enabled;
bool skey_enabled;
+ struct kvm_s390_pv_vcpu pv;
+ union diag318_info diag318_info;
};
struct kvm_vm_stat {
+ struct kvm_vm_stat_generic generic;
u64 inject_io;
u64 inject_float_mchk;
u64 inject_pfault_done;
u64 inject_service_signal;
u64 inject_virtio;
- u64 remote_tlb_flush;
+ u64 aen_forward;
+ u64 gmap_shadow_create;
+ u64 gmap_shadow_reuse;
+ u64 gmap_shadow_r1_entry;
+ u64 gmap_shadow_r2_entry;
+ u64 gmap_shadow_r3_entry;
+ u64 gmap_shadow_sg_entry;
+ u64 gmap_shadow_pg_entry;
};
struct kvm_arch_memory_slot {
@@ -696,9 +803,6 @@ struct s390_io_adapter {
bool masked;
bool swap;
bool suppressible;
- struct rw_semaphore maps_lock;
- struct list_head maps;
- atomic_t nr_maps;
};
#define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
@@ -714,22 +818,22 @@ struct s390_io_adapter {
struct kvm_s390_cpu_model {
/* facility mask supported by kvm & hosting machine */
- __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64];
+ __u64 fac_mask[S390_ARCH_FAC_MASK_SIZE_U64];
struct kvm_s390_vm_cpu_subfunc subfuncs;
/* facility list requested by guest (in dma page) */
__u64 *fac_list;
u64 cpuid;
unsigned short ibc;
+ /* subset of available UV-features for pv-guests enabled by user space */
+ struct kvm_s390_vm_cpu_uv_feat uv_feat_guest;
};
-struct kvm_s390_module_hook {
- int (*hook)(struct kvm_vcpu *vcpu);
- struct module *owner;
-};
+typedef int (*crypto_hook)(struct kvm_vcpu *vcpu);
struct kvm_s390_crypto {
struct kvm_s390_crypto_cb *crycb;
- struct kvm_s390_module_hook *pqap_hook;
+ struct rw_semaphore pqap_hook_rwsem;
+ crypto_hook *pqap_hook;
__u32 crycbd;
__u8 aes_kw;
__u8 dea_kw;
@@ -841,6 +945,17 @@ struct kvm_s390_gisa_interrupt {
DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS);
};
+struct kvm_s390_pv {
+ u64 handle;
+ u64 guest_len;
+ unsigned long stor_base;
+ void *stor_var;
+ bool dumping;
+ void *set_aside;
+ struct list_head need_cleanup;
+ struct mmu_notifier mmu_notifier;
+};
+
struct kvm_arch{
void *sca;
int use_esca;
@@ -855,6 +970,7 @@ struct kvm_arch{
int use_cmma;
int use_pfmfi;
int use_skf;
+ int use_zpci_interp;
int user_cpu_state_ctrl;
int user_sigp;
int user_stsi;
@@ -874,8 +990,12 @@ struct kvm_arch{
atomic64_t cmma_dirty_pages;
/* subset of available cpu features enabled by user space */
DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+ /* indexed by vcpu_idx */
DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
struct kvm_s390_gisa_interrupt gisa_int;
+ struct kvm_s390_pv pv;
+ struct list_head kzdev_list;
+ spinlock_t kzdev_list_lock;
};
#define KVM_HVA_ERR_BAD (-1UL)
@@ -891,33 +1011,42 @@ struct kvm_arch_async_pf {
unsigned long pfault_token;
};
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu);
void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
+static inline void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) {}
+
void kvm_arch_crypto_clear_masks(struct kvm *kvm);
void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
unsigned long *aqm, unsigned long *adm);
-extern int sie64a(struct kvm_s390_sie_block *, u64 *);
+int __sie64a(phys_addr_t sie_block_phys, struct kvm_s390_sie_block *sie_block, u64 *rsa);
+
+static inline int sie64a(struct kvm_s390_sie_block *sie_block, u64 *rsa)
+{
+ return __sie64a(virt_to_phys(sie_block), sie_block, rsa);
+}
+
extern char sie_exit;
+bool kvm_s390_pv_is_protected(struct kvm *kvm);
+bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu);
+
extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc);
-static inline void kvm_arch_hardware_disable(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
-static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_free_memslot(struct kvm *kvm,
- struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
+ struct kvm_memory_slot *slot) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -925,6 +1054,14 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
-void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu);
+#define __KVM_HAVE_ARCH_VM_FREE
+void kvm_arch_free_vm(struct kvm *kvm);
+
+struct zpci_kvm_hook {
+ int (*kvm_register)(void *opaque, struct kvm *kvm);
+ void (*kvm_unregister)(void *opaque);
+};
+
+extern struct zpci_kvm_hook zpci_kvm_hook;
#endif
diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h
index cbc7c3a68e4d..df73a052760c 100644
--- a/arch/s390/include/asm/kvm_para.h
+++ b/arch/s390/include/asm/kvm_para.h
@@ -24,162 +24,79 @@
#include <uapi/asm/kvm_para.h>
#include <asm/diag.h>
-static inline long __kvm_hypercall0(unsigned long nr)
-{
- register unsigned long __nr asm("1") = nr;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr): "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall0(unsigned long nr)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall0(nr);
-}
-
-static inline long __kvm_hypercall1(unsigned long nr, unsigned long p1)
-{
- register unsigned long __nr asm("1") = nr;
- register unsigned long __p1 asm("2") = p1;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr), "0" (__p1) : "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall1(unsigned long nr, unsigned long p1)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall1(nr, p1);
-}
-
-static inline long __kvm_hypercall2(unsigned long nr, unsigned long p1,
- unsigned long p2)
-{
- register unsigned long __nr asm("1") = nr;
- register unsigned long __p1 asm("2") = p1;
- register unsigned long __p2 asm("3") = p2;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2)
- : "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall2(unsigned long nr, unsigned long p1,
- unsigned long p2)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall2(nr, p1, p2);
-}
-
-static inline long __kvm_hypercall3(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3)
-{
- register unsigned long __nr asm("1") = nr;
- register unsigned long __p1 asm("2") = p1;
- register unsigned long __p2 asm("3") = p2;
- register unsigned long __p3 asm("4") = p3;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
- "d" (__p3) : "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall3(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall3(nr, p1, p2, p3);
-}
-
-static inline long __kvm_hypercall4(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3,
- unsigned long p4)
-{
- register unsigned long __nr asm("1") = nr;
- register unsigned long __p1 asm("2") = p1;
- register unsigned long __p2 asm("3") = p2;
- register unsigned long __p3 asm("4") = p3;
- register unsigned long __p4 asm("5") = p4;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
- "d" (__p3), "d" (__p4) : "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall4(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3,
- unsigned long p4)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall4(nr, p1, p2, p3, p4);
-}
-
-static inline long __kvm_hypercall5(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3,
- unsigned long p4, unsigned long p5)
-{
- register unsigned long __nr asm("1") = nr;
- register unsigned long __p1 asm("2") = p1;
- register unsigned long __p2 asm("3") = p2;
- register unsigned long __p3 asm("4") = p3;
- register unsigned long __p4 asm("5") = p4;
- register unsigned long __p5 asm("6") = p5;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
- "d" (__p3), "d" (__p4), "d" (__p5) : "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall5(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3,
- unsigned long p4, unsigned long p5)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall5(nr, p1, p2, p3, p4, p5);
-}
-
-static inline long __kvm_hypercall6(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3,
- unsigned long p4, unsigned long p5,
- unsigned long p6)
-{
- register unsigned long __nr asm("1") = nr;
- register unsigned long __p1 asm("2") = p1;
- register unsigned long __p2 asm("3") = p2;
- register unsigned long __p3 asm("4") = p3;
- register unsigned long __p4 asm("5") = p4;
- register unsigned long __p5 asm("6") = p5;
- register unsigned long __p6 asm("7") = p6;
- register long __rc asm("2");
-
- asm volatile ("diag 2,4,0x500\n"
- : "=d" (__rc) : "d" (__nr), "0" (__p1), "d" (__p2),
- "d" (__p3), "d" (__p4), "d" (__p5), "d" (__p6)
- : "memory", "cc");
- return __rc;
-}
-
-static inline long kvm_hypercall6(unsigned long nr, unsigned long p1,
- unsigned long p2, unsigned long p3,
- unsigned long p4, unsigned long p5,
- unsigned long p6)
-{
- diag_stat_inc(DIAG_STAT_X500);
- return __kvm_hypercall6(nr, p1, p2, p3, p4, p5, p6);
-}
+#define HYPERCALL_FMT_0
+#define HYPERCALL_FMT_1 , "0" (r2)
+#define HYPERCALL_FMT_2 , "d" (r3) HYPERCALL_FMT_1
+#define HYPERCALL_FMT_3 , "d" (r4) HYPERCALL_FMT_2
+#define HYPERCALL_FMT_4 , "d" (r5) HYPERCALL_FMT_3
+#define HYPERCALL_FMT_5 , "d" (r6) HYPERCALL_FMT_4
+#define HYPERCALL_FMT_6 , "d" (r7) HYPERCALL_FMT_5
+
+#define HYPERCALL_PARM_0
+#define HYPERCALL_PARM_1 , unsigned long arg1
+#define HYPERCALL_PARM_2 HYPERCALL_PARM_1, unsigned long arg2
+#define HYPERCALL_PARM_3 HYPERCALL_PARM_2, unsigned long arg3
+#define HYPERCALL_PARM_4 HYPERCALL_PARM_3, unsigned long arg4
+#define HYPERCALL_PARM_5 HYPERCALL_PARM_4, unsigned long arg5
+#define HYPERCALL_PARM_6 HYPERCALL_PARM_5, unsigned long arg6
+
+#define HYPERCALL_REGS_0
+#define HYPERCALL_REGS_1 \
+ register unsigned long r2 asm("2") = arg1
+#define HYPERCALL_REGS_2 \
+ HYPERCALL_REGS_1; \
+ register unsigned long r3 asm("3") = arg2
+#define HYPERCALL_REGS_3 \
+ HYPERCALL_REGS_2; \
+ register unsigned long r4 asm("4") = arg3
+#define HYPERCALL_REGS_4 \
+ HYPERCALL_REGS_3; \
+ register unsigned long r5 asm("5") = arg4
+#define HYPERCALL_REGS_5 \
+ HYPERCALL_REGS_4; \
+ register unsigned long r6 asm("6") = arg5
+#define HYPERCALL_REGS_6 \
+ HYPERCALL_REGS_5; \
+ register unsigned long r7 asm("7") = arg6
+
+#define HYPERCALL_ARGS_0
+#define HYPERCALL_ARGS_1 , arg1
+#define HYPERCALL_ARGS_2 HYPERCALL_ARGS_1, arg2
+#define HYPERCALL_ARGS_3 HYPERCALL_ARGS_2, arg3
+#define HYPERCALL_ARGS_4 HYPERCALL_ARGS_3, arg4
+#define HYPERCALL_ARGS_5 HYPERCALL_ARGS_4, arg5
+#define HYPERCALL_ARGS_6 HYPERCALL_ARGS_5, arg6
+
+#define GENERATE_KVM_HYPERCALL_FUNC(args) \
+static inline \
+long __kvm_hypercall##args(unsigned long nr HYPERCALL_PARM_##args) \
+{ \
+ register unsigned long __nr asm("1") = nr; \
+ register long __rc asm("2"); \
+ HYPERCALL_REGS_##args; \
+ \
+ asm volatile ( \
+ " diag 2,4,0x500\n" \
+ : "=d" (__rc) \
+ : "d" (__nr) HYPERCALL_FMT_##args \
+ : "memory", "cc"); \
+ return __rc; \
+} \
+ \
+static inline \
+long kvm_hypercall##args(unsigned long nr HYPERCALL_PARM_##args) \
+{ \
+ diag_stat_inc(DIAG_STAT_X500); \
+ return __kvm_hypercall##args(nr HYPERCALL_ARGS_##args); \
+}
+
+GENERATE_KVM_HYPERCALL_FUNC(0)
+GENERATE_KVM_HYPERCALL_FUNC(1)
+GENERATE_KVM_HYPERCALL_FUNC(2)
+GENERATE_KVM_HYPERCALL_FUNC(3)
+GENERATE_KVM_HYPERCALL_FUNC(4)
+GENERATE_KVM_HYPERCALL_FUNC(5)
+GENERATE_KVM_HYPERCALL_FUNC(6)
/* kvm on s390 is always paravirtualization enabled */
static inline int kvm_para_available(void)
diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h
index 7f22262b0e46..df3fb7d8227b 100644
--- a/arch/s390/include/asm/linkage.h
+++ b/arch/s390/include/asm/linkage.h
@@ -4,36 +4,7 @@
#include <linux/stringify.h>
-#define __ALIGN .align 4, 0x07
+#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x07
#define __ALIGN_STR __stringify(__ALIGN)
-#ifndef __ASSEMBLY__
-
-/*
- * Helper macro for exception table entries
- */
-#define EX_TABLE(_fault, _target) \
- ".section __ex_table,\"a\"\n" \
- ".align 4\n" \
- ".long (" #_fault ") - .\n" \
- ".long (" #_target ") - .\n" \
- ".previous\n"
-
-#else /* __ASSEMBLY__ */
-
-#define EX_TABLE(_fault, _target) \
- .section __ex_table,"a" ; \
- .align 4 ; \
- .long (_fault) - . ; \
- .long (_target) - . ; \
- .previous
-
-#define EX_TABLE_DMA(_fault, _target) \
- .section .dma.ex_table, "a" ; \
- .align 4 ; \
- .long (_fault) - . ; \
- .long (_target) - . ; \
- .previous
-
-#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h
deleted file mode 100644
index 818612b784cd..000000000000
--- a/arch/s390/include/asm/livepatch.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * livepatch.h - s390-specific Kernel Live Patching Core
- *
- * Copyright (c) 2013-2015 SUSE
- * Authors: Jiri Kosina
- * Vojtech Pavlik
- * Jiri Slaby
- */
-
-#ifndef ASM_LIVEPATCH_H
-#define ASM_LIVEPATCH_H
-
-#include <asm/ptrace.h>
-
-static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
-{
- regs->psw.addr = ip;
-}
-
-#endif
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 237ee0c4169f..5dc1b6345006 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -11,27 +11,46 @@
#include <linux/types.h>
#include <asm/ptrace.h>
+#include <asm/ctlreg.h>
#include <asm/cpu.h>
#include <asm/types.h>
#define LC_ORDER 1
#define LC_PAGES 2
+struct pgm_tdb {
+ u64 data[32];
+};
+
struct lowcore {
__u8 pad_0x0000[0x0014-0x0000]; /* 0x0000 */
__u32 ipl_parmblock_ptr; /* 0x0014 */
__u8 pad_0x0018[0x0080-0x0018]; /* 0x0018 */
__u32 ext_params; /* 0x0080 */
- __u16 ext_cpu_addr; /* 0x0084 */
- __u16 ext_int_code; /* 0x0086 */
- __u16 svc_ilc; /* 0x0088 */
- __u16 svc_code; /* 0x008a */
- __u16 pgm_ilc; /* 0x008c */
- __u16 pgm_code; /* 0x008e */
+ union {
+ struct {
+ __u16 ext_cpu_addr; /* 0x0084 */
+ __u16 ext_int_code; /* 0x0086 */
+ };
+ __u32 ext_int_code_addr;
+ };
+ __u32 svc_int_code; /* 0x0088 */
+ union {
+ struct {
+ __u16 pgm_ilc; /* 0x008c */
+ __u16 pgm_code; /* 0x008e */
+ };
+ __u32 pgm_int_code;
+ };
__u32 data_exc_code; /* 0x0090 */
__u16 mon_class_num; /* 0x0094 */
- __u8 per_code; /* 0x0096 */
- __u8 per_atmid; /* 0x0097 */
+ union {
+ struct {
+ __u8 per_code; /* 0x0096 */
+ __u8 per_atmid; /* 0x0097 */
+ };
+ __u16 per_code_combined;
+ };
__u64 per_address; /* 0x0098 */
__u8 exc_access_id; /* 0x00a0 */
__u8 per_access_id; /* 0x00a1 */
@@ -40,10 +59,15 @@ struct lowcore {
__u8 pad_0x00a4[0x00a8-0x00a4]; /* 0x00a4 */
__u64 trans_exc_code; /* 0x00a8 */
__u64 monitor_code; /* 0x00b0 */
- __u16 subchannel_id; /* 0x00b8 */
- __u16 subchannel_nr; /* 0x00ba */
- __u32 io_int_parm; /* 0x00bc */
- __u32 io_int_word; /* 0x00c0 */
+ union {
+ struct {
+ __u16 subchannel_id; /* 0x00b8 */
+ __u16 subchannel_nr; /* 0x00ba */
+ __u32 io_int_parm; /* 0x00bc */
+ __u32 io_int_word; /* 0x00c0 */
+ };
+ struct tpi_info tpi_info; /* 0x00b8 */
+ };
__u8 pad_0x00c4[0x00c8-0x00c4]; /* 0x00c4 */
__u32 stfl_fac_list; /* 0x00c8 */
__u8 pad_0x00cc[0x00e8-0x00cc]; /* 0x00cc */
@@ -52,7 +76,7 @@ struct lowcore {
__u32 external_damage_code; /* 0x00f4 */
__u64 failing_storage_address; /* 0x00f8 */
__u8 pad_0x0100[0x0110-0x0100]; /* 0x0100 */
- __u64 breaking_event_addr; /* 0x0110 */
+ __u64 pgm_last_break; /* 0x0110 */
__u8 pad_0x0118[0x0120-0x0118]; /* 0x0118 */
psw_t restart_old_psw; /* 0x0120 */
psw_t external_old_psw; /* 0x0130 */
@@ -80,9 +104,10 @@ struct lowcore {
psw_t return_psw; /* 0x0290 */
psw_t return_mcck_psw; /* 0x02a0 */
+ __u64 last_break; /* 0x02b0 */
+
/* CPU accounting and timing values. */
- __u64 sync_enter_timer; /* 0x02b0 */
- __u64 async_enter_timer; /* 0x02b8 */
+ __u64 sys_enter_timer; /* 0x02b8 */
__u64 mcck_enter_timer; /* 0x02c0 */
__u64 exit_timer; /* 0x02c8 */
__u64 user_timer; /* 0x02d0 */
@@ -94,8 +119,8 @@ struct lowcore {
__u64 avg_steal_timer; /* 0x0300 */
__u64 last_update_timer; /* 0x0308 */
__u64 last_update_clock; /* 0x0310 */
- __u64 int_clock; /* 0x0318*/
- __u64 mcck_clock; /* 0x0320 */
+ __u64 int_clock; /* 0x0318 */
+ __u8 pad_0x0320[0x0328-0x0320]; /* 0x0320 */
__u64 clock_comparator; /* 0x0328 */
__u64 boot_clock[2]; /* 0x0330 */
@@ -107,16 +132,16 @@ struct lowcore {
__u64 async_stack; /* 0x0350 */
__u64 nodat_stack; /* 0x0358 */
__u64 restart_stack; /* 0x0360 */
-
+ __u64 mcck_stack; /* 0x0368 */
/* Restart function and parameter. */
- __u64 restart_fn; /* 0x0368 */
- __u64 restart_data; /* 0x0370 */
- __u64 restart_source; /* 0x0378 */
+ __u64 restart_fn; /* 0x0370 */
+ __u64 restart_data; /* 0x0378 */
+ __u32 restart_source; /* 0x0380 */
+ __u32 restart_flags; /* 0x0384 */
/* Address space pointer. */
- __u64 kernel_asce; /* 0x0380 */
- __u64 user_asce; /* 0x0388 */
- __u64 vdso_asce; /* 0x0390 */
+ struct ctlreg kernel_asce; /* 0x0388 */
+ struct ctlreg user_asce; /* 0x0390 */
/*
* The lpp and current_pid fields form a
@@ -134,14 +159,14 @@ struct lowcore {
__u32 spinlock_index; /* 0x03b0 */
__u32 fpu_flags; /* 0x03b4 */
__u64 percpu_offset; /* 0x03b8 */
- __u64 vdso_per_cpu_data; /* 0x03c0 */
+ __u8 pad_0x03c0[0x03c8-0x03c0]; /* 0x03c0 */
__u64 machine_flags; /* 0x03c8 */
__u64 gmap; /* 0x03d0 */
__u8 pad_0x03d8[0x0400-0x03d8]; /* 0x03d8 */
- /* br %r1 trampoline */
- __u16 br_r1_trampoline; /* 0x0400 */
- __u8 pad_0x0402[0x0e00-0x0402]; /* 0x0402 */
+ __u32 return_lpswe; /* 0x0400 */
+ __u32 return_mcck_lpswe; /* 0x0404 */
+ __u8 pad_0x040a[0x0e00-0x0408]; /* 0x0408 */
/*
* 0xe00 contains the address of the IPL Parameter Information
@@ -153,12 +178,7 @@ struct lowcore {
__u64 vmcore_info; /* 0x0e0c */
__u8 pad_0x0e14[0x0e18-0x0e14]; /* 0x0e14 */
__u64 os_info; /* 0x0e18 */
- __u8 pad_0x0e20[0x0f00-0x0e20]; /* 0x0e20 */
-
- /* Extended facility list */
- __u64 stfle_fac_list[16]; /* 0x0f00 */
- __u64 alt_stfle_fac_list[16]; /* 0x0f80 */
- __u8 pad_0x1000[0x11b0-0x1000]; /* 0x1000 */
+ __u8 pad_0x0e20[0x11b0-0x0e20]; /* 0x0e20 */
/* Pointer to the machine check extended save area */
__u64 mcesad; /* 0x11b0 */
@@ -178,13 +198,18 @@ struct lowcore {
__u32 tod_progreg_save_area; /* 0x1324 */
__u32 cpu_timer_save_area[2]; /* 0x1328 */
__u32 clock_comp_save_area[2]; /* 0x1330 */
- __u8 pad_0x1338[0x1340-0x1338]; /* 0x1338 */
+ __u64 last_break_save_area; /* 0x1338 */
__u32 access_regs_save_area[16]; /* 0x1340 */
- __u64 cregs_save_area[16]; /* 0x1380 */
- __u8 pad_0x1400[0x1800-0x1400]; /* 0x1400 */
+ struct ctlreg cregs_save_area[16]; /* 0x1380 */
+ __u8 pad_0x1400[0x1500-0x1400]; /* 0x1400 */
+ /* Cryptography-counter designation */
+ __u64 ccd; /* 0x1500 */
+ /* AI-extension counter designation */
+ __u64 aicd; /* 0x1508 */
+ __u8 pad_0x1510[0x1800-0x1510]; /* 0x1510 */
/* Transaction abort diagnostic block */
- __u8 pgm_tdb[256]; /* 0x1800 */
+ struct pgm_tdb pgm_tdb; /* 0x1800 */
__u8 pad_0x1900[0x2000-0x1900]; /* 0x1900 */
} __packed __aligned(8192);
@@ -197,12 +222,4 @@ static inline void set_prefix(__u32 address)
asm volatile("spx %0" : : "Q" (address) : "memory");
}
-static inline __u32 store_prefix(void)
-{
- __u32 address;
-
- asm volatile("stpx %0" : "=Q" (address));
- return address;
-}
-
#endif /* _ASM_S390_LOWCORE_H */
diff --git a/arch/s390/include/asm/maccess.h b/arch/s390/include/asm/maccess.h
new file mode 100644
index 000000000000..50225940d971
--- /dev/null
+++ b/arch/s390/include/asm/maccess.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_S390_MACCESS_H
+#define __ASM_S390_MACCESS_H
+
+#include <linux/types.h>
+
+#define MEMCPY_REAL_SIZE PAGE_SIZE
+#define MEMCPY_REAL_MASK PAGE_MASK
+
+struct iov_iter;
+
+extern unsigned long __memcpy_real_area;
+extern pte_t *memcpy_real_ptep;
+size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count);
+int memcpy_real(void *dest, unsigned long src, size_t count);
+#ifdef CONFIG_CRASH_DUMP
+int copy_oldmem_kernel(void *dst, unsigned long src, size_t count);
+#endif
+
+#endif /* __ASM_S390_MACCESS_H */
diff --git a/arch/s390/include/asm/mem_detect.h b/arch/s390/include/asm/mem_detect.h
deleted file mode 100644
index a7c922a69050..000000000000
--- a/arch/s390/include/asm/mem_detect.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_S390_MEM_DETECT_H
-#define _ASM_S390_MEM_DETECT_H
-
-#include <linux/types.h>
-
-enum mem_info_source {
- MEM_DETECT_NONE = 0,
- MEM_DETECT_SCLP_STOR_INFO,
- MEM_DETECT_DIAG260,
- MEM_DETECT_SCLP_READ_INFO,
- MEM_DETECT_BIN_SEARCH
-};
-
-struct mem_detect_block {
- u64 start;
- u64 end;
-};
-
-/*
- * Storage element id is defined as 1 byte (up to 256 storage elements).
- * In practise only storage element id 0 and 1 are used).
- * According to architecture one storage element could have as much as
- * 1020 subincrements. 255 mem_detect_blocks are embedded in mem_detect_info.
- * If more mem_detect_blocks are required, a block of memory from already
- * known mem_detect_block is taken (entries_extended points to it).
- */
-#define MEM_INLINED_ENTRIES 255 /* (PAGE_SIZE - 16) / 16 */
-
-struct mem_detect_info {
- u32 count;
- u8 info_source;
- struct mem_detect_block entries[MEM_INLINED_ENTRIES];
- struct mem_detect_block *entries_extended;
-};
-extern struct mem_detect_info mem_detect;
-
-void add_mem_detect_block(u64 start, u64 end);
-
-static inline int __get_mem_detect_block(u32 n, unsigned long *start,
- unsigned long *end)
-{
- if (n >= mem_detect.count) {
- *start = 0;
- *end = 0;
- return -1;
- }
-
- if (n < MEM_INLINED_ENTRIES) {
- *start = (unsigned long)mem_detect.entries[n].start;
- *end = (unsigned long)mem_detect.entries[n].end;
- } else {
- *start = (unsigned long)mem_detect.entries_extended[n - MEM_INLINED_ENTRIES].start;
- *end = (unsigned long)mem_detect.entries_extended[n - MEM_INLINED_ENTRIES].end;
- }
- return 0;
-}
-
-/**
- * for_each_mem_detect_block - early online memory range iterator
- * @i: an integer used as loop variable
- * @p_start: ptr to unsigned long for start address of the range
- * @p_end: ptr to unsigned long for end address of the range
- *
- * Walks over detected online memory ranges.
- */
-#define for_each_mem_detect_block(i, p_start, p_end) \
- for (i = 0, __get_mem_detect_block(i, p_start, p_end); \
- i < mem_detect.count; \
- i++, __get_mem_detect_block(i, p_start, p_end))
-
-static inline void get_mem_detect_reserved(unsigned long *start,
- unsigned long *size)
-{
- *start = (unsigned long)mem_detect.entries_extended;
- if (mem_detect.count > MEM_INLINED_ENTRIES)
- *size = (mem_detect.count - MEM_INLINED_ENTRIES) * sizeof(struct mem_detect_block);
- else
- *size = 0;
-}
-
-static inline unsigned long get_mem_detect_end(void)
-{
- unsigned long start;
- unsigned long end;
-
- if (mem_detect.count) {
- __get_mem_detect_block(mem_detect.count - 1, &start, &end);
- return end;
- }
- return 0;
-}
-
-#endif
diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h
index 2542cbf7e2d1..b85e13505a0f 100644
--- a/arch/s390/include/asm/mem_encrypt.h
+++ b/arch/s390/include/asm/mem_encrypt.h
@@ -4,10 +4,8 @@
#ifndef __ASSEMBLY__
-static inline bool mem_encrypt_active(void) { return false; }
-
-int set_memory_encrypted(unsigned long addr, int numpages);
-int set_memory_decrypted(unsigned long addr, int numpages);
+int set_memory_encrypted(unsigned long vaddr, int numpages);
+int set_memory_decrypted(unsigned long vaddr, int numpages);
#endif /* __ASSEMBLY__ */
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index bcfb6371086f..bb1b4bef1878 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -4,18 +4,20 @@
#include <linux/cpumask.h>
#include <linux/errno.h>
+#include <asm/asm-extable.h>
typedef struct {
spinlock_t lock;
cpumask_t cpu_attach_mask;
atomic_t flush_count;
unsigned int flush_mm;
- struct list_head pgtable_list;
struct list_head gmap_list;
unsigned long gmap_asce;
unsigned long asce;
unsigned long asce_limit;
unsigned long vdso_base;
+ /* The mmu context belongs to a secure guest. */
+ atomic_t protected_count;
/*
* The following bitfields need a down_write on the mm
* semaphore when they are written to. As they are only
@@ -32,27 +34,10 @@ typedef struct {
unsigned int uses_cmm:1;
/* The gmaps associated with this context are allowed to use huge pages. */
unsigned int allow_gmap_hpage_1m:1;
- /* The mmu context is for compat task */
- unsigned int compat_mm:1;
} mm_context_t;
#define INIT_MM_CONTEXT(name) \
.context.lock = __SPIN_LOCK_UNLOCKED(name.context.lock), \
- .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
-static inline int tprot(unsigned long addr)
-{
- int rc = -EFAULT;
-
- asm volatile(
- " tprot 0(%1),0\n"
- "0: ipm %0\n"
- " srl %0,28\n"
- "1:\n"
- EX_TABLE(0b,1b)
- : "+d" (rc) : "a" (addr) : "cc");
- return rc;
-}
-
#endif
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 8d04e6f3f796..929af18b0908 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -12,20 +12,22 @@
#include <linux/uaccess.h>
#include <linux/mm_types.h>
#include <asm/tlbflush.h>
-#include <asm/ctl_reg.h>
+#include <asm/ctlreg.h>
#include <asm-generic/mm_hooks.h>
+#define init_new_context init_new_context
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
+ unsigned long asce_type, init_entry;
+
spin_lock_init(&mm->context.lock);
- INIT_LIST_HEAD(&mm->context.pgtable_list);
INIT_LIST_HEAD(&mm->context.gmap_list);
cpumask_clear(&mm->context.cpu_attach_mask);
atomic_set(&mm->context.flush_count, 0);
+ atomic_set(&mm->context.protected_count, 0);
mm->context.gmap_asce = 0;
mm->context.flush_mm = 0;
- mm->context.compat_mm = test_thread_flag(TIF_31BIT);
#ifdef CONFIG_PGSTE
mm->context.alloc_pgste = page_table_allocate_pgste ||
test_thread_flag(TIF_PGSTE) ||
@@ -36,73 +38,62 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.allow_gmap_hpage_1m = 0;
#endif
switch (mm->context.asce_limit) {
- case _REGION2_SIZE:
+ default:
/*
- * forked 3-level task, fall through to set new asce with new
- * mm->pgd
+ * context created by exec, the value of asce_limit can
+ * only be zero in this case
*/
- case 0:
- /* context created by exec, set asce limit to 4TB */
- mm->context.asce_limit = STACK_TOP_MAX;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
+ VM_BUG_ON(mm->context.asce_limit);
+ /* continue as 3-level task */
+ mm->context.asce_limit = _REGION2_SIZE;
+ fallthrough;
+ case _REGION2_SIZE:
+ /* forked 3-level task */
+ init_entry = _REGION3_ENTRY_EMPTY;
+ asce_type = _ASCE_TYPE_REGION3;
break;
- case -PAGE_SIZE:
- /* forked 5-level task, set new asce with new_mm->pgd */
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
+ case TASK_SIZE_MAX:
+ /* forked 5-level task */
+ init_entry = _REGION1_ENTRY_EMPTY;
+ asce_type = _ASCE_TYPE_REGION1;
break;
case _REGION1_SIZE:
- /* forked 4-level task, set new asce with new mm->pgd */
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
+ /* forked 4-level task */
+ init_entry = _REGION2_ENTRY_EMPTY;
+ asce_type = _ASCE_TYPE_REGION2;
break;
- case _REGION3_SIZE:
- /* forked 2-level compat task, set new asce with new mm->pgd */
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
}
- crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | asce_type;
+ crst_table_init((unsigned long *) mm->pgd, init_entry);
return 0;
}
-#define destroy_context(mm) do { } while (0)
-
-static inline void set_user_asce(struct mm_struct *mm)
+static inline void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
{
- S390_lowcore.user_asce = mm->context.asce;
- __ctl_load(S390_lowcore.user_asce, 1, 1);
- clear_cpu_flag(CIF_ASCE_PRIMARY);
-}
+ int cpu = smp_processor_id();
-static inline void clear_user_asce(void)
-{
- S390_lowcore.user_asce = S390_lowcore.kernel_asce;
- __ctl_load(S390_lowcore.kernel_asce, 1, 1);
- set_cpu_flag(CIF_ASCE_PRIMARY);
+ if (next == &init_mm)
+ S390_lowcore.user_asce = s390_invalid_asce;
+ else
+ S390_lowcore.user_asce.val = next->context.asce;
+ cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
+ /* Clear previous user-ASCE from CR7 */
+ local_ctl_load(7, &s390_invalid_asce);
+ if (prev != next)
+ cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
}
-
-mm_segment_t enable_sacf_uaccess(void);
-void disable_sacf_uaccess(mm_segment_t old_fs);
+#define switch_mm_irqs_off switch_mm_irqs_off
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
- int cpu = smp_processor_id();
+ unsigned long flags;
- S390_lowcore.user_asce = next->context.asce;
- cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
- /* Clear previous user-ASCE from CR1 and CR7 */
- if (!test_cpu_flag(CIF_ASCE_PRIMARY)) {
- __ctl_load(S390_lowcore.kernel_asce, 1, 1);
- set_cpu_flag(CIF_ASCE_PRIMARY);
- }
- if (test_cpu_flag(CIF_ASCE_SECONDARY)) {
- __ctl_load(S390_lowcore.vdso_asce, 7, 7);
- clear_cpu_flag(CIF_ASCE_SECONDARY);
- }
- if (prev != next)
- cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
+ local_irq_save(flags);
+ switch_mm_irqs_off(prev, next, tsk);
+ local_irq_restore(flags);
}
#define finish_arch_post_lock_switch finish_arch_post_lock_switch
@@ -119,18 +110,18 @@ static inline void finish_arch_post_lock_switch(void)
__tlb_flush_mm_lazy(mm);
preempt_enable();
}
- set_fs(current->thread.mm_segment);
+ local_ctl_load(7, &S390_lowcore.user_asce);
}
-#define enter_lazy_tlb(mm,tsk) do { } while (0)
-#define deactivate_mm(tsk,mm) do { } while (0)
-
+#define activate_mm activate_mm
static inline void activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
switch_mm(prev, next, current);
cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
- set_user_asce(next);
+ local_ctl_load(7, &S390_lowcore.user_asce);
}
+#include <asm-generic/mmu_context.h>
+
#endif /* __S390_MMU_CONTEXT_H */
diff --git a/arch/s390/include/asm/module.h b/arch/s390/include/asm/module.h
index e0a6d29846e2..9f1eea15872c 100644
--- a/arch/s390/include/asm/module.h
+++ b/arch/s390/include/asm/module.h
@@ -8,16 +8,14 @@
* This file contains the s390 architecture specific module code.
*/
-struct mod_arch_syminfo
-{
+struct mod_arch_syminfo {
unsigned long got_offset;
unsigned long plt_offset;
int got_initialized;
int plt_initialized;
};
-struct mod_arch_specific
-{
+struct mod_arch_specific {
/* Starting offset of got in the module core memory. */
unsigned long got_offset;
/* Starting offset of plt in the module core memory. */
@@ -30,6 +28,14 @@ struct mod_arch_specific
int nsyms;
/* Additional symbol information (got and plt offsets). */
struct mod_arch_syminfo *syminfo;
+#ifdef CONFIG_FUNCTION_TRACER
+ /* Start of memory reserved for ftrace hotpatch trampolines. */
+ struct ftrace_hotpatch_trampoline *trampolines_start;
+ /* End of memory reserved for ftrace hotpatch trampolines. */
+ struct ftrace_hotpatch_trampoline *trampolines_end;
+ /* Next unused ftrace hotpatch trampoline slot. */
+ struct ftrace_hotpatch_trampoline *next_trampoline;
+#endif /* CONFIG_FUNCTION_TRACER */
};
#endif /* _ASM_S390_MODULE_H */
diff --git a/arch/s390/include/asm/msi.h b/arch/s390/include/asm/msi.h
new file mode 100644
index 000000000000..399343ed9ffb
--- /dev/null
+++ b/arch/s390/include/asm/msi.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_MSI_H
+#define _ASM_S390_MSI_H
+#include <asm-generic/msi.h>
+
+/*
+ * Work around S390 not using irq_domain at all so we can't set
+ * IRQ_DOMAIN_FLAG_ISOLATED_MSI. See for an explanation how it works:
+ *
+ * https://lore.kernel.org/r/31af8174-35e9-ebeb-b9ef-74c90d4bfd93@linux.ibm.com/
+ *
+ * Note this is less isolated than the ARM/x86 versions as userspace can trigger
+ * MSI belonging to kernel devices within the same gisa.
+ */
+#define arch_is_isolated_msi() true
+
+#endif
diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h
index b160da8fa14b..227466ce9e41 100644
--- a/arch/s390/include/asm/nmi.h
+++ b/arch/s390/include/asm/nmi.h
@@ -6,7 +6,6 @@
* Author(s): Ingo Adlung <adlung@de.ibm.com>,
* Martin Schwidefsky <schwidefsky@de.ibm.com>,
* Cornelia Huck <cornelia.huck@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*/
#ifndef _ASM_S390_NMI_H
@@ -23,12 +22,16 @@
#define MCCK_CODE_SYSTEM_DAMAGE BIT(63)
#define MCCK_CODE_EXT_DAMAGE BIT(63 - 5)
#define MCCK_CODE_CP BIT(63 - 9)
-#define MCCK_CODE_CPU_TIMER_VALID BIT(63 - 46)
+#define MCCK_CODE_STG_ERROR BIT(63 - 16)
+#define MCCK_CODE_STG_KEY_ERROR BIT(63 - 18)
+#define MCCK_CODE_STG_DEGRAD BIT(63 - 19)
#define MCCK_CODE_PSW_MWP_VALID BIT(63 - 20)
#define MCCK_CODE_PSW_IA_VALID BIT(63 - 23)
+#define MCCK_CODE_STG_FAIL_ADDR BIT(63 - 24)
#define MCCK_CODE_CR_VALID BIT(63 - 29)
#define MCCK_CODE_GS_VALID BIT(63 - 36)
#define MCCK_CODE_FC_VALID BIT(63 - 43)
+#define MCCK_CODE_CPU_TIMER_VALID BIT(63 - 46)
#ifndef __ASSEMBLY__
@@ -94,9 +97,9 @@ struct mcesa {
struct pt_regs;
-void nmi_alloc_boot_cpu(struct lowcore *lc);
-int nmi_alloc_per_cpu(struct lowcore *lc);
-void nmi_free_per_cpu(struct lowcore *lc);
+void nmi_alloc_mcesa_early(u64 *mcesad);
+int nmi_alloc_mcesa(u64 *mcesad);
+void nmi_free_mcesa(u64 *mcesad);
void s390_handle_mcck(void);
void s390_do_machine_check(struct pt_regs *regs);
diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h
index b4bd8c41e9d3..82725cf783c7 100644
--- a/arch/s390/include/asm/nospec-branch.h
+++ b/arch/s390/include/asm/nospec-branch.h
@@ -12,6 +12,11 @@ void nospec_init_branches(void);
void nospec_auto_detect(void);
void nospec_revert(s32 *start, s32 *end);
+static inline bool nospec_uses_trampoline(void)
+{
+ return __is_defined(CC_USING_EXPOLINE) && !nospec_disable;
+}
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_EXPOLINE_H */
diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h
index 0033dcd663b1..7a946c42ad13 100644
--- a/arch/s390/include/asm/nospec-insn.h
+++ b/arch/s390/include/asm/nospec-insn.h
@@ -2,23 +2,25 @@
#ifndef _ASM_S390_NOSPEC_ASM_H
#define _ASM_S390_NOSPEC_ASM_H
-#include <asm/alternative-asm.h>
-#include <asm/asm-offsets.h>
+#include <linux/linkage.h>
#include <asm/dwarf.h>
#ifdef __ASSEMBLY__
#ifdef CC_USING_EXPOLINE
-_LC_BR_R1 = __LC_BR_R1
-
/*
* The expoline macros are used to create thunks in the same format
* as gcc generates them. The 'comdat' section flag makes sure that
* the various thunks are merged into a single copy.
*/
.macro __THUNK_PROLOG_NAME name
+#ifdef CONFIG_EXPOLINE_EXTERN
+ .pushsection .text,"ax",@progbits
+ __ALIGN
+#else
.pushsection .text.\name,"axG",@progbits,\name,comdat
+#endif
.globl \name
.hidden \name
.type \name,@function
@@ -26,167 +28,101 @@ _LC_BR_R1 = __LC_BR_R1
CFI_STARTPROC
.endm
- .macro __THUNK_EPILOG
+ .macro __THUNK_EPILOG_NAME name
CFI_ENDPROC
+#ifdef CONFIG_EXPOLINE_EXTERN
+ .size \name, .-\name
+#endif
.popsection
.endm
- .macro __THUNK_PROLOG_BR r1,r2
- __THUNK_PROLOG_NAME __s390_indirect_jump_r\r2\()use_r\r1
- .endm
-
- .macro __THUNK_PROLOG_BC d0,r1,r2
- __THUNK_PROLOG_NAME __s390_indirect_branch_\d0\()_\r2\()use_\r1
+ .macro __THUNK_PROLOG_BR r1
+ __THUNK_PROLOG_NAME __s390_indirect_jump_r\r1
.endm
- .macro __THUNK_BR r1,r2
- jg __s390_indirect_jump_r\r2\()use_r\r1
+ .macro __THUNK_EPILOG_BR r1
+ __THUNK_EPILOG_NAME __s390_indirect_jump_r\r1
.endm
- .macro __THUNK_BC d0,r1,r2
- jg __s390_indirect_branch_\d0\()_\r2\()use_\r1
+ .macro __THUNK_BR r1
+ jg __s390_indirect_jump_r\r1
.endm
- .macro __THUNK_BRASL r1,r2,r3
- brasl \r1,__s390_indirect_jump_r\r3\()use_r\r2
+ .macro __THUNK_BRASL r1,r2
+ brasl \r1,__s390_indirect_jump_r\r2
.endm
- .macro __DECODE_RR expand,reg,ruse
- .set __decode_fail,1
+ .macro __DECODE_R expand,reg
+ .set .L__decode_fail,1
.irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
.ifc \reg,%r\r1
- .irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
- .ifc \ruse,%r\r2
- \expand \r1,\r2
- .set __decode_fail,0
- .endif
- .endr
+ \expand \r1
+ .set .L__decode_fail,0
.endif
.endr
- .if __decode_fail == 1
- .error "__DECODE_RR failed"
+ .if .L__decode_fail == 1
+ .error "__DECODE_R failed"
.endif
.endm
- .macro __DECODE_RRR expand,rsave,rtarget,ruse
- .set __decode_fail,1
+ .macro __DECODE_RR expand,rsave,rtarget
+ .set .L__decode_fail,1
.irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
.ifc \rsave,%r\r1
.irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
.ifc \rtarget,%r\r2
- .irp r3,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
- .ifc \ruse,%r\r3
- \expand \r1,\r2,\r3
- .set __decode_fail,0
- .endif
- .endr
- .endif
- .endr
- .endif
- .endr
- .if __decode_fail == 1
- .error "__DECODE_RRR failed"
- .endif
- .endm
-
- .macro __DECODE_DRR expand,disp,reg,ruse
- .set __decode_fail,1
- .irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
- .ifc \reg,%r\r1
- .irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
- .ifc \ruse,%r\r2
- \expand \disp,\r1,\r2
- .set __decode_fail,0
+ \expand \r1,\r2
+ .set .L__decode_fail,0
.endif
.endr
.endif
.endr
- .if __decode_fail == 1
- .error "__DECODE_DRR failed"
+ .if .L__decode_fail == 1
+ .error "__DECODE_RR failed"
.endif
.endm
- .macro __THUNK_EX_BR reg,ruse
- # Be very careful when adding instructions to this macro!
- # The ALTERNATIVE replacement code has a .+10 which targets
- # the "br \reg" after the code has been patched.
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
+ .macro __THUNK_EX_BR reg
exrl 0,555f
j .
-#else
- .ifc \reg,%r1
- ALTERNATIVE "ex %r0,_LC_BR_R1", ".insn ril,0xc60000000000,0,.+10", 35
- j .
- .else
- larl \ruse,555f
- ex 0,0(\ruse)
- j .
- .endif
-#endif
555: br \reg
.endm
- .macro __THUNK_EX_BC disp,reg,ruse
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
- exrl 0,556f
- j .
+#ifdef CONFIG_EXPOLINE_EXTERN
+ .macro GEN_BR_THUNK reg
+ .endm
+ .macro GEN_BR_THUNK_EXTERN reg
#else
- larl \ruse,556f
- ex 0,0(\ruse)
- j .
+ .macro GEN_BR_THUNK reg
#endif
-556: b \disp(\reg)
- .endm
-
- .macro GEN_BR_THUNK reg,ruse=%r1
- __DECODE_RR __THUNK_PROLOG_BR,\reg,\ruse
- __THUNK_EX_BR \reg,\ruse
- __THUNK_EPILOG
- .endm
-
- .macro GEN_B_THUNK disp,reg,ruse=%r1
- __DECODE_DRR __THUNK_PROLOG_BC,\disp,\reg,\ruse
- __THUNK_EX_BC \disp,\reg,\ruse
- __THUNK_EPILOG
+ __DECODE_R __THUNK_PROLOG_BR,\reg
+ __THUNK_EX_BR \reg
+ __DECODE_R __THUNK_EPILOG_BR,\reg
.endm
- .macro BR_EX reg,ruse=%r1
-557: __DECODE_RR __THUNK_BR,\reg,\ruse
+ .macro BR_EX reg
+557: __DECODE_R __THUNK_BR,\reg
.pushsection .s390_indirect_branches,"a",@progbits
.long 557b-.
.popsection
.endm
- .macro B_EX disp,reg,ruse=%r1
-558: __DECODE_DRR __THUNK_BC,\disp,\reg,\ruse
- .pushsection .s390_indirect_branches,"a",@progbits
- .long 558b-.
- .popsection
- .endm
-
- .macro BASR_EX rsave,rtarget,ruse=%r1
-559: __DECODE_RRR __THUNK_BRASL,\rsave,\rtarget,\ruse
+ .macro BASR_EX rsave,rtarget
+559: __DECODE_RR __THUNK_BRASL,\rsave,\rtarget
.pushsection .s390_indirect_branches,"a",@progbits
.long 559b-.
.popsection
.endm
#else
- .macro GEN_BR_THUNK reg,ruse=%r1
- .endm
-
- .macro GEN_B_THUNK disp,reg,ruse=%r1
+ .macro GEN_BR_THUNK reg
.endm
- .macro BR_EX reg,ruse=%r1
+ .macro BR_EX reg
br \reg
.endm
- .macro B_EX disp,reg,ruse=%r1
- b \disp(\reg)
- .endm
-
- .macro BASR_EX rsave,rtarget,ruse=%r1
+ .macro BASR_EX rsave,rtarget
basr \rsave,\rtarget
.endm
#endif /* CC_USING_EXPOLINE */
diff --git a/arch/s390/include/asm/numa.h b/arch/s390/include/asm/numa.h
index 35f8cbe7e5bb..23cd5d1b734b 100644
--- a/arch/s390/include/asm/numa.h
+++ b/arch/s390/include/asm/numa.h
@@ -13,24 +13,13 @@
#ifdef CONFIG_NUMA
#include <linux/numa.h>
-#include <linux/cpumask.h>
void numa_setup(void);
-int numa_pfn_to_nid(unsigned long pfn);
-int __node_distance(int a, int b);
-void numa_update_cpu_topology(void);
-
-extern cpumask_t node_to_cpumask_map[MAX_NUMNODES];
-extern int numa_debug_enabled;
#else
static inline void numa_setup(void) { }
-static inline void numa_update_cpu_topology(void) { }
-static inline int numa_pfn_to_nid(unsigned long pfn)
-{
- return 0;
-}
#endif /* CONFIG_NUMA */
+
#endif /* _ASM_S390_NUMA_H */
diff --git a/arch/s390/include/asm/os_info.h b/arch/s390/include/asm/os_info.h
index 3c89279d2a4b..a4d2e103f116 100644
--- a/arch/s390/include/asm/os_info.h
+++ b/arch/s390/include/asm/os_info.h
@@ -8,12 +8,17 @@
#ifndef _ASM_S390_OS_INFO_H
#define _ASM_S390_OS_INFO_H
+#include <linux/uio.h>
+
#define OS_INFO_VERSION_MAJOR 1
#define OS_INFO_VERSION_MINOR 1
#define OS_INFO_MAGIC 0x4f53494e464f535aULL /* OSINFOSZ */
#define OS_INFO_VMCOREINFO 0
#define OS_INFO_REIPL_BLOCK 1
+#define OS_INFO_FLAGS_ENTRY 2
+
+#define OS_INFO_FLAG_REIPL_CLEAR (1UL << 0)
struct os_info_entry {
u64 addr;
@@ -28,8 +33,8 @@ struct os_info {
u16 version_minor;
u64 crashkernel_addr;
u64 crashkernel_size;
- struct os_info_entry entry[2];
- u8 reserved[4024];
+ struct os_info_entry entry[3];
+ u8 reserved[4004];
} __packed;
void os_info_init(void);
@@ -39,7 +44,6 @@ u32 os_info_csum(struct os_info *os_info);
#ifdef CONFIG_CRASH_DUMP
void *os_info_old_entry(int nr, unsigned long *size);
-int copy_oldmem_kernel(void *dst, void *src, size_t count);
#else
static inline void *os_info_old_entry(int nr, unsigned long *size)
{
diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h
index c33c4deb545f..08fcbd628120 100644
--- a/arch/s390/include/asm/page-states.h
+++ b/arch/s390/include/asm/page-states.h
@@ -7,6 +7,9 @@
#ifndef PAGE_STATES_H
#define PAGE_STATES_H
+#include <asm/sections.h>
+#include <asm/page.h>
+
#define ESSA_GET_STATE 0
#define ESSA_SET_STABLE 1
#define ESSA_SET_UNUSED 2
@@ -18,4 +21,60 @@
#define ESSA_MAX ESSA_SET_STABLE_NODAT
+extern int __bootdata_preserved(cmma_flag);
+
+static __always_inline unsigned long essa(unsigned long paddr, unsigned char cmd)
+{
+ unsigned long rc;
+
+ asm volatile(
+ " .insn rrf,0xb9ab0000,%[rc],%[paddr],%[cmd],0"
+ : [rc] "=d" (rc)
+ : [paddr] "d" (paddr),
+ [cmd] "i" (cmd));
+ return rc;
+}
+
+static __always_inline void __set_page_state(void *addr, unsigned long num_pages, unsigned char cmd)
+{
+ unsigned long paddr = __pa(addr) & PAGE_MASK;
+
+ while (num_pages--) {
+ essa(paddr, cmd);
+ paddr += PAGE_SIZE;
+ }
+}
+
+static inline void __set_page_unused(void *addr, unsigned long num_pages)
+{
+ __set_page_state(addr, num_pages, ESSA_SET_UNUSED);
+}
+
+static inline void __set_page_stable_dat(void *addr, unsigned long num_pages)
+{
+ __set_page_state(addr, num_pages, ESSA_SET_STABLE);
+}
+
+static inline void __set_page_stable_nodat(void *addr, unsigned long num_pages)
+{
+ __set_page_state(addr, num_pages, ESSA_SET_STABLE_NODAT);
+}
+
+static inline void __arch_set_page_nodat(void *addr, unsigned long num_pages)
+{
+ if (!cmma_flag)
+ return;
+ if (cmma_flag < 2)
+ __set_page_stable_dat(addr, num_pages);
+ else
+ __set_page_stable_nodat(addr, num_pages);
+}
+
+static inline void __arch_set_page_dat(void *addr, unsigned long num_pages)
+{
+ if (!cmma_flag)
+ return;
+ __set_page_stable_dat(addr, num_pages);
+}
+
#endif
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index a4d38092530a..73b9c3bf377f 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -19,7 +19,9 @@
#define PAGE_SHIFT _PAGE_SHIFT
#define PAGE_SIZE _PAGE_SIZE
#define PAGE_MASK _PAGE_MASK
-#define PAGE_DEFAULT_ACC 0
+#define PAGE_DEFAULT_ACC _AC(0, UL)
+/* storage-protection override */
+#define PAGE_SPO_ACC 9
#define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4)
#define HPAGE_SHIFT 20
@@ -33,6 +35,8 @@
#define ARCH_HAS_PREPARE_HUGEPAGE
#define ARCH_HAS_HUGEPAGE_CLEAR_FLUSH
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+
#include <asm/setup.h>
#ifndef __ASSEMBLY__
@@ -40,7 +44,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end);
static inline void storage_key_init_range(unsigned long start, unsigned long end)
{
- if (PAGE_DEFAULT_KEY)
+ if (PAGE_DEFAULT_KEY != 0)
__storage_key_init_range(start, end);
}
@@ -53,22 +57,24 @@ static inline void storage_key_init_range(unsigned long start, unsigned long end
*/
static inline void copy_page(void *to, void *from)
{
- register void *reg2 asm ("2") = to;
- register unsigned long reg3 asm ("3") = 0x1000;
- register void *reg4 asm ("4") = from;
- register unsigned long reg5 asm ("5") = 0xb0001000;
+ union register_pair dst, src;
+
+ dst.even = (unsigned long) to;
+ dst.odd = 0x1000;
+ src.even = (unsigned long) from;
+ src.odd = 0xb0001000;
+
asm volatile(
- " mvcl 2,4"
- : "+d" (reg2), "+d" (reg3), "+d" (reg4), "+d" (reg5)
+ " mvcl %[dst],%[src]"
+ : [dst] "+&d" (dst.pair), [src] "+&d" (src.pair)
: : "memory", "cc");
}
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
- alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
+ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)
/*
* These are used to make use of C type-checking..
@@ -85,11 +91,31 @@ typedef pte_t *pgtable_t;
#define pgprot_val(x) ((x).pgprot)
#define pgste_val(x) ((x).pgste)
-#define pte_val(x) ((x).pte)
-#define pmd_val(x) ((x).pmd)
-#define pud_val(x) ((x).pud)
-#define p4d_val(x) ((x).p4d)
-#define pgd_val(x) ((x).pgd)
+
+static inline unsigned long pte_val(pte_t pte)
+{
+ return pte.pte;
+}
+
+static inline unsigned long pmd_val(pmd_t pmd)
+{
+ return pmd.pmd;
+}
+
+static inline unsigned long pud_val(pud_t pud)
+{
+ return pud.pud;
+}
+
+static inline unsigned long p4d_val(p4d_t p4d)
+{
+ return p4d.p4d;
+}
+
+static inline unsigned long pgd_val(pgd_t pgd)
+{
+ return pgd.pgd;
+}
#define __pgste(x) ((pgste_t) { (x) } )
#define __pte(x) ((pte_t) { (x) } )
@@ -138,10 +164,6 @@ static inline int page_reset_referenced(unsigned long addr)
struct page;
void arch_free_page(struct page *page, int order);
void arch_alloc_page(struct page *page, int order);
-void arch_set_page_dat(struct page *page, int order);
-void arch_set_page_nodat(struct page *page, int order);
-int arch_test_page_nodat(struct page *page);
-void arch_set_page_states(int make_stable);
static inline int devmem_is_allowed(unsigned long pfn)
{
@@ -151,7 +173,10 @@ static inline int devmem_is_allowed(unsigned long pfn)
#define HAVE_ARCH_FREE_PAGE
#define HAVE_ARCH_ALLOC_PAGE
-#endif /* !__ASSEMBLY__ */
+#if IS_ENABLED(CONFIG_PGSTE)
+int arch_make_page_accessible(struct page *page);
+#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
+#endif
#define __PAGE_OFFSET 0x0UL
#define PAGE_OFFSET 0x0UL
@@ -159,23 +184,32 @@ static inline int devmem_is_allowed(unsigned long pfn)
#define __pa(x) ((unsigned long)(x))
#define __va(x) ((void *)(unsigned long)(x))
-#define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT)
-#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT)
+#define phys_to_pfn(phys) ((phys) >> PAGE_SHIFT)
+#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT)
+
+#define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys))
+#define page_to_phys(page) pfn_to_phys(page_to_pfn(page))
+
+static inline void *pfn_to_virt(unsigned long pfn)
+{
+ return __va(pfn_to_phys(pfn));
+}
+
+static inline unsigned long virt_to_pfn(const void *kaddr)
+{
+ return phys_to_pfn(__pa(kaddr));
+}
+
#define pfn_to_kaddr(pfn) pfn_to_virt(pfn)
#define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr))
#define page_to_virt(page) pfn_to_virt(page_to_pfn(page))
-#define phys_to_pfn(kaddr) ((kaddr) >> PAGE_SHIFT)
-#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT)
-
-#define phys_to_page(kaddr) pfn_to_page(phys_to_pfn(kaddr))
-#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
+#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr))
-#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC
-#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#endif /* !__ASSEMBLY__ */
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
diff --git a/arch/s390/include/asm/pai.h b/arch/s390/include/asm/pai.h
new file mode 100644
index 000000000000..7d1888e3dee6
--- /dev/null
+++ b/arch/s390/include/asm/pai.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Processor Activity Instrumentation support for cryptography counters
+ *
+ * Copyright IBM Corp. 2022
+ * Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ */
+#ifndef _ASM_S390_PAI_H
+#define _ASM_S390_PAI_H
+
+#include <linux/jump_label.h>
+#include <asm/lowcore.h>
+#include <asm/ptrace.h>
+
+struct qpaci_info_block {
+ u64 header;
+ struct {
+ u64 : 8;
+ u64 num_cc : 8; /* # of supported crypto counters */
+ u64 : 9;
+ u64 num_nnpa : 7; /* # of supported NNPA counters */
+ u64 : 32;
+ };
+};
+
+static inline int qpaci(struct qpaci_info_block *info)
+{
+ /* Size of info (in double words minus one) */
+ size_t size = sizeof(*info) / sizeof(u64) - 1;
+ int cc;
+
+ asm volatile(
+ " lgr 0,%[size]\n"
+ " .insn s,0xb28f0000,%[info]\n"
+ " lgr %[size],0\n"
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
+ : [cc] "=d" (cc), [info] "=Q" (*info), [size] "+&d" (size)
+ :
+ : "0", "cc", "memory");
+ return cc ? (size + 1) * sizeof(u64) : 0;
+}
+
+#define PAI_CRYPTO_BASE 0x1000 /* First event number */
+#define PAI_CRYPTO_MAXCTR 256 /* Max # of event counters */
+#define PAI_CRYPTO_KERNEL_OFFSET 2048
+#define PAI_NNPA_BASE 0x1800 /* First event number */
+#define PAI_NNPA_MAXCTR 128 /* Max # of event counters */
+
+DECLARE_STATIC_KEY_FALSE(pai_key);
+
+static __always_inline void pai_kernel_enter(struct pt_regs *regs)
+{
+ if (!IS_ENABLED(CONFIG_PERF_EVENTS))
+ return;
+ if (!static_branch_unlikely(&pai_key))
+ return;
+ if (!S390_lowcore.ccd)
+ return;
+ if (!user_mode(regs))
+ return;
+ WRITE_ONCE(S390_lowcore.ccd, S390_lowcore.ccd | PAI_CRYPTO_KERNEL_OFFSET);
+}
+
+static __always_inline void pai_kernel_exit(struct pt_regs *regs)
+{
+ if (!IS_ENABLED(CONFIG_PERF_EVENTS))
+ return;
+ if (!static_branch_unlikely(&pai_key))
+ return;
+ if (!S390_lowcore.ccd)
+ return;
+ if (!user_mode(regs))
+ return;
+ WRITE_ONCE(S390_lowcore.ccd, S390_lowcore.ccd & ~PAI_CRYPTO_KERNEL_OFFSET);
+}
+
+enum paievt_mode {
+ PAI_MODE_NONE,
+ PAI_MODE_SAMPLING,
+ PAI_MODE_COUNTING,
+};
+
+#endif
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 3a06c264ea53..e91cd6bbc330 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -5,9 +5,10 @@
#include <linux/pci.h>
#include <linux/mutex.h>
#include <linux/iommu.h>
-#include <asm-generic/pci.h>
+#include <linux/pci_hotplug.h>
#include <asm/pci_clp.h>
#include <asm/pci_debug.h>
+#include <asm/pci_insn.h>
#include <asm/sclp.h>
#define PCIBIOS_MIN_IO 0x1000
@@ -21,10 +22,16 @@ int pci_domain_nr(struct pci_bus *);
int pci_proc_domain(struct pci_bus *);
#define ZPCI_BUS_NR 0 /* default bus number */
-#define ZPCI_DEVFN 0 /* default device number */
#define ZPCI_NR_DMA_SPACES 1
#define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS
+#define ZPCI_DOMAIN_BITMAP_SIZE (1 << 16)
+
+#ifdef PCI
+#if (ZPCI_NR_DEVICES > ZPCI_DOMAIN_BITMAP_SIZE)
+# error ZPCI_NR_DEVICES can not be bigger than ZPCI_DOMAIN_BITMAP_SIZE
+#endif
+#endif /* PCI */
/* PCI Function Controls */
#define ZPCI_FC_FN_ENABLED 0x80
@@ -78,7 +85,6 @@ enum zpci_state {
ZPCI_FN_STATE_STANDBY = 0,
ZPCI_FN_STATE_CONFIGURED = 1,
ZPCI_FN_STATE_RESERVED = 2,
- ZPCI_FN_STATE_ONLINE = 3,
};
struct zpci_bar_struct {
@@ -91,20 +97,50 @@ struct zpci_bar_struct {
};
struct s390_domain;
+struct kvm_zdev;
+
+#define ZPCI_FUNCTIONS_PER_BUS 256
+struct zpci_bus {
+ struct kref kref;
+ struct pci_bus *bus;
+ struct zpci_dev *function[ZPCI_FUNCTIONS_PER_BUS];
+ struct list_head resources;
+ struct list_head bus_next;
+ struct resource bus_resource;
+ int pchid;
+ int domain_nr;
+ bool multifunction;
+ enum pci_bus_speed max_bus_speed;
+};
/* Private data per function */
struct zpci_dev {
- struct pci_bus *bus;
+ struct zpci_bus *zbus;
struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */
+ struct list_head iommu_list;
+ struct kref kref;
+ struct rcu_head rcu;
+ struct hotplug_slot hotplug_slot;
enum zpci_state state;
u32 fid; /* function ID, used by sclp */
u32 fh; /* function handle, used by insn's */
+ u32 gisa; /* GISA designation for passthrough */
u16 vfn; /* virtual function number */
u16 pchid; /* physical channel ID */
+ u16 maxstbl; /* Maximum store block size */
u8 pfgid; /* function group ID */
u8 pft; /* pci function type */
- u16 domain;
+ u8 port;
+ u8 dtsm; /* Supported DT mask */
+ u8 rid_available : 1;
+ u8 has_hp_slot : 1;
+ u8 has_resources : 1;
+ u8 is_physfn : 1;
+ u8 util_str_avail : 1;
+ u8 irqs_registered : 1;
+ u8 reserved : 2;
+ unsigned int devfn; /* DEVFN part of the RID*/
struct mutex lock;
u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */
@@ -121,16 +157,8 @@ struct zpci_dev {
/* DMA stuff */
unsigned long *dma_table;
- spinlock_t dma_table_lock;
int tlb_refresh;
- spinlock_t iommu_bitmap_lock;
- unsigned long *iommu_bitmap;
- unsigned long *lazy_bitmap;
- unsigned long iommu_size;
- unsigned long iommu_pages;
- unsigned int next_bit;
-
struct iommu_device iommu_dev; /* IOMMU core handle */
char res_name[16];
@@ -145,16 +173,16 @@ struct zpci_dev {
struct zpci_fmb *fmb;
u16 fmb_update; /* update interval */
u16 fmb_length;
- /* software counters */
- atomic64_t allocated_pages;
- atomic64_t mapped_pages;
- atomic64_t unmapped_pages;
+ u8 version;
enum pci_bus_speed max_bus_speed;
struct dentry *debugfs_dev;
+ /* IOMMU and passthrough */
struct s390_domain *s390_domain; /* s390 IOMMU domain data */
+ struct kvm_zdev *kzdev;
+ struct mutex kzdev_lock;
};
static inline bool zdev_enabled(struct zpci_dev *zdev)
@@ -164,27 +192,40 @@ static inline bool zdev_enabled(struct zpci_dev *zdev)
extern const struct attribute_group *zpci_attr_groups[];
extern unsigned int s390_pci_force_floating __initdata;
+extern unsigned int s390_pci_no_rid;
+
+extern union zpci_sic_iib *zpci_aipb;
+extern struct airq_iv *zpci_aif_sbv;
/* -----------------------------------------------------------------------------
Prototypes
----------------------------------------------------------------------------- */
/* Base stuff */
-int zpci_create_device(struct zpci_dev *);
-void zpci_remove_device(struct zpci_dev *zdev);
+struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state);
int zpci_enable_device(struct zpci_dev *);
int zpci_disable_device(struct zpci_dev *);
-int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64);
+int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh);
+int zpci_deconfigure_device(struct zpci_dev *zdev);
+void zpci_device_reserved(struct zpci_dev *zdev);
+bool zpci_is_device_configured(struct zpci_dev *zdev);
+
+int zpci_hot_reset_device(struct zpci_dev *zdev);
+int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64, u8 *);
int zpci_unregister_ioat(struct zpci_dev *, u8);
void zpci_remove_reserved_devices(void);
+void zpci_update_fh(struct zpci_dev *zdev, u32 fh);
/* CLP */
+int clp_setup_writeback_mio(void);
int clp_scan_pci_devices(void);
-int clp_rescan_pci_devices(void);
-int clp_rescan_pci_devices_simple(void);
-int clp_add_pci_device(u32, u32, int);
-int clp_enable_fh(struct zpci_dev *, u8);
-int clp_disable_fh(struct zpci_dev *);
+int clp_query_pci_fn(struct zpci_dev *zdev);
+int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as);
+int clp_disable_fh(struct zpci_dev *zdev, u32 *fh);
int clp_get_state(u32 fid, enum zpci_state *state);
+int clp_refresh_fh(u32 fid, u32 *fh);
+
+/* UID */
+void update_uid_checking(bool new);
/* IOMMU Interface */
int zpci_init_iommu(struct zpci_dev *zdev);
@@ -199,12 +240,10 @@ static inline bool zpci_use_mio(struct zpci_dev *zdev)
/* Error handling and recovery */
void zpci_event_error(void *);
void zpci_event_availability(void *);
-void zpci_rescan(void);
bool zpci_is_enabled(void);
#else /* CONFIG_PCI */
static inline void zpci_event_error(void *e) {}
static inline void zpci_event_availability(void *e) {}
-static inline void zpci_rescan(void) {}
#endif /* CONFIG_PCI */
#ifdef CONFIG_HOTPLUG_PCI_S390
@@ -221,7 +260,14 @@ static inline void zpci_exit_slot(struct zpci_dev *zdev) {}
/* Helpers */
static inline struct zpci_dev *to_zpci(struct pci_dev *pdev)
{
- return pdev->sysdata;
+ struct zpci_bus *zbus = pdev->sysdata;
+
+ return zbus->function[pdev->devfn];
+}
+
+static inline struct zpci_dev *to_zpci_dev(struct device *dev)
+{
+ return to_zpci(to_pci_dev(dev));
}
struct zpci_dev *get_zdev_by_fid(u32);
@@ -229,7 +275,10 @@ struct zpci_dev *get_zdev_by_fid(u32);
/* DMA */
int zpci_dma_init(void);
void zpci_dma_exit(void);
+int zpci_dma_init_device(struct zpci_dev *zdev);
+int zpci_dma_exit_device(struct zpci_dev *zdev);
+/* IRQ */
int __init zpci_irq_init(void);
void __init zpci_irq_exit(void);
@@ -242,10 +291,11 @@ int zpci_debug_init(void);
void zpci_debug_exit(void);
void zpci_debug_init_device(struct zpci_dev *, const char *);
void zpci_debug_exit_device(struct zpci_dev *);
-void zpci_debug_info(struct zpci_dev *, struct seq_file *);
-/* Error reporting */
+/* Error handling */
int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *);
+int zpci_clear_error_state(struct zpci_dev *zdev);
+int zpci_reset_load_store_blocked(struct zpci_dev *zdev);
#ifdef CONFIG_NUMA
diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h
index bd2cb4ea7d93..f0c677ddd270 100644
--- a/arch/s390/include/asm/pci_clp.h
+++ b/arch/s390/include/asm/pci_clp.h
@@ -7,6 +7,7 @@
/*
* Call Logical Processor - Command Codes
*/
+#define CLP_SLPC 0x0001
#define CLP_LIST_PCI 0x0002
#define CLP_QUERY_PCI_FN 0x0003
#define CLP_QUERY_PCI_FNGRP 0x0004
@@ -49,8 +50,24 @@ struct clp_fh_list_entry {
#define CLP_UTIL_STR_LEN 64
#define CLP_PFIP_NR_SEGMENTS 4
+/* PCI function type numbers */
+#define PCI_FUNC_TYPE_ISM 0x5 /* ISM device */
+
extern bool zpci_unique_uid;
+struct clp_rsp_slpc_pci {
+ struct clp_rsp_hdr hdr;
+ u32 reserved2[4];
+ u32 lpif[8];
+ u32 reserved3[4];
+ u32 vwb : 1;
+ u32 : 1;
+ u32 mio_wb : 6;
+ u32 : 24;
+ u32 reserved5[3];
+ u32 lpic[8];
+} __packed;
+
/* List PCI functions request */
struct clp_req_list_pci {
struct clp_req_hdr hdr;
@@ -93,7 +110,10 @@ struct clp_req_query_pci {
struct clp_rsp_query_pci {
struct clp_rsp_hdr hdr;
u16 vfn; /* virtual fn number */
- u16 : 6;
+ u16 : 3;
+ u16 rid_avail : 1;
+ u16 is_physfn : 1;
+ u16 reserved1 : 1;
u16 mio_addr_avail : 1;
u16 util_str_avail : 1; /* utility string available? */
u16 pfgid : 8; /* pci function group id */
@@ -102,12 +122,16 @@ struct clp_rsp_query_pci {
u16 pchid;
__le32 bar[PCI_STD_NUM_BARS];
u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */
- u32 : 16;
+ u16 : 12;
+ u16 port : 4;
u8 fmb_len;
u8 pft; /* pci function type */
u64 sdma; /* start dma as */
u64 edma; /* end dma as */
- u32 reserved[11];
+#define ZPCI_RID_MASK_DEVFN 0x00ff
+ u16 rid; /* BUS/DEVFN PCI address */
+ u16 reserved0;
+ u32 reserved[10];
u32 uid; /* user defined id */
u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */
u32 reserved2[16];
@@ -132,9 +156,11 @@ struct clp_rsp_query_pci_grp {
u8 : 6;
u8 frame : 1;
u8 refresh : 1; /* TLB refresh mode */
- u16 reserved2;
+ u16 : 3;
+ u16 maxstbl : 13; /* Maximum store block size */
u16 mui;
- u16 : 16;
+ u8 dtsm; /* Supported DT mask */
+ u8 reserved3;
u16 maxfaal;
u16 : 4;
u16 dnoi : 12;
@@ -152,7 +178,8 @@ struct clp_req_set_pci {
u16 reserved2;
u8 oc; /* operation controls */
u8 ndas; /* number of dma spaces */
- u64 reserved3;
+ u32 reserved3;
+ u32 gisa; /* GISA designation */
} __packed;
/* Set PCI function response */
@@ -165,6 +192,11 @@ struct clp_rsp_set_pci {
} __packed;
/* Combined request/response block structures used by clp insn */
+struct clp_req_rsp_slpc_pci {
+ struct clp_req_slpc request;
+ struct clp_rsp_slpc_pci response;
+} __packed;
+
struct clp_req_rsp_list_pci {
struct clp_req_list_pci request;
struct clp_rsp_list_pci response;
diff --git a/arch/s390/include/asm/pci_debug.h b/arch/s390/include/asm/pci_debug.h
index 5dfe47588277..3bb4e7e33a0e 100644
--- a/arch/s390/include/asm/pci_debug.h
+++ b/arch/s390/include/asm/pci_debug.h
@@ -17,9 +17,14 @@ extern debug_info_t *pci_debug_err_id;
debug_text_event(pci_debug_err_id, 0, debug_buffer); \
} while (0)
+static inline void zpci_err_hex_level(int level, void *addr, int len)
+{
+ debug_event(pci_debug_err_id, level, addr, len);
+}
+
static inline void zpci_err_hex(void *addr, int len)
{
- debug_event(pci_debug_err_id, 0, addr, len);
+ zpci_err_hex_level(0, addr, len);
}
#endif
diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h
index 419fac7a62c0..42d7cc4262ca 100644
--- a/arch/s390/include/asm/pci_dma.h
+++ b/arch/s390/include/asm/pci_dma.h
@@ -82,126 +82,16 @@ enum zpci_ioat_dtype {
#define ZPCI_TABLE_VALID_MASK 0x20
#define ZPCI_TABLE_PROT_MASK 0x200
-static inline unsigned int calc_rtx(dma_addr_t ptr)
-{
- return ((unsigned long) ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK;
-}
-
-static inline unsigned int calc_sx(dma_addr_t ptr)
-{
- return ((unsigned long) ptr >> ZPCI_ST_SHIFT) & ZPCI_INDEX_MASK;
-}
-
-static inline unsigned int calc_px(dma_addr_t ptr)
-{
- return ((unsigned long) ptr >> PAGE_SHIFT) & ZPCI_PT_MASK;
-}
-
-static inline void set_pt_pfaa(unsigned long *entry, void *pfaa)
-{
- *entry &= ZPCI_PTE_FLAG_MASK;
- *entry |= ((unsigned long) pfaa & ZPCI_PTE_ADDR_MASK);
-}
-
-static inline void set_rt_sto(unsigned long *entry, void *sto)
-{
- *entry &= ZPCI_RTE_FLAG_MASK;
- *entry |= ((unsigned long) sto & ZPCI_RTE_ADDR_MASK);
- *entry |= ZPCI_TABLE_TYPE_RTX;
-}
-
-static inline void set_st_pto(unsigned long *entry, void *pto)
-{
- *entry &= ZPCI_STE_FLAG_MASK;
- *entry |= ((unsigned long) pto & ZPCI_STE_ADDR_MASK);
- *entry |= ZPCI_TABLE_TYPE_SX;
-}
-
-static inline void validate_rt_entry(unsigned long *entry)
-{
- *entry &= ~ZPCI_TABLE_VALID_MASK;
- *entry &= ~ZPCI_TABLE_OFFSET_MASK;
- *entry |= ZPCI_TABLE_VALID;
- *entry |= ZPCI_TABLE_LEN_RTX;
-}
-
-static inline void validate_st_entry(unsigned long *entry)
-{
- *entry &= ~ZPCI_TABLE_VALID_MASK;
- *entry |= ZPCI_TABLE_VALID;
-}
-
-static inline void invalidate_table_entry(unsigned long *entry)
-{
- *entry &= ~ZPCI_TABLE_VALID_MASK;
- *entry |= ZPCI_TABLE_INVALID;
-}
-
-static inline void invalidate_pt_entry(unsigned long *entry)
-{
- WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID);
- *entry &= ~ZPCI_PTE_VALID_MASK;
- *entry |= ZPCI_PTE_INVALID;
-}
-
-static inline void validate_pt_entry(unsigned long *entry)
-{
- WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID);
- *entry &= ~ZPCI_PTE_VALID_MASK;
- *entry |= ZPCI_PTE_VALID;
-}
-
-static inline void entry_set_protected(unsigned long *entry)
-{
- *entry &= ~ZPCI_TABLE_PROT_MASK;
- *entry |= ZPCI_TABLE_PROTECTED;
-}
-
-static inline void entry_clr_protected(unsigned long *entry)
-{
- *entry &= ~ZPCI_TABLE_PROT_MASK;
- *entry |= ZPCI_TABLE_UNPROTECTED;
-}
-
-static inline int reg_entry_isvalid(unsigned long entry)
-{
- return (entry & ZPCI_TABLE_VALID_MASK) == ZPCI_TABLE_VALID;
-}
-
-static inline int pt_entry_isvalid(unsigned long entry)
-{
- return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID;
-}
-
-static inline int entry_isprotected(unsigned long entry)
-{
- return (entry & ZPCI_TABLE_PROT_MASK) == ZPCI_TABLE_PROTECTED;
-}
-
-static inline unsigned long *get_rt_sto(unsigned long entry)
-{
- return ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX)
- ? (unsigned long *) (entry & ZPCI_RTE_ADDR_MASK)
- : NULL;
-}
-
-static inline unsigned long *get_st_pto(unsigned long entry)
-{
- return ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_SX)
- ? (unsigned long *) (entry & ZPCI_STE_ADDR_MASK)
- : NULL;
-}
-
-/* Prototypes */
-int zpci_dma_init_device(struct zpci_dev *);
-void zpci_dma_exit_device(struct zpci_dev *);
-void dma_free_seg_table(unsigned long);
-unsigned long *dma_alloc_cpu_table(void);
-void dma_cleanup_tables(unsigned long *);
-unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr);
-void dma_update_cpu_trans(unsigned long *entry, void *page_addr, int flags);
-
-extern const struct dma_map_ops s390_pci_dma_ops;
+struct zpci_iommu_ctrs {
+ atomic64_t mapped_pages;
+ atomic64_t unmapped_pages;
+ atomic64_t global_rpcits;
+ atomic64_t sync_map_rpcits;
+ atomic64_t sync_rpcits;
+};
+
+struct zpci_dev;
+struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev);
#endif
diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h
index 61cf9531f68f..e5f57cfe1d45 100644
--- a/arch/s390/include/asm/pci_insn.h
+++ b/arch/s390/include/asm/pci_insn.h
@@ -98,6 +98,15 @@ struct zpci_fib {
u32 gd;
} __packed __aligned(8);
+/* Set Interruption Controls Operation Controls */
+#define SIC_IRQ_MODE_ALL 0
+#define SIC_IRQ_MODE_SINGLE 1
+#define SIC_SET_AENI_CONTROLS 2
+#define SIC_IRQ_MODE_DIRECT 4
+#define SIC_IRQ_MODE_D_ALL 16
+#define SIC_IRQ_MODE_D_SINGLE 17
+#define SIC_IRQ_MODE_SET_CPU 18
+
/* directed interruption information block */
struct zpci_diib {
u32 : 1;
@@ -119,9 +128,20 @@ struct zpci_cdiib {
u64 : 64;
} __packed __aligned(8);
+/* adapter interruption parameters block */
+struct zpci_aipb {
+ u64 faisb;
+ u64 gait;
+ u16 : 13;
+ u16 afi : 3;
+ u32 : 32;
+ u16 faal;
+} __packed __aligned(8);
+
union zpci_sic_iib {
struct zpci_diib diib;
struct zpci_cdiib cdiib;
+ struct zpci_aipb aipb;
};
DECLARE_STATIC_KEY_FALSE(have_mio);
@@ -134,13 +154,6 @@ int __zpci_store(u64 data, u64 req, u64 offset);
int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len);
int __zpci_store_block(const u64 *data, u64 req, u64 offset);
void zpci_barrier(void);
-int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib);
-
-static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc)
-{
- union zpci_sic_iib iib = {{0}};
-
- return __zpci_set_irq_ctrl(ctl, isc, &iib);
-}
+int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib);
#endif
diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h
index cd060b5dd8fd..2686bee800e3 100644
--- a/arch/s390/include/asm/pci_io.h
+++ b/arch/s390/include/asm/pci_io.h
@@ -8,14 +8,21 @@
#include <linux/slab.h>
#include <asm/pci_insn.h>
+/* I/O size constraints */
+#define ZPCI_MAX_READ_SIZE 8
+#define ZPCI_MAX_WRITE_SIZE 128
+#define ZPCI_BOUNDARY_SIZE (1 << 12)
+#define ZPCI_BOUNDARY_MASK (ZPCI_BOUNDARY_SIZE - 1)
+
/* I/O Map */
#define ZPCI_IOMAP_SHIFT 48
-#define ZPCI_IOMAP_ADDR_BASE 0x8000000000000000UL
+#define ZPCI_IOMAP_ADDR_SHIFT 62
+#define ZPCI_IOMAP_ADDR_BASE (1UL << ZPCI_IOMAP_ADDR_SHIFT)
#define ZPCI_IOMAP_ADDR_OFF_MASK ((1UL << ZPCI_IOMAP_SHIFT) - 1)
#define ZPCI_IOMAP_MAX_ENTRIES \
- ((ULONG_MAX - ZPCI_IOMAP_ADDR_BASE + 1) / (1UL << ZPCI_IOMAP_SHIFT))
+ (1UL << (ZPCI_IOMAP_ADDR_SHIFT - ZPCI_IOMAP_SHIFT))
#define ZPCI_IOMAP_ADDR_IDX_MASK \
- (~ZPCI_IOMAP_ADDR_OFF_MASK - ZPCI_IOMAP_ADDR_BASE)
+ ((ZPCI_IOMAP_ADDR_BASE - 1) & ~ZPCI_IOMAP_ADDR_OFF_MASK)
struct zpci_iomap_entry {
u32 fh;
@@ -120,16 +127,18 @@ out:
int zpci_write_block(volatile void __iomem *dst, const void *src,
unsigned long len);
-static inline u8 zpci_get_max_write_size(u64 src, u64 dst, int len, int max)
+static inline int zpci_get_max_io_size(u64 src, u64 dst, int len, int max)
{
- int count = len > max ? max : len, size = 1;
+ int offset = dst & ZPCI_BOUNDARY_MASK;
+ int size;
- while (!(src & 0x1) && !(dst & 0x1) && ((size << 1) <= count)) {
- dst = dst >> 1;
- src = src >> 1;
- size = size << 1;
- }
- return size;
+ size = min3(len, ZPCI_BOUNDARY_SIZE - offset, max);
+ if (IS_ALIGNED(src, 8) && IS_ALIGNED(dst, 8) && IS_ALIGNED(size, 8))
+ return size;
+
+ if (size >= 8)
+ return 8;
+ return rounddown_pow_of_two(size);
}
static inline int zpci_memcpy_fromio(void *dst,
@@ -139,8 +148,9 @@ static inline int zpci_memcpy_fromio(void *dst,
int size, rc = 0;
while (n > 0) {
- size = zpci_get_max_write_size((u64 __force) src,
- (u64) dst, n, 8);
+ size = zpci_get_max_io_size((u64 __force) src,
+ (u64) dst, n,
+ ZPCI_MAX_READ_SIZE);
rc = zpci_read_single(dst, src, size);
if (rc)
break;
@@ -160,8 +170,9 @@ static inline int zpci_memcpy_toio(volatile void __iomem *dst,
return -EINVAL;
while (n > 0) {
- size = zpci_get_max_write_size((u64 __force) dst,
- (u64) src, n, 128);
+ size = zpci_get_max_io_size((u64 __force) dst,
+ (u64) src, n,
+ ZPCI_MAX_WRITE_SIZE);
if (size > 8) /* main path */
rc = zpci_write_block(dst, src, size);
else
diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 50b4ce8cddfd..264095dd84bc 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -29,15 +29,15 @@
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ old__, new__, prev__; \
pcp_op_T__ *ptr__; \
- preempt_disable(); \
+ preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
- prev__ = *ptr__; \
+ prev__ = READ_ONCE(*ptr__); \
do { \
old__ = prev__; \
new__ = old__ op (val); \
prev__ = cmpxchg(ptr__, old__, new__); \
} while (prev__ != old__); \
- preempt_enable(); \
+ preempt_enable_notrace(); \
new__; \
})
@@ -68,7 +68,7 @@
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ val__ = (val); \
pcp_op_T__ old__, *ptr__; \
- preempt_disable(); \
+ preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
if (__builtin_constant_p(val__) && \
((szcast)val__ > -129) && ((szcast)val__ < 128)) { \
@@ -84,7 +84,7 @@
: [val__] "d" (val__) \
: "cc"); \
} \
- preempt_enable(); \
+ preempt_enable_notrace(); \
}
#define this_cpu_add_4(pcp, val) arch_this_cpu_add(pcp, val, "laa", "asi", int)
@@ -95,14 +95,14 @@
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ val__ = (val); \
pcp_op_T__ old__, *ptr__; \
- preempt_disable(); \
+ preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
asm volatile( \
op " %[old__],%[val__],%[ptr__]\n" \
: [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \
: [val__] "d" (val__) \
: "cc"); \
- preempt_enable(); \
+ preempt_enable_notrace(); \
old__ + val__; \
})
@@ -114,14 +114,14 @@
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ val__ = (val); \
pcp_op_T__ old__, *ptr__; \
- preempt_disable(); \
+ preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
asm volatile( \
op " %[old__],%[val__],%[ptr__]\n" \
: [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \
: [val__] "d" (val__) \
: "cc"); \
- preempt_enable(); \
+ preempt_enable_notrace(); \
}
#define this_cpu_and_4(pcp, val) arch_this_cpu_to_op(pcp, val, "lan")
@@ -136,10 +136,10 @@
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ ret__; \
pcp_op_T__ *ptr__; \
- preempt_disable(); \
+ preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
ret__ = cmpxchg(ptr__, oval, nval); \
- preempt_enable(); \
+ preempt_enable_notrace(); \
ret__; \
})
@@ -148,14 +148,30 @@
#define this_cpu_cmpxchg_4(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
+#define this_cpu_cmpxchg64(pcp, o, n) this_cpu_cmpxchg_8(pcp, o, n)
+
+#define this_cpu_cmpxchg128(pcp, oval, nval) \
+({ \
+ typedef typeof(pcp) pcp_op_T__; \
+ u128 old__, new__, ret__; \
+ pcp_op_T__ *ptr__; \
+ old__ = oval; \
+ new__ = nval; \
+ preempt_disable_notrace(); \
+ ptr__ = raw_cpu_ptr(&(pcp)); \
+ ret__ = cmpxchg128((void *)ptr__, old__, new__); \
+ preempt_enable_notrace(); \
+ ret__; \
+})
+
#define arch_this_cpu_xchg(pcp, nval) \
({ \
typeof(pcp) *ptr__; \
typeof(pcp) ret__; \
- preempt_disable(); \
+ preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
ret__ = xchg(ptr__, nval); \
- preempt_enable(); \
+ preempt_enable_notrace(); \
ret__; \
})
@@ -164,23 +180,6 @@
#define this_cpu_xchg_4(pcp, nval) arch_this_cpu_xchg(pcp, nval)
#define this_cpu_xchg_8(pcp, nval) arch_this_cpu_xchg(pcp, nval)
-#define arch_this_cpu_cmpxchg_double(pcp1, pcp2, o1, o2, n1, n2) \
-({ \
- typeof(pcp1) o1__ = (o1), n1__ = (n1); \
- typeof(pcp2) o2__ = (o2), n2__ = (n2); \
- typeof(pcp1) *p1__; \
- typeof(pcp2) *p2__; \
- int ret__; \
- preempt_disable(); \
- p1__ = raw_cpu_ptr(&(pcp1)); \
- p2__ = raw_cpu_ptr(&(pcp2)); \
- ret__ = __cmpxchg_double(p1__, p2__, o1__, o2__, n1__, n2__); \
- preempt_enable(); \
- ret__; \
-})
-
-#define this_cpu_cmpxchg_double_8 arch_this_cpu_cmpxchg_double
-
#include <asm-generic/percpu.h>
#endif /* __ARCH_S390_PERCPU__ */
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index b9da71632827..9917e2717b2b 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -60,7 +60,6 @@ struct perf_sf_sde_regs {
#define PERF_CPUM_SF_DIAG_MODE 0x0002 /* Diagnostic-sampling flag */
#define PERF_CPUM_SF_MODE_MASK (PERF_CPUM_SF_BASIC_MODE| \
PERF_CPUM_SF_DIAG_MODE)
-#define PERF_CPUM_SF_FULL_BLOCKS 0x0004 /* Process full SDBs only */
#define PERF_CPUM_SF_FREQ_MODE 0x0008 /* Sampling with frequency */
#define REG_NONE 0
@@ -71,7 +70,6 @@ struct perf_sf_sde_regs {
#define SAMPL_RATE(hwc) ((hwc)->event_base)
#define SAMPL_FLAGS(hwc) ((hwc)->config_base)
#define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE)
-#define SDB_FULL_BLOCKS(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FULL_BLOCKS)
#define SAMPLE_FREQ_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE)
#define perf_arch_fetch_caller_regs(regs, __ip) do { \
diff --git a/arch/s390/include/asm/pfault.h b/arch/s390/include/asm/pfault.h
new file mode 100644
index 000000000000..a1bee4a1e470
--- /dev/null
+++ b/arch/s390/include/asm/pfault.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 1999, 2023
+ */
+#ifndef _ASM_S390_PFAULT_H
+#define _ASM_S390_PFAULT_H
+
+#include <linux/errno.h>
+
+int __pfault_init(void);
+void __pfault_fini(void);
+
+static inline int pfault_init(void)
+{
+ if (IS_ENABLED(CONFIG_PFAULT))
+ return __pfault_init();
+ return -EOPNOTSUPP;
+}
+
+static inline void pfault_fini(void)
+{
+ if (IS_ENABLED(CONFIG_PFAULT))
+ __pfault_fini();
+}
+
+#endif /* _ASM_S390_PFAULT_H */
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 77606c4acd58..502d655fe6ae 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -25,7 +25,6 @@ void crst_table_free(struct mm_struct *, unsigned long *);
unsigned long *page_table_alloc(struct mm_struct *);
struct page *page_table_alloc_pgste(struct mm_struct *mm);
void page_table_free(struct mm_struct *, unsigned long *);
-void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
void page_table_free_pgste(struct page *page);
extern int page_table_allocate_pgste;
@@ -34,19 +33,21 @@ static inline void crst_table_init(unsigned long *crst, unsigned long entry)
memset64((u64 *)crst, entry, _CRST_ENTRIES);
}
-static inline unsigned long pgd_entry_type(struct mm_struct *mm)
+int crst_table_upgrade(struct mm_struct *mm, unsigned long limit);
+
+static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
{
- if (mm_pmd_folded(mm))
- return _SEGMENT_ENTRY_EMPTY;
- if (mm_pud_folded(mm))
- return _REGION3_ENTRY_EMPTY;
- if (mm_p4d_folded(mm))
- return _REGION2_ENTRY_EMPTY;
- return _REGION1_ENTRY_EMPTY;
-}
+ int rc;
-int crst_table_upgrade(struct mm_struct *mm, unsigned long limit);
-void crst_table_downgrade(struct mm_struct *);
+ if (addr + len > mm->context.asce_limit &&
+ addr + len <= TASK_SIZE) {
+ rc = crst_table_upgrade(mm, addr + len);
+ if (rc)
+ return (unsigned long) rc;
+ }
+ return addr;
+}
static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address)
{
@@ -84,7 +85,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
if (!table)
return NULL;
crst_table_init(table, _SEGMENT_ENTRY_EMPTY);
- if (!pgtable_pmd_page_ctor(virt_to_page(table))) {
+ if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) {
crst_table_free(mm, table);
return NULL;
}
@@ -95,59 +96,43 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
if (mm_pmd_folded(mm))
return;
- pgtable_pmd_page_dtor(virt_to_page(pmd));
+ pagetable_pmd_dtor(virt_to_ptdesc(pmd));
crst_table_free(mm, (unsigned long *) pmd);
}
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
- pgd_val(*pgd) = _REGION1_ENTRY | __pa(p4d);
+ set_pgd(pgd, __pgd(_REGION1_ENTRY | __pa(p4d)));
}
static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
- p4d_val(*p4d) = _REGION2_ENTRY | __pa(pud);
+ set_p4d(p4d, __p4d(_REGION2_ENTRY | __pa(pud)));
}
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
- pud_val(*pud) = _REGION3_ENTRY | __pa(pmd);
+ set_pud(pud, __pud(_REGION3_ENTRY | __pa(pmd)));
}
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
- unsigned long *table = crst_table_alloc(mm);
-
- if (!table)
- return NULL;
- if (mm->context.asce_limit == _REGION3_SIZE) {
- /* Forking a compat process with 2 page table levels */
- if (!pgtable_pmd_page_ctor(virt_to_page(table))) {
- crst_table_free(mm, table);
- return NULL;
- }
- }
- return (pgd_t *) table;
+ return (pgd_t *) crst_table_alloc(mm);
}
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
- if (mm->context.asce_limit == _REGION3_SIZE)
- pgtable_pmd_page_dtor(virt_to_page(pgd));
crst_table_free(mm, (unsigned long *) pgd);
}
static inline void pmd_populate(struct mm_struct *mm,
pmd_t *pmd, pgtable_t pte)
{
- pmd_val(*pmd) = _SEGMENT_ENTRY + __pa(pte);
+ set_pmd(pmd, __pmd(_SEGMENT_ENTRY | __pa(pte)));
}
#define pmd_populate_kernel(mm, pmd, pte) pmd_populate(mm, pmd, pte)
-#define pmd_pgtable(pmd) \
- (pgtable_t)(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)
-
/*
* page table entry allocation/free routines.
*/
@@ -157,7 +142,9 @@ static inline void pmd_populate(struct mm_struct *mm,
#define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte)
#define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte)
-extern void rcu_table_freelist_finish(void);
+/* arch use pte_free_defer() implementation in arch/s390/mm/pgalloc.c */
+#define pte_free_defer pte_free_defer
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);
void vmem_map_init(void);
void *vmem_crst_alloc(unsigned long val);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7b03037a8475..1299b56e43f6 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -17,11 +17,16 @@
#include <linux/page-flags.h>
#include <linux/radix-tree.h>
#include <linux/atomic.h>
+#include <asm/sections.h>
+#include <asm/ctlreg.h>
#include <asm/bug.h>
#include <asm/page.h>
+#include <asm/uv.h>
extern pgd_t swapper_pg_dir[];
+extern pgd_t invalid_pg_dir[];
extern void paging_init(void);
+extern struct ctlreg s390_invalid_asce;
enum {
PG_DIRECT_MAP_4K = 0,
@@ -30,7 +35,7 @@ enum {
PG_DIRECT_MAP_MAX
};
-extern atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX];
+extern atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]);
static inline void update_page_count(int level, long count)
{
@@ -38,14 +43,12 @@ static inline void update_page_count(int level, long count)
atomic_long_add(count, &direct_pages_count[level]);
}
-struct seq_file;
-void arch_report_meminfo(struct seq_file *m);
-
/*
* The S390 doesn't have any external MMU info: the kernel page
* tables contain all the necessary information.
*/
#define update_mmu_cache(vma, address, ptep) do { } while (0)
+#define update_mmu_cache_range(vmf, vma, addr, ptep, nr) do { } while (0)
#define update_mmu_cache_pmd(vma, address, ptep) do { } while (0)
/*
@@ -63,36 +66,33 @@ extern unsigned long zero_page_mask;
/* TODO: s390 cannot support io_remap_pfn_range... */
-#define FIRST_USER_ADDRESS 0UL
-
#define pte_ERROR(e) \
- printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e))
+ pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
#define pmd_ERROR(e) \
- printk("%s:%d: bad pmd %p.\n", __FILE__, __LINE__, (void *) pmd_val(e))
+ pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
#define pud_ERROR(e) \
- printk("%s:%d: bad pud %p.\n", __FILE__, __LINE__, (void *) pud_val(e))
+ pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
#define p4d_ERROR(e) \
- printk("%s:%d: bad p4d %p.\n", __FILE__, __LINE__, (void *) p4d_val(e))
+ pr_err("%s:%d: bad p4d %016lx.\n", __FILE__, __LINE__, p4d_val(e))
#define pgd_ERROR(e) \
- printk("%s:%d: bad pgd %p.\n", __FILE__, __LINE__, (void *) pgd_val(e))
+ pr_err("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e))
/*
* The vmalloc and module area will always be on the topmost area of the
- * kernel mapping. We reserve 128GB (64bit) for vmalloc and modules.
- * On 64 bit kernels we have a 2GB area at the top of the vmalloc area where
- * modules will reside. That makes sure that inter module branches always
- * happen without trampolines and in addition the placement within a 2GB frame
- * is branch prediction unit friendly.
+ * kernel mapping. 512GB are reserved for vmalloc by default.
+ * At the top of the vmalloc area a 2GB area is reserved where modules
+ * will reside. That makes sure that inter module branches always
+ * happen without trampolines and in addition the placement within a
+ * 2GB frame is branch prediction unit friendly.
*/
-extern unsigned long VMALLOC_START;
-extern unsigned long VMALLOC_END;
-#define VMALLOC_DEFAULT_SIZE ((128UL << 30) - MODULES_LEN)
-extern struct page *vmemmap;
-
-#define VMEM_MAX_PHYS ((unsigned long) vmemmap)
-
-extern unsigned long MODULES_VADDR;
-extern unsigned long MODULES_END;
+extern unsigned long __bootdata_preserved(VMALLOC_START);
+extern unsigned long __bootdata_preserved(VMALLOC_END);
+#define VMALLOC_DEFAULT_SIZE ((512UL << 30) - MODULES_LEN)
+extern struct page *__bootdata_preserved(vmemmap);
+extern unsigned long __bootdata_preserved(vmemmap_size);
+
+extern unsigned long __bootdata_preserved(MODULES_VADDR);
+extern unsigned long __bootdata_preserved(MODULES_END);
#define MODULES_VADDR MODULES_VADDR
#define MODULES_END MODULES_END
#define MODULES_LEN (1UL << 31)
@@ -179,11 +179,21 @@ static inline int is_module_addr(void *addr)
#define _PAGE_SOFT_DIRTY 0x000
#endif
+#define _PAGE_SW_BITS 0xffUL /* All SW bits */
+
+#define _PAGE_SWP_EXCLUSIVE _PAGE_LARGE /* SW pte exclusive swap bit */
+
/* Set of bits not changed in pte_modify */
#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_SPECIAL | _PAGE_DIRTY | \
_PAGE_YOUNG | _PAGE_SOFT_DIRTY)
/*
+ * Mask of bits that must not be changed with RDP. Allow only _PAGE_PROTECT
+ * HW bit and all SW bits.
+ */
+#define _PAGE_RDP_MASK ~(_PAGE_PROTECT | _PAGE_SW_BITS)
+
+/*
* handle_pte_fault uses pte_present and pte_none to find out the pte type
* WITHOUT holding the page table lock. The _PAGE_PRESENT bit is used to
* distinguish present from not-present ptes. It is changed only with the page
@@ -341,8 +351,6 @@ static inline int is_module_addr(void *addr)
#define PTRS_PER_P4D _CRST_ENTRIES
#define PTRS_PER_PGD _CRST_ENTRIES
-#define MAX_PTRS_PER_P4D PTRS_PER_P4D
-
/*
* Segment table and region3 table entry encoding
* (R = read-only, I = invalid, y = young bit):
@@ -422,23 +430,6 @@ static inline int is_module_addr(void *addr)
* implies read permission.
*/
/*xwr*/
-#define __P000 PAGE_NONE
-#define __P001 PAGE_RO
-#define __P010 PAGE_RO
-#define __P011 PAGE_RO
-#define __P100 PAGE_RX
-#define __P101 PAGE_RX
-#define __P110 PAGE_RX
-#define __P111 PAGE_RX
-
-#define __S000 PAGE_NONE
-#define __S001 PAGE_RO
-#define __S010 PAGE_RW
-#define __S011 PAGE_RW
-#define __S100 PAGE_RX
-#define __S101 PAGE_RX
-#define __S110 PAGE_RWX
-#define __S111 PAGE_RWX
/*
* Segment entry (large page) protection definitions.
@@ -492,6 +483,12 @@ static inline int is_module_addr(void *addr)
_REGION3_ENTRY_YOUNG | \
_REGION_ENTRY_PROTECT | \
_REGION_ENTRY_NOEXEC)
+#define REGION3_KERNEL_EXEC __pgprot(_REGION_ENTRY_TYPE_R3 | \
+ _REGION3_ENTRY_LARGE | \
+ _REGION3_ENTRY_READ | \
+ _REGION3_ENTRY_WRITE | \
+ _REGION3_ENTRY_YOUNG | \
+ _REGION3_ENTRY_DIRTY)
static inline bool mm_p4d_folded(struct mm_struct *mm)
{
@@ -520,6 +517,15 @@ static inline int mm_has_pgste(struct mm_struct *mm)
return 0;
}
+static inline int mm_is_protected(struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+ if (unlikely(atomic_read(&mm->context.protected_count)))
+ return 1;
+#endif
+ return 0;
+}
+
static inline int mm_alloc_pgste(struct mm_struct *mm)
{
#ifdef CONFIG_PGSTE
@@ -529,6 +535,36 @@ static inline int mm_alloc_pgste(struct mm_struct *mm)
return 0;
}
+static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot)
+{
+ return __pte(pte_val(pte) & ~pgprot_val(prot));
+}
+
+static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot)
+{
+ return __pte(pte_val(pte) | pgprot_val(prot));
+}
+
+static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot)
+{
+ return __pmd(pmd_val(pmd) & ~pgprot_val(prot));
+}
+
+static inline pmd_t set_pmd_bit(pmd_t pmd, pgprot_t prot)
+{
+ return __pmd(pmd_val(pmd) | pgprot_val(prot));
+}
+
+static inline pud_t clear_pud_bit(pud_t pud, pgprot_t prot)
+{
+ return __pud(pud_val(pud) & ~pgprot_val(prot));
+}
+
+static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot)
+{
+ return __pud(pud_val(pud) | pgprot_val(prot));
+}
+
/*
* In the case that a guest uses storage keys
* faults should no longer be backed by zero pages
@@ -545,27 +581,25 @@ static inline int mm_uses_skeys(struct mm_struct *mm)
static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new)
{
- register unsigned long reg2 asm("2") = old;
- register unsigned long reg3 asm("3") = new;
+ union register_pair r1 = { .even = old, .odd = new, };
unsigned long address = (unsigned long)ptr | 1;
asm volatile(
- " csp %0,%3"
- : "+d" (reg2), "+m" (*ptr)
- : "d" (reg3), "d" (address)
+ " csp %[r1],%[address]"
+ : [r1] "+&d" (r1.pair), "+m" (*ptr)
+ : [address] "d" (address)
: "cc");
}
static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new)
{
- register unsigned long reg2 asm("2") = old;
- register unsigned long reg3 asm("3") = new;
+ union register_pair r1 = { .even = old, .odd = new, };
unsigned long address = (unsigned long)ptr | 1;
asm volatile(
- " .insn rre,0xb98a0000,%0,%3"
- : "+d" (reg2), "+m" (*ptr)
- : "d" (reg3), "d" (address)
+ " cspg %[r1],%[address]"
+ : [r1] "+&d" (r1.pair), "+m" (*ptr)
+ : [address] "d" (address)
: "cc");
}
@@ -576,17 +610,15 @@ static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new
#define CRDTE_DTT_REGION1 0x1cUL
static inline void crdte(unsigned long old, unsigned long new,
- unsigned long table, unsigned long dtt,
+ unsigned long *table, unsigned long dtt,
unsigned long address, unsigned long asce)
{
- register unsigned long reg2 asm("2") = old;
- register unsigned long reg3 asm("3") = new;
- register unsigned long reg4 asm("4") = table | dtt;
- register unsigned long reg5 asm("5") = address;
+ union register_pair r1 = { .even = old, .odd = new, };
+ union register_pair r2 = { .even = __pa(table) | dtt, .odd = address, };
- asm volatile(".insn rrf,0xb98f0000,%0,%2,%4,0"
- : "+d" (reg2)
- : "d" (reg3), "d" (reg4), "d" (reg5), "a" (asce)
+ asm volatile(".insn rrf,0xb98f0000,%[r1],%[r2],%[asce],0"
+ : [r1] "+&d" (r1.pair)
+ : [r2] "d" (r2.pair), [asce] "a" (asce)
: "memory", "cc");
}
@@ -673,6 +705,7 @@ static inline int pud_none(pud_t pud)
return pud_val(pud) == _REGION3_ENTRY_EMPTY;
}
+#define pud_leaf pud_large
static inline int pud_large(pud_t pud)
{
if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) != _REGION_ENTRY_TYPE_R3)
@@ -680,16 +713,7 @@ static inline int pud_large(pud_t pud)
return !!(pud_val(pud) & _REGION3_ENTRY_LARGE);
}
-static inline unsigned long pud_pfn(pud_t pud)
-{
- unsigned long origin_mask;
-
- origin_mask = _REGION_ENTRY_ORIGIN;
- if (pud_large(pud))
- origin_mask = _REGION3_ENTRY_ORIGIN_LARGE;
- return (pud_val(pud) & origin_mask) >> PAGE_SHIFT;
-}
-
+#define pmd_leaf pmd_large
static inline int pmd_large(pmd_t pmd)
{
return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
@@ -734,27 +758,25 @@ static inline int pmd_none(pmd_t pmd)
return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY;
}
-static inline unsigned long pmd_pfn(pmd_t pmd)
-{
- unsigned long origin_mask;
-
- origin_mask = _SEGMENT_ENTRY_ORIGIN;
- if (pmd_large(pmd))
- origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE;
- return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT;
-}
-
#define pmd_write pmd_write
static inline int pmd_write(pmd_t pmd)
{
return (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) != 0;
}
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+ return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0;
+}
+
+#define pmd_dirty pmd_dirty
static inline int pmd_dirty(pmd_t pmd)
{
return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0;
}
+#define pmd_young pmd_young
static inline int pmd_young(pmd_t pmd)
{
return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0;
@@ -803,6 +825,21 @@ static inline int pmd_protnone(pmd_t pmd)
}
#endif
+static inline int pte_swp_exclusive(pte_t pte)
+{
+ return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
+}
+
+static inline pte_t pte_swp_mkexclusive(pte_t pte)
+{
+ return set_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE));
+}
+
+static inline pte_t pte_swp_clear_exclusive(pte_t pte)
+{
+ return clear_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE));
+}
+
static inline int pte_soft_dirty(pte_t pte)
{
return pte_val(pte) & _PAGE_SOFT_DIRTY;
@@ -811,15 +848,13 @@ static inline int pte_soft_dirty(pte_t pte)
static inline pte_t pte_mksoft_dirty(pte_t pte)
{
- pte_val(pte) |= _PAGE_SOFT_DIRTY;
- return pte;
+ return set_pte_bit(pte, __pgprot(_PAGE_SOFT_DIRTY));
}
#define pte_swp_mksoft_dirty pte_mksoft_dirty
static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
- pte_val(pte) &= ~_PAGE_SOFT_DIRTY;
- return pte;
+ return clear_pte_bit(pte, __pgprot(_PAGE_SOFT_DIRTY));
}
#define pte_swp_clear_soft_dirty pte_clear_soft_dirty
@@ -830,14 +865,12 @@ static inline int pmd_soft_dirty(pmd_t pmd)
static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
- pmd_val(pmd) |= _SEGMENT_ENTRY_SOFT_DIRTY;
- return pmd;
+ return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_SOFT_DIRTY));
}
static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_SOFT_DIRTY;
- return pmd;
+ return clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_SOFT_DIRTY));
}
/*
@@ -866,35 +899,79 @@ static inline int pte_unused(pte_t pte)
}
/*
+ * Extract the pgprot value from the given pte while at the same time making it
+ * usable for kernel address space mappings where fault driven dirty and
+ * young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID
+ * must not be set.
+ */
+static inline pgprot_t pte_pgprot(pte_t pte)
+{
+ unsigned long pte_flags = pte_val(pte) & _PAGE_CHG_MASK;
+
+ if (pte_write(pte))
+ pte_flags |= pgprot_val(PAGE_KERNEL);
+ else
+ pte_flags |= pgprot_val(PAGE_KERNEL_RO);
+ pte_flags |= pte_val(pte) & mio_wb_bit_mask;
+
+ return __pgprot(pte_flags);
+}
+
+/*
* pgd/pmd/pte modification functions
*/
+static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ WRITE_ONCE(*pgdp, pgd);
+}
+
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+ WRITE_ONCE(*p4dp, p4d);
+}
+
+static inline void set_pud(pud_t *pudp, pud_t pud)
+{
+ WRITE_ONCE(*pudp, pud);
+}
+
+static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ WRITE_ONCE(*pmdp, pmd);
+}
+
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+ WRITE_ONCE(*ptep, pte);
+}
+
static inline void pgd_clear(pgd_t *pgd)
{
if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1)
- pgd_val(*pgd) = _REGION1_ENTRY_EMPTY;
+ set_pgd(pgd, __pgd(_REGION1_ENTRY_EMPTY));
}
static inline void p4d_clear(p4d_t *p4d)
{
if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2)
- p4d_val(*p4d) = _REGION2_ENTRY_EMPTY;
+ set_p4d(p4d, __p4d(_REGION2_ENTRY_EMPTY));
}
static inline void pud_clear(pud_t *pud)
{
if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
- pud_val(*pud) = _REGION3_ENTRY_EMPTY;
+ set_pud(pud, __pud(_REGION3_ENTRY_EMPTY));
}
static inline void pmd_clear(pmd_t *pmdp)
{
- pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
+ set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
}
static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
- pte_val(*ptep) = _PAGE_INVALID;
+ set_pte(ptep, __pte(_PAGE_INVALID));
}
/*
@@ -903,79 +980,74 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt
*/
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
- pte_val(pte) &= _PAGE_CHG_MASK;
- pte_val(pte) |= pgprot_val(newprot);
+ pte = clear_pte_bit(pte, __pgprot(~_PAGE_CHG_MASK));
+ pte = set_pte_bit(pte, newprot);
/*
* newprot for PAGE_NONE, PAGE_RO, PAGE_RX, PAGE_RW and PAGE_RWX
* has the invalid bit set, clear it again for readable, young pages
*/
if ((pte_val(pte) & _PAGE_YOUNG) && (pte_val(pte) & _PAGE_READ))
- pte_val(pte) &= ~_PAGE_INVALID;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_INVALID));
/*
* newprot for PAGE_RO, PAGE_RX, PAGE_RW and PAGE_RWX has the page
* protection bit set, clear it again for writable, dirty pages
*/
if ((pte_val(pte) & _PAGE_DIRTY) && (pte_val(pte) & _PAGE_WRITE))
- pte_val(pte) &= ~_PAGE_PROTECT;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT));
return pte;
}
static inline pte_t pte_wrprotect(pte_t pte)
{
- pte_val(pte) &= ~_PAGE_WRITE;
- pte_val(pte) |= _PAGE_PROTECT;
- return pte;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_WRITE));
+ return set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite_novma(pte_t pte)
{
- pte_val(pte) |= _PAGE_WRITE;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_WRITE));
if (pte_val(pte) & _PAGE_DIRTY)
- pte_val(pte) &= ~_PAGE_PROTECT;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT));
return pte;
}
static inline pte_t pte_mkclean(pte_t pte)
{
- pte_val(pte) &= ~_PAGE_DIRTY;
- pte_val(pte) |= _PAGE_PROTECT;
- return pte;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_DIRTY));
+ return set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
}
static inline pte_t pte_mkdirty(pte_t pte)
{
- pte_val(pte) |= _PAGE_DIRTY | _PAGE_SOFT_DIRTY;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_DIRTY | _PAGE_SOFT_DIRTY));
if (pte_val(pte) & _PAGE_WRITE)
- pte_val(pte) &= ~_PAGE_PROTECT;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT));
return pte;
}
static inline pte_t pte_mkold(pte_t pte)
{
- pte_val(pte) &= ~_PAGE_YOUNG;
- pte_val(pte) |= _PAGE_INVALID;
- return pte;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_YOUNG));
+ return set_pte_bit(pte, __pgprot(_PAGE_INVALID));
}
static inline pte_t pte_mkyoung(pte_t pte)
{
- pte_val(pte) |= _PAGE_YOUNG;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_YOUNG));
if (pte_val(pte) & _PAGE_READ)
- pte_val(pte) &= ~_PAGE_INVALID;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_INVALID));
return pte;
}
static inline pte_t pte_mkspecial(pte_t pte)
{
- pte_val(pte) |= _PAGE_SPECIAL;
- return pte;
+ return set_pte_bit(pte, __pgprot(_PAGE_SPECIAL));
}
#ifdef CONFIG_HUGETLB_PAGE
static inline pte_t pte_mkhuge(pte_t pte)
{
- pte_val(pte) |= _PAGE_LARGE;
- return pte;
+ return set_pte_bit(pte, __pgprot(_PAGE_LARGE));
}
#endif
@@ -985,16 +1057,29 @@ static inline pte_t pte_mkhuge(pte_t pte)
#define IPTE_NODAT 0x400
#define IPTE_GUEST_ASCE 0x800
+static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep,
+ unsigned long opt, unsigned long asce,
+ int local)
+{
+ unsigned long pto;
+
+ pto = __pa(ptep) & ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
+ asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%[asce],%[m4]"
+ : "+m" (*ptep)
+ : [r1] "a" (pto), [r2] "a" ((addr & PAGE_MASK) | opt),
+ [asce] "a" (asce), [m4] "i" (local));
+}
+
static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep,
unsigned long opt, unsigned long asce,
int local)
{
- unsigned long pto = (unsigned long) ptep;
+ unsigned long pto = __pa(ptep);
if (__builtin_constant_p(opt) && opt == 0) {
/* Invalidation + TLB flush for the pte */
asm volatile(
- " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]"
+ " ipte %[r1],%[r2],0,%[m4]"
: "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address),
[m4] "i" (local));
return;
@@ -1003,7 +1088,7 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep,
/* Invalidate ptes with options + TLB flush of the ptes */
opt = opt | (asce & _ASCE_ORIGIN);
asm volatile(
- " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]"
+ " ipte %[r1],%[r2],%[r3],%[m4]"
: [r2] "+a" (address), [r3] "+a" (opt)
: [r1] "a" (pto), [m4] "i" (local) : "memory");
}
@@ -1011,12 +1096,12 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep,
static __always_inline void __ptep_ipte_range(unsigned long address, int nr,
pte_t *ptep, int local)
{
- unsigned long pto = (unsigned long) ptep;
+ unsigned long pto = __pa(ptep);
/* Invalidate a range of ptes + TLB flush of the ptes */
do {
asm volatile(
- " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]"
+ " ipte %[r1],%[r2],%[r3],%[m4]"
: [r2] "+a" (address), [r3] "+a" (nr)
: [r1] "a" (pto), [m4] "i" (local) : "memory");
} while (nr != 255);
@@ -1059,7 +1144,13 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+ pte_t res;
+
+ res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+ /* At this point the reference through the mapping is still present */
+ if (mm_is_protected(mm) && pte_present(res))
+ uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK);
+ return res;
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@ -1071,7 +1162,13 @@ void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
- return ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID));
+ pte_t res;
+
+ res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID));
+ /* At this point the reference through the mapping is still present */
+ if (mm_is_protected(vma->vm_mm) && pte_present(res))
+ uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK);
+ return res;
}
/*
@@ -1086,12 +1183,31 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
unsigned long addr,
pte_t *ptep, int full)
{
+ pte_t res;
+
if (full) {
- pte_t pte = *ptep;
- *ptep = __pte(_PAGE_INVALID);
- return pte;
+ res = *ptep;
+ set_pte(ptep, __pte(_PAGE_INVALID));
+ } else {
+ res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
}
- return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+ /* Nothing to do */
+ if (!mm_is_protected(mm) || !pte_present(res))
+ return res;
+ /*
+ * At this point the reference through the mapping is still present.
+ * The notifier should have destroyed all protected vCPUs at this
+ * point, so the destroy should be successful.
+ */
+ if (full && !uv_destroy_owned_page(pte_val(res) & PAGE_MASK))
+ return res;
+ /*
+ * If something went wrong and the page could not be destroyed, or
+ * if this is not a mm teardown, the slower export is used as
+ * fallback instead.
+ */
+ uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK);
+ return res;
}
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
@@ -1104,6 +1220,44 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
ptep_xchg_lazy(mm, addr, ptep, pte_wrprotect(pte));
}
+/*
+ * Check if PTEs only differ in _PAGE_PROTECT HW bit, but also allow SW PTE
+ * bits in the comparison. Those might change e.g. because of dirty and young
+ * tracking.
+ */
+static inline int pte_allow_rdp(pte_t old, pte_t new)
+{
+ /*
+ * Only allow changes from RO to RW
+ */
+ if (!(pte_val(old) & _PAGE_PROTECT) || pte_val(new) & _PAGE_PROTECT)
+ return 0;
+
+ return (pte_val(old) & _PAGE_RDP_MASK) == (pte_val(new) & _PAGE_RDP_MASK);
+}
+
+static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *ptep)
+{
+ /*
+ * RDP might not have propagated the PTE protection reset to all CPUs,
+ * so there could be spurious TLB protection faults.
+ * NOTE: This will also be called when a racing pagetable update on
+ * another thread already installed the correct PTE. Both cases cannot
+ * really be distinguished.
+ * Therefore, only do the local TLB flush when RDP can be used, and the
+ * PTE does not have _PAGE_PROTECT set, to avoid unnecessary overhead.
+ * A local RDP can be used to do the flush.
+ */
+ if (MACHINE_HAS_RDP && !(pte_val(*ptep) & _PAGE_PROTECT))
+ __ptep_rdp(address, ptep, 0, 0, 1);
+}
+#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
+
+void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t new);
+
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
static inline int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
@@ -1111,7 +1265,10 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
{
if (pte_same(*ptep, entry))
return 0;
- ptep_xchg_direct(vma->vm_mm, addr, ptep, entry);
+ if (MACHINE_HAS_RDP && !mm_has_pgste(vma->vm_mm) && pte_allow_rdp(*ptep, entry))
+ ptep_reset_dat_prot(vma->vm_mm, addr, ptep, entry);
+ else
+ ptep_xchg_direct(vma->vm_mm, addr, ptep, entry);
return 1;
}
@@ -1153,21 +1310,41 @@ void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr);
void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr);
void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr);
+#define pgprot_writecombine pgprot_writecombine
+pgprot_t pgprot_writecombine(pgprot_t prot);
+
+#define pgprot_writethrough pgprot_writethrough
+pgprot_t pgprot_writethrough(pgprot_t prot);
+
/*
- * Certain architectures need to do special things when PTEs
- * within a page table are directly modified. Thus, the following
- * hook is made available.
+ * Set multiple PTEs to consecutive pages with a single call. All PTEs
+ * are within the same folio, PMD and VMA.
*/
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t entry)
+static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t entry, unsigned int nr)
{
if (pte_present(entry))
- pte_val(entry) &= ~_PAGE_UNUSED;
- if (mm_has_pgste(mm))
- ptep_set_pte_at(mm, addr, ptep, entry);
- else
- *ptep = entry;
+ entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED));
+ if (mm_has_pgste(mm)) {
+ for (;;) {
+ ptep_set_pte_at(mm, addr, ptep, entry);
+ if (--nr == 0)
+ break;
+ ptep++;
+ entry = __pte(pte_val(entry) + PAGE_SIZE);
+ addr += PAGE_SIZE;
+ }
+ } else {
+ for (;;) {
+ set_pte(ptep, entry);
+ if (--nr == 0)
+ break;
+ ptep++;
+ entry = __pte(pte_val(entry) + PAGE_SIZE);
+ }
+ }
}
+#define set_ptes set_ptes
/*
* Conversion functions: convert a page and protection to a page entry,
@@ -1176,9 +1353,10 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
{
pte_t __pte;
- pte_val(__pte) = physpage + pgprot_val(pgprot);
+
+ __pte = __pte(physpage | pgprot_val(pgprot));
if (!MACHINE_HAS_NX)
- pte_val(__pte) &= ~_PAGE_NOEXEC;
+ __pte = clear_pte_bit(__pte, __pgprot(_PAGE_NOEXEC));
return pte_mkyoung(__pte);
}
@@ -1196,12 +1374,39 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
#define p4d_index(address) (((address) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
-#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1))
-#define pmd_deref(pmd) (pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN)
-#define pud_deref(pud) (pud_val(pud) & _REGION_ENTRY_ORIGIN)
-#define p4d_deref(pud) (p4d_val(pud) & _REGION_ENTRY_ORIGIN)
-#define pgd_deref(pgd) (pgd_val(pgd) & _REGION_ENTRY_ORIGIN)
+#define p4d_deref(pud) ((unsigned long)__va(p4d_val(pud) & _REGION_ENTRY_ORIGIN))
+#define pgd_deref(pgd) ((unsigned long)__va(pgd_val(pgd) & _REGION_ENTRY_ORIGIN))
+
+static inline unsigned long pmd_deref(pmd_t pmd)
+{
+ unsigned long origin_mask;
+
+ origin_mask = _SEGMENT_ENTRY_ORIGIN;
+ if (pmd_large(pmd))
+ origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE;
+ return (unsigned long)__va(pmd_val(pmd) & origin_mask);
+}
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+ return __pa(pmd_deref(pmd)) >> PAGE_SHIFT;
+}
+
+static inline unsigned long pud_deref(pud_t pud)
+{
+ unsigned long origin_mask;
+
+ origin_mask = _REGION_ENTRY_ORIGIN;
+ if (pud_large(pud))
+ origin_mask = _REGION3_ENTRY_ORIGIN_LARGE;
+ return (unsigned long)__va(pud_val(pud) & origin_mask);
+}
+
+static inline unsigned long pud_pfn(pud_t pud)
+{
+ return __pa(pud_deref(pud)) >> PAGE_SHIFT;
+}
/*
* The pgd_offset function *always* adds the index for the top-level
@@ -1227,38 +1432,52 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address)
}
#define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address)
-#define pgd_offset_k(address) pgd_offset(&init_mm, address)
-static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
+static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long address)
{
- if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
- return (p4d_t *) pgd_deref(*pgd) + p4d_index(address);
- return (p4d_t *) pgd;
+ if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
+ return (p4d_t *) pgd_deref(pgd) + p4d_index(address);
+ return (p4d_t *) pgdp;
}
+#define p4d_offset_lockless p4d_offset_lockless
-static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
+static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address)
{
- if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
- return (pud_t *) p4d_deref(*p4d) + pud_index(address);
- return (pud_t *) p4d;
+ return p4d_offset_lockless(pgdp, *pgdp, address);
}
-static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long address)
{
- if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3)
- return (pmd_t *) pud_deref(*pud) + pmd_index(address);
- return (pmd_t *) pud;
+ if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
+ return (pud_t *) p4d_deref(p4d) + pud_index(address);
+ return (pud_t *) p4dp;
}
+#define pud_offset_lockless pud_offset_lockless
-static inline pte_t *pte_offset(pmd_t *pmd, unsigned long address)
+static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address)
{
- return (pte_t *) pmd_deref(*pmd) + pte_index(address);
+ return pud_offset_lockless(p4dp, *p4dp, address);
}
+#define pud_offset pud_offset
-#define pte_offset_kernel(pmd, address) pte_offset(pmd, address)
-#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
+static inline pmd_t *pmd_offset_lockless(pud_t *pudp, pud_t pud, unsigned long address)
+{
+ if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3)
+ return (pmd_t *) pud_deref(pud) + pmd_index(address);
+ return (pmd_t *) pudp;
+}
+#define pmd_offset_lockless pmd_offset_lockless
-static inline void pte_unmap(pte_t *pte) { }
+static inline pmd_t *pmd_offset(pud_t *pudp, unsigned long address)
+{
+ return pmd_offset_lockless(pudp, *pudp, address);
+}
+#define pmd_offset pmd_offset
+
+static inline unsigned long pmd_page_vaddr(pmd_t pmd)
+{
+ return (unsigned long) pmd_deref(pmd);
+}
static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
{
@@ -1266,7 +1485,7 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
}
#define gup_fast_permitted gup_fast_permitted
-#define pfn_pte(pfn,pgprot) mk_pte_phys(__pa((pfn) << PAGE_SHIFT),(pgprot))
+#define pfn_pte(pfn, pgprot) mk_pte_phys(((pfn) << PAGE_SHIFT), (pgprot))
#define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT)
#define pte_page(x) pfn_to_page(pte_pfn(x))
@@ -1277,61 +1496,57 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_WRITE;
- pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
- return pmd;
+ pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_WRITE));
+ return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
}
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
+static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
{
- pmd_val(pmd) |= _SEGMENT_ENTRY_WRITE;
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_WRITE));
if (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY)
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT;
+ pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
return pmd;
}
static inline pmd_t pmd_mkclean(pmd_t pmd)
{
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_DIRTY;
- pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
- return pmd;
+ pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_DIRTY));
+ return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
}
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
- pmd_val(pmd) |= _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_SOFT_DIRTY;
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_SOFT_DIRTY));
if (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE)
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT;
+ pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
return pmd;
}
static inline pud_t pud_wrprotect(pud_t pud)
{
- pud_val(pud) &= ~_REGION3_ENTRY_WRITE;
- pud_val(pud) |= _REGION_ENTRY_PROTECT;
- return pud;
+ pud = clear_pud_bit(pud, __pgprot(_REGION3_ENTRY_WRITE));
+ return set_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT));
}
static inline pud_t pud_mkwrite(pud_t pud)
{
- pud_val(pud) |= _REGION3_ENTRY_WRITE;
+ pud = set_pud_bit(pud, __pgprot(_REGION3_ENTRY_WRITE));
if (pud_val(pud) & _REGION3_ENTRY_DIRTY)
- pud_val(pud) &= ~_REGION_ENTRY_PROTECT;
+ pud = clear_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT));
return pud;
}
static inline pud_t pud_mkclean(pud_t pud)
{
- pud_val(pud) &= ~_REGION3_ENTRY_DIRTY;
- pud_val(pud) |= _REGION_ENTRY_PROTECT;
- return pud;
+ pud = clear_pud_bit(pud, __pgprot(_REGION3_ENTRY_DIRTY));
+ return set_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT));
}
static inline pud_t pud_mkdirty(pud_t pud)
{
- pud_val(pud) |= _REGION3_ENTRY_DIRTY | _REGION3_ENTRY_SOFT_DIRTY;
+ pud = set_pud_bit(pud, __pgprot(_REGION3_ENTRY_DIRTY | _REGION3_ENTRY_SOFT_DIRTY));
if (pud_val(pud) & _REGION3_ENTRY_WRITE)
- pud_val(pud) &= ~_REGION_ENTRY_PROTECT;
+ pud = clear_pud_bit(pud, __pgprot(_REGION_ENTRY_PROTECT));
return pud;
}
@@ -1355,37 +1570,39 @@ static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
- pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG;
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG));
if (pmd_val(pmd) & _SEGMENT_ENTRY_READ)
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_INVALID;
+ pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID));
return pmd;
}
static inline pmd_t pmd_mkold(pmd_t pmd)
{
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_YOUNG;
- pmd_val(pmd) |= _SEGMENT_ENTRY_INVALID;
- return pmd;
+ pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG));
+ return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID));
}
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
- pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE |
- _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG |
- _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY;
- pmd_val(pmd) |= massage_pgprot_pmd(newprot);
+ unsigned long mask;
+
+ mask = _SEGMENT_ENTRY_ORIGIN_LARGE;
+ mask |= _SEGMENT_ENTRY_DIRTY;
+ mask |= _SEGMENT_ENTRY_YOUNG;
+ mask |= _SEGMENT_ENTRY_LARGE;
+ mask |= _SEGMENT_ENTRY_SOFT_DIRTY;
+ pmd = __pmd(pmd_val(pmd) & mask);
+ pmd = set_pmd_bit(pmd, __pgprot(massage_pgprot_pmd(newprot)));
if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY))
- pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
if (!(pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG))
- pmd_val(pmd) |= _SEGMENT_ENTRY_INVALID;
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_INVALID));
return pmd;
}
static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot)
{
- pmd_t __pmd;
- pmd_val(__pmd) = physpage + massage_pgprot_pmd(pgprot);
- return __pmd;
+ return __pmd(physpage + massage_pgprot_pmd(pgprot));
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */
@@ -1409,11 +1626,11 @@ static __always_inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp,
{
unsigned long sto;
- sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t);
+ sto = __pa(pmdp) - pmd_index(addr) * sizeof(pmd_t);
if (__builtin_constant_p(opt) && opt == 0) {
/* flush without guest asce */
asm volatile(
- " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]"
+ " idte %[r1],0,%[r2],%[m4]"
: "+m" (*pmdp)
: [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK)),
[m4] "i" (local)
@@ -1421,7 +1638,7 @@ static __always_inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp,
} else {
/* flush with guest asce */
asm volatile(
- " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]"
+ " idte %[r1],%[r3],%[r2],%[m4]"
: "+m" (*pmdp)
: [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt),
[r3] "a" (asce), [m4] "i" (local)
@@ -1435,12 +1652,12 @@ static __always_inline void __pudp_idte(unsigned long addr, pud_t *pudp,
{
unsigned long r3o;
- r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t);
+ r3o = __pa(pudp) - pud_index(addr) * sizeof(pud_t);
r3o |= _ASCE_TYPE_REGION3;
if (__builtin_constant_p(opt) && opt == 0) {
/* flush without guest asce */
asm volatile(
- " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]"
+ " idte %[r1],0,%[r2],%[m4]"
: "+m" (*pudp)
: [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK)),
[m4] "i" (local)
@@ -1448,7 +1665,7 @@ static __always_inline void __pudp_idte(unsigned long addr, pud_t *pudp,
} else {
/* flush with guest asce */
asm volatile(
- " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]"
+ " idte %[r1],%[r3],%[r2],%[m4]"
: "+m" (*pudp)
: [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt),
[r3] "a" (asce), [m4] "i" (local)
@@ -1507,16 +1724,15 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t entry)
{
if (!MACHINE_HAS_NX)
- pmd_val(entry) &= ~_SEGMENT_ENTRY_NOEXEC;
- *pmdp = entry;
+ entry = clear_pmd_bit(entry, __pgprot(_SEGMENT_ENTRY_NOEXEC));
+ set_pmd(pmdp, entry);
}
static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
- pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
- pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG;
- pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
- return pmd;
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_LARGE));
+ pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_YOUNG));
+ return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
}
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
@@ -1527,16 +1743,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
}
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
-static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmdp, int full)
{
if (full) {
pmd_t pmd = *pmdp;
- *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
+ set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
return pmd;
}
- return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
+ return pmdp_xchg_lazy(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
}
#define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
@@ -1573,7 +1789,7 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
}
#define pmdp_collapse_flush pmdp_collapse_flush
-#define pfn_pmd(pfn, pgprot) mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot))
+#define pfn_pmd(pfn, pgprot) mk_pmd_phys(((pfn) << PAGE_SHIFT), (pgprot))
#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
static inline int pmd_trans_huge(pmd_t pmd)
@@ -1591,18 +1807,18 @@ static inline int has_transparent_hugepage(void)
/*
* 64 bit swap entry format:
* A page-table entry has some bits we have to treat in a special way.
- * Bits 52 and bit 55 have to be zero, otherwise a specification
- * exception will occur instead of a page translation exception. The
- * specification exception has the bad habit not to store necessary
- * information in the lowcore.
- * Bits 54 and 63 are used to indicate the page type.
+ * Bits 54 and 63 are used to indicate the page type. Bit 53 marks the pte
+ * as invalid.
* A swap pte is indicated by bit pattern (pte & 0x201) == 0x200
- * This leaves the bits 0-51 and bits 56-62 to store type and offset.
- * We use the 5 bits from 57-61 for the type and the 52 bits from 0-51
- * for the offset.
- * | offset |01100|type |00|
+ * | offset |E11XX|type |S0|
* |0000000000111111111122222222223333333333444444444455|55555|55566|66|
* |0123456789012345678901234567890123456789012345678901|23456|78901|23|
+ *
+ * Bits 0-51 store the offset.
+ * Bit 52 (E) is used to remember PG_anon_exclusive.
+ * Bits 57-61 store the type.
+ * Bit 62 (S) is used for softdirty tracking.
+ * Bits 55 and 56 (X) are unused.
*/
#define __SWP_OFFSET_MASK ((1UL << 52) - 1)
@@ -1612,12 +1828,12 @@ static inline int has_transparent_hugepage(void)
static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
{
- pte_t pte;
+ unsigned long pteval;
- pte_val(pte) = _PAGE_INVALID | _PAGE_PROTECT;
- pte_val(pte) |= (offset & __SWP_OFFSET_MASK) << __SWP_OFFSET_SHIFT;
- pte_val(pte) |= (type & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT;
- return pte;
+ pteval = _PAGE_INVALID | _PAGE_PROTECT;
+ pteval |= (offset & __SWP_OFFSET_MASK) << __SWP_OFFSET_SHIFT;
+ pteval |= (type & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT;
+ return __pte(pteval);
}
static inline unsigned long __swp_type(swp_entry_t entry)
@@ -1638,10 +1854,12 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset)
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-#define kern_addr_valid(addr) (1)
-
extern int vmem_add_mapping(unsigned long start, unsigned long size);
-extern int vmem_remove_mapping(unsigned long start, unsigned long size);
+extern void vmem_remove_mapping(unsigned long start, unsigned long size);
+extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc);
+extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot);
+extern void vmem_unmap_4k_page(unsigned long addr);
+extern pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc);
extern int s390_enable_sie(void);
extern int s390_enable_skey(void);
extern void s390_reset_cmma(struct mm_struct *mm);
@@ -1650,6 +1868,7 @@ extern void s390_reset_cmma(struct mm_struct *mm);
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#include <asm-generic/pgtable.h>
+#define pmd_pgtable(pmd) \
+ ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE))
#endif /* _S390_PAGE_H */
diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h
new file mode 100644
index 000000000000..9e41a74fce9a
--- /dev/null
+++ b/arch/s390/include/asm/physmem_info.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_MEM_DETECT_H
+#define _ASM_S390_MEM_DETECT_H
+
+#include <linux/types.h>
+#include <asm/page.h>
+
+enum physmem_info_source {
+ MEM_DETECT_NONE = 0,
+ MEM_DETECT_SCLP_STOR_INFO,
+ MEM_DETECT_DIAG260,
+ MEM_DETECT_SCLP_READ_INFO,
+ MEM_DETECT_BIN_SEARCH
+};
+
+struct physmem_range {
+ u64 start;
+ u64 end;
+};
+
+enum reserved_range_type {
+ RR_DECOMPRESSOR,
+ RR_INITRD,
+ RR_VMLINUX,
+ RR_AMODE31,
+ RR_IPLREPORT,
+ RR_CERT_COMP_LIST,
+ RR_MEM_DETECT_EXTENDED,
+ RR_VMEM,
+ RR_MAX
+};
+
+struct reserved_range {
+ unsigned long start;
+ unsigned long end;
+ struct reserved_range *chain;
+};
+
+/*
+ * Storage element id is defined as 1 byte (up to 256 storage elements).
+ * In practise only storage element id 0 and 1 are used).
+ * According to architecture one storage element could have as much as
+ * 1020 subincrements. 255 physmem_ranges are embedded in physmem_info.
+ * If more physmem_ranges are required, a block of memory from already
+ * known physmem_range is taken (online_extended points to it).
+ */
+#define MEM_INLINED_ENTRIES 255 /* (PAGE_SIZE - 16) / 16 */
+
+struct physmem_info {
+ u32 range_count;
+ u8 info_source;
+ unsigned long usable;
+ struct reserved_range reserved[RR_MAX];
+ struct physmem_range online[MEM_INLINED_ENTRIES];
+ struct physmem_range *online_extended;
+};
+
+extern struct physmem_info physmem_info;
+
+void add_physmem_online_range(u64 start, u64 end);
+
+static inline int __get_physmem_range(u32 n, unsigned long *start,
+ unsigned long *end, bool respect_usable_limit)
+{
+ if (n >= physmem_info.range_count) {
+ *start = 0;
+ *end = 0;
+ return -1;
+ }
+
+ if (n < MEM_INLINED_ENTRIES) {
+ *start = (unsigned long)physmem_info.online[n].start;
+ *end = (unsigned long)physmem_info.online[n].end;
+ } else {
+ *start = (unsigned long)physmem_info.online_extended[n - MEM_INLINED_ENTRIES].start;
+ *end = (unsigned long)physmem_info.online_extended[n - MEM_INLINED_ENTRIES].end;
+ }
+
+ if (respect_usable_limit && physmem_info.usable) {
+ if (*start >= physmem_info.usable)
+ return -1;
+ if (*end > physmem_info.usable)
+ *end = physmem_info.usable;
+ }
+ return 0;
+}
+
+/**
+ * for_each_physmem_usable_range - early online memory range iterator
+ * @i: an integer used as loop variable
+ * @p_start: ptr to unsigned long for start address of the range
+ * @p_end: ptr to unsigned long for end address of the range
+ *
+ * Walks over detected online memory ranges below usable limit.
+ */
+#define for_each_physmem_usable_range(i, p_start, p_end) \
+ for (i = 0; !__get_physmem_range(i, p_start, p_end, true); i++)
+
+/* Walks over all detected online memory ranges disregarding usable limit. */
+#define for_each_physmem_online_range(i, p_start, p_end) \
+ for (i = 0; !__get_physmem_range(i, p_start, p_end, false); i++)
+
+static inline const char *get_physmem_info_source(void)
+{
+ switch (physmem_info.info_source) {
+ case MEM_DETECT_SCLP_STOR_INFO:
+ return "sclp storage info";
+ case MEM_DETECT_DIAG260:
+ return "diag260";
+ case MEM_DETECT_SCLP_READ_INFO:
+ return "sclp read info";
+ case MEM_DETECT_BIN_SEARCH:
+ return "binary search";
+ }
+ return "none";
+}
+
+#define RR_TYPE_NAME(t) case RR_ ## t: return #t
+static inline const char *get_rr_type_name(enum reserved_range_type t)
+{
+ switch (t) {
+ RR_TYPE_NAME(DECOMPRESSOR);
+ RR_TYPE_NAME(INITRD);
+ RR_TYPE_NAME(VMLINUX);
+ RR_TYPE_NAME(AMODE31);
+ RR_TYPE_NAME(IPLREPORT);
+ RR_TYPE_NAME(CERT_COMP_LIST);
+ RR_TYPE_NAME(MEM_DETECT_EXTENDED);
+ RR_TYPE_NAME(VMEM);
+ default:
+ return "UNKNOWN";
+ }
+}
+
+#define for_each_physmem_reserved_type_range(t, range, p_start, p_end) \
+ for (range = &physmem_info.reserved[t], *p_start = range->start, *p_end = range->end; \
+ range && range->end; range = range->chain ? __va(range->chain) : NULL, \
+ *p_start = range ? range->start : 0, *p_end = range ? range->end : 0)
+
+static inline struct reserved_range *__physmem_reserved_next(enum reserved_range_type *t,
+ struct reserved_range *range)
+{
+ if (!range) {
+ range = &physmem_info.reserved[*t];
+ if (range->end)
+ return range;
+ }
+ if (range->chain)
+ return __va(range->chain);
+ while (++*t < RR_MAX) {
+ range = &physmem_info.reserved[*t];
+ if (range->end)
+ return range;
+ }
+ return NULL;
+}
+
+#define for_each_physmem_reserved_range(t, range, p_start, p_end) \
+ for (t = 0, range = __physmem_reserved_next(&t, NULL), \
+ *p_start = range ? range->start : 0, *p_end = range ? range->end : 0; \
+ range; range = __physmem_reserved_next(&t, range), \
+ *p_start = range ? range->start : 0, *p_end = range ? range->end : 0)
+
+static inline unsigned long get_physmem_reserved(enum reserved_range_type type,
+ unsigned long *addr, unsigned long *size)
+{
+ *addr = physmem_info.reserved[type].start;
+ *size = physmem_info.reserved[type].end - physmem_info.reserved[type].start;
+ return *size;
+}
+
+#endif
diff --git a/arch/s390/include/asm/pkey.h b/arch/s390/include/asm/pkey.h
index dd3d20c332ac..47d80a7451a6 100644
--- a/arch/s390/include/asm/pkey.h
+++ b/arch/s390/include/asm/pkey.h
@@ -2,7 +2,7 @@
/*
* Kernelspace interface to the pkey device driver
*
- * Copyright IBM Corp. 2016,2019
+ * Copyright IBM Corp. 2016, 2023
*
* Author: Harald Freudenberger <freude@de.ibm.com>
*
@@ -23,6 +23,6 @@
* @return 0 on success, negative errno value on failure
*/
int pkey_keyblob2pkey(const u8 *key, u32 keylen,
- struct pkey_protkey *protkey);
+ u8 *protkey, u32 *protkeylen, u32 *protkeytype);
#endif /* _KAPI_PKEY_H */
diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h
index b5ea9e14c017..bf15da0fedbc 100644
--- a/arch/s390/include/asm/preempt.h
+++ b/arch/s390/include/asm/preempt.h
@@ -29,12 +29,6 @@ static inline void preempt_count_set(int pc)
old, new) != old);
}
-#define init_task_preempt_count(p) do { } while (0)
-
-#define init_idle_preempt_count(p, cpu) do { \
- S390_lowcore.preempt_count = PREEMPT_ENABLED; \
-} while (0)
-
static inline void set_preempt_need_resched(void)
{
__atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count);
@@ -52,10 +46,17 @@ static inline bool test_preempt_need_resched(void)
static inline void __preempt_count_add(int val)
{
- if (__builtin_constant_p(val) && (val >= -128) && (val <= 127))
- __atomic_add_const(val, &S390_lowcore.preempt_count);
- else
- __atomic_add(val, &S390_lowcore.preempt_count);
+ /*
+ * With some obscure config options and CONFIG_PROFILE_ALL_BRANCHES
+ * enabled, gcc 12 fails to handle __builtin_constant_p().
+ */
+ if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES)) {
+ if (__builtin_constant_p(val) && (val >= -128) && (val <= 127)) {
+ __atomic_add_const(val, &S390_lowcore.preempt_count);
+ return;
+ }
+ }
+ __atomic_add(val, &S390_lowcore.preempt_count);
}
static inline void __preempt_count_sub(int val)
@@ -88,12 +89,6 @@ static inline void preempt_count_set(int pc)
S390_lowcore.preempt_count = pc;
}
-#define init_task_preempt_count(p) do { } while (0)
-
-#define init_idle_preempt_count(p, cpu) do { \
- S390_lowcore.preempt_count = PREEMPT_ENABLED; \
-} while (0)
-
static inline void set_preempt_need_resched(void)
{
}
@@ -130,11 +125,15 @@ static inline bool should_resched(int preempt_offset)
#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
-#ifdef CONFIG_PREEMPT
-extern asmlinkage void preempt_schedule(void);
+#define init_task_preempt_count(p) do { } while (0)
+/* Deferred to CPU bringup time */
+#define init_idle_preempt_count(p, cpu) do { } while (0)
+
+#ifdef CONFIG_PREEMPTION
+extern void preempt_schedule(void);
#define __preempt_schedule() preempt_schedule()
-extern asmlinkage void preempt_schedule_notrace(void);
+extern void preempt_schedule_notrace(void);
#define __preempt_schedule_notrace() preempt_schedule_notrace()
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
#endif /* __ASM_PREEMPT_H */
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 361ef5eda468..c0b6e74d899a 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -14,26 +14,20 @@
#include <linux/bits.h>
-#define CIF_MCCK_PENDING 0 /* machine check handling is pending */
-#define CIF_ASCE_PRIMARY 1 /* primary asce needs fixup / uaccess */
-#define CIF_ASCE_SECONDARY 2 /* secondary asce needs fixup / uaccess */
-#define CIF_NOHZ_DELAY 3 /* delay HZ disable for a tick */
-#define CIF_FPU 4 /* restore FPU registers */
-#define CIF_IGNORE_IRQ 5 /* ignore interrupt (for udelay) */
-#define CIF_ENABLED_WAIT 6 /* in enabled wait state */
-#define CIF_MCCK_GUEST 7 /* machine check happening in guest */
-#define CIF_DEDICATED_CPU 8 /* this CPU is dedicated */
-
-#define _CIF_MCCK_PENDING BIT(CIF_MCCK_PENDING)
-#define _CIF_ASCE_PRIMARY BIT(CIF_ASCE_PRIMARY)
-#define _CIF_ASCE_SECONDARY BIT(CIF_ASCE_SECONDARY)
+#define CIF_NOHZ_DELAY 2 /* delay HZ disable for a tick */
+#define CIF_FPU 3 /* restore FPU registers */
+#define CIF_ENABLED_WAIT 5 /* in enabled wait state */
+#define CIF_MCCK_GUEST 6 /* machine check happening in guest */
+#define CIF_DEDICATED_CPU 7 /* this CPU is dedicated */
+
#define _CIF_NOHZ_DELAY BIT(CIF_NOHZ_DELAY)
#define _CIF_FPU BIT(CIF_FPU)
-#define _CIF_IGNORE_IRQ BIT(CIF_IGNORE_IRQ)
#define _CIF_ENABLED_WAIT BIT(CIF_ENABLED_WAIT)
#define _CIF_MCCK_GUEST BIT(CIF_MCCK_GUEST)
#define _CIF_DEDICATED_CPU BIT(CIF_DEDICATED_CPU)
+#define RESTART_FLAG_CTLREGS _AC(1 << 0, U)
+
#ifndef __ASSEMBLY__
#include <linux/cpumask.h>
@@ -46,30 +40,50 @@
#include <asm/runtime_instr.h>
#include <asm/fpu/types.h>
#include <asm/fpu/internal.h>
+#include <asm/irqflags.h>
+
+typedef long (*sys_call_ptr_t)(struct pt_regs *regs);
-static inline void set_cpu_flag(int flag)
+static __always_inline void set_cpu_flag(int flag)
{
S390_lowcore.cpu_flags |= (1UL << flag);
}
-static inline void clear_cpu_flag(int flag)
+static __always_inline void clear_cpu_flag(int flag)
{
S390_lowcore.cpu_flags &= ~(1UL << flag);
}
-static inline int test_cpu_flag(int flag)
+static __always_inline bool test_cpu_flag(int flag)
+{
+ return S390_lowcore.cpu_flags & (1UL << flag);
+}
+
+static __always_inline bool test_and_set_cpu_flag(int flag)
{
- return !!(S390_lowcore.cpu_flags & (1UL << flag));
+ if (test_cpu_flag(flag))
+ return true;
+ set_cpu_flag(flag);
+ return false;
+}
+
+static __always_inline bool test_and_clear_cpu_flag(int flag)
+{
+ if (!test_cpu_flag(flag))
+ return false;
+ clear_cpu_flag(flag);
+ return true;
}
/*
* Test CIF flag of another CPU. The caller needs to ensure that
* CPU hotplug can not happen, e.g. by disabling preemption.
*/
-static inline int test_cpu_flag_of(int flag, int cpu)
+static __always_inline bool test_cpu_flag_of(int flag, int cpu)
{
struct lowcore *lc = lowcore_ptr[cpu];
- return !!(lc->cpu_flags & (1UL << flag));
+
+ return lc->cpu_flags & (1UL << flag);
}
#define arch_needs_cpu() test_cpu_flag(CIF_NOHZ_DELAY)
@@ -84,65 +98,93 @@ void s390_update_cpu_mhz(void);
void cpu_detect_mhz_feature(void);
extern const struct seq_operations cpuinfo_op;
-extern int sysctl_ieee_emulation_warnings;
extern void execve_tail(void);
-extern void __bpon(void);
+unsigned long vdso_size(void);
/*
* User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit.
*/
-#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_31BIT) ? \
- (1UL << 31) : -PAGE_SIZE)
+#define TASK_SIZE (test_thread_flag(TIF_31BIT) ? \
+ _REGION3_SIZE : TASK_SIZE_MAX)
#define TASK_UNMAPPED_BASE (test_thread_flag(TIF_31BIT) ? \
- (1UL << 30) : (1UL << 41))
-#define TASK_SIZE TASK_SIZE_OF(current)
+ (_REGION3_SIZE >> 1) : (_REGION2_SIZE >> 1))
#define TASK_SIZE_MAX (-PAGE_SIZE)
-#define STACK_TOP (test_thread_flag(TIF_31BIT) ? \
- (1UL << 31) : (1UL << 42))
-#define STACK_TOP_MAX (1UL << 42)
+#define VDSO_BASE (STACK_TOP + PAGE_SIZE)
+#define VDSO_LIMIT (test_thread_flag(TIF_31BIT) ? _REGION3_SIZE : _REGION2_SIZE)
+#define STACK_TOP (VDSO_LIMIT - vdso_size() - PAGE_SIZE)
+#define STACK_TOP_MAX (_REGION2_SIZE - vdso_size() - PAGE_SIZE)
#define HAVE_ARCH_PICK_MMAP_LAYOUT
-typedef unsigned int mm_segment_t;
+#define __stackleak_poison __stackleak_poison
+static __always_inline void __stackleak_poison(unsigned long erase_low,
+ unsigned long erase_high,
+ unsigned long poison)
+{
+ unsigned long tmp, count;
+
+ count = erase_high - erase_low;
+ if (!count)
+ return;
+ asm volatile(
+ " cghi %[count],8\n"
+ " je 2f\n"
+ " aghi %[count],-(8+1)\n"
+ " srlg %[tmp],%[count],8\n"
+ " ltgr %[tmp],%[tmp]\n"
+ " jz 1f\n"
+ "0: stg %[poison],0(%[addr])\n"
+ " mvc 8(256-8,%[addr]),0(%[addr])\n"
+ " la %[addr],256(%[addr])\n"
+ " brctg %[tmp],0b\n"
+ "1: stg %[poison],0(%[addr])\n"
+ " larl %[tmp],3f\n"
+ " ex %[count],0(%[tmp])\n"
+ " j 4f\n"
+ "2: stg %[poison],0(%[addr])\n"
+ " j 4f\n"
+ "3: mvc 8(1,%[addr]),0(%[addr])\n"
+ "4:\n"
+ : [addr] "+&a" (erase_low), [count] "+&d" (count), [tmp] "=&a" (tmp)
+ : [poison] "d" (poison)
+ : "memory", "cc"
+ );
+}
/*
* Thread structure
*/
struct thread_struct {
unsigned int acrs[NUM_ACRS];
- unsigned long ksp; /* kernel stack pointer */
- unsigned long user_timer; /* task cputime in user space */
- unsigned long guest_timer; /* task cputime in kvm guest */
- unsigned long system_timer; /* task cputime in kernel space */
- unsigned long hardirq_timer; /* task cputime in hardirq context */
- unsigned long softirq_timer; /* task cputime in softirq context */
- unsigned long sys_call_table; /* system call table address */
- mm_segment_t mm_segment;
- unsigned long gmap_addr; /* address of last gmap fault. */
- unsigned int gmap_write_flag; /* gmap fault write indication */
- unsigned int gmap_int_code; /* int code of last gmap fault */
- unsigned int gmap_pfault; /* signal of a pending guest pfault */
+ unsigned long ksp; /* kernel stack pointer */
+ unsigned long user_timer; /* task cputime in user space */
+ unsigned long guest_timer; /* task cputime in kvm guest */
+ unsigned long system_timer; /* task cputime in kernel space */
+ unsigned long hardirq_timer; /* task cputime in hardirq context */
+ unsigned long softirq_timer; /* task cputime in softirq context */
+ const sys_call_ptr_t *sys_call_table; /* system call table address */
+ unsigned long gmap_addr; /* address of last gmap fault. */
+ unsigned int gmap_write_flag; /* gmap fault write indication */
+ unsigned int gmap_int_code; /* int code of last gmap fault */
+ unsigned int gmap_pfault; /* signal of a pending guest pfault */
+
/* Per-thread information related to debugging */
- struct per_regs per_user; /* User specified PER registers */
- struct per_event per_event; /* Cause of the last PER trap */
- unsigned long per_flags; /* Flags to control debug behavior */
- unsigned int system_call; /* system call number in signal */
- unsigned long last_break; /* last breaking-event-address. */
- /* pfault_wait is used to block the process on a pfault event */
+ struct per_regs per_user; /* User specified PER registers */
+ struct per_event per_event; /* Cause of the last PER trap */
+ unsigned long per_flags; /* Flags to control debug behavior */
+ unsigned int system_call; /* system call number in signal */
+ unsigned long last_break; /* last breaking-event-address. */
+ /* pfault_wait is used to block the process on a pfault event */
unsigned long pfault_wait;
struct list_head list;
/* cpu runtime instrumentation */
struct runtime_instr_cb *ri_cb;
- struct gs_cb *gs_cb; /* Current guarded storage cb */
- struct gs_cb *gs_bc_cb; /* Broadcast guarded storage cb */
- unsigned char trap_tdb[256]; /* Transaction abort diagnose block */
- /*
- * Warning: 'fpu' is dynamically-sized. It *MUST* be at
- * the end.
- */
- struct fpu fpu; /* FP and VX register save area */
+ struct gs_cb *gs_cb; /* Current guarded storage cb */
+ struct gs_cb *gs_bc_cb; /* Broadcast guarded storage cb */
+ struct pgm_tdb trap_tdb; /* Transaction abort diagnose block */
+ struct fpu fpu; /* FP and VX register save area */
};
/* Flag to disable transactions. */
@@ -162,6 +204,7 @@ typedef struct thread_struct thread_struct;
#define INIT_THREAD { \
.ksp = sizeof(init_stack) + (unsigned long) &init_stack, \
.fpu.regs = (void *) init_task.thread.fpu.fprs, \
+ .last_break = 1, \
}
/*
@@ -178,11 +221,9 @@ typedef struct thread_struct thread_struct;
regs->psw.mask = PSW_USER_BITS | PSW_MASK_BA; \
regs->psw.addr = new_psw; \
regs->gprs[15] = new_stackp; \
- crst_table_downgrade(current->mm); \
execve_tail(); \
} while (0)
-/* Forward declaration, a strange C thing */
struct task_struct;
struct mm_struct;
struct seq_file;
@@ -191,13 +232,11 @@ struct pt_regs;
void show_registers(struct pt_regs *regs);
void show_cacheinfo(struct seq_file *m);
-/* Free all resources held by a thread. */
-static inline void release_thread(struct task_struct *tsk) { }
-
/* Free guarded storage control block */
void guarded_storage_release(struct task_struct *tsk);
+void gs_load_bc_cb(struct pt_regs *regs);
-unsigned long get_wchan(struct task_struct *p);
+unsigned long __get_wchan(struct task_struct *p);
#define task_pt_regs(tsk) ((struct pt_regs *) \
(task_stack_page(tsk) + THREAD_SIZE) - 1)
#define KSTK_EIP(tsk) (task_pt_regs(tsk)->psw.addr)
@@ -206,15 +245,25 @@ unsigned long get_wchan(struct task_struct *p);
/* Has task runtime instrumentation enabled ? */
#define is_ri_task(tsk) (!!(tsk)->thread.ri_cb)
-static __always_inline unsigned long current_stack_pointer(void)
+/* avoid using global register due to gcc bug in versions < 8.4 */
+#define current_stack_pointer (__current_stack_pointer())
+
+static __always_inline unsigned long __current_stack_pointer(void)
{
unsigned long sp;
- asm volatile("la %0,0(15)" : "=a" (sp));
+ asm volatile("lgr %0,15" : "=d" (sp));
return sp;
}
-static __no_kasan_or_inline unsigned short stap(void)
+static __always_inline bool on_thread_stack(void)
+{
+ unsigned long ksp = S390_lowcore.kernel_stack;
+
+ return !((ksp ^ current_stack_pointer) & ~(THREAD_SIZE - 1));
+}
+
+static __always_inline unsigned short stap(void)
{
unsigned short cpu_address;
@@ -231,8 +280,7 @@ static inline unsigned long __ecag(unsigned int asi, unsigned char parm)
{
unsigned long val;
- asm volatile(".insn rsy,0xeb000000004c,%0,0,0(%1)" /* ecag */
- : "=d" (val) : "a" (asi << 8 | parm));
+ asm volatile("ecag %0,0,0(%1)" : "=d" (val) : "a" (asi << 8 | parm));
return val;
}
@@ -253,7 +301,7 @@ static inline void __load_psw(psw_t psw)
* Set PSW mask to specified value, while leaving the
* PSW addr pointing to the next instruction.
*/
-static __no_kasan_or_inline void __load_psw_mask(unsigned long mask)
+static __always_inline void __load_psw_mask(unsigned long mask)
{
unsigned long addr;
psw_t psw;
@@ -279,14 +327,36 @@ static inline unsigned long __extract_psw(void)
return (((unsigned long) reg1) << 32) | ((unsigned long) reg2);
}
-static inline void local_mcck_enable(void)
+static inline unsigned long __local_mcck_save(void)
{
- __load_psw_mask(__extract_psw() | PSW_MASK_MCHECK);
+ unsigned long mask = __extract_psw();
+
+ __load_psw_mask(mask & ~PSW_MASK_MCHECK);
+ return mask & PSW_MASK_MCHECK;
+}
+
+#define local_mcck_save(mflags) \
+do { \
+ typecheck(unsigned long, mflags); \
+ mflags = __local_mcck_save(); \
+} while (0)
+
+static inline void local_mcck_restore(unsigned long mflags)
+{
+ unsigned long mask = __extract_psw();
+
+ mask &= ~PSW_MASK_MCHECK;
+ __load_psw_mask(mask | mflags);
}
static inline void local_mcck_disable(void)
{
- __load_psw_mask(__extract_psw() & ~PSW_MASK_MCHECK);
+ __local_mcck_save();
+}
+
+static inline void local_mcck_enable(void)
+{
+ __load_psw_mask(__extract_psw() | PSW_MASK_MCHECK);
}
/*
@@ -303,11 +373,6 @@ static inline unsigned long __rewind_psw(psw_t psw, unsigned long ilc)
}
/*
- * Function to stop a processor until the next interrupt occurs
- */
-void enabled_wait(void);
-
-/*
* Function to drop a processor into disabled wait state
*/
static __always_inline void __noreturn disabled_wait(void)
@@ -320,30 +385,12 @@ static __always_inline void __noreturn disabled_wait(void)
while (1);
}
-/*
- * Basic Machine Check/Program Check Handler.
- */
-
-extern void s390_base_pgm_handler(void);
-extern void s390_base_ext_handler(void);
-
-extern void (*s390_base_pgm_handler_fn)(void);
-extern void (*s390_base_ext_handler_fn)(void);
-
#define ARCH_LOW_ADDRESS_LIMIT 0x7fffffffUL
-extern int memcpy_real(void *, void *, size_t);
-extern void memcpy_absolute(void *, void *, size_t);
-
-#define mem_assign_absolute(dest, val) do { \
- __typeof__(dest) __tmp = (val); \
- \
- BUILD_BUG_ON(sizeof(__tmp) != sizeof(val)); \
- memcpy_absolute(&(dest), &__tmp, sizeof(__tmp)); \
-} while (0)
-
-extern int s390_isolate_bp(void);
-extern int s390_isolate_bp_guest(void);
+static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
+{
+ return arch_irqs_disabled_flags(regs->psw.mask);
+}
#endif /* __ASSEMBLY__ */
diff --git a/arch/s390/include/asm/ptdump.h b/arch/s390/include/asm/ptdump.h
new file mode 100644
index 000000000000..f960b2896606
--- /dev/null
+++ b/arch/s390/include/asm/ptdump.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_PTDUMP_H
+#define _ASM_S390_PTDUMP_H
+
+void ptdump_check_wx(void);
+
+static inline void debug_checkwx(void)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_WX))
+ ptdump_check_wx();
+}
+
+#endif /* _ASM_S390_PTDUMP_H */
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index f009a13afe71..d28bf8fb2799 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -9,25 +9,54 @@
#include <linux/bits.h>
#include <uapi/asm/ptrace.h>
+#include <asm/tpi.h>
-#define PIF_SYSCALL 0 /* inside a system call */
-#define PIF_PER_TRAP 1 /* deliver sigtrap on return to user */
-#define PIF_SYSCALL_RESTART 2 /* restart the current system call */
-#define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */
+#define PIF_SYSCALL 0 /* inside a system call */
+#define PIF_EXECVE_PGSTE_RESTART 1 /* restart execve for PGSTE binaries */
+#define PIF_SYSCALL_RET_SET 2 /* return value was set via ptrace */
+#define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */
+#define PIF_FTRACE_FULL_REGS 4 /* all register contents valid (ftrace) */
-#define _PIF_SYSCALL BIT(PIF_SYSCALL)
-#define _PIF_PER_TRAP BIT(PIF_PER_TRAP)
-#define _PIF_SYSCALL_RESTART BIT(PIF_SYSCALL_RESTART)
-#define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT)
+#define _PIF_SYSCALL BIT(PIF_SYSCALL)
+#define _PIF_EXECVE_PGSTE_RESTART BIT(PIF_EXECVE_PGSTE_RESTART)
+#define _PIF_SYSCALL_RET_SET BIT(PIF_SYSCALL_RET_SET)
+#define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT)
+#define _PIF_FTRACE_FULL_REGS BIT(PIF_FTRACE_FULL_REGS)
-#ifndef __ASSEMBLY__
+#define PSW32_MASK_PER _AC(0x40000000, UL)
+#define PSW32_MASK_DAT _AC(0x04000000, UL)
+#define PSW32_MASK_IO _AC(0x02000000, UL)
+#define PSW32_MASK_EXT _AC(0x01000000, UL)
+#define PSW32_MASK_KEY _AC(0x00F00000, UL)
+#define PSW32_MASK_BASE _AC(0x00080000, UL) /* Always one */
+#define PSW32_MASK_MCHECK _AC(0x00040000, UL)
+#define PSW32_MASK_WAIT _AC(0x00020000, UL)
+#define PSW32_MASK_PSTATE _AC(0x00010000, UL)
+#define PSW32_MASK_ASC _AC(0x0000C000, UL)
+#define PSW32_MASK_CC _AC(0x00003000, UL)
+#define PSW32_MASK_PM _AC(0x00000f00, UL)
+#define PSW32_MASK_RI _AC(0x00000080, UL)
+
+#define PSW32_ADDR_AMODE _AC(0x80000000, UL)
+#define PSW32_ADDR_INSN _AC(0x7FFFFFFF, UL)
+
+#define PSW32_DEFAULT_KEY ((PAGE_DEFAULT_ACC) << 20)
+
+#define PSW32_ASC_PRIMARY _AC(0x00000000, UL)
+#define PSW32_ASC_ACCREG _AC(0x00004000, UL)
+#define PSW32_ASC_SECONDARY _AC(0x00008000, UL)
+#define PSW32_ASC_HOME _AC(0x0000C000, UL)
+
+#define PSW_DEFAULT_KEY ((PAGE_DEFAULT_ACC) << 52)
#define PSW_KERNEL_BITS (PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_HOME | \
- PSW_MASK_EA | PSW_MASK_BA)
+ PSW_MASK_EA | PSW_MASK_BA | PSW_MASK_DAT)
#define PSW_USER_BITS (PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | \
PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_MASK_MCHECK | \
PSW_MASK_PSTATE | PSW_ASC_PRIMARY)
+#ifndef __ASSEMBLY__
+
struct psw_bits {
unsigned long : 1;
unsigned long per : 1; /* PER-Mask */
@@ -68,12 +97,19 @@ enum {
&(*(struct psw_bits *)(&(__psw))); \
}))
+typedef struct {
+ unsigned int mask;
+ unsigned int addr;
+} psw_t32 __aligned(8);
+
+#define PGM_INT_CODE_MASK 0x7f
+#define PGM_INT_CODE_PER 0x80
+
/*
* The pt_regs struct defines the way the registers are stored on
* the stack during a system call.
*/
-struct pt_regs
-{
+struct pt_regs {
union {
user_pt_regs user_regs;
struct {
@@ -83,10 +119,17 @@ struct pt_regs
};
};
unsigned long orig_gpr2;
- unsigned int int_code;
- unsigned int int_parm;
- unsigned long int_parm_long;
+ union {
+ struct {
+ unsigned int int_code;
+ unsigned int int_parm;
+ unsigned long int_parm_long;
+ };
+ struct tpi_info tpi_info;
+ };
unsigned long flags;
+ unsigned long cr1;
+ unsigned long last_break;
};
/*
@@ -152,6 +195,14 @@ static inline int test_pt_regs_flag(struct pt_regs *regs, int flag)
return !!(regs->flags & (1UL << flag));
}
+static inline int test_and_clear_pt_regs_flag(struct pt_regs *regs, int flag)
+{
+ int ret = test_pt_regs_flag(regs, flag);
+
+ clear_pt_regs_flag(regs, flag);
+ return ret;
+}
+
/*
* These are defined as per linux/ptrace.h, which see.
*/
@@ -179,10 +230,34 @@ const char *regs_query_register_name(unsigned int offset);
unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset);
unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n);
+/**
+ * regs_get_kernel_argument() - get Nth function argument in kernel
+ * @regs: pt_regs of that context
+ * @n: function argument number (start from 0)
+ *
+ * regs_get_kernel_argument() returns @n th argument of the function call.
+ */
+static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
+ unsigned int n)
+{
+ unsigned int argoffset = STACK_FRAME_OVERHEAD / sizeof(long);
+
+#define NR_REG_ARGUMENTS 5
+ if (n < NR_REG_ARGUMENTS)
+ return regs_get_register(regs, 2 + n);
+ n -= NR_REG_ARGUMENTS;
+ return regs_get_kernel_stack_nth(regs, argoffset + n);
+}
+
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
return regs->gprs[15];
}
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+ regs->gprs[2] = rc;
+}
+
#endif /* __ASSEMBLY__ */
#endif /* _S390_PTRACE_H */
diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h
index 71e3f0146cda..2f983e0b95e0 100644
--- a/arch/s390/include/asm/qdio.h
+++ b/arch/s390/include/asm/qdio.h
@@ -18,7 +18,6 @@
#define QDIO_MAX_BUFFERS_MASK (QDIO_MAX_BUFFERS_PER_Q - 1)
#define QDIO_BUFNR(num) ((num) & QDIO_MAX_BUFFERS_MASK)
#define QDIO_MAX_ELEMENTS_PER_BUFFER 16
-#define QDIO_SBAL_SIZE 256
#define QDIO_QETH_QFMT 0
#define QDIO_ZFCP_QFMT 1
@@ -26,9 +25,9 @@
/**
* struct qdesfmt0 - queue descriptor, format 0
- * @sliba: storage list information block address
- * @sla: storage list address
- * @slsba: storage list state block address
+ * @sliba: absolute address of storage list information block
+ * @sla: absolute address of storage list
+ * @slsba: absolute address of storage list state block
* @akey: access key for SLIB
* @bkey: access key for SL
* @ckey: access key for SBALs
@@ -56,7 +55,7 @@ struct qdesfmt0 {
* @oqdcnt: output queue descriptor count
* @iqdsz: input queue descriptor size
* @oqdsz: output queue descriptor size
- * @qiba: queue information block address
+ * @qiba: absolute address of queue information block
* @qkey: queue information block key
* @qdf0: queue descriptions
*/
@@ -92,8 +91,8 @@ struct qdr {
* @pfmt: implementation dependent parameter format
* @rflags: QEBSM
* @ac: adapter characteristics
- * @isliba: absolute address of first input SLIB
- * @osliba: absolute address of first output SLIB
+ * @isliba: logical address of first input SLIB
+ * @osliba: logical address of first output SLIB
* @ebcnam: adapter identifier in EBCDIC
* @parm: implementation dependent parameters
*/
@@ -134,10 +133,9 @@ struct slibe {
* @sb_count: number of storage blocks
* @sba: storage block element addresses
* @dcount: size of storage block elements
- * @user0: user defineable value
- * @res4: reserved paramater
- * @user1: user defineable value
- * @user2: user defineable value
+ * @user0: user definable value
+ * @res4: reserved parameter
+ * @user1: user definable value
*/
struct qaob {
u64 res0[6];
@@ -152,8 +150,7 @@ struct qaob {
u16 dcount[QDIO_MAX_ELEMENTS_PER_BUFFER];
u64 user0;
u64 res4[2];
- u64 user1;
- u64 user2;
+ u8 user1[16];
} __attribute__ ((packed, aligned(256)));
/**
@@ -201,7 +198,7 @@ struct slib {
* @scount: SBAL count
* @sflags: whole SBAL flags
* @length: length
- * @addr: address
+ * @addr: absolute data address
*/
struct qdio_buffer_element {
u8 eflags;
@@ -211,7 +208,7 @@ struct qdio_buffer_element {
u8 scount;
u8 sflags;
u32 length;
- void *addr;
+ u64 addr;
} __attribute__ ((packed, aligned(16)));
/**
@@ -227,7 +224,7 @@ struct qdio_buffer {
* @sbal: absolute SBAL address
*/
struct sl_element {
- unsigned long sbal;
+ u64 sbal;
} __attribute__ ((packed));
/**
@@ -246,25 +243,8 @@ struct slsb {
u8 val[QDIO_MAX_BUFFERS_PER_Q];
} __attribute__ ((packed, aligned(256)));
-/**
- * struct qdio_outbuf_state - SBAL related asynchronous operation information
- * (for communication with upper layer programs)
- * (only required for use with completion queues)
- * @flags: flags indicating state of buffer
- * @user: pointer to upper layer program's state information related to SBAL
- * (stored in user1 data of QAOB)
- */
-struct qdio_outbuf_state {
- u8 flags;
- void *user;
-};
-
-#define QDIO_OUTBUF_STATE_FLAG_PENDING 0x01
-
-#define CHSC_AC1_INITIATE_INPUTQ 0x80
-
-
/* qdio adapter-characteristics-1 flag */
+#define CHSC_AC1_INITIATE_INPUTQ 0x80
#define AC1_SIGA_INPUT_NEEDED 0x40 /* process input queues */
#define AC1_SIGA_OUTPUT_NEEDED 0x20 /* process output queues */
#define AC1_SIGA_SYNC_NEEDED 0x10 /* ask hypervisor to sync */
@@ -310,14 +290,14 @@ struct qdio_ssqd_desc {
typedef void qdio_handler_t(struct ccw_device *, unsigned int, int,
int, int, unsigned long);
-/* qdio errors reported to the upper-layer program */
+/* qdio errors reported through the queue handlers: */
#define QDIO_ERROR_ACTIVATE 0x0001
#define QDIO_ERROR_GET_BUF_STATE 0x0002
#define QDIO_ERROR_SET_BUF_STATE 0x0004
-#define QDIO_ERROR_SLSB_STATE 0x0100
-#define QDIO_ERROR_FATAL 0x00ff
-#define QDIO_ERROR_TEMPORARY 0xff00
+/* extra info for completed SBALs: */
+#define QDIO_ERROR_SLSB_STATE 0x0100
+#define QDIO_ERROR_SLSB_PENDING 0x0200
/* for qdio_cleanup */
#define QDIO_FLAG_CLEANUP_USING_CLEAR 0x01
@@ -325,109 +305,60 @@ typedef void qdio_handler_t(struct ccw_device *, unsigned int, int,
/**
* struct qdio_initialize - qdio initialization data
- * @cdev: associated ccw device
* @q_format: queue format
* @qdr_ac: feature flags to set
- * @adapter_name: name for the adapter
* @qib_param_field_format: format for qib_parm_field
* @qib_param_field: pointer to 128 bytes or NULL, if no param field
* @qib_rflags: rflags to set
- * @input_slib_elements: pointer to no_input_qs * 128 words of data or NULL
- * @output_slib_elements: pointer to no_output_qs * 128 words of data or NULL
* @no_input_qs: number of input queues
* @no_output_qs: number of output queues
- * @input_handler: handler to be called for input queues
+ * @input_handler: handler to be called for input queues, and device-wide errors
* @output_handler: handler to be called for output queues
- * @queue_start_poll_array: polling handlers (one per input queue or NULL)
+ * @irq_poll: Data IRQ polling handler
* @scan_threshold: # of in-use buffers that triggers scan on output queue
* @int_parm: interruption parameter
- * @input_sbal_addr_array: address of no_input_qs * 128 pointers
- * @output_sbal_addr_array: address of no_output_qs * 128 pointers
- * @output_sbal_state_array: no_output_qs * 128 state info (for CQ or NULL)
+ * @input_sbal_addr_array: per-queue array, each element points to 128 SBALs
+ * @output_sbal_addr_array: per-queue array, each element points to 128 SBALs
*/
struct qdio_initialize {
- struct ccw_device *cdev;
unsigned char q_format;
unsigned char qdr_ac;
- unsigned char adapter_name[8];
unsigned int qib_param_field_format;
unsigned char *qib_param_field;
unsigned char qib_rflags;
- unsigned long *input_slib_elements;
- unsigned long *output_slib_elements;
unsigned int no_input_qs;
unsigned int no_output_qs;
qdio_handler_t *input_handler;
qdio_handler_t *output_handler;
- void (**queue_start_poll_array) (struct ccw_device *, int,
- unsigned long);
- unsigned int scan_threshold;
+ void (*irq_poll)(struct ccw_device *cdev, unsigned long data);
unsigned long int_parm;
- struct qdio_buffer **input_sbal_addr_array;
- struct qdio_buffer **output_sbal_addr_array;
- struct qdio_outbuf_state *output_sbal_state_array;
+ struct qdio_buffer ***input_sbal_addr_array;
+ struct qdio_buffer ***output_sbal_addr_array;
};
-/**
- * enum qdio_brinfo_entry_type - type of address entry for qdio_brinfo_desc()
- * @l3_ipv6_addr: entry contains IPv6 address
- * @l3_ipv4_addr: entry contains IPv4 address
- * @l2_addr_lnid: entry contains MAC address and VLAN ID
- */
-enum qdio_brinfo_entry_type {l3_ipv6_addr, l3_ipv4_addr, l2_addr_lnid};
-
-/**
- * struct qdio_brinfo_entry_XXX - Address entry for qdio_brinfo_desc()
- * @nit: Network interface token
- * @addr: Address of one of the three types
- *
- * The struct is passed to the callback function by qdio_brinfo_desc()
- */
-struct qdio_brinfo_entry_l3_ipv6 {
- u64 nit;
- struct { unsigned char _s6_addr[16]; } addr;
-} __packed;
-struct qdio_brinfo_entry_l3_ipv4 {
- u64 nit;
- struct { uint32_t _s_addr; } addr;
-} __packed;
-struct qdio_brinfo_entry_l2 {
- u64 nit;
- struct { u8 mac[6]; u16 lnid; } addr_lnid;
-} __packed;
-
-#define QDIO_STATE_INACTIVE 0x00000002 /* after qdio_cleanup */
-#define QDIO_STATE_ESTABLISHED 0x00000004 /* after qdio_establish */
-#define QDIO_STATE_ACTIVE 0x00000008 /* after qdio_activate */
-#define QDIO_STATE_STOPPED 0x00000010 /* after queues went down */
-
-#define QDIO_FLAG_SYNC_INPUT 0x01
-#define QDIO_FLAG_SYNC_OUTPUT 0x02
-#define QDIO_FLAG_PCI_OUT 0x10
-
int qdio_alloc_buffers(struct qdio_buffer **buf, unsigned int count);
void qdio_free_buffers(struct qdio_buffer **buf, unsigned int count);
void qdio_reset_buffers(struct qdio_buffer **buf, unsigned int count);
-extern int qdio_allocate(struct qdio_initialize *);
-extern int qdio_establish(struct qdio_initialize *);
+extern int qdio_allocate(struct ccw_device *cdev, unsigned int no_input_qs,
+ unsigned int no_output_qs);
+extern int qdio_establish(struct ccw_device *cdev,
+ struct qdio_initialize *init_data);
extern int qdio_activate(struct ccw_device *);
-extern void qdio_release_aob(struct qaob *);
-extern int do_QDIO(struct ccw_device *, unsigned int, int, unsigned int,
- unsigned int);
-extern int qdio_start_irq(struct ccw_device *, int);
-extern int qdio_stop_irq(struct ccw_device *, int);
-extern int qdio_get_next_buffers(struct ccw_device *, int, int *, int *);
-extern int qdio_inspect_queue(struct ccw_device *cdev, unsigned int nr,
- bool is_input, unsigned int *bufnr,
- unsigned int *error);
+extern int qdio_start_irq(struct ccw_device *cdev);
+extern int qdio_stop_irq(struct ccw_device *cdev);
+extern int qdio_inspect_input_queue(struct ccw_device *cdev, unsigned int nr,
+ unsigned int *bufnr, unsigned int *error);
+extern int qdio_inspect_output_queue(struct ccw_device *cdev, unsigned int nr,
+ unsigned int *bufnr, unsigned int *error);
+extern int qdio_add_bufs_to_input_queue(struct ccw_device *cdev,
+ unsigned int q_nr, unsigned int bufnr,
+ unsigned int count);
+extern int qdio_add_bufs_to_output_queue(struct ccw_device *cdev,
+ unsigned int q_nr, unsigned int bufnr,
+ unsigned int count, struct qaob *aob);
extern int qdio_shutdown(struct ccw_device *, int);
extern int qdio_free(struct ccw_device *);
extern int qdio_get_ssqd_desc(struct ccw_device *, struct qdio_ssqd_desc *);
-extern int qdio_pnso_brinfo(struct subchannel_id schid,
- int cnc, u16 *response,
- void (*cb)(void *priv, enum qdio_brinfo_entry_type type,
- void *entry),
- void *priv);
#endif /* __QDIO_H__ */
diff --git a/arch/s390/include/asm/rwonce.h b/arch/s390/include/asm/rwonce.h
new file mode 100644
index 000000000000..91fc24520e82
--- /dev/null
+++ b/arch/s390/include/asm/rwonce.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_S390_RWONCE_H
+#define __ASM_S390_RWONCE_H
+
+#include <linux/compiler_types.h>
+
+/*
+ * Use READ_ONCE_ALIGNED_128() for 128-bit block concurrent (atomic) read
+ * accesses. Note that x must be 128-bit aligned, otherwise a specification
+ * exception is generated.
+ */
+#define READ_ONCE_ALIGNED_128(x) \
+({ \
+ union { \
+ typeof(x) __x; \
+ __uint128_t val; \
+ } __u; \
+ \
+ BUILD_BUG_ON(sizeof(x) != 16); \
+ asm volatile( \
+ " lpq %[val],%[_x]\n" \
+ : [val] "=d" (__u.val) \
+ : [_x] "QS" (x) \
+ : "memory"); \
+ __u.__x; \
+})
+
+#include <asm-generic/rwonce.h>
+
+#endif /* __ASM_S390_RWONCE_H */
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index c563f8368b19..5742d23bba13 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -1,18 +1,25 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright IBM Corp. 2007
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#ifndef _ASM_S390_SCLP_H
#define _ASM_S390_SCLP_H
#include <linux/types.h>
-#include <asm/chpid.h>
-#include <asm/cpu.h>
#define SCLP_CHP_INFO_MASK_SIZE 32
-#define SCLP_MAX_CORES 256
+#define EARLY_SCCB_SIZE PAGE_SIZE
+#define SCLP_MAX_CORES 512
+/* 144 + 16 * SCLP_MAX_CORES + 2 * (SCLP_MAX_CORES - 1) */
+#define EXT_SCCB_READ_SCP (3 * PAGE_SIZE)
+/* 24 + 16 * SCLP_MAX_CORES */
+#define EXT_SCCB_READ_CPU (3 * PAGE_SIZE)
+
+#ifndef __ASSEMBLY__
+#include <linux/uio.h>
+#include <asm/chpid.h>
+#include <asm/cpu.h>
struct sclp_chp_info {
u8 recognized[SCLP_CHP_INFO_MASK_SIZE];
@@ -79,8 +86,15 @@ struct sclp_info {
unsigned char has_kss : 1;
unsigned char has_gisaf : 1;
unsigned char has_diag318 : 1;
+ unsigned char has_diag320 : 1;
unsigned char has_sipl : 1;
+ unsigned char has_sipl_eckd : 1;
unsigned char has_dirq : 1;
+ unsigned char has_iplcc : 1;
+ unsigned char has_zpci_lsi : 1;
+ unsigned char has_aisii : 1;
+ unsigned char has_aeni : 1;
+ unsigned char has_aisi : 1;
unsigned int ibc;
unsigned int mtid;
unsigned int mtid_cp;
@@ -105,17 +119,21 @@ struct zpci_report_error_header {
* (OpenCrypto Successful Diagnostics Execution)
*/
u16 length; /* Length of Subsequent Data (up to 4K – SCLP header */
- u8 data[0]; /* Subsequent Data passed verbatim to SCLP ET 24 */
+ u8 data[]; /* Subsequent Data passed verbatim to SCLP ET 24 */
} __packed;
+extern char *sclp_early_sccb;
+
+void sclp_early_adjust_va(void);
+void sclp_early_set_buffer(void *sccb);
int sclp_early_read_info(void);
int sclp_early_read_storage_info(void);
int sclp_early_get_core_info(struct sclp_core_info *info);
void sclp_early_get_ipl_info(struct sclp_ipl_info *info);
void sclp_early_detect(void);
void sclp_early_printk(const char *s);
-void sclp_early_printk_force(const char *s);
-void __sclp_early_printk(const char *s, unsigned int len, unsigned int force);
+void __sclp_early_printk(const char *s, unsigned int len);
+void sclp_emergency_printk(const char *s);
int sclp_early_get_memsize(unsigned long *mem);
int sclp_early_get_hsa_size(unsigned long *hsa_size);
@@ -129,9 +147,10 @@ int sclp_chp_deconfigure(struct chp_id chpid);
int sclp_chp_read_info(struct sclp_chp_info *info);
int sclp_pci_configure(u32 fid);
int sclp_pci_deconfigure(u32 fid);
+int sclp_ap_configure(u32 apid);
+int sclp_ap_deconfigure(u32 apid);
int sclp_pci_report(struct zpci_report_error_header *report, u32 fh, u32 fid);
-int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count);
-int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count);
+size_t memcpy_hsa_iter(struct iov_iter *iter, unsigned long src, size_t count);
void sclp_ocf_cpc_name_copy(char *dst);
static inline int sclp_get_core_info(struct sclp_core_info *info, int early)
@@ -141,4 +160,5 @@ static inline int sclp_get_core_info(struct sclp_core_info *info, int early)
return _sclp_get_core_info(info);
}
+#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_SCLP_H */
diff --git a/arch/s390/include/asm/scsw.h b/arch/s390/include/asm/scsw.h
index c00f7b031628..322bdcd4b616 100644
--- a/arch/s390/include/asm/scsw.h
+++ b/arch/s390/include/asm/scsw.h
@@ -215,6 +215,11 @@ union scsw {
#define SNS2_ENV_DATA_PRESENT 0x10
#define SNS2_INPRECISE_END 0x04
+/*
+ * architectured values for PPRC errors
+ */
+#define SNS7_INVALID_ON_SEC 0x0e
+
/**
* scsw_is_tm - check for transport mode scsw
* @scsw: pointer to scsw
@@ -508,9 +513,21 @@ static inline int scsw_cmd_is_valid_zcc(union scsw *scsw)
*/
static inline int scsw_cmd_is_valid_ectl(union scsw *scsw)
{
- return (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) &&
- !(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) &&
- (scsw->cmd.stctl & SCSW_STCTL_ALERT_STATUS);
+ /* Must be status pending. */
+ if (!(scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND))
+ return 0;
+
+ /* Must have alert status. */
+ if (!(scsw->cmd.stctl & SCSW_STCTL_ALERT_STATUS))
+ return 0;
+
+ /* Must be alone or together with primary, secondary or both,
+ * => no intermediate status.
+ */
+ if (scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS)
+ return 0;
+
+ return 1;
}
/**
@@ -522,11 +539,25 @@ static inline int scsw_cmd_is_valid_ectl(union scsw *scsw)
*/
static inline int scsw_cmd_is_valid_pno(union scsw *scsw)
{
- return (scsw->cmd.fctl != 0) &&
- (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) &&
- (!(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) ||
- ((scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) &&
- (scsw->cmd.actl & SCSW_ACTL_SUSPENDED)));
+ /* Must indicate at least one I/O function. */
+ if (!scsw->cmd.fctl)
+ return 0;
+
+ /* Must be status pending. */
+ if (!(scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND))
+ return 0;
+
+ /* Can be status pending alone, or with any combination of primary,
+ * secondary and alert => no intermediate status.
+ */
+ if (!(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS))
+ return 1;
+
+ /* If intermediate, must be suspended. */
+ if (scsw->cmd.actl & SCSW_ACTL_SUSPENDED)
+ return 1;
+
+ return 0;
}
/**
@@ -676,9 +707,21 @@ static inline int scsw_tm_is_valid_q(union scsw *scsw)
*/
static inline int scsw_tm_is_valid_ectl(union scsw *scsw)
{
- return (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) &&
- !(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) &&
- (scsw->tm.stctl & SCSW_STCTL_ALERT_STATUS);
+ /* Must be status pending. */
+ if (!(scsw->tm.stctl & SCSW_STCTL_STATUS_PEND))
+ return 0;
+
+ /* Must have alert status. */
+ if (!(scsw->tm.stctl & SCSW_STCTL_ALERT_STATUS))
+ return 0;
+
+ /* Must be alone or together with primary, secondary or both,
+ * => no intermediate status.
+ */
+ if (scsw->tm.stctl & SCSW_STCTL_INTER_STATUS)
+ return 0;
+
+ return 1;
}
/**
@@ -690,11 +733,25 @@ static inline int scsw_tm_is_valid_ectl(union scsw *scsw)
*/
static inline int scsw_tm_is_valid_pno(union scsw *scsw)
{
- return (scsw->tm.fctl != 0) &&
- (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) &&
- (!(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) ||
- ((scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) &&
- (scsw->tm.actl & SCSW_ACTL_SUSPENDED)));
+ /* Must indicate at least one I/O function. */
+ if (!scsw->tm.fctl)
+ return 0;
+
+ /* Must be status pending. */
+ if (!(scsw->tm.stctl & SCSW_STCTL_STATUS_PEND))
+ return 0;
+
+ /* Can be status pending alone, or with any combination of primary,
+ * secondary and alert => no intermediate status.
+ */
+ if (!(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS))
+ return 1;
+
+ /* If intermediate, must be suspended. */
+ if (scsw->tm.actl & SCSW_ACTL_SUSPENDED)
+ return 1;
+
+ return 0;
}
/**
diff --git a/arch/s390/include/asm/seccomp.h b/arch/s390/include/asm/seccomp.h
index 795bbe0d7ca6..71d46f0ba97b 100644
--- a/arch/s390/include/asm/seccomp.h
+++ b/arch/s390/include/asm/seccomp.h
@@ -16,4 +16,13 @@
#include <asm-generic/seccomp.h>
+#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_S390X
+#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+#define SECCOMP_ARCH_NATIVE_NAME "s390x"
+#ifdef CONFIG_COMPAT
+# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_S390
+# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
+# define SECCOMP_ARCH_COMPAT_NAME "s390"
+#endif
+
#endif /* _ASM_S390_SECCOMP_H */
diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h
index 42de04ad9c07..0486e6ef62bf 100644
--- a/arch/s390/include/asm/sections.h
+++ b/arch/s390/include/asm/sections.h
@@ -2,20 +2,8 @@
#ifndef _S390_SECTIONS_H
#define _S390_SECTIONS_H
-#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
-
#include <asm-generic/sections.h>
-extern bool initmem_freed;
-
-static inline int arch_is_kernel_initmem_freed(unsigned long addr)
-{
- if (!initmem_freed)
- return 0;
- return addr >= (unsigned long)__init_begin &&
- addr < (unsigned long)__init_end;
-}
-
/*
* .boot.data section contains variables "shared" between the decompressor and
* the decompressed kernel. The decompressor will store values in them, and
@@ -26,16 +14,16 @@ static inline int arch_is_kernel_initmem_freed(unsigned long addr)
* final .boot.data section, which should be identical in the decompressor and
* the decompressed kernel (that is checked during the build).
*/
-#define __bootdata(var) __section(.boot.data.var) var
+#define __bootdata(var) __section(".boot.data." #var) var
/*
* .boot.preserved.data is similar to .boot.data, but it is not part of the
* .init section and thus will be preserved for later use in the decompressed
* kernel.
*/
-#define __bootdata_preserved(var) __section(.boot.preserved.data.var) var
+#define __bootdata_preserved(var) __section(".boot.preserved.data." #var) var
-extern unsigned long __sdma, __edma;
-extern unsigned long __stext_dma, __etext_dma;
+extern char *__samode31, *__eamode31;
+extern char *__stext_amode31, *__etext_amode31;
#endif
diff --git a/arch/s390/include/asm/serial.h b/arch/s390/include/asm/serial.h
deleted file mode 100644
index aaf85a69061c..000000000000
--- a/arch/s390/include/asm/serial.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_S390_SERIAL_H
-#define _ASM_S390_SERIAL_H
-
-#define BASE_BAUD 0
-
-#endif /* _ASM_S390_SERIAL_H */
diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h
index c59a83536c70..06fbabe2f66c 100644
--- a/arch/s390/include/asm/set_memory.h
+++ b/arch/s390/include/asm/set_memory.h
@@ -2,31 +2,65 @@
#ifndef _ASMS390_SET_MEMORY_H
#define _ASMS390_SET_MEMORY_H
-#define SET_MEMORY_RO 1UL
-#define SET_MEMORY_RW 2UL
-#define SET_MEMORY_NX 4UL
-#define SET_MEMORY_X 8UL
+#include <linux/mutex.h>
-int __set_memory(unsigned long addr, int numpages, unsigned long flags);
+extern struct mutex cpa_mutex;
-static inline int set_memory_ro(unsigned long addr, int numpages)
-{
- return __set_memory(addr, numpages, SET_MEMORY_RO);
-}
+enum {
+ _SET_MEMORY_RO_BIT,
+ _SET_MEMORY_RW_BIT,
+ _SET_MEMORY_NX_BIT,
+ _SET_MEMORY_X_BIT,
+ _SET_MEMORY_4K_BIT,
+ _SET_MEMORY_INV_BIT,
+ _SET_MEMORY_DEF_BIT,
+};
-static inline int set_memory_rw(unsigned long addr, int numpages)
-{
- return __set_memory(addr, numpages, SET_MEMORY_RW);
-}
+#define SET_MEMORY_RO BIT(_SET_MEMORY_RO_BIT)
+#define SET_MEMORY_RW BIT(_SET_MEMORY_RW_BIT)
+#define SET_MEMORY_NX BIT(_SET_MEMORY_NX_BIT)
+#define SET_MEMORY_X BIT(_SET_MEMORY_X_BIT)
+#define SET_MEMORY_4K BIT(_SET_MEMORY_4K_BIT)
+#define SET_MEMORY_INV BIT(_SET_MEMORY_INV_BIT)
+#define SET_MEMORY_DEF BIT(_SET_MEMORY_DEF_BIT)
-static inline int set_memory_nx(unsigned long addr, int numpages)
-{
- return __set_memory(addr, numpages, SET_MEMORY_NX);
-}
+int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags);
-static inline int set_memory_x(unsigned long addr, int numpages)
-{
- return __set_memory(addr, numpages, SET_MEMORY_X);
+#define set_memory_rox set_memory_rox
+
+/*
+ * Generate two variants of each set_memory() function:
+ *
+ * set_memory_yy(unsigned long addr, int numpages);
+ * __set_memory_yy(void *start, void *end);
+ *
+ * The second variant exists for both convenience to avoid the usual
+ * (unsigned long) casts, but unlike the first variant it can also be used
+ * for areas larger than 8TB, which may happen at memory initialization.
+ */
+#define __SET_MEMORY_FUNC(fname, flags) \
+static inline int fname(unsigned long addr, int numpages) \
+{ \
+ return __set_memory(addr, numpages, (flags)); \
+} \
+ \
+static inline int __##fname(void *start, void *end) \
+{ \
+ unsigned long numpages; \
+ \
+ numpages = (end - start) >> PAGE_SHIFT; \
+ return __set_memory((unsigned long)start, numpages, (flags)); \
}
+__SET_MEMORY_FUNC(set_memory_ro, SET_MEMORY_RO)
+__SET_MEMORY_FUNC(set_memory_rw, SET_MEMORY_RW)
+__SET_MEMORY_FUNC(set_memory_nx, SET_MEMORY_NX)
+__SET_MEMORY_FUNC(set_memory_x, SET_MEMORY_X)
+__SET_MEMORY_FUNC(set_memory_rox, SET_MEMORY_RO | SET_MEMORY_X)
+__SET_MEMORY_FUNC(set_memory_rwnx, SET_MEMORY_RW | SET_MEMORY_NX)
+__SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K)
+
+int set_direct_map_invalid_noflush(struct page *page);
+int set_direct_map_default_noflush(struct page *page);
+
#endif
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 6dc6c4fbc8e2..03bcaa8effb2 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -8,15 +8,11 @@
#include <linux/bits.h>
#include <uapi/asm/setup.h>
+#include <linux/build_bug.h>
-#define EP_OFFSET 0x10008
-#define EP_STRING "S390EP"
#define PARMAREA 0x10400
-#define EARLY_SCCB_OFFSET 0x11000
-#define HEAD_END 0x12000
-
-#define EARLY_SCCB_SIZE PAGE_SIZE
+#define COMMAND_LINE_SIZE CONFIG_COMMAND_LINE_SIZE
/*
* Machine features detected in early.c
*/
@@ -27,17 +23,17 @@
#define MACHINE_FLAG_DIAG9C BIT(3)
#define MACHINE_FLAG_ESOP BIT(4)
#define MACHINE_FLAG_IDTE BIT(5)
-#define MACHINE_FLAG_DIAG44 BIT(6)
#define MACHINE_FLAG_EDAT1 BIT(7)
#define MACHINE_FLAG_EDAT2 BIT(8)
#define MACHINE_FLAG_TOPOLOGY BIT(10)
#define MACHINE_FLAG_TE BIT(11)
#define MACHINE_FLAG_TLB_LC BIT(12)
-#define MACHINE_FLAG_VX BIT(13)
#define MACHINE_FLAG_TLB_GUEST BIT(14)
#define MACHINE_FLAG_NX BIT(15)
#define MACHINE_FLAG_GS BIT(16)
#define MACHINE_FLAG_SCC BIT(17)
+#define MACHINE_FLAG_PCI_MIO BIT(18)
+#define MACHINE_FLAG_RDP BIT(19)
#define LPP_MAGIC BIT(31)
#define LPP_PID_MASK _AC(0xffffffff, UL)
@@ -47,28 +43,13 @@
#define STARTUP_NORMAL_OFFSET 0x10000
#define STARTUP_KDUMP_OFFSET 0x10010
-/* Offsets to parameters in kernel/head.S */
-
-#define IPL_DEVICE_OFFSET 0x10400
-#define INITRD_START_OFFSET 0x10408
-#define INITRD_SIZE_OFFSET 0x10410
-#define OLDMEM_BASE_OFFSET 0x10418
-#define OLDMEM_SIZE_OFFSET 0x10420
-#define KERNEL_VERSION_OFFSET 0x10428
-#define COMMAND_LINE_OFFSET 0x10480
+#define LEGACY_COMMAND_LINE_SIZE 896
#ifndef __ASSEMBLY__
#include <asm/lowcore.h>
#include <asm/types.h>
-#define IPL_DEVICE (*(unsigned long *) (IPL_DEVICE_OFFSET))
-#define INITRD_START (*(unsigned long *) (INITRD_START_OFFSET))
-#define INITRD_SIZE (*(unsigned long *) (INITRD_SIZE_OFFSET))
-#define OLDMEM_BASE (*(unsigned long *) (OLDMEM_BASE_OFFSET))
-#define OLDMEM_SIZE (*(unsigned long *) (OLDMEM_SIZE_OFFSET))
-#define COMMAND_LINE ((char *) (COMMAND_LINE_OFFSET))
-
struct parmarea {
unsigned long ipl_device; /* 0x10400 */
unsigned long initrd_start; /* 0x10408 */
@@ -76,16 +57,25 @@ struct parmarea {
unsigned long oldmem_base; /* 0x10418 */
unsigned long oldmem_size; /* 0x10420 */
unsigned long kernel_version; /* 0x10428 */
- char pad1[0x10480 - 0x10430]; /* 0x10430 - 0x10480 */
- char command_line[ARCH_COMMAND_LINE_SIZE]; /* 0x10480 */
+ unsigned long max_command_line_size; /* 0x10430 */
+ char pad1[0x10480-0x10438]; /* 0x10438 - 0x10480 */
+ char command_line[COMMAND_LINE_SIZE]; /* 0x10480 */
};
-extern int noexec_disabled;
-extern int memory_end_set;
-extern unsigned long memory_end;
-extern unsigned long vmalloc_size;
-extern unsigned long max_physmem_end;
-extern unsigned long __swsusp_reset_dma;
+extern struct parmarea parmarea;
+
+extern unsigned int zlib_dfltcc_support;
+#define ZLIB_DFLTCC_DISABLED 0
+#define ZLIB_DFLTCC_FULL 1
+#define ZLIB_DFLTCC_DEFLATE_ONLY 2
+#define ZLIB_DFLTCC_INFLATE_ONLY 3
+#define ZLIB_DFLTCC_FULL_DEBUG 4
+
+extern unsigned long ident_map_size;
+extern unsigned long max_mappable;
+
+/* The Write Back bit position in the physaddr is given by the SLPC PCI */
+extern unsigned long mio_wb_bit_mask;
#define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM)
#define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM)
@@ -94,17 +84,17 @@ extern unsigned long __swsusp_reset_dma;
#define MACHINE_HAS_DIAG9C (S390_lowcore.machine_flags & MACHINE_FLAG_DIAG9C)
#define MACHINE_HAS_ESOP (S390_lowcore.machine_flags & MACHINE_FLAG_ESOP)
#define MACHINE_HAS_IDTE (S390_lowcore.machine_flags & MACHINE_FLAG_IDTE)
-#define MACHINE_HAS_DIAG44 (S390_lowcore.machine_flags & MACHINE_FLAG_DIAG44)
#define MACHINE_HAS_EDAT1 (S390_lowcore.machine_flags & MACHINE_FLAG_EDAT1)
#define MACHINE_HAS_EDAT2 (S390_lowcore.machine_flags & MACHINE_FLAG_EDAT2)
#define MACHINE_HAS_TOPOLOGY (S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY)
#define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE)
#define MACHINE_HAS_TLB_LC (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC)
-#define MACHINE_HAS_VX (S390_lowcore.machine_flags & MACHINE_FLAG_VX)
#define MACHINE_HAS_TLB_GUEST (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_GUEST)
#define MACHINE_HAS_NX (S390_lowcore.machine_flags & MACHINE_FLAG_NX)
#define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS)
#define MACHINE_HAS_SCC (S390_lowcore.machine_flags & MACHINE_FLAG_SCC)
+#define MACHINE_HAS_PCI_MIO (S390_lowcore.machine_flags & MACHINE_FLAG_PCI_MIO)
+#define MACHINE_HAS_RDP (S390_lowcore.machine_flags & MACHINE_FLAG_RDP)
/*
* Console mode. Override with conmode=
@@ -113,9 +103,6 @@ extern unsigned int console_mode;
extern unsigned int console_devno;
extern unsigned int console_irq;
-extern char vmhalt_cmd[];
-extern char vmpoff_cmd[];
-
#define CONSOLE_IS_UNDEFINED (console_mode == 0)
#define CONSOLE_IS_SCLP (console_mode == 1)
#define CONSOLE_IS_3215 (console_mode == 2)
@@ -128,14 +115,6 @@ extern char vmpoff_cmd[];
#define SET_CONSOLE_VT220 do { console_mode = 4; } while (0)
#define SET_CONSOLE_HVC do { console_mode = 5; } while (0)
-#ifdef CONFIG_PFAULT
-extern int pfault_init(void);
-extern void pfault_fini(void);
-#else /* CONFIG_PFAULT */
-#define pfault_init() ({-1;})
-#define pfault_fini() do { } while (0)
-#endif /* CONFIG_PFAULT */
-
#ifdef CONFIG_VMCP
void vmcp_cma_reserve(void);
#else
@@ -144,9 +123,6 @@ static inline void vmcp_cma_reserve(void) { }
void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault);
-void cmma_init(void);
-void cmma_init_nodat(void);
-
extern void (*_machine_restart)(char *command);
extern void (*_machine_halt)(void);
extern void (*_machine_power_off)(void);
@@ -157,14 +133,24 @@ static inline unsigned long kaslr_offset(void)
return __kaslr_offset;
}
-#else /* __ASSEMBLY__ */
+extern int __kaslr_enabled;
+static inline int kaslr_enabled(void)
+{
+ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
+ return __kaslr_enabled;
+ return 0;
+}
-#define IPL_DEVICE (IPL_DEVICE_OFFSET)
-#define INITRD_START (INITRD_START_OFFSET)
-#define INITRD_SIZE (INITRD_SIZE_OFFSET)
-#define OLDMEM_BASE (OLDMEM_BASE_OFFSET)
-#define OLDMEM_SIZE (OLDMEM_SIZE_OFFSET)
-#define COMMAND_LINE (COMMAND_LINE_OFFSET)
+struct oldmem_data {
+ unsigned long start;
+ unsigned long size;
+};
+extern struct oldmem_data oldmem_data;
+static __always_inline u32 gen_lpswe(unsigned long addr)
+{
+ BUILD_BUG_ON(addr > 0xfff);
+ return 0xb2b20000 | addr;
+}
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_SETUP_H */
diff --git a/arch/s390/include/asm/shmparam.h b/arch/s390/include/asm/shmparam.h
deleted file mode 100644
index e75d45649c54..000000000000
--- a/arch/s390/include/asm/shmparam.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * S390 version
- *
- * Derived from "include/asm-i386/shmparam.h"
- */
-#ifndef _ASM_S390_SHMPARAM_H
-#define _ASM_S390_SHMPARAM_H
-
-#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */
-
-#endif /* _ASM_S390_SHMPARAM_H */
diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h
index 53ee795cd3d3..edee63da08e7 100644
--- a/arch/s390/include/asm/sigp.h
+++ b/arch/s390/include/asm/sigp.h
@@ -41,15 +41,17 @@
static inline int ____pcpu_sigp(u16 addr, u8 order, unsigned long parm,
u32 *status)
{
- register unsigned long reg1 asm ("1") = parm;
+ union register_pair r1 = { .odd = parm, };
int cc;
asm volatile(
- " sigp %1,%2,0(%3)\n"
- " ipm %0\n"
- " srl %0,28\n"
- : "=d" (cc), "+d" (reg1) : "d" (addr), "a" (order) : "cc");
- *status = reg1;
+ " sigp %[r1],%[addr],0(%[order])\n"
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
+ : [cc] "=&d" (cc), [r1] "+&d" (r1.pair)
+ : [addr] "d" (addr), [order] "a" (order)
+ : "cc");
+ *status = r1.even;
return cc;
}
diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index b157a81fb977..6e5b1b4b19a9 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -3,13 +3,13 @@
* Copyright IBM Corp. 1999, 2012
* Author(s): Denis Joseph Barrow,
* Martin Schwidefsky <schwidefsky@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*/
#ifndef __ASM_SMP_H
#define __ASM_SMP_H
#include <asm/sigp.h>
#include <asm/lowcore.h>
+#include <asm/processor.h>
#define raw_smp_processor_id() (S390_lowcore.cpu_nr)
@@ -17,6 +17,7 @@ extern struct mutex smp_cpu_state_mutex;
extern unsigned int smp_cpu_mt_shift;
extern unsigned int smp_cpu_mtid;
extern __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS];
+extern cpumask_t cpu_setup_mask;
extern int __cpu_up(unsigned int cpu, struct task_struct *tidle);
@@ -29,11 +30,12 @@ extern void smp_emergency_stop(void);
extern int smp_find_processor_id(u16 address);
extern int smp_store_status(int cpu);
-extern void smp_save_dump_cpus(void);
-extern int smp_vcpu_scheduled(int cpu);
+extern void smp_save_dump_ipl_cpu(void);
+extern void smp_save_dump_secondary_cpus(void);
extern void smp_yield_cpu(int cpu);
extern void smp_cpu_set_polarization(int cpu, int val);
extern int smp_cpu_get_polarization(int cpu);
+extern int smp_cpu_get_cpu_address(int cpu);
extern void smp_fill_possible_mask(void);
extern void smp_detect_cpus(void);
@@ -53,9 +55,15 @@ static inline int smp_get_base_cpu(int cpu)
return cpu - (cpu % (smp_cpu_mtid + 1));
}
+static inline void smp_cpus_done(unsigned int max_cpus)
+{
+}
+
extern int smp_rescan_cpus(void);
extern void __noreturn cpu_die(void);
extern void __cpu_die(unsigned int cpu);
extern int __cpu_disable(void);
+extern void schedule_mcck_handler(void);
+void notrace smp_yield_cpu(int cpu);
#endif /* __ASM_SMP_H */
diff --git a/arch/s390/include/asm/softirq_stack.h b/arch/s390/include/asm/softirq_stack.h
new file mode 100644
index 000000000000..1ac5115d3115
--- /dev/null
+++ b/arch/s390/include/asm/softirq_stack.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_S390_SOFTIRQ_STACK_H
+#define __ASM_S390_SOFTIRQ_STACK_H
+
+#include <asm/lowcore.h>
+#include <asm/stacktrace.h>
+
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
+static inline void do_softirq_own_stack(void)
+{
+ call_on_stack(0, S390_lowcore.async_stack, void, __do_softirq);
+}
+#endif
+#endif /* __ASM_S390_SOFTIRQ_STACK_H */
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 3a37172d5398..37127cd7749e 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -67,14 +67,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lp)
arch_spin_lock_wait(lp);
}
-static inline void arch_spin_lock_flags(arch_spinlock_t *lp,
- unsigned long flags)
-{
- if (!arch_spin_trylock_once(lp))
- arch_spin_lock_wait(lp);
-}
-#define arch_spin_lock_flags arch_spin_lock_flags
-
static inline int arch_spin_trylock(arch_spinlock_t *lp)
{
if (!arch_spin_trylock_once(lp))
@@ -85,10 +77,11 @@ static inline int arch_spin_trylock(arch_spinlock_t *lp)
static inline void arch_spin_unlock(arch_spinlock_t *lp)
{
typecheck(int, lp->lock);
+ kcsan_release();
asm_inline volatile(
- ALTERNATIVE("", ".long 0xb2fa0070", 49) /* NIAI 7 */
+ ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", 49) /* NIAI 7 */
" sth %1,%0\n"
- : "=Q" (((unsigned short *) &lp->lock)[1])
+ : "=R" (((unsigned short *) &lp->lock)[1])
: "d" (0) : "cc", "memory");
}
diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h
index cfed272e4fd5..b69695e39957 100644
--- a/arch/s390/include/asm/spinlock_types.h
+++ b/arch/s390/include/asm/spinlock_types.h
@@ -2,13 +2,13 @@
#ifndef __ASM_SPINLOCK_TYPES_H
#define __ASM_SPINLOCK_TYPES_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
+#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
# error "please don't include this file directly"
#endif
typedef struct {
int lock;
-} __attribute__ ((aligned (4))) arch_spinlock_t;
+} arch_spinlock_t;
#define __ARCH_SPIN_LOCK_UNLOCKED { .lock = 0, }
diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h
index ee056f4a4fa3..31ec4f545e03 100644
--- a/arch/s390/include/asm/stacktrace.h
+++ b/arch/s390/include/asm/stacktrace.h
@@ -6,12 +6,20 @@
#include <linux/ptrace.h>
#include <asm/switch_to.h>
+struct stack_frame_user {
+ unsigned long back_chain;
+ unsigned long empty1[5];
+ unsigned long gprs[10];
+ unsigned long empty2[4];
+};
+
enum stack_type {
STACK_TYPE_UNKNOWN,
STACK_TYPE_TASK,
STACK_TYPE_IRQ,
STACK_TYPE_NODAT,
STACK_TYPE_RESTART,
+ STACK_TYPE_MCCK,
};
struct stack_info {
@@ -33,37 +41,27 @@ static inline bool on_stack(struct stack_info *info,
return addr >= info->begin && addr + len <= info->end;
}
-static __always_inline unsigned long get_stack_pointer(struct task_struct *task,
- struct pt_regs *regs)
-{
- if (regs)
- return (unsigned long) kernel_stack_pointer(regs);
- if (task == current)
- return current_stack_pointer();
- return (unsigned long) task->thread.ksp;
-}
-
/*
* Stack layout of a C stack frame.
+ * Kernel uses the packed stack layout (-mpacked-stack).
*/
-#ifndef __PACK_STACK
struct stack_frame {
- unsigned long back_chain;
- unsigned long empty1[5];
- unsigned long gprs[10];
- unsigned int empty2[8];
-};
-#else
-struct stack_frame {
- unsigned long empty1[5];
- unsigned int empty2[8];
+ union {
+ unsigned long empty[9];
+ struct {
+ unsigned long sie_control_block;
+ unsigned long sie_savearea;
+ unsigned long sie_reason;
+ unsigned long sie_flags;
+ unsigned long sie_control_block_phys;
+ };
+ };
unsigned long gprs[10];
unsigned long back_chain;
};
-#endif
/*
- * Unlike current_stack_pointer() which simply returns current value of %r15
+ * Unlike current_stack_pointer which simply contains the current value of %r15
* current_frame_address() returns function stack frame address, which matches
* %r15 upon function invocation. It may differ from %r15 later if function
* allocates stack for local variables or new stack frame to call other
@@ -73,29 +71,26 @@ struct stack_frame {
((unsigned long)__builtin_frame_address(0) - \
offsetof(struct stack_frame, back_chain))
-#define CALL_ARGS_0() \
- register unsigned long r2 asm("2")
-#define CALL_ARGS_1(arg1) \
- register unsigned long r2 asm("2") = (unsigned long)(arg1)
-#define CALL_ARGS_2(arg1, arg2) \
- CALL_ARGS_1(arg1); \
- register unsigned long r3 asm("3") = (unsigned long)(arg2)
-#define CALL_ARGS_3(arg1, arg2, arg3) \
- CALL_ARGS_2(arg1, arg2); \
- register unsigned long r4 asm("4") = (unsigned long)(arg3)
-#define CALL_ARGS_4(arg1, arg2, arg3, arg4) \
- CALL_ARGS_3(arg1, arg2, arg3); \
- register unsigned long r4 asm("5") = (unsigned long)(arg4)
-#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5) \
- CALL_ARGS_4(arg1, arg2, arg3, arg4); \
- register unsigned long r4 asm("6") = (unsigned long)(arg5)
-
-#define CALL_FMT_0 "=&d" (r2) :
-#define CALL_FMT_1 "+&d" (r2) :
-#define CALL_FMT_2 CALL_FMT_1 "d" (r3),
-#define CALL_FMT_3 CALL_FMT_2 "d" (r4),
-#define CALL_FMT_4 CALL_FMT_3 "d" (r5),
-#define CALL_FMT_5 CALL_FMT_4 "d" (r6),
+static __always_inline unsigned long get_stack_pointer(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ if (regs)
+ return (unsigned long)kernel_stack_pointer(regs);
+ if (task == current)
+ return current_frame_address();
+ return (unsigned long)task->thread.ksp;
+}
+
+/*
+ * To keep this simple mark register 2-6 as being changed (volatile)
+ * by the called function, even though register 6 is saved/nonvolatile.
+ */
+#define CALL_FMT_0 "=&d" (r2)
+#define CALL_FMT_1 "+&d" (r2)
+#define CALL_FMT_2 CALL_FMT_1, "+&d" (r3)
+#define CALL_FMT_3 CALL_FMT_2, "+&d" (r4)
+#define CALL_FMT_4 CALL_FMT_3, "+&d" (r5)
+#define CALL_FMT_5 CALL_FMT_4, "+&d" (r6)
#define CALL_CLOBBER_5 "0", "1", "14", "cc", "memory"
#define CALL_CLOBBER_4 CALL_CLOBBER_5
@@ -104,35 +99,150 @@ struct stack_frame {
#define CALL_CLOBBER_1 CALL_CLOBBER_2, "3"
#define CALL_CLOBBER_0 CALL_CLOBBER_1
-#define CALL_ON_STACK(fn, stack, nr, args...) \
+#define CALL_LARGS_0(...) \
+ long dummy = 0
+#define CALL_LARGS_1(t1, a1) \
+ long arg1 = (long)(t1)(a1)
+#define CALL_LARGS_2(t1, a1, t2, a2) \
+ CALL_LARGS_1(t1, a1); \
+ long arg2 = (long)(t2)(a2)
+#define CALL_LARGS_3(t1, a1, t2, a2, t3, a3) \
+ CALL_LARGS_2(t1, a1, t2, a2); \
+ long arg3 = (long)(t3)(a3)
+#define CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4) \
+ CALL_LARGS_3(t1, a1, t2, a2, t3, a3); \
+ long arg4 = (long)(t4)(a4)
+#define CALL_LARGS_5(t1, a1, t2, a2, t3, a3, t4, a4, t5, a5) \
+ CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4); \
+ long arg5 = (long)(t5)(a5)
+
+#define CALL_REGS_0 \
+ register long r2 asm("2") = dummy
+#define CALL_REGS_1 \
+ register long r2 asm("2") = arg1
+#define CALL_REGS_2 \
+ CALL_REGS_1; \
+ register long r3 asm("3") = arg2
+#define CALL_REGS_3 \
+ CALL_REGS_2; \
+ register long r4 asm("4") = arg3
+#define CALL_REGS_4 \
+ CALL_REGS_3; \
+ register long r5 asm("5") = arg4
+#define CALL_REGS_5 \
+ CALL_REGS_4; \
+ register long r6 asm("6") = arg5
+
+#define CALL_TYPECHECK_0(...)
+#define CALL_TYPECHECK_1(t, a, ...) \
+ typecheck(t, a)
+#define CALL_TYPECHECK_2(t, a, ...) \
+ CALL_TYPECHECK_1(__VA_ARGS__); \
+ typecheck(t, a)
+#define CALL_TYPECHECK_3(t, a, ...) \
+ CALL_TYPECHECK_2(__VA_ARGS__); \
+ typecheck(t, a)
+#define CALL_TYPECHECK_4(t, a, ...) \
+ CALL_TYPECHECK_3(__VA_ARGS__); \
+ typecheck(t, a)
+#define CALL_TYPECHECK_5(t, a, ...) \
+ CALL_TYPECHECK_4(__VA_ARGS__); \
+ typecheck(t, a)
+
+#define CALL_PARM_0(...) void
+#define CALL_PARM_1(t, a, ...) t
+#define CALL_PARM_2(t, a, ...) t, CALL_PARM_1(__VA_ARGS__)
+#define CALL_PARM_3(t, a, ...) t, CALL_PARM_2(__VA_ARGS__)
+#define CALL_PARM_4(t, a, ...) t, CALL_PARM_3(__VA_ARGS__)
+#define CALL_PARM_5(t, a, ...) t, CALL_PARM_4(__VA_ARGS__)
+#define CALL_PARM_6(t, a, ...) t, CALL_PARM_5(__VA_ARGS__)
+
+/*
+ * Use call_on_stack() to call a function switching to a specified
+ * stack. Proper sign and zero extension of function arguments is
+ * done. Usage:
+ *
+ * rc = call_on_stack(nr, stack, rettype, fn, t1, a1, t2, a2, ...)
+ *
+ * - nr specifies the number of function arguments of fn.
+ * - stack specifies the stack to be used.
+ * - fn is the function to be called.
+ * - rettype is the return type of fn.
+ * - t1, a1, ... are pairs, where t1 must match the type of the first
+ * argument of fn, t2 the second, etc. a1 is the corresponding
+ * first function argument (not name), etc.
+ */
+#define call_on_stack(nr, stack, rettype, fn, ...) \
({ \
+ rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = fn; \
unsigned long frame = current_frame_address(); \
- CALL_ARGS_##nr(args); \
+ unsigned long __stack = stack; \
unsigned long prev; \
+ CALL_LARGS_##nr(__VA_ARGS__); \
+ CALL_REGS_##nr; \
\
+ CALL_TYPECHECK_##nr(__VA_ARGS__); \
asm volatile( \
- " la %[_prev],0(15)\n" \
+ " lgr %[_prev],15\n" \
" lg 15,%[_stack]\n" \
" stg %[_frame],%[_bc](15)\n" \
" brasl 14,%[_fn]\n" \
- " la 15,0(%[_prev])\n" \
- : [_prev] "=&a" (prev), CALL_FMT_##nr \
- [_stack] "R" (stack), \
+ " lgr 15,%[_prev]\n" \
+ : [_prev] "=&d" (prev), CALL_FMT_##nr \
+ : [_stack] "R" (__stack), \
[_bc] "i" (offsetof(struct stack_frame, back_chain)), \
[_frame] "d" (frame), \
- [_fn] "X" (fn) : CALL_CLOBBER_##nr); \
- r2; \
+ [_fn] "X" (__fn) : CALL_CLOBBER_##nr); \
+ (rettype)r2; \
})
-#define CALL_ON_STACK_NORETURN(fn, stack) \
+/*
+ * Use call_nodat() to call a function with DAT disabled.
+ * Proper sign and zero extension of function arguments is done.
+ * Usage:
+ *
+ * rc = call_nodat(nr, rettype, fn, t1, a1, t2, a2, ...)
+ *
+ * - nr specifies the number of function arguments of fn.
+ * - fn is the function to be called, where fn is a physical address.
+ * - rettype is the return type of fn.
+ * - t1, a1, ... are pairs, where t1 must match the type of the first
+ * argument of fn, t2 the second, etc. a1 is the corresponding
+ * first function argument (not name), etc.
+ *
+ * fn() is called with standard C function call ABI, with the exception
+ * that no useful stackframe or stackpointer is passed via register 15.
+ * Therefore the called function must not use r15 to access the stack.
+ */
+#define call_nodat(nr, rettype, fn, ...) \
({ \
+ rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = (fn); \
+ /* aligned since psw_leave must not cross page boundary */ \
+ psw_t __aligned(16) psw_leave; \
+ psw_t psw_enter; \
+ CALL_LARGS_##nr(__VA_ARGS__); \
+ CALL_REGS_##nr; \
+ \
+ CALL_TYPECHECK_##nr(__VA_ARGS__); \
+ psw_enter.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT; \
+ psw_enter.addr = (unsigned long)__fn; \
asm volatile( \
- " la 15,0(%[_stack])\n" \
- " xc %[_bc](8,15),%[_bc](15)\n" \
- " brasl 14,%[_fn]\n" \
- ::[_bc] "i" (offsetof(struct stack_frame, back_chain)), \
- [_stack] "a" (stack), [_fn] "X" (fn)); \
- BUG(); \
+ " epsw 0,1\n" \
+ " risbg 1,0,0,31,32\n" \
+ " larl 7,1f\n" \
+ " stg 1,%[psw_leave]\n" \
+ " stg 7,8+%[psw_leave]\n" \
+ " la 7,%[psw_leave]\n" \
+ " lra 7,0(7)\n" \
+ " larl 1,0f\n" \
+ " lra 14,0(1)\n" \
+ " lpswe %[psw_enter]\n" \
+ "0: lpswe 0(7)\n" \
+ "1:\n" \
+ : CALL_FMT_##nr, [psw_leave] "=Q" (psw_leave) \
+ : [psw_enter] "Q" (psw_enter) \
+ : "7", CALL_CLOBBER_##nr); \
+ (rettype)r2; \
})
#endif /* _ASM_S390_STACKTRACE_H */
diff --git a/arch/s390/include/asm/stp.h b/arch/s390/include/asm/stp.h
index f0ddefb06ec8..4d74d7e33340 100644
--- a/arch/s390/include/asm/stp.h
+++ b/arch/s390/include/asm/stp.h
@@ -6,43 +6,89 @@
#ifndef __S390_STP_H
#define __S390_STP_H
+#include <linux/compiler.h>
+
/* notifier for syncs */
extern struct atomic_notifier_head s390_epoch_delta_notifier;
/* STP interruption parameter */
struct stp_irq_parm {
- unsigned int _pad0 : 14;
- unsigned int tsc : 1; /* Timing status change */
- unsigned int lac : 1; /* Link availability change */
- unsigned int tcpc : 1; /* Time control parameter change */
- unsigned int _pad2 : 15;
-} __attribute__ ((packed));
+ u32 : 14;
+ u32 tsc : 1; /* Timing status change */
+ u32 lac : 1; /* Link availability change */
+ u32 tcpc : 1; /* Time control parameter change */
+ u32 : 15;
+} __packed;
#define STP_OP_SYNC 1
#define STP_OP_CTRL 3
struct stp_sstpi {
- unsigned int rsvd0;
- unsigned int rsvd1 : 8;
- unsigned int stratum : 8;
- unsigned int vbits : 16;
- unsigned int leaps : 16;
- unsigned int tmd : 4;
- unsigned int ctn : 4;
- unsigned int rsvd2 : 3;
- unsigned int c : 1;
- unsigned int tst : 4;
- unsigned int tzo : 16;
- unsigned int dsto : 16;
- unsigned int ctrl : 16;
- unsigned int rsvd3 : 16;
- unsigned int tto;
- unsigned int rsvd4;
- unsigned int ctnid[3];
- unsigned int rsvd5;
- unsigned int todoff[4];
- unsigned int rsvd6[48];
-} __attribute__ ((packed));
+ u32 : 32;
+ u32 tu : 1;
+ u32 lu : 1;
+ u32 : 6;
+ u32 stratum : 8;
+ u32 vbits : 16;
+ u32 leaps : 16;
+ u32 tmd : 4;
+ u32 ctn : 4;
+ u32 : 3;
+ u32 c : 1;
+ u32 tst : 4;
+ u32 tzo : 16;
+ u32 dsto : 16;
+ u32 ctrl : 16;
+ u32 : 16;
+ u32 tto;
+ u32 : 32;
+ u32 ctnid[3];
+ u32 : 32;
+ u64 todoff;
+ u32 rsvd[50];
+} __packed;
+
+struct stp_tzib {
+ u32 tzan : 16;
+ u32 : 16;
+ u32 tzo : 16;
+ u32 dsto : 16;
+ u32 stn;
+ u32 dstn;
+ u64 dst_on_alg;
+ u64 dst_off_alg;
+} __packed;
+
+struct stp_tcpib {
+ u32 atcode : 4;
+ u32 ntcode : 4;
+ u32 d : 1;
+ u32 : 23;
+ s32 tto;
+ struct stp_tzib atzib;
+ struct stp_tzib ntzib;
+ s32 adst_offset : 16;
+ s32 ndst_offset : 16;
+ u32 rsvd1;
+ u64 ntzib_update;
+ u64 ndsto_update;
+} __packed;
+
+struct stp_lsoib {
+ u32 p : 1;
+ u32 : 31;
+ s32 also : 16;
+ s32 nlso : 16;
+ u64 nlsout;
+} __packed;
+
+struct stp_stzi {
+ u32 rsvd0[3];
+ u64 data_ts;
+ u32 rsvd1[22];
+ struct stp_tcpib tcpib;
+ struct stp_lsoib lsoib;
+} __packed;
/* Functions needed by the machine check handler */
int stp_sync_check(void);
diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h
index 4c0690fc5167..351685de53d2 100644
--- a/arch/s390/include/asm/string.h
+++ b/arch/s390/include/asm/string.h
@@ -31,22 +31,18 @@ void *memmove(void *dest, const void *src, size_t n);
#define __HAVE_ARCH_STRCMP /* arch function */
#define __HAVE_ARCH_STRCPY /* inline & arch function */
#define __HAVE_ARCH_STRLCAT /* arch function */
-#define __HAVE_ARCH_STRLCPY /* arch function */
#define __HAVE_ARCH_STRLEN /* inline & arch function */
#define __HAVE_ARCH_STRNCAT /* arch function */
#define __HAVE_ARCH_STRNCPY /* arch function */
#define __HAVE_ARCH_STRNLEN /* inline & arch function */
-#define __HAVE_ARCH_STRRCHR /* arch function */
#define __HAVE_ARCH_STRSTR /* arch function */
/* Prototypes for non-inlined arch strings functions. */
int memcmp(const void *s1, const void *s2, size_t n);
int strcmp(const char *s1, const char *s2);
size_t strlcat(char *dest, const char *src, size_t n);
-size_t strlcpy(char *dest, const char *src, size_t size);
char *strncat(char *dest, const char *src, size_t n);
char *strncpy(char *dest, const char *src, size_t n);
-char *strrchr(const char *s, int c);
char *strstr(const char *s1, const char *s2);
#endif /* !CONFIG_KASAN */
@@ -59,18 +55,6 @@ char *strstr(const char *s1, const char *s2);
#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
-extern void *__memcpy(void *dest, const void *src, size_t n);
-extern void *__memset(void *s, int c, size_t n);
-extern void *__memmove(void *dest, const void *src, size_t n);
-
-/*
- * For files that are not instrumented (e.g. mm/slub.c) we
- * should use not instrumented version of mem* functions.
- */
-
-#define memcpy(dst, src, len) __memcpy(dst, src, len)
-#define memmove(dst, src, len) __memmove(dst, src, len)
-#define memset(s, c, n) __memset(s, c, n)
#define strlen(s) __strlen(s)
#define __no_sanitize_prefix_strfunc(x) __##x
@@ -83,6 +67,9 @@ extern void *__memmove(void *dest, const void *src, size_t n);
#define __no_sanitize_prefix_strfunc(x) x
#endif /* defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) */
+void *__memcpy(void *dest, const void *src, size_t n);
+void *__memset(void *s, int c, size_t n);
+void *__memmove(void *dest, const void *src, size_t n);
void *__memset16(uint16_t *s, uint16_t v, size_t count);
void *__memset32(uint32_t *s, uint32_t v, size_t count);
void *__memset64(uint64_t *s, uint64_t v, size_t count);
@@ -107,16 +94,18 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t count)
#ifdef __HAVE_ARCH_MEMCHR
static inline void *memchr(const void * s, int c, size_t n)
{
- register int r0 asm("0") = (char) c;
const void *ret = s + n;
asm volatile(
- "0: srst %0,%1\n"
+ " lgr 0,%[c]\n"
+ "0: srst %[ret],%[s]\n"
" jo 0b\n"
" jl 1f\n"
- " la %0,0\n"
+ " la %[ret],0\n"
"1:"
- : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory");
+ : [ret] "+&a" (ret), [s] "+&a" (s)
+ : [c] "d" (c)
+ : "cc", "memory", "0");
return (void *) ret;
}
#endif
@@ -124,13 +113,15 @@ static inline void *memchr(const void * s, int c, size_t n)
#ifdef __HAVE_ARCH_MEMSCAN
static inline void *memscan(void *s, int c, size_t n)
{
- register int r0 asm("0") = (char) c;
const void *ret = s + n;
asm volatile(
- "0: srst %0,%1\n"
+ " lgr 0,%[c]\n"
+ "0: srst %[ret],%[s]\n"
" jo 0b\n"
- : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory");
+ : [ret] "+&a" (ret), [s] "+&a" (s)
+ : [c] "d" (c)
+ : "cc", "memory", "0");
return (void *) ret;
}
#endif
@@ -138,17 +129,18 @@ static inline void *memscan(void *s, int c, size_t n)
#ifdef __HAVE_ARCH_STRCAT
static inline char *strcat(char *dst, const char *src)
{
- register int r0 asm("0") = 0;
- unsigned long dummy;
+ unsigned long dummy = 0;
char *ret = dst;
asm volatile(
- "0: srst %0,%1\n"
+ " lghi 0,0\n"
+ "0: srst %[dummy],%[dst]\n"
" jo 0b\n"
- "1: mvst %0,%2\n"
+ "1: mvst %[dummy],%[src]\n"
" jo 1b"
- : "=&a" (dummy), "+a" (dst), "+a" (src)
- : "d" (r0), "0" (0) : "cc", "memory" );
+ : [dummy] "+&a" (dummy), [dst] "+&a" (dst), [src] "+&a" (src)
+ :
+ : "cc", "memory", "0");
return ret;
}
#endif
@@ -156,14 +148,15 @@ static inline char *strcat(char *dst, const char *src)
#ifdef __HAVE_ARCH_STRCPY
static inline char *strcpy(char *dst, const char *src)
{
- register int r0 asm("0") = 0;
char *ret = dst;
asm volatile(
- "0: mvst %0,%1\n"
+ " lghi 0,0\n"
+ "0: mvst %[dst],%[src]\n"
" jo 0b"
- : "+&a" (dst), "+&a" (src) : "d" (r0)
- : "cc", "memory");
+ : [dst] "+&a" (dst), [src] "+&a" (src)
+ :
+ : "cc", "memory", "0");
return ret;
}
#endif
@@ -171,28 +164,33 @@ static inline char *strcpy(char *dst, const char *src)
#if defined(__HAVE_ARCH_STRLEN) || (defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__))
static inline size_t __no_sanitize_prefix_strfunc(strlen)(const char *s)
{
- register unsigned long r0 asm("0") = 0;
+ unsigned long end = 0;
const char *tmp = s;
asm volatile(
- "0: srst %0,%1\n"
+ " lghi 0,0\n"
+ "0: srst %[end],%[tmp]\n"
" jo 0b"
- : "+d" (r0), "+a" (tmp) : : "cc", "memory");
- return r0 - (unsigned long) s;
+ : [end] "+&a" (end), [tmp] "+&a" (tmp)
+ :
+ : "cc", "memory", "0");
+ return end - (unsigned long)s;
}
#endif
#ifdef __HAVE_ARCH_STRNLEN
static inline size_t strnlen(const char * s, size_t n)
{
- register int r0 asm("0") = 0;
const char *tmp = s;
const char *end = s + n;
asm volatile(
- "0: srst %0,%1\n"
+ " lghi 0,0\n"
+ "0: srst %[end],%[tmp]\n"
" jo 0b"
- : "+a" (end), "+a" (tmp) : "d" (r0) : "cc", "memory");
+ : [end] "+&a" (end), [tmp] "+&a" (tmp)
+ :
+ : "cc", "memory", "0");
return end - s;
}
#endif
diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h
index f073292e9fdb..27e3d804b311 100644
--- a/arch/s390/include/asm/syscall.h
+++ b/arch/s390/include/asm/syscall.h
@@ -14,8 +14,8 @@
#include <linux/err.h>
#include <asm/ptrace.h>
-extern const unsigned long sys_call_table[];
-extern const unsigned long sys_call_table_emu[];
+extern const sys_call_ptr_t sys_call_table[];
+extern const sys_call_ptr_t sys_call_table_emu[];
static inline long syscall_get_nr(struct task_struct *task,
struct pt_regs *regs)
@@ -33,7 +33,17 @@ static inline void syscall_rollback(struct task_struct *task,
static inline long syscall_get_error(struct task_struct *task,
struct pt_regs *regs)
{
- return IS_ERR_VALUE(regs->gprs[2]) ? regs->gprs[2] : 0;
+ unsigned long error = regs->gprs[2];
+#ifdef CONFIG_COMPAT
+ if (test_tsk_thread_flag(task, TIF_31BIT)) {
+ /*
+ * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
+ * and will match correctly in comparisons.
+ */
+ error = (long)(int)error;
+ }
+#endif
+ return IS_ERR_VALUE(error) ? error : 0;
}
static inline long syscall_get_return_value(struct task_struct *task,
@@ -46,6 +56,7 @@ static inline void syscall_set_return_value(struct task_struct *task,
struct pt_regs *regs,
int error, long val)
{
+ set_pt_regs_flag(regs, PIF_SYSCALL_RET_SET);
regs->gprs[2] = error ? error : val;
}
@@ -67,18 +78,6 @@ static inline void syscall_get_arguments(struct task_struct *task,
args[0] = regs->orig_gpr2 & mask;
}
-static inline void syscall_set_arguments(struct task_struct *task,
- struct pt_regs *regs,
- const unsigned long *args)
-{
- unsigned int n = 6;
-
- while (n-- > 0)
- if (n > 0)
- regs->gprs[2 + n] = args[n];
- regs->orig_gpr2 = args[0];
-}
-
static inline int syscall_get_arch(struct task_struct *task)
{
#ifdef CONFIG_COMPAT
@@ -87,4 +86,69 @@ static inline int syscall_get_arch(struct task_struct *task)
#endif
return AUDIT_ARCH_S390X;
}
+
+static inline bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs)
+{
+ return false;
+}
+
+#define SYSCALL_FMT_0
+#define SYSCALL_FMT_1 , "0" (r2)
+#define SYSCALL_FMT_2 , "d" (r3) SYSCALL_FMT_1
+#define SYSCALL_FMT_3 , "d" (r4) SYSCALL_FMT_2
+#define SYSCALL_FMT_4 , "d" (r5) SYSCALL_FMT_3
+#define SYSCALL_FMT_5 , "d" (r6) SYSCALL_FMT_4
+#define SYSCALL_FMT_6 , "d" (r7) SYSCALL_FMT_5
+
+#define SYSCALL_PARM_0
+#define SYSCALL_PARM_1 , long arg1
+#define SYSCALL_PARM_2 SYSCALL_PARM_1, long arg2
+#define SYSCALL_PARM_3 SYSCALL_PARM_2, long arg3
+#define SYSCALL_PARM_4 SYSCALL_PARM_3, long arg4
+#define SYSCALL_PARM_5 SYSCALL_PARM_4, long arg5
+#define SYSCALL_PARM_6 SYSCALL_PARM_5, long arg6
+
+#define SYSCALL_REGS_0
+#define SYSCALL_REGS_1 \
+ register long r2 asm("2") = arg1
+#define SYSCALL_REGS_2 \
+ SYSCALL_REGS_1; \
+ register long r3 asm("3") = arg2
+#define SYSCALL_REGS_3 \
+ SYSCALL_REGS_2; \
+ register long r4 asm("4") = arg3
+#define SYSCALL_REGS_4 \
+ SYSCALL_REGS_3; \
+ register long r5 asm("5") = arg4
+#define SYSCALL_REGS_5 \
+ SYSCALL_REGS_4; \
+ register long r6 asm("6") = arg5
+#define SYSCALL_REGS_6 \
+ SYSCALL_REGS_5; \
+ register long r7 asm("7") = arg6
+
+#define GENERATE_SYSCALL_FUNC(nr) \
+static __always_inline \
+long syscall##nr(unsigned long syscall SYSCALL_PARM_##nr) \
+{ \
+ register unsigned long r1 asm ("1") = syscall; \
+ register long rc asm ("2"); \
+ SYSCALL_REGS_##nr; \
+ \
+ asm volatile ( \
+ " svc 0\n" \
+ : "=d" (rc) \
+ : "d" (r1) SYSCALL_FMT_##nr \
+ : "memory"); \
+ return rc; \
+}
+
+GENERATE_SYSCALL_FUNC(0)
+GENERATE_SYSCALL_FUNC(1)
+GENERATE_SYSCALL_FUNC(2)
+GENERATE_SYSCALL_FUNC(3)
+GENERATE_SYSCALL_FUNC(4)
+GENERATE_SYSCALL_FUNC(5)
+GENERATE_SYSCALL_FUNC(6)
+
#endif /* _ASM_SYSCALL_H */
diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h
index 3c3d6fe8e2f0..35c1d1b860d8 100644
--- a/arch/s390/include/asm/syscall_wrapper.h
+++ b/arch/s390/include/asm/syscall_wrapper.h
@@ -7,9 +7,13 @@
#ifndef _ASM_S390_SYSCALL_WRAPPER_H
#define _ASM_S390_SYSCALL_WRAPPER_H
+/* Mapping of registers to parameters for syscalls */
+#define SC_S390_REGS_TO_ARGS(x, ...) \
+ __MAP(x, __SC_ARGS \
+ ,, regs->orig_gpr2,, regs->gprs[3],, regs->gprs[4] \
+ ,, regs->gprs[5],, regs->gprs[6],, regs->gprs[7])
+
#ifdef CONFIG_COMPAT
-#define __SC_COMPAT_TYPE(t, a) \
- __typeof(__builtin_choose_expr(sizeof(t) > 4, 0L, (t)0)) a
#define __SC_COMPAT_CAST(t, a) \
({ \
@@ -29,107 +33,108 @@
(t)__ReS; \
})
-#define __S390_SYS_STUBx(x, name, ...) \
- asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\
- ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \
- asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\
- { \
- long ret = __s390x_sys##name(__MAP(x,__SC_COMPAT_CAST,__VA_ARGS__));\
- __MAP(x,__SC_TEST,__VA_ARGS__); \
- return ret; \
- }
-
/*
* To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias
* named __s390x_sys_*()
*/
#define COMPAT_SYSCALL_DEFINE0(sname) \
- SYSCALL_METADATA(_##sname, 0); \
- asmlinkage long __s390_compat_sys_##sname(void); \
- ALLOW_ERROR_INJECTION(__s390_compat__sys_##sname, ERRNO); \
- asmlinkage long __s390_compat_sys_##sname(void)
+ long __s390_compat_sys_##sname(void); \
+ ALLOW_ERROR_INJECTION(__s390_compat_sys_##sname, ERRNO); \
+ long __s390_compat_sys_##sname(void)
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
- asmlinkage long __s390x_sys_##sname(void); \
+ long __s390_sys_##sname(void); \
+ ALLOW_ERROR_INJECTION(__s390_sys_##sname, ERRNO); \
+ long __s390x_sys_##sname(void); \
ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \
- asmlinkage long __s390_sys_##sname(void) \
- __attribute__((alias(__stringify(__s390x_sys_##sname)))); \
- asmlinkage long __s390x_sys_##sname(void)
+ static inline long __do_sys_##sname(void); \
+ long __s390_sys_##sname(void) \
+ { \
+ return __do_sys_##sname(); \
+ } \
+ long __s390x_sys_##sname(void) \
+ { \
+ return __do_sys_##sname(); \
+ } \
+ static inline long __do_sys_##sname(void)
#define COND_SYSCALL(name) \
cond_syscall(__s390x_sys_##name); \
cond_syscall(__s390_sys_##name)
-#define SYS_NI(name) \
- SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers); \
- SYSCALL_ALIAS(__s390_sys_##name, sys_ni_posix_timers)
-
-#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \
- __diag_push(); \
- __diag_ignore(GCC, 8, "-Wattribute-alias", \
- "Type aliasing is used to sanitize syscall arguments");\
- asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
- asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
- __attribute__((alias(__stringify(__se_compat_sys##name)))); \
- ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \
- static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
- asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
- asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
- { \
- long ret = __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\
- __MAP(x,__SC_TEST,__VA_ARGS__); \
- return ret; \
- } \
- __diag_pop(); \
- static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
+#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \
+ long __s390_compat_sys##name(struct pt_regs *regs); \
+ ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO); \
+ static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \
+ static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)); \
+ long __s390_compat_sys##name(struct pt_regs *regs) \
+ { \
+ return __se_compat_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \
+ } \
+ static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \
+ { \
+ __MAP(x, __SC_TEST, __VA_ARGS__); \
+ return __do_compat_sys##name(__MAP(x, __SC_DELOUSE, __VA_ARGS__)); \
+ } \
+ static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__))
/*
* As some compat syscalls may not be implemented, we need to expand
- * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in
- * kernel/time/posix-stubs.c to cover this case as well.
+ * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well.
*/
#define COND_SYSCALL_COMPAT(name) \
cond_syscall(__s390_compat_sys_##name)
-#define COMPAT_SYS_NI(name) \
- SYSCALL_ALIAS(__s390_compat_sys_##name, sys_ni_posix_timers)
+#define __S390_SYS_STUBx(x, name, ...) \
+ long __s390_sys##name(struct pt_regs *regs); \
+ ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \
+ static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \
+ long __s390_sys##name(struct pt_regs *regs) \
+ { \
+ return ___se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \
+ } \
+ static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \
+ { \
+ __MAP(x, __SC_TEST, __VA_ARGS__); \
+ return __do_sys##name(__MAP(x, __SC_COMPAT_CAST, __VA_ARGS__)); \
+ }
#else /* CONFIG_COMPAT */
-#define __S390_SYS_STUBx(x, fullname, name, ...)
-
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
- asmlinkage long __s390x_sys_##sname(void); \
+ long __s390x_sys_##sname(void); \
ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \
- asmlinkage long __s390x_sys_##sname(void)
+ static inline long __do_sys_##sname(void); \
+ long __s390x_sys_##sname(void) \
+ { \
+ return __do_sys_##sname(); \
+ } \
+ static inline long __do_sys_##sname(void)
#define COND_SYSCALL(name) \
cond_syscall(__s390x_sys_##name)
-#define SYS_NI(name) \
- SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers);
+#define __S390_SYS_STUBx(x, fullname, name, ...)
#endif /* CONFIG_COMPAT */
#define __SYSCALL_DEFINEx(x, name, ...) \
- __diag_push(); \
- __diag_ignore(GCC, 8, "-Wattribute-alias", \
- "Type aliasing is used to sanitize syscall arguments");\
- asmlinkage long __s390x_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
- __attribute__((alias(__stringify(__se_sys##name)))); \
+ long __s390x_sys##name(struct pt_regs *regs); \
ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \
- long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
- static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
- __S390_SYS_STUBx(x, name, __VA_ARGS__) \
- asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
+ static inline long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \
+ static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)); \
+ __S390_SYS_STUBx(x, name, __VA_ARGS__); \
+ long __s390x_sys##name(struct pt_regs *regs) \
+ { \
+ return __se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \
+ } \
+ static inline long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \
{ \
- long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
- __MAP(x,__SC_TEST,__VA_ARGS__); \
- return ret; \
+ __MAP(x, __SC_TEST, __VA_ARGS__); \
+ return __do_sys##name(__MAP(x, __SC_CAST, __VA_ARGS__)); \
} \
- __diag_pop(); \
- static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
+ static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__))
-#endif /* _ASM_X86_SYSCALL_WRAPPER_H */
+#endif /* _ASM_S390_SYSCALL_WRAPPER_H */
diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h
index fe7b3f8f0791..edca5a751df4 100644
--- a/arch/s390/include/asm/sysinfo.h
+++ b/arch/s390/include/asm/sysinfo.h
@@ -40,6 +40,10 @@ struct sysinfo_1_1_1 {
unsigned int ncr;
unsigned int npr;
unsigned int ntr;
+ char reserved_3[4];
+ char model_var_cap[16];
+ unsigned int model_var_cap_rating;
+ unsigned int nvr;
};
struct sysinfo_1_2_1 {
@@ -67,12 +71,12 @@ struct sysinfo_1_2_2 {
unsigned short cpus_configured;
unsigned short cpus_standby;
unsigned short cpus_reserved;
- unsigned short adjustment[0];
+ unsigned short adjustment[];
};
struct sysinfo_1_2_2_extension {
unsigned int alt_capability;
- unsigned short alt_adjustment[0];
+ unsigned short alt_adjustment[];
};
struct sysinfo_2_2_1 {
@@ -181,7 +185,7 @@ struct sysinfo_15_1_x {
unsigned char reserved1;
unsigned char mnest;
unsigned char reserved2[4];
- union topology_entry tle[0];
+ union topology_entry tle[];
};
int stsi(void *sysinfo, int fc, int sel1, int sel2);
diff --git a/arch/s390/include/asm/termios.h b/arch/s390/include/asm/termios.h
deleted file mode 100644
index 46fa3020b41e..000000000000
--- a/arch/s390/include/asm/termios.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * S390 version
- *
- * Derived from "include/asm-i386/termios.h"
- */
-#ifndef _S390_TERMIOS_H
-#define _S390_TERMIOS_H
-
-#include <uapi/asm/termios.h>
-
-
-/* intr=^C quit=^\ erase=del kill=^U
- eof=^D vtime=\0 vmin=\1 sxtc=\0
- start=^Q stop=^S susp=^Z eol=\0
- reprint=^R discard=^U werase=^W lnext=^V
- eol2=\0
-*/
-#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
-
-#define user_termios_to_kernel_termios(k, u) copy_from_user(k, u, sizeof(struct termios2))
-#define kernel_termios_to_user_termios(u, k) copy_to_user(u, k, sizeof(struct termios2))
-
-#include <asm-generic/termios-base.h>
-
-#endif /* _S390_TERMIOS_H */
diff --git a/arch/s390/include/asm/text-patching.h b/arch/s390/include/asm/text-patching.h
new file mode 100644
index 000000000000..b219056a8817
--- /dev/null
+++ b/arch/s390/include/asm/text-patching.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_TEXT_PATCHING_H
+#define _ASM_S390_TEXT_PATCHING_H
+
+#include <asm/barrier.h>
+
+static __always_inline void sync_core(void)
+{
+ bcr_serialize();
+}
+
+void text_poke_sync(void);
+void text_poke_sync_lock(void);
+
+#endif /* _ASM_S390_TEXT_PATCHING_H */
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index e582fbe59e20..a674c7d25da5 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -9,6 +9,9 @@
#define _ASM_THREAD_INFO_H
#include <linux/bits.h>
+#ifndef ASM_OFFSETS_C
+#include <asm/asm-offsets.h>
+#endif
/*
* General size of kernel stacks
@@ -18,16 +21,14 @@
#else
#define THREAD_SIZE_ORDER 2
#endif
-#define BOOT_STACK_ORDER 2
+#define BOOT_STACK_SIZE (PAGE_SIZE << 2)
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
+#define STACK_INIT_OFFSET (THREAD_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE)
+
#ifndef __ASSEMBLY__
#include <asm/lowcore.h>
#include <asm/page.h>
-#include <asm/processor.h>
-
-#define STACK_INIT_OFFSET \
- (THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs))
/*
* low level task data that entry.S needs immediate access to
@@ -37,6 +38,8 @@
*/
struct thread_info {
unsigned long flags; /* low level flags */
+ unsigned long syscall_work; /* SYSCALL_WORK_ flags */
+ unsigned int cpu; /* current CPU */
};
/*
@@ -47,8 +50,7 @@ struct thread_info {
.flags = 0, \
}
-void arch_release_task_struct(struct task_struct *tsk);
-int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
+struct task_struct;
void arch_setup_new_exec(void);
#define arch_setup_new_exec arch_setup_new_exec
@@ -66,8 +68,9 @@ void arch_setup_new_exec(void);
#define TIF_GUARDED_STORAGE 4 /* load guarded storage control block */
#define TIF_PATCH_PENDING 5 /* pending live patching update */
#define TIF_PGSTE 6 /* New mm's will use 4K page tables */
-#define TIF_ISOLATE_BP 8 /* Run process with isolated BP */
+#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */
#define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */
+#define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */
#define TIF_31BIT 16 /* 32bit process */
#define TIF_MEMDIE 17 /* is terminating due to OOM killer */
@@ -83,13 +86,14 @@ void arch_setup_new_exec(void);
#define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */
#define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME)
+#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL)
#define _TIF_SIGPENDING BIT(TIF_SIGPENDING)
#define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED)
#define _TIF_UPROBE BIT(TIF_UPROBE)
#define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE)
#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING)
-#define _TIF_ISOLATE_BP BIT(TIF_ISOLATE_BP)
#define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST)
+#define _TIF_PER_TRAP BIT(TIF_PER_TRAP)
#define _TIF_31BIT BIT(TIF_31BIT)
#define _TIF_SINGLE_STEP BIT(TIF_SINGLE_STEP)
diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h
index 6da8885251d6..4d646659a5f5 100644
--- a/arch/s390/include/asm/timex.h
+++ b/arch/s390/include/asm/timex.h
@@ -19,6 +19,25 @@
extern u64 clock_comparator_max;
+union tod_clock {
+ __uint128_t val;
+ struct {
+ __uint128_t ei : 8; /* epoch index */
+ __uint128_t tod : 64; /* bits 0-63 of tod clock */
+ __uint128_t : 40;
+ __uint128_t pf : 16; /* programmable field */
+ };
+ struct {
+ __uint128_t eitod : 72; /* epoch index + bits 0-63 tod clock */
+ __uint128_t : 56;
+ };
+ struct {
+ __uint128_t us : 60; /* micro-seconds */
+ __uint128_t sus : 12; /* sub-microseconds */
+ __uint128_t : 56;
+ };
+} __packed;
+
/* Inline functions for clock register access. */
static inline int set_tod_clock(__u64 time)
{
@@ -32,26 +51,36 @@ static inline int set_tod_clock(__u64 time)
return cc;
}
-static inline int store_tod_clock(__u64 *time)
+static inline int store_tod_clock_ext_cc(union tod_clock *clk)
{
int cc;
asm volatile(
- " stck %1\n"
+ " stcke %1\n"
" ipm %0\n"
" srl %0,28\n"
- : "=d" (cc), "=Q" (*time) : : "cc");
+ : "=d" (cc), "=Q" (*clk) : : "cc");
return cc;
}
+static __always_inline void store_tod_clock_ext(union tod_clock *tod)
+{
+ asm volatile("stcke %0" : "=Q" (*tod) : : "cc");
+}
+
static inline void set_clock_comparator(__u64 time)
{
asm volatile("sckc %0" : : "Q" (time));
}
-static inline void store_clock_comparator(__u64 *time)
+static inline void set_tod_programmable_field(u16 val)
{
- asm volatile("stckc %0" : "=Q" (*time));
+ asm volatile(
+ " lgr 0,%[val]\n"
+ " sckpf\n"
+ :
+ : [val] "d" ((unsigned long)val)
+ : "0");
}
void clock_comparator_work(void);
@@ -72,10 +101,10 @@ extern unsigned char ptff_function_mask[16];
/* Query TOD offset result */
struct ptff_qto {
- unsigned long long physical_clock;
- unsigned long long tod_offset;
- unsigned long long logical_tod_offset;
- unsigned long long tod_epoch_difference;
+ unsigned long physical_clock;
+ unsigned long tod_offset;
+ unsigned long logical_tod_offset;
+ unsigned long tod_epoch_difference;
} __packed;
static inline int ptff_query(unsigned int nr)
@@ -112,22 +141,25 @@ struct ptff_qui {
#define ptff(ptff_block, len, func) \
({ \
struct addrtype { char _[len]; }; \
- register unsigned int reg0 asm("0") = func; \
- register unsigned long reg1 asm("1") = (unsigned long) (ptff_block);\
+ unsigned int reg0 = func; \
+ unsigned long reg1 = (unsigned long)(ptff_block); \
int rc; \
\
asm volatile( \
- " .word 0x0104\n" \
- " ipm %0\n" \
- " srl %0,28\n" \
- : "=d" (rc), "+m" (*(struct addrtype *) reg1) \
- : "d" (reg0), "d" (reg1) : "cc"); \
+ " lgr 0,%[reg0]\n" \
+ " lgr 1,%[reg1]\n" \
+ " ptff\n" \
+ " ipm %[rc]\n" \
+ " srl %[rc],28\n" \
+ : [rc] "=&d" (rc), "+m" (*(struct addrtype *)reg1) \
+ : [reg0] "d" (reg0), [reg1] "d" (reg1) \
+ : "cc", "0", "1"); \
rc; \
})
-static inline unsigned long long local_tick_disable(void)
+static inline unsigned long local_tick_disable(void)
{
- unsigned long long old;
+ unsigned long old;
old = S390_lowcore.clock_comparator;
S390_lowcore.clock_comparator = clock_comparator_max;
@@ -135,53 +167,47 @@ static inline unsigned long long local_tick_disable(void)
return old;
}
-static inline void local_tick_enable(unsigned long long comp)
+static inline void local_tick_enable(unsigned long comp)
{
S390_lowcore.clock_comparator = comp;
set_clock_comparator(S390_lowcore.clock_comparator);
}
#define CLOCK_TICK_RATE 1193180 /* Underlying HZ */
-#define STORE_CLOCK_EXT_SIZE 16 /* stcke writes 16 bytes */
-
-typedef unsigned long long cycles_t;
-static inline void get_tod_clock_ext(char *clk)
-{
- typedef struct { char _[STORE_CLOCK_EXT_SIZE]; } addrtype;
-
- asm volatile("stcke %0" : "=Q" (*(addrtype *) clk) : : "cc");
-}
+typedef unsigned long cycles_t;
-static inline unsigned long long get_tod_clock(void)
+static __always_inline unsigned long get_tod_clock(void)
{
- unsigned char clk[STORE_CLOCK_EXT_SIZE];
+ union tod_clock clk;
- get_tod_clock_ext(clk);
- return *((unsigned long long *)&clk[1]);
+ store_tod_clock_ext(&clk);
+ return clk.tod;
}
-static inline unsigned long long get_tod_clock_fast(void)
+static inline unsigned long get_tod_clock_fast(void)
{
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
- unsigned long long clk;
+ unsigned long clk;
asm volatile("stckf %0" : "=Q" (clk) : : "cc");
return clk;
-#else
- return get_tod_clock();
-#endif
}
static inline cycles_t get_cycles(void)
{
return (cycles_t) get_tod_clock() >> 2;
}
+#define get_cycles get_cycles
int get_phys_clock(unsigned long *clock);
void init_cpu_timer(void);
-extern unsigned char tod_clock_base[16] __aligned(8);
+extern union tod_clock tod_clock_base;
+
+static __always_inline unsigned long __get_tod_clock_monotonic(void)
+{
+ return get_tod_clock() - tod_clock_base.tod;
+}
/**
* get_clock_monotonic - returns current time in clock rate units
@@ -190,13 +216,13 @@ extern unsigned char tod_clock_base[16] __aligned(8);
* Therefore preemption must be disabled, otherwise the returned
* value is not guaranteed to be monotonic.
*/
-static inline unsigned long long get_tod_clock_monotonic(void)
+static inline unsigned long get_tod_clock_monotonic(void)
{
- unsigned long long tod;
+ unsigned long tod;
- preempt_disable();
- tod = get_tod_clock() - *(unsigned long long *) &tod_clock_base[1];
- preempt_enable();
+ preempt_disable_notrace();
+ tod = __get_tod_clock_monotonic();
+ preempt_enable_notrace();
return tod;
}
@@ -219,7 +245,7 @@ static inline unsigned long long get_tod_clock_monotonic(void)
* -> ns = (th * 125) + ((tl * 125) >> 9);
*
*/
-static inline unsigned long long tod_to_ns(unsigned long long todval)
+static __always_inline unsigned long tod_to_ns(unsigned long todval)
{
return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9);
}
@@ -231,10 +257,10 @@ static inline unsigned long long tod_to_ns(unsigned long long todval)
*
* Returns: true if a is later than b
*/
-static inline int tod_after(unsigned long long a, unsigned long long b)
+static inline int tod_after(unsigned long a, unsigned long b)
{
if (MACHINE_HAS_SCC)
- return (long long) a > (long long) b;
+ return (long) a > (long) b;
return a > b;
}
@@ -245,10 +271,10 @@ static inline int tod_after(unsigned long long a, unsigned long long b)
*
* Returns: true if a is later than b
*/
-static inline int tod_after_eq(unsigned long long a, unsigned long long b)
+static inline int tod_after_eq(unsigned long a, unsigned long b)
{
if (MACHINE_HAS_SCC)
- return (long long) a >= (long long) b;
+ return (long) a >= (long) b;
return a >= b;
}
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index aa406c05a350..d1455a601adc 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -25,10 +25,8 @@
void __tlb_remove_table(void *_table);
static inline void tlb_flush(struct mmu_gather *tlb);
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
- struct page *page, int page_size);
-
-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
+ struct encoded_page *page,
+ int page_size);
#define tlb_flush tlb_flush
#define pte_free_tlb pte_free_tlb
@@ -36,7 +34,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
#define p4d_free_tlb p4d_free_tlb
#define pud_free_tlb pud_free_tlb
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm-generic/tlb.h>
@@ -44,11 +41,15 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
* Release the page cache reference for a pte removed by
* tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page
* has already been freed, so just do free_page_and_swap_cache.
+ *
+ * s390 doesn't delay rmap removal, so there is nothing encoded in
+ * the page pointer.
*/
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
- struct page *page, int page_size)
+ struct encoded_page *page,
+ int page_size)
{
- free_page_and_swap_cache(page);
+ free_page_and_swap_cache(encoded_page_ptr(page));
return false;
}
@@ -67,13 +68,10 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
__tlb_adjust_range(tlb, address, PAGE_SIZE);
tlb->mm->context.flush_mm = 1;
tlb->freed_tables = 1;
- tlb->cleared_ptes = 1;
- /*
- * page_table_free_rcu takes care of the allocation bit masks
- * of the 2K table fragments in the 4K page table page,
- * then calls tlb_remove_table.
- */
- page_table_free_rcu(tlb, (unsigned long *) pte, address);
+ tlb->cleared_pmds = 1;
+ if (mm_alloc_pgste(tlb->mm))
+ gmap_unlink(tlb->mm, (unsigned long *)pte, address);
+ tlb_remove_ptdesc(tlb, pte);
}
/*
@@ -88,12 +86,12 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
{
if (mm_pmd_folded(tlb->mm))
return;
- pgtable_pmd_page_dtor(virt_to_page(pmd));
+ pagetable_pmd_dtor(virt_to_ptdesc(pmd));
__tlb_adjust_range(tlb, address, PAGE_SIZE);
tlb->mm->context.flush_mm = 1;
tlb->freed_tables = 1;
tlb->cleared_puds = 1;
- tlb_remove_table(tlb, pmd);
+ tlb_remove_ptdesc(tlb, pmd);
}
/*
@@ -111,8 +109,7 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
__tlb_adjust_range(tlb, address, PAGE_SIZE);
tlb->mm->context.flush_mm = 1;
tlb->freed_tables = 1;
- tlb->cleared_p4ds = 1;
- tlb_remove_table(tlb, p4d);
+ tlb_remove_ptdesc(tlb, p4d);
}
/*
@@ -129,8 +126,8 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
return;
tlb->mm->context.flush_mm = 1;
tlb->freed_tables = 1;
- tlb->cleared_puds = 1;
- tlb_remove_table(tlb, pud);
+ tlb->cleared_p4ds = 1;
+ tlb_remove_ptdesc(tlb, pud);
}
diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h
index 82703e03f35d..a6e2cd89b609 100644
--- a/arch/s390/include/asm/tlbflush.h
+++ b/arch/s390/include/asm/tlbflush.h
@@ -5,8 +5,6 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <asm/processor.h>
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
/*
* Flush all TLB entries on the local CPU.
@@ -27,13 +25,9 @@ static inline void __tlb_flush_idte(unsigned long asce)
if (MACHINE_HAS_TLB_GUEST)
opt |= IDTE_GUEST_ASCE;
/* Global TLB flush for the mm */
- asm volatile(
- " .insn rrf,0xb98e0000,0,%0,%1,0"
- : : "a" (opt), "a" (asce) : "cc");
+ asm volatile("idte 0,%1,%0" : : "a" (opt), "a" (asce) : "cc");
}
-void smp_ptlb_all(void);
-
/*
* Flush all TLB entries on all CPUs.
*/
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index cca406fdbe51..3a0ac0c7a9a3 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -16,8 +16,8 @@ struct cpu_topology_s390 {
unsigned short socket_id;
unsigned short book_id;
unsigned short drawer_id;
- unsigned short node_id;
unsigned short dedicated : 1;
+ int booted_cores;
cpumask_t thread_mask;
cpumask_t core_mask;
cpumask_t book_mask;
@@ -25,7 +25,6 @@ struct cpu_topology_s390 {
};
extern struct cpu_topology_s390 cpu_topology[NR_CPUS];
-extern cpumask_t cpus_with_topology;
#define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id)
#define topology_thread_id(cpu) (cpu_topology[cpu].thread_id)
@@ -37,6 +36,7 @@ extern cpumask_t cpus_with_topology;
#define topology_drawer_id(cpu) (cpu_topology[cpu].drawer_id)
#define topology_drawer_cpumask(cpu) (&cpu_topology[cpu].drawer_mask)
#define topology_cpu_dedicated(cpu) (cpu_topology[cpu].dedicated)
+#define topology_booted_cores(cpu) (cpu_topology[cpu].booted_cores)
#define mc_capable() 1
@@ -45,6 +45,7 @@ int topology_cpu_init(struct cpu *);
int topology_set_cpu_management(int fc);
void topology_schedule_update(void);
void store_topology(struct sysinfo_15_1_x *info);
+void update_cpu_masks(void);
void topology_expect_change(void);
const struct cpumask *cpu_coregroup_mask(int cpu);
@@ -54,6 +55,8 @@ static inline void topology_init_early(void) { }
static inline void topology_schedule_update(void) { }
static inline int topology_cpu_init(struct cpu *cpu) { return 0; }
static inline int topology_cpu_dedicated(int cpu_nr) { return 0; }
+static inline int topology_booted_cores(int cpu_nr) { return 1; }
+static inline void update_cpu_masks(void) { }
static inline void topology_expect_change(void) { }
#endif /* CONFIG_SCHED_TOPOLOGY */
@@ -71,20 +74,18 @@ static inline void topology_expect_change(void) { }
#define cpu_to_node cpu_to_node
static inline int cpu_to_node(int cpu)
{
- return cpu_topology[cpu].node_id;
+ return 0;
}
/* Returns a pointer to the cpumask of CPUs on node 'node'. */
#define cpumask_of_node cpumask_of_node
static inline const struct cpumask *cpumask_of_node(int node)
{
- return &node_to_cpumask_map[node];
+ return cpu_possible_mask;
}
#define pcibus_to_node(bus) __pcibus_to_node(bus)
-#define node_distance(a, b) __node_distance(a, b)
-
#else /* !CONFIG_NUMA */
#define numa_node_id numa_node_id
diff --git a/arch/s390/include/asm/tpi.h b/arch/s390/include/asm/tpi.h
new file mode 100644
index 000000000000..f76e5fdff23a
--- /dev/null
+++ b/arch/s390/include/asm/tpi.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_S390_TPI_H
+#define _ASM_S390_TPI_H
+
+#include <linux/types.h>
+#include <uapi/asm/schid.h>
+
+#ifndef __ASSEMBLY__
+
+/* I/O-Interruption Code as stored by TEST PENDING INTERRUPTION (TPI). */
+struct tpi_info {
+ struct subchannel_id schid;
+ u32 intparm;
+ u32 adapter_IO:1;
+ u32 directed_irq:1;
+ u32 isc:3;
+ u32 :12;
+ u32 type:3;
+ u32 :12;
+} __packed __aligned(4);
+
+/* I/O-Interruption Code as stored by TPI for an Adapter I/O */
+struct tpi_adapter_info {
+ u32 aism:8;
+ u32 :22;
+ u32 error:1;
+ u32 forward:1;
+ u32 reserved;
+ u32 adapter_IO:1;
+ u32 directed_irq:1;
+ u32 isc:3;
+ u32 :27;
+} __packed __aligned(4);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_S390_TPI_H */
diff --git a/arch/s390/include/asm/types.h b/arch/s390/include/asm/types.h
new file mode 100644
index 000000000000..0b5d550a0478
--- /dev/null
+++ b/arch/s390/include/asm/types.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _ASM_S390_TYPES_H
+#define _ASM_S390_TYPES_H
+
+#include <uapi/asm/types.h>
+
+#ifndef __ASSEMBLY__
+
+union register_pair {
+ unsigned __int128 pair;
+ struct {
+ unsigned long even;
+ unsigned long odd;
+ };
+};
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_S390_TYPES_H */
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index a470f1fa9f2a..81ae8a98e7ec 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -3,7 +3,7 @@
* S390 version
* Copyright IBM Corp. 1999, 2000
* Author(s): Hartmut Penner (hp@de.ibm.com),
- * Martin Schwidefsky (schwidefsky@de.ibm.com)
+ * Martin Schwidefsky (schwidefsky@de.ibm.com)
*
* Derived from "include/asm-i386/uaccess.h"
*/
@@ -13,41 +13,13 @@
/*
* User space memory access functions
*/
+#include <asm/asm-extable.h>
#include <asm/processor.h>
-#include <asm/ctl_reg.h>
#include <asm/extable.h>
#include <asm/facility.h>
+#include <asm-generic/access_ok.h>
-/*
- * The fs value determines whether argument validity checking should be
- * performed or not. If get_fs() == USER_DS, checking is performed, with
- * get_fs() == KERNEL_DS, checking is bypassed.
- *
- * For historical reasons, these macros are grossly misnamed.
- */
-
-#define KERNEL_DS (0)
-#define KERNEL_DS_SACF (1)
-#define USER_DS (2)
-#define USER_DS_SACF (3)
-
-#define get_fs() (current->thread.mm_segment)
-#define segment_eq(a,b) (((a) & 2) == ((b) & 2))
-
-void set_fs(mm_segment_t fs);
-
-static inline int __range_ok(unsigned long addr, unsigned long size)
-{
- return 1;
-}
-
-#define __access_ok(addr, size) \
-({ \
- __chk_user_ptr(addr); \
- __range_ok((unsigned long)(addr), (size)); \
-})
-
-#define access_ok(addr, size) __access_ok(addr, size)
+void debug_user_asce(int exit);
unsigned long __must_check
raw_copy_from_user(void *to, const void __user *from, unsigned long n);
@@ -60,209 +32,246 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n);
#define INLINE_COPY_TO_USER
#endif
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
-
-#define __put_get_user_asm(to, from, size, spec) \
-({ \
- register unsigned long __reg0 asm("0") = spec; \
- int __rc; \
- \
- asm volatile( \
- "0: mvcos %1,%3,%2\n" \
- "1: xr %0,%0\n" \
- "2:\n" \
- ".pushsection .fixup, \"ax\"\n" \
- "3: lhi %0,%5\n" \
- " jg 2b\n" \
- ".popsection\n" \
- EX_TABLE(0b,3b) EX_TABLE(1b,3b) \
- : "=d" (__rc), "+Q" (*(to)) \
- : "d" (size), "Q" (*(from)), \
- "d" (__reg0), "K" (-EFAULT) \
- : "cc"); \
- __rc; \
+unsigned long __must_check
+_copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key);
+
+static __always_inline unsigned long __must_check
+copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key)
+{
+ if (check_copy_size(to, n, false))
+ n = _copy_from_user_key(to, from, n, key);
+ return n;
+}
+
+unsigned long __must_check
+_copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key);
+
+static __always_inline unsigned long __must_check
+copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key)
+{
+ if (check_copy_size(from, n, true))
+ n = _copy_to_user_key(to, from, n, key);
+ return n;
+}
+
+union oac {
+ unsigned int val;
+ struct {
+ struct {
+ unsigned short key : 4;
+ unsigned short : 4;
+ unsigned short as : 2;
+ unsigned short : 4;
+ unsigned short k : 1;
+ unsigned short a : 1;
+ } oac1;
+ struct {
+ unsigned short key : 4;
+ unsigned short : 4;
+ unsigned short as : 2;
+ unsigned short : 4;
+ unsigned short k : 1;
+ unsigned short a : 1;
+ } oac2;
+ };
+};
+
+int __noreturn __put_user_bad(void);
+
+#define __put_user_asm(to, from, size) \
+({ \
+ union oac __oac_spec = { \
+ .oac1.as = PSW_BITS_AS_SECONDARY, \
+ .oac1.a = 1, \
+ }; \
+ int __rc; \
+ \
+ asm volatile( \
+ " lr 0,%[spec]\n" \
+ "0: mvcos %[_to],%[_from],%[_size]\n" \
+ "1: xr %[rc],%[rc]\n" \
+ "2:\n" \
+ EX_TABLE_UA_STORE(0b, 2b, %[rc]) \
+ EX_TABLE_UA_STORE(1b, 2b, %[rc]) \
+ : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \
+ : [_size] "d" (size), [_from] "Q" (*(from)), \
+ [spec] "d" (__oac_spec.val) \
+ : "cc", "0"); \
+ __rc; \
})
static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size)
{
- unsigned long spec = 0x010000UL;
int rc;
switch (size) {
case 1:
- rc = __put_get_user_asm((unsigned char __user *)ptr,
- (unsigned char *)x,
- size, spec);
+ rc = __put_user_asm((unsigned char __user *)ptr,
+ (unsigned char *)x,
+ size);
break;
case 2:
- rc = __put_get_user_asm((unsigned short __user *)ptr,
- (unsigned short *)x,
- size, spec);
+ rc = __put_user_asm((unsigned short __user *)ptr,
+ (unsigned short *)x,
+ size);
break;
case 4:
- rc = __put_get_user_asm((unsigned int __user *)ptr,
- (unsigned int *)x,
- size, spec);
+ rc = __put_user_asm((unsigned int __user *)ptr,
+ (unsigned int *)x,
+ size);
break;
case 8:
- rc = __put_get_user_asm((unsigned long __user *)ptr,
- (unsigned long *)x,
- size, spec);
+ rc = __put_user_asm((unsigned long __user *)ptr,
+ (unsigned long *)x,
+ size);
+ break;
+ default:
+ __put_user_bad();
break;
}
return rc;
}
+int __noreturn __get_user_bad(void);
+
+#define __get_user_asm(to, from, size) \
+({ \
+ union oac __oac_spec = { \
+ .oac2.as = PSW_BITS_AS_SECONDARY, \
+ .oac2.a = 1, \
+ }; \
+ int __rc; \
+ \
+ asm volatile( \
+ " lr 0,%[spec]\n" \
+ "0: mvcos 0(%[_to]),%[_from],%[_size]\n" \
+ "1: xr %[rc],%[rc]\n" \
+ "2:\n" \
+ EX_TABLE_UA_LOAD_MEM(0b, 2b, %[rc], %[_to], %[_ksize]) \
+ EX_TABLE_UA_LOAD_MEM(1b, 2b, %[rc], %[_to], %[_ksize]) \
+ : [rc] "=&d" (__rc), "=Q" (*(to)) \
+ : [_size] "d" (size), [_from] "Q" (*(from)), \
+ [spec] "d" (__oac_spec.val), [_to] "a" (to), \
+ [_ksize] "K" (size) \
+ : "cc", "0"); \
+ __rc; \
+})
+
static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size)
{
- unsigned long spec = 0x01UL;
int rc;
switch (size) {
case 1:
- rc = __put_get_user_asm((unsigned char *)x,
- (unsigned char __user *)ptr,
- size, spec);
+ rc = __get_user_asm((unsigned char *)x,
+ (unsigned char __user *)ptr,
+ size);
break;
case 2:
- rc = __put_get_user_asm((unsigned short *)x,
- (unsigned short __user *)ptr,
- size, spec);
+ rc = __get_user_asm((unsigned short *)x,
+ (unsigned short __user *)ptr,
+ size);
break;
case 4:
- rc = __put_get_user_asm((unsigned int *)x,
- (unsigned int __user *)ptr,
- size, spec);
+ rc = __get_user_asm((unsigned int *)x,
+ (unsigned int __user *)ptr,
+ size);
break;
case 8:
- rc = __put_get_user_asm((unsigned long *)x,
- (unsigned long __user *)ptr,
- size, spec);
+ rc = __get_user_asm((unsigned long *)x,
+ (unsigned long __user *)ptr,
+ size);
+ break;
+ default:
+ __get_user_bad();
break;
}
return rc;
}
-#else /* CONFIG_HAVE_MARCH_Z10_FEATURES */
-
-static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size)
-{
- size = raw_copy_to_user(ptr, x, size);
- return size ? -EFAULT : 0;
-}
-
-static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size)
-{
- size = raw_copy_from_user(x, ptr, size);
- return size ? -EFAULT : 0;
-}
-
-#endif /* CONFIG_HAVE_MARCH_Z10_FEATURES */
-
/*
* These are the main single-value transfer routines. They automatically
* use the right size if we just have the right pointer type.
*/
-#define __put_user(x, ptr) \
-({ \
- __typeof__(*(ptr)) __x = (x); \
- int __pu_err = -EFAULT; \
- __chk_user_ptr(ptr); \
- switch (sizeof (*(ptr))) { \
- case 1: \
- case 2: \
- case 4: \
- case 8: \
- __pu_err = __put_user_fn(&__x, ptr, \
- sizeof(*(ptr))); \
- break; \
- default: \
- __put_user_bad(); \
- break; \
- } \
- __builtin_expect(__pu_err, 0); \
+#define __put_user(x, ptr) \
+({ \
+ __typeof__(*(ptr)) __x = (x); \
+ int __pu_err = -EFAULT; \
+ \
+ __chk_user_ptr(ptr); \
+ switch (sizeof(*(ptr))) { \
+ case 1: \
+ case 2: \
+ case 4: \
+ case 8: \
+ __pu_err = __put_user_fn(&__x, ptr, sizeof(*(ptr))); \
+ break; \
+ default: \
+ __put_user_bad(); \
+ break; \
+ } \
+ __builtin_expect(__pu_err, 0); \
})
-#define put_user(x, ptr) \
-({ \
- might_fault(); \
- __put_user(x, ptr); \
+#define put_user(x, ptr) \
+({ \
+ might_fault(); \
+ __put_user(x, ptr); \
})
-
-int __put_user_bad(void) __attribute__((noreturn));
-
-#define __get_user(x, ptr) \
-({ \
- int __gu_err = -EFAULT; \
- __chk_user_ptr(ptr); \
- switch (sizeof(*(ptr))) { \
- case 1: { \
- unsigned char __x = 0; \
- __gu_err = __get_user_fn(&__x, ptr, \
- sizeof(*(ptr))); \
- (x) = *(__force __typeof__(*(ptr)) *) &__x; \
- break; \
- }; \
- case 2: { \
- unsigned short __x = 0; \
- __gu_err = __get_user_fn(&__x, ptr, \
- sizeof(*(ptr))); \
- (x) = *(__force __typeof__(*(ptr)) *) &__x; \
- break; \
- }; \
- case 4: { \
- unsigned int __x = 0; \
- __gu_err = __get_user_fn(&__x, ptr, \
- sizeof(*(ptr))); \
- (x) = *(__force __typeof__(*(ptr)) *) &__x; \
- break; \
- }; \
- case 8: { \
- unsigned long long __x = 0; \
- __gu_err = __get_user_fn(&__x, ptr, \
- sizeof(*(ptr))); \
- (x) = *(__force __typeof__(*(ptr)) *) &__x; \
- break; \
- }; \
- default: \
- __get_user_bad(); \
- break; \
- } \
- __builtin_expect(__gu_err, 0); \
+#define __get_user(x, ptr) \
+({ \
+ int __gu_err = -EFAULT; \
+ \
+ __chk_user_ptr(ptr); \
+ switch (sizeof(*(ptr))) { \
+ case 1: { \
+ unsigned char __x; \
+ \
+ __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \
+ (x) = *(__force __typeof__(*(ptr)) *)&__x; \
+ break; \
+ }; \
+ case 2: { \
+ unsigned short __x; \
+ \
+ __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \
+ (x) = *(__force __typeof__(*(ptr)) *)&__x; \
+ break; \
+ }; \
+ case 4: { \
+ unsigned int __x; \
+ \
+ __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \
+ (x) = *(__force __typeof__(*(ptr)) *)&__x; \
+ break; \
+ }; \
+ case 8: { \
+ unsigned long __x; \
+ \
+ __gu_err = __get_user_fn(&__x, ptr, sizeof(*(ptr))); \
+ (x) = *(__force __typeof__(*(ptr)) *)&__x; \
+ break; \
+ }; \
+ default: \
+ __get_user_bad(); \
+ break; \
+ } \
+ __builtin_expect(__gu_err, 0); \
})
-#define get_user(x, ptr) \
-({ \
- might_fault(); \
- __get_user(x, ptr); \
+#define get_user(x, ptr) \
+({ \
+ might_fault(); \
+ __get_user(x, ptr); \
})
-int __get_user_bad(void) __attribute__((noreturn));
-
-unsigned long __must_check
-raw_copy_in_user(void __user *to, const void __user *from, unsigned long n);
-
/*
* Copy a null terminated string from userspace.
*/
+long __must_check strncpy_from_user(char *dst, const char __user *src, long count);
-long __strncpy_from_user(char *dst, const char __user *src, long count);
-
-static inline long __must_check
-strncpy_from_user(char *dst, const char __user *src, long count)
-{
- might_fault();
- return __strncpy_from_user(dst, src, count);
-}
-
-unsigned long __must_check __strnlen_user(const char __user *src, unsigned long count);
-
-static inline unsigned long strnlen_user(const char __user *src, unsigned long n)
-{
- might_fault();
- return __strnlen_user(src, n);
-}
+long __must_check strnlen_user(const char __user *src, long count);
/*
* Zero Userspace
@@ -275,7 +284,317 @@ static inline unsigned long __must_check clear_user(void __user *to, unsigned lo
return __clear_user(to, n);
}
-int copy_to_user_real(void __user *dest, void *src, unsigned long count);
-void s390_kernel_write(void *dst, const void *src, size_t size);
+void *s390_kernel_write(void *dst, const void *src, size_t size);
+
+int __noreturn __put_kernel_bad(void);
+
+#define __put_kernel_asm(val, to, insn) \
+({ \
+ int __rc; \
+ \
+ asm volatile( \
+ "0: " insn " %[_val],%[_to]\n" \
+ "1: xr %[rc],%[rc]\n" \
+ "2:\n" \
+ EX_TABLE_UA_STORE(0b, 2b, %[rc]) \
+ EX_TABLE_UA_STORE(1b, 2b, %[rc]) \
+ : [rc] "=d" (__rc), [_to] "+Q" (*(to)) \
+ : [_val] "d" (val) \
+ : "cc"); \
+ __rc; \
+})
+
+#define __put_kernel_nofault(dst, src, type, err_label) \
+do { \
+ unsigned long __x = (unsigned long)(*((type *)(src))); \
+ int __pk_err; \
+ \
+ switch (sizeof(type)) { \
+ case 1: \
+ __pk_err = __put_kernel_asm(__x, (type *)(dst), "stc"); \
+ break; \
+ case 2: \
+ __pk_err = __put_kernel_asm(__x, (type *)(dst), "sth"); \
+ break; \
+ case 4: \
+ __pk_err = __put_kernel_asm(__x, (type *)(dst), "st"); \
+ break; \
+ case 8: \
+ __pk_err = __put_kernel_asm(__x, (type *)(dst), "stg"); \
+ break; \
+ default: \
+ __pk_err = __put_kernel_bad(); \
+ break; \
+ } \
+ if (unlikely(__pk_err)) \
+ goto err_label; \
+} while (0)
+
+int __noreturn __get_kernel_bad(void);
+
+#define __get_kernel_asm(val, from, insn) \
+({ \
+ int __rc; \
+ \
+ asm volatile( \
+ "0: " insn " %[_val],%[_from]\n" \
+ "1: xr %[rc],%[rc]\n" \
+ "2:\n" \
+ EX_TABLE_UA_LOAD_REG(0b, 2b, %[rc], %[_val]) \
+ EX_TABLE_UA_LOAD_REG(1b, 2b, %[rc], %[_val]) \
+ : [rc] "=d" (__rc), [_val] "=d" (val) \
+ : [_from] "Q" (*(from)) \
+ : "cc"); \
+ __rc; \
+})
+
+#define __get_kernel_nofault(dst, src, type, err_label) \
+do { \
+ int __gk_err; \
+ \
+ switch (sizeof(type)) { \
+ case 1: { \
+ unsigned char __x; \
+ \
+ __gk_err = __get_kernel_asm(__x, (type *)(src), "ic"); \
+ *((type *)(dst)) = (type)__x; \
+ break; \
+ }; \
+ case 2: { \
+ unsigned short __x; \
+ \
+ __gk_err = __get_kernel_asm(__x, (type *)(src), "lh"); \
+ *((type *)(dst)) = (type)__x; \
+ break; \
+ }; \
+ case 4: { \
+ unsigned int __x; \
+ \
+ __gk_err = __get_kernel_asm(__x, (type *)(src), "l"); \
+ *((type *)(dst)) = (type)__x; \
+ break; \
+ }; \
+ case 8: { \
+ unsigned long __x; \
+ \
+ __gk_err = __get_kernel_asm(__x, (type *)(src), "lg"); \
+ *((type *)(dst)) = (type)__x; \
+ break; \
+ }; \
+ default: \
+ __gk_err = __get_kernel_bad(); \
+ break; \
+ } \
+ if (unlikely(__gk_err)) \
+ goto err_label; \
+} while (0)
+
+void __cmpxchg_user_key_called_with_bad_pointer(void);
+
+#define CMPXCHG_USER_KEY_MAX_LOOPS 128
+
+static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval,
+ __uint128_t old, __uint128_t new,
+ unsigned long key, int size)
+{
+ int rc = 0;
+
+ switch (size) {
+ case 1: {
+ unsigned int prev, shift, mask, _old, _new;
+ unsigned long count;
+
+ shift = (3 ^ (address & 3)) << 3;
+ address ^= address & 3;
+ _old = ((unsigned int)old & 0xff) << shift;
+ _new = ((unsigned int)new & 0xff) << shift;
+ mask = ~(0xff << shift);
+ asm volatile(
+ " spka 0(%[key])\n"
+ " sacf 256\n"
+ " llill %[count],%[max_loops]\n"
+ "0: l %[prev],%[address]\n"
+ "1: nr %[prev],%[mask]\n"
+ " xilf %[mask],0xffffffff\n"
+ " or %[new],%[prev]\n"
+ " or %[prev],%[tmp]\n"
+ "2: lr %[tmp],%[prev]\n"
+ "3: cs %[prev],%[new],%[address]\n"
+ "4: jnl 5f\n"
+ " xr %[tmp],%[prev]\n"
+ " xr %[new],%[tmp]\n"
+ " nr %[tmp],%[mask]\n"
+ " jnz 5f\n"
+ " brct %[count],2b\n"
+ "5: sacf 768\n"
+ " spka %[default_key]\n"
+ EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev])
+ : [rc] "+&d" (rc),
+ [prev] "=&d" (prev),
+ [address] "+Q" (*(int *)address),
+ [tmp] "+&d" (_old),
+ [new] "+&d" (_new),
+ [mask] "+&d" (mask),
+ [count] "=a" (count)
+ : [key] "%[count]" (key << 4),
+ [default_key] "J" (PAGE_DEFAULT_KEY),
+ [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS)
+ : "memory", "cc");
+ *(unsigned char *)uval = prev >> shift;
+ if (!count)
+ rc = -EAGAIN;
+ return rc;
+ }
+ case 2: {
+ unsigned int prev, shift, mask, _old, _new;
+ unsigned long count;
+
+ shift = (2 ^ (address & 2)) << 3;
+ address ^= address & 2;
+ _old = ((unsigned int)old & 0xffff) << shift;
+ _new = ((unsigned int)new & 0xffff) << shift;
+ mask = ~(0xffff << shift);
+ asm volatile(
+ " spka 0(%[key])\n"
+ " sacf 256\n"
+ " llill %[count],%[max_loops]\n"
+ "0: l %[prev],%[address]\n"
+ "1: nr %[prev],%[mask]\n"
+ " xilf %[mask],0xffffffff\n"
+ " or %[new],%[prev]\n"
+ " or %[prev],%[tmp]\n"
+ "2: lr %[tmp],%[prev]\n"
+ "3: cs %[prev],%[new],%[address]\n"
+ "4: jnl 5f\n"
+ " xr %[tmp],%[prev]\n"
+ " xr %[new],%[tmp]\n"
+ " nr %[tmp],%[mask]\n"
+ " jnz 5f\n"
+ " brct %[count],2b\n"
+ "5: sacf 768\n"
+ " spka %[default_key]\n"
+ EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev])
+ : [rc] "+&d" (rc),
+ [prev] "=&d" (prev),
+ [address] "+Q" (*(int *)address),
+ [tmp] "+&d" (_old),
+ [new] "+&d" (_new),
+ [mask] "+&d" (mask),
+ [count] "=a" (count)
+ : [key] "%[count]" (key << 4),
+ [default_key] "J" (PAGE_DEFAULT_KEY),
+ [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS)
+ : "memory", "cc");
+ *(unsigned short *)uval = prev >> shift;
+ if (!count)
+ rc = -EAGAIN;
+ return rc;
+ }
+ case 4: {
+ unsigned int prev = old;
+
+ asm volatile(
+ " spka 0(%[key])\n"
+ " sacf 256\n"
+ "0: cs %[prev],%[new],%[address]\n"
+ "1: sacf 768\n"
+ " spka %[default_key]\n"
+ EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
+ : [rc] "+&d" (rc),
+ [prev] "+&d" (prev),
+ [address] "+Q" (*(int *)address)
+ : [new] "d" ((unsigned int)new),
+ [key] "a" (key << 4),
+ [default_key] "J" (PAGE_DEFAULT_KEY)
+ : "memory", "cc");
+ *(unsigned int *)uval = prev;
+ return rc;
+ }
+ case 8: {
+ unsigned long prev = old;
+
+ asm volatile(
+ " spka 0(%[key])\n"
+ " sacf 256\n"
+ "0: csg %[prev],%[new],%[address]\n"
+ "1: sacf 768\n"
+ " spka %[default_key]\n"
+ EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
+ : [rc] "+&d" (rc),
+ [prev] "+&d" (prev),
+ [address] "+QS" (*(long *)address)
+ : [new] "d" ((unsigned long)new),
+ [key] "a" (key << 4),
+ [default_key] "J" (PAGE_DEFAULT_KEY)
+ : "memory", "cc");
+ *(unsigned long *)uval = prev;
+ return rc;
+ }
+ case 16: {
+ __uint128_t prev = old;
+
+ asm volatile(
+ " spka 0(%[key])\n"
+ " sacf 256\n"
+ "0: cdsg %[prev],%[new],%[address]\n"
+ "1: sacf 768\n"
+ " spka %[default_key]\n"
+ EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev])
+ : [rc] "+&d" (rc),
+ [prev] "+&d" (prev),
+ [address] "+QS" (*(__int128_t *)address)
+ : [new] "d" (new),
+ [key] "a" (key << 4),
+ [default_key] "J" (PAGE_DEFAULT_KEY)
+ : "memory", "cc");
+ *(__uint128_t *)uval = prev;
+ return rc;
+ }
+ }
+ __cmpxchg_user_key_called_with_bad_pointer();
+ return rc;
+}
+
+/**
+ * cmpxchg_user_key() - cmpxchg with user space target, honoring storage keys
+ * @ptr: User space address of value to compare to @old and exchange with
+ * @new. Must be aligned to sizeof(*@ptr).
+ * @uval: Address where the old value of *@ptr is written to.
+ * @old: Old value. Compared to the content pointed to by @ptr in order to
+ * determine if the exchange occurs. The old value read from *@ptr is
+ * written to *@uval.
+ * @new: New value to place at *@ptr.
+ * @key: Access key to use for checking storage key protection.
+ *
+ * Perform a cmpxchg on a user space target, honoring storage key protection.
+ * @key alone determines how key checking is performed, neither
+ * storage-protection-override nor fetch-protection-override apply.
+ * The caller must compare *@uval and @old to determine if values have been
+ * exchanged. In case of an exception *@uval is set to zero.
+ *
+ * Return: 0: cmpxchg executed
+ * -EFAULT: an exception happened when trying to access *@ptr
+ * -EAGAIN: maxed out number of retries (byte and short only)
+ */
+#define cmpxchg_user_key(ptr, uval, old, new, key) \
+({ \
+ __typeof__(ptr) __ptr = (ptr); \
+ __typeof__(uval) __uval = (uval); \
+ \
+ BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval))); \
+ might_fault(); \
+ __chk_user_ptr(__ptr); \
+ __cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval), \
+ (old), (new), (key), sizeof(*(__ptr))); \
+})
#endif /* __S390_UACCESS_H */
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index 9e9f75ef046a..4260bc5ce7f8 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -28,6 +28,7 @@
#define __ARCH_WANT_SYS_SIGPENDING
#define __ARCH_WANT_SYS_SIGPROCMASK
# ifdef CONFIG_COMPAT
+# define __ARCH_WANT_COMPAT_STAT
# define __ARCH_WANT_SYS_TIME32
# define __ARCH_WANT_SYS_UTIME32
# endif
diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h
index de9006b0cfeb..b8ecf04e3468 100644
--- a/arch/s390/include/asm/unwind.h
+++ b/arch/s390/include/asm/unwind.h
@@ -4,6 +4,8 @@
#include <linux/sched.h>
#include <linux/ftrace.h>
+#include <linux/rethook.h>
+#include <linux/llist.h>
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
@@ -36,10 +38,23 @@ struct unwind_state {
struct pt_regs *regs;
unsigned long sp, ip;
int graph_idx;
+ struct llist_node *kr_cur;
bool reliable;
bool error;
};
+/* Recover the return address modified by rethook and ftrace_graph. */
+static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state,
+ unsigned long ip)
+{
+ ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *)state->sp);
+#ifdef CONFIG_RETHOOK
+ if (is_rethook_trampoline(ip))
+ ip = rethook_find_ret_addr(state->task, state->sp, &state->kr_cur);
+#endif
+ return ip;
+}
+
void __unwind_start(struct unwind_state *state, struct task_struct *task,
struct pt_regs *regs, unsigned long first_frame);
bool unwind_next_frame(struct unwind_state *state);
@@ -55,10 +70,10 @@ static inline bool unwind_error(struct unwind_state *state)
return state->error;
}
-static inline void unwind_start(struct unwind_state *state,
- struct task_struct *task,
- struct pt_regs *regs,
- unsigned long first_frame)
+static __always_inline void unwind_start(struct unwind_state *state,
+ struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned long first_frame)
{
task = task ?: current;
first_frame = first_frame ?: get_stack_pointer(task, regs);
diff --git a/arch/s390/include/asm/user.h b/arch/s390/include/asm/user.h
index 0ca572ced21b..8e8aaf48582e 100644
--- a/arch/s390/include/asm/user.h
+++ b/arch/s390/include/asm/user.h
@@ -67,9 +67,5 @@ struct user {
unsigned long magic; /* To uniquely identify a core file */
char u_comm[32]; /* User command that was responsible */
};
-#define NBPG PAGE_SIZE
-#define UPAGES 1
-#define HOST_TEXT_START_ADDR (u.start_code)
-#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
#endif /* _S390_USER_H */
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index ef3c00b049ab..0e7bd3873907 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -2,7 +2,7 @@
/*
* Ultravisor Interfaces
*
- * Copyright IBM Corp. 2019
+ * Copyright IBM Corp. 2019, 2022
*
* Author(s):
* Vasily Gorbik <gor@linux.ibm.com>
@@ -14,23 +14,93 @@
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/bug.h>
+#include <linux/sched.h>
#include <asm/page.h>
+#include <asm/gmap.h>
+
+#define UVC_CC_OK 0
+#define UVC_CC_ERROR 1
+#define UVC_CC_BUSY 2
+#define UVC_CC_PARTIAL 3
#define UVC_RC_EXECUTED 0x0001
#define UVC_RC_INV_CMD 0x0002
#define UVC_RC_INV_STATE 0x0003
#define UVC_RC_INV_LEN 0x0005
#define UVC_RC_NO_RESUME 0x0007
+#define UVC_RC_NEED_DESTROY 0x8000
#define UVC_CMD_QUI 0x0001
+#define UVC_CMD_INIT_UV 0x000f
+#define UVC_CMD_CREATE_SEC_CONF 0x0100
+#define UVC_CMD_DESTROY_SEC_CONF 0x0101
+#define UVC_CMD_DESTROY_SEC_CONF_FAST 0x0102
+#define UVC_CMD_CREATE_SEC_CPU 0x0120
+#define UVC_CMD_DESTROY_SEC_CPU 0x0121
+#define UVC_CMD_CONV_TO_SEC_STOR 0x0200
+#define UVC_CMD_CONV_FROM_SEC_STOR 0x0201
+#define UVC_CMD_DESTR_SEC_STOR 0x0202
+#define UVC_CMD_SET_SEC_CONF_PARAMS 0x0300
+#define UVC_CMD_UNPACK_IMG 0x0301
+#define UVC_CMD_VERIFY_IMG 0x0302
+#define UVC_CMD_CPU_RESET 0x0310
+#define UVC_CMD_CPU_RESET_INITIAL 0x0311
+#define UVC_CMD_PREPARE_RESET 0x0320
+#define UVC_CMD_CPU_RESET_CLEAR 0x0321
+#define UVC_CMD_CPU_SET_STATE 0x0330
+#define UVC_CMD_SET_UNSHARE_ALL 0x0340
+#define UVC_CMD_PIN_PAGE_SHARED 0x0341
+#define UVC_CMD_UNPIN_PAGE_SHARED 0x0342
+#define UVC_CMD_DUMP_INIT 0x0400
+#define UVC_CMD_DUMP_CONF_STOR_STATE 0x0401
+#define UVC_CMD_DUMP_CPU 0x0402
+#define UVC_CMD_DUMP_COMPLETE 0x0403
#define UVC_CMD_SET_SHARED_ACCESS 0x1000
#define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001
+#define UVC_CMD_RETR_ATTEST 0x1020
+#define UVC_CMD_ADD_SECRET 0x1031
+#define UVC_CMD_LIST_SECRETS 0x1033
+#define UVC_CMD_LOCK_SECRETS 0x1034
/* Bits in installed uv calls */
enum uv_cmds_inst {
BIT_UVC_CMD_QUI = 0,
+ BIT_UVC_CMD_INIT_UV = 1,
+ BIT_UVC_CMD_CREATE_SEC_CONF = 2,
+ BIT_UVC_CMD_DESTROY_SEC_CONF = 3,
+ BIT_UVC_CMD_CREATE_SEC_CPU = 4,
+ BIT_UVC_CMD_DESTROY_SEC_CPU = 5,
+ BIT_UVC_CMD_CONV_TO_SEC_STOR = 6,
+ BIT_UVC_CMD_CONV_FROM_SEC_STOR = 7,
BIT_UVC_CMD_SET_SHARED_ACCESS = 8,
BIT_UVC_CMD_REMOVE_SHARED_ACCESS = 9,
+ BIT_UVC_CMD_SET_SEC_PARMS = 11,
+ BIT_UVC_CMD_UNPACK_IMG = 13,
+ BIT_UVC_CMD_VERIFY_IMG = 14,
+ BIT_UVC_CMD_CPU_RESET = 15,
+ BIT_UVC_CMD_CPU_RESET_INITIAL = 16,
+ BIT_UVC_CMD_CPU_SET_STATE = 17,
+ BIT_UVC_CMD_PREPARE_RESET = 18,
+ BIT_UVC_CMD_CPU_PERFORM_CLEAR_RESET = 19,
+ BIT_UVC_CMD_UNSHARE_ALL = 20,
+ BIT_UVC_CMD_PIN_PAGE_SHARED = 21,
+ BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
+ BIT_UVC_CMD_DESTROY_SEC_CONF_FAST = 23,
+ BIT_UVC_CMD_DUMP_INIT = 24,
+ BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE = 25,
+ BIT_UVC_CMD_DUMP_CPU = 26,
+ BIT_UVC_CMD_DUMP_COMPLETE = 27,
+ BIT_UVC_CMD_RETR_ATTEST = 28,
+ BIT_UVC_CMD_ADD_SECRET = 29,
+ BIT_UVC_CMD_LIST_SECRETS = 30,
+ BIT_UVC_CMD_LOCK_SECRETS = 31,
+};
+
+enum uv_feat_ind {
+ BIT_UV_FEAT_MISC = 0,
+ BIT_UV_FEAT_AIV = 1,
+ BIT_UV_FEAT_AP = 4,
+ BIT_UV_FEAT_AP_INTR = 5,
};
struct uv_cb_header {
@@ -40,13 +110,158 @@ struct uv_cb_header {
u16 rrc; /* Return Reason Code */
} __packed __aligned(8);
+/* Query Ultravisor Information */
struct uv_cb_qui {
+ struct uv_cb_header header; /* 0x0000 */
+ u64 reserved08; /* 0x0008 */
+ u64 inst_calls_list[4]; /* 0x0010 */
+ u64 reserved30[2]; /* 0x0030 */
+ u64 uv_base_stor_len; /* 0x0040 */
+ u64 reserved48; /* 0x0048 */
+ u64 conf_base_phys_stor_len; /* 0x0050 */
+ u64 conf_base_virt_stor_len; /* 0x0058 */
+ u64 conf_virt_var_stor_len; /* 0x0060 */
+ u64 cpu_stor_len; /* 0x0068 */
+ u32 reserved70[3]; /* 0x0070 */
+ u32 max_num_sec_conf; /* 0x007c */
+ u64 max_guest_stor_addr; /* 0x0080 */
+ u8 reserved88[0x9e - 0x88]; /* 0x0088 */
+ u16 max_guest_cpu_id; /* 0x009e */
+ u64 uv_feature_indications; /* 0x00a0 */
+ u64 reserveda8; /* 0x00a8 */
+ u64 supp_se_hdr_versions; /* 0x00b0 */
+ u64 supp_se_hdr_pcf; /* 0x00b8 */
+ u64 reservedc0; /* 0x00c0 */
+ u64 conf_dump_storage_state_len; /* 0x00c8 */
+ u64 conf_dump_finalize_len; /* 0x00d0 */
+ u64 reservedd8; /* 0x00d8 */
+ u64 supp_att_req_hdr_ver; /* 0x00e0 */
+ u64 supp_att_pflags; /* 0x00e8 */
+ u64 reservedf0; /* 0x00f0 */
+ u64 supp_add_secret_req_ver; /* 0x00f8 */
+ u64 supp_add_secret_pcf; /* 0x0100 */
+ u64 supp_secret_types; /* 0x0180 */
+ u16 max_secrets; /* 0x0110 */
+ u8 reserved112[0x120 - 0x112]; /* 0x0112 */
+} __packed __aligned(8);
+
+/* Initialize Ultravisor */
+struct uv_cb_init {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 stor_origin;
+ u64 stor_len;
+ u64 reserved28[4];
+} __packed __aligned(8);
+
+/* Create Guest Configuration */
+struct uv_cb_cgc {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 guest_handle;
+ u64 conf_base_stor_origin;
+ u64 conf_virt_stor_origin;
+ u8 reserved30[6];
+ union {
+ struct {
+ u16 : 14;
+ u16 ap_instr_intr : 1;
+ u16 ap_allow_instr : 1;
+ };
+ u16 raw;
+ } flags;
+ u64 guest_stor_origin;
+ u64 guest_stor_len;
+ u64 guest_sca;
+ u64 guest_asce;
+ u64 reserved58[5];
+} __packed __aligned(8);
+
+/* Create Secure CPU */
+struct uv_cb_csc {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 cpu_handle;
+ u64 guest_handle;
+ u64 stor_origin;
+ u8 reserved30[6];
+ u16 num;
+ u64 state_origin;
+ u64 reserved40[4];
+} __packed __aligned(8);
+
+/* Convert to Secure */
+struct uv_cb_cts {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 guest_handle;
+ u64 gaddr;
+} __packed __aligned(8);
+
+/* Convert from Secure / Pin Page Shared */
+struct uv_cb_cfs {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 paddr;
+} __packed __aligned(8);
+
+/* Set Secure Config Parameter */
+struct uv_cb_ssc {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 guest_handle;
+ u64 sec_header_origin;
+ u32 sec_header_len;
+ u32 reserved2c;
+ u64 reserved30[4];
+} __packed __aligned(8);
+
+/* Unpack */
+struct uv_cb_unp {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 guest_handle;
+ u64 gaddr;
+ u64 tweak[2];
+ u64 reserved38[3];
+} __packed __aligned(8);
+
+#define PV_CPU_STATE_OPR 1
+#define PV_CPU_STATE_STP 2
+#define PV_CPU_STATE_CHKSTP 3
+#define PV_CPU_STATE_OPR_LOAD 5
+
+struct uv_cb_cpu_set_state {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 cpu_handle;
+ u8 reserved20[7];
+ u8 state;
+ u64 reserved28[5];
+};
+
+/*
+ * A common UV call struct for calls that take no payload
+ * Examples:
+ * Destroy cpu/config
+ * Verify
+ */
+struct uv_cb_nodata {
struct uv_cb_header header;
- u64 reserved08;
- u64 inst_calls_list[4];
- u64 reserved30[15];
+ u64 reserved08[2];
+ u64 handle;
+ u64 reserved20[4];
} __packed __aligned(8);
+/* Destroy Configuration Fast */
+struct uv_cb_destroy_fast {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 handle;
+ u64 reserved20[5];
+} __packed __aligned(8);
+
+/* Set Shared Access */
struct uv_cb_share {
struct uv_cb_header header;
u64 reserved08[3];
@@ -54,21 +269,151 @@ struct uv_cb_share {
u64 reserved28;
} __packed __aligned(8);
-static inline int uv_call(unsigned long r1, unsigned long r2)
+/* Retrieve Attestation Measurement */
+struct uv_cb_attest {
+ struct uv_cb_header header; /* 0x0000 */
+ u64 reserved08[2]; /* 0x0008 */
+ u64 arcb_addr; /* 0x0018 */
+ u64 cont_token; /* 0x0020 */
+ u8 reserved28[6]; /* 0x0028 */
+ u16 user_data_len; /* 0x002e */
+ u8 user_data[256]; /* 0x0030 */
+ u32 reserved130[3]; /* 0x0130 */
+ u32 meas_len; /* 0x013c */
+ u64 meas_addr; /* 0x0140 */
+ u8 config_uid[16]; /* 0x0148 */
+ u32 reserved158; /* 0x0158 */
+ u32 add_data_len; /* 0x015c */
+ u64 add_data_addr; /* 0x0160 */
+ u64 reserved168[4]; /* 0x0168 */
+} __packed __aligned(8);
+
+struct uv_cb_dump_cpu {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 cpu_handle;
+ u64 dump_area_origin;
+ u64 reserved28[5];
+} __packed __aligned(8);
+
+struct uv_cb_dump_stor_state {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 config_handle;
+ u64 dump_area_origin;
+ u64 gaddr;
+ u64 reserved28[4];
+} __packed __aligned(8);
+
+struct uv_cb_dump_complete {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 config_handle;
+ u64 dump_area_origin;
+ u64 reserved30[5];
+} __packed __aligned(8);
+
+/*
+ * A common UV call struct for pv guests that contains a single address
+ * Examples:
+ * Add Secret
+ * List Secrets
+ */
+struct uv_cb_guest_addr {
+ struct uv_cb_header header;
+ u64 reserved08[3];
+ u64 addr;
+ u64 reserved28[4];
+} __packed __aligned(8);
+
+static inline int __uv_call(unsigned long r1, unsigned long r2)
{
int cc;
asm volatile(
- "0: .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n"
- " brc 3,0b\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
+ " .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n"
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
: [cc] "=d" (cc)
: [r1] "a" (r1), [r2] "a" (r2)
: "memory", "cc");
return cc;
}
+static inline int uv_call(unsigned long r1, unsigned long r2)
+{
+ int cc;
+
+ do {
+ cc = __uv_call(r1, r2);
+ } while (cc > 1);
+ return cc;
+}
+
+/* Low level uv_call that avoids stalls for long running busy conditions */
+static inline int uv_call_sched(unsigned long r1, unsigned long r2)
+{
+ int cc;
+
+ do {
+ cc = __uv_call(r1, r2);
+ cond_resched();
+ } while (cc > 1);
+ return cc;
+}
+
+/*
+ * special variant of uv_call that only transports the cpu or guest
+ * handle and the command, like destroy or verify.
+ */
+static inline int uv_cmd_nodata(u64 handle, u16 cmd, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_nodata uvcb = {
+ .header.cmd = cmd,
+ .header.len = sizeof(uvcb),
+ .handle = handle,
+ };
+ int cc;
+
+ WARN(!handle, "No handle provided to Ultravisor call cmd %x\n", cmd);
+ cc = uv_call_sched(0, (u64)&uvcb);
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ return cc ? -EINVAL : 0;
+}
+
+struct uv_info {
+ unsigned long inst_calls_list[4];
+ unsigned long uv_base_stor_len;
+ unsigned long guest_base_stor_len;
+ unsigned long guest_virt_base_stor_len;
+ unsigned long guest_virt_var_stor_len;
+ unsigned long guest_cpu_stor_len;
+ unsigned long max_sec_stor_addr;
+ unsigned int max_num_sec_conf;
+ unsigned short max_guest_cpu_id;
+ unsigned long uv_feature_indications;
+ unsigned long supp_se_hdr_ver;
+ unsigned long supp_se_hdr_pcf;
+ unsigned long conf_dump_storage_state_len;
+ unsigned long conf_dump_finalize_len;
+ unsigned long supp_att_req_hdr_ver;
+ unsigned long supp_att_pflags;
+ unsigned long supp_add_secret_req_ver;
+ unsigned long supp_add_secret_pcf;
+ unsigned long supp_secret_types;
+ unsigned short max_secrets;
+};
+
+extern struct uv_info uv_info;
+
+static inline bool uv_has_feature(u8 feature_bit)
+{
+ if (feature_bit >= sizeof(uv_info.uv_feature_indications) * 8)
+ return false;
+ return test_bit_inv(feature_bit, &uv_info.uv_feature_indications);
+}
+
#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
extern int prot_virt_guest;
@@ -86,7 +431,7 @@ static inline int share(unsigned long addr, u16 cmd)
};
if (!is_prot_virt_guest())
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
/*
* Sharing is page wise, if we encounter addresses that are
* not page aligned, we assume something went wrong. If
@@ -121,12 +466,52 @@ static inline int uv_remove_shared(unsigned long addr)
return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS);
}
-void uv_query_info(void);
#else
#define is_prot_virt_guest() 0
static inline int uv_set_shared(unsigned long addr) { return 0; }
static inline int uv_remove_shared(unsigned long addr) { return 0; }
-static inline void uv_query_info(void) {}
+#endif
+
+#if IS_ENABLED(CONFIG_KVM)
+extern int prot_virt_host;
+
+static inline int is_prot_virt_host(void)
+{
+ return prot_virt_host;
+}
+
+int uv_pin_shared(unsigned long paddr);
+int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
+int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr);
+int uv_destroy_owned_page(unsigned long paddr);
+int uv_convert_from_secure(unsigned long paddr);
+int uv_convert_owned_from_secure(unsigned long paddr);
+int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr);
+
+void setup_uv(void);
+#else
+#define is_prot_virt_host() 0
+static inline void setup_uv(void) {}
+
+static inline int uv_pin_shared(unsigned long paddr)
+{
+ return 0;
+}
+
+static inline int uv_destroy_owned_page(unsigned long paddr)
+{
+ return 0;
+}
+
+static inline int uv_convert_from_secure(unsigned long paddr)
+{
+ return 0;
+}
+
+static inline int uv_convert_owned_from_secure(unsigned long paddr)
+{
+ return 0;
+}
#endif
#endif /* _ASM_S390_UV_H */
diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h
index 3bcfdeb01395..53165aa7813a 100644
--- a/arch/s390/include/asm/vdso.h
+++ b/arch/s390/include/asm/vdso.h
@@ -2,64 +2,33 @@
#ifndef __S390_VDSO_H__
#define __S390_VDSO_H__
-/* Default link addresses for the vDSOs */
-#define VDSO32_LBASE 0
-#define VDSO64_LBASE 0
-
-#define VDSO_VERSION_STRING LINUX_2.6.29
+#include <vdso/datapage.h>
#ifndef __ASSEMBLY__
-/*
- * Note about the vdso_data and vdso_per_cpu_data structures:
- *
- * NEVER USE THEM IN USERSPACE CODE DIRECTLY. The layout of the
- * structure is supposed to be known only to the function in the vdso
- * itself and may change without notice.
- */
+#include <generated/vdso64-offsets.h>
+#ifdef CONFIG_COMPAT
+#include <generated/vdso32-offsets.h>
+#endif
-struct vdso_data {
- __u64 tb_update_count; /* Timebase atomicity ctr 0x00 */
- __u64 xtime_tod_stamp; /* TOD clock for xtime 0x08 */
- __u64 xtime_clock_sec; /* Kernel time 0x10 */
- __u64 xtime_clock_nsec; /* 0x18 */
- __u64 xtime_coarse_sec; /* Coarse kernel time 0x20 */
- __u64 xtime_coarse_nsec; /* 0x28 */
- __u64 wtom_clock_sec; /* Wall to monotonic clock 0x30 */
- __u64 wtom_clock_nsec; /* 0x38 */
- __u64 wtom_coarse_sec; /* Coarse wall to monotonic 0x40 */
- __u64 wtom_coarse_nsec; /* 0x48 */
- __u32 tz_minuteswest; /* Minutes west of Greenwich 0x50 */
- __u32 tz_dsttime; /* Type of dst correction 0x54 */
- __u32 ectg_available; /* ECTG instruction present 0x58 */
- __u32 tk_mult; /* Mult. used for xtime_nsec 0x5c */
- __u32 tk_shift; /* Shift used for xtime_nsec 0x60 */
- __u32 ts_dir; /* TOD steering direction 0x64 */
- __u64 ts_end; /* TOD steering end 0x68 */
-};
-
-struct vdso_per_cpu_data {
- __u64 ectg_timer_base;
- __u64 ectg_user_time;
- /*
- * Note: node_id and cpu_nr must be at adjacent memory locations.
- * VDSO userspace must read both values with a single instruction.
- */
- union {
- __u64 getcpu_val;
- struct {
- __u32 node_id;
- __u32 cpu_nr;
- };
- };
-};
+#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name))
+#ifdef CONFIG_COMPAT
+#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name))
+#else
+#define VDSO32_SYMBOL(tsk, name) (-1UL)
+#endif
extern struct vdso_data *vdso_data;
-extern struct vdso_data boot_vdso_data;
-void vdso_alloc_boot_cpu(struct lowcore *lowcore);
-int vdso_alloc_per_cpu(struct lowcore *lowcore);
-void vdso_free_per_cpu(struct lowcore *lowcore);
+int vdso_getcpu_init(void);
#endif /* __ASSEMBLY__ */
+
+/* Default link address for the vDSO */
+#define VDSO_LBASE 0
+
+#define __VVAR_PAGES 2
+
+#define VDSO_VERSION_STRING LINUX_2.6.29
+
#endif /* __S390_VDSO_H__ */
diff --git a/arch/s390/include/asm/vdso/clocksource.h b/arch/s390/include/asm/vdso/clocksource.h
new file mode 100644
index 000000000000..a93eda0ce7bb
--- /dev/null
+++ b/arch/s390/include/asm/vdso/clocksource.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_VDSO_CLOCKSOURCE_H
+#define __ASM_VDSO_CLOCKSOURCE_H
+
+#define VDSO_ARCH_CLOCKMODES \
+ VDSO_CLOCKMODE_TOD
+
+#endif /* __ASM_VDSO_CLOCKSOURCE_H */
diff --git a/arch/s390/include/asm/vdso/data.h b/arch/s390/include/asm/vdso/data.h
new file mode 100644
index 000000000000..73ee89142666
--- /dev/null
+++ b/arch/s390/include/asm/vdso/data.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __S390_ASM_VDSO_DATA_H
+#define __S390_ASM_VDSO_DATA_H
+
+#include <linux/types.h>
+#include <vdso/datapage.h>
+
+struct arch_vdso_data {
+ __s64 tod_steering_delta;
+ __u64 tod_steering_end;
+};
+
+#endif /* __S390_ASM_VDSO_DATA_H */
diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h
new file mode 100644
index 000000000000..db84942eb78f
--- /dev/null
+++ b/arch/s390/include/asm/vdso/gettimeofday.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_VDSO_GETTIMEOFDAY_H
+#define ASM_VDSO_GETTIMEOFDAY_H
+
+#define VDSO_HAS_TIME 1
+
+#define VDSO_HAS_CLOCK_GETRES 1
+
+#include <asm/syscall.h>
+#include <asm/timex.h>
+#include <asm/unistd.h>
+#include <linux/compiler.h>
+
+#define vdso_calc_delta __arch_vdso_calc_delta
+static __always_inline u64 __arch_vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
+{
+ return (cycles - last) * mult;
+}
+
+static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
+{
+ return _vdso_data;
+}
+
+static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *vd)
+{
+ u64 adj, now;
+
+ now = get_tod_clock();
+ adj = vd->arch_data.tod_steering_end - now;
+ if (unlikely((s64) adj > 0))
+ now += (vd->arch_data.tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15);
+ return now;
+}
+
+static __always_inline
+long clock_gettime_fallback(clockid_t clkid, struct __kernel_timespec *ts)
+{
+ return syscall2(__NR_clock_gettime, (long)clkid, (long)ts);
+}
+
+static __always_inline
+long gettimeofday_fallback(register struct __kernel_old_timeval *tv,
+ register struct timezone *tz)
+{
+ return syscall2(__NR_gettimeofday, (long)tv, (long)tz);
+}
+
+static __always_inline
+long clock_getres_fallback(clockid_t clkid, struct __kernel_timespec *ts)
+{
+ return syscall2(__NR_clock_getres, (long)clkid, (long)ts);
+}
+
+#ifdef CONFIG_TIME_NS
+static __always_inline
+const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
+{
+ return _timens_data;
+}
+#endif
+
+#endif
diff --git a/arch/s390/include/asm/vdso/processor.h b/arch/s390/include/asm/vdso/processor.h
new file mode 100644
index 000000000000..cfcc3e117c4c
--- /dev/null
+++ b/arch/s390/include/asm/vdso/processor.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_VDSO_PROCESSOR_H
+#define __ASM_VDSO_PROCESSOR_H
+
+#define cpu_relax() barrier()
+
+#endif /* __ASM_VDSO_PROCESSOR_H */
diff --git a/arch/s390/include/asm/vdso/vsyscall.h b/arch/s390/include/asm/vdso/vsyscall.h
new file mode 100644
index 000000000000..6c67c08cefdd
--- /dev/null
+++ b/arch/s390/include/asm/vdso/vsyscall.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_VDSO_VSYSCALL_H
+#define __ASM_VDSO_VSYSCALL_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/hrtimer.h>
+#include <linux/timekeeper_internal.h>
+#include <vdso/datapage.h>
+#include <asm/vdso.h>
+/*
+ * Update the vDSO data page to keep in sync with kernel timekeeping.
+ */
+
+static __always_inline struct vdso_data *__s390_get_k_vdso_data(void)
+{
+ return vdso_data;
+}
+#define __arch_get_k_vdso_data __s390_get_k_vdso_data
+
+/* The asm-generic header needs to be included after the definitions above */
+#include <asm-generic/vdso/vsyscall.h>
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_VSYSCALL_H */
diff --git a/arch/s390/include/asm/vga.h b/arch/s390/include/asm/vga.h
deleted file mode 100644
index 605dc46bac5e..000000000000
--- a/arch/s390/include/asm/vga.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_S390_VGA_H
-#define _ASM_S390_VGA_H
-
-/* Avoid compile errors due to missing asm/vga.h */
-
-#endif /* _ASM_S390_VGA_H */
diff --git a/arch/s390/include/asm/vmalloc.h b/arch/s390/include/asm/vmalloc.h
new file mode 100644
index 000000000000..3ba3a6bdca25
--- /dev/null
+++ b/arch/s390/include/asm/vmalloc.h
@@ -0,0 +1,4 @@
+#ifndef _ASM_S390_VMALLOC_H
+#define _ASM_S390_VMALLOC_H
+
+#endif /* _ASM_S390_VMALLOC_H */
diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h
index 3622d4ebc73a..fe17e448c0c5 100644
--- a/arch/s390/include/asm/vtime.h
+++ b/arch/s390/include/asm/vtime.h
@@ -2,7 +2,20 @@
#ifndef _S390_VTIME_H
#define _S390_VTIME_H
-#define __ARCH_HAS_VTIME_ACCOUNT
#define __ARCH_HAS_VTIME_TASK_SWITCH
+static inline void update_timer_sys(void)
+{
+ S390_lowcore.system_timer += S390_lowcore.last_update_timer - S390_lowcore.exit_timer;
+ S390_lowcore.user_timer += S390_lowcore.exit_timer - S390_lowcore.sys_enter_timer;
+ S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer;
+}
+
+static inline void update_timer_mcck(void)
+{
+ S390_lowcore.system_timer += S390_lowcore.last_update_timer - S390_lowcore.exit_timer;
+ S390_lowcore.user_timer += S390_lowcore.exit_timer - S390_lowcore.mcck_enter_timer;
+ S390_lowcore.last_update_timer = S390_lowcore.mcck_enter_timer;
+}
+
#endif /* _S390_VTIME_H */
diff --git a/arch/s390/include/asm/vtimer.h b/arch/s390/include/asm/vtimer.h
index 42f707d1c1e8..e601adaa6320 100644
--- a/arch/s390/include/asm/vtimer.h
+++ b/arch/s390/include/asm/vtimer.h
@@ -25,8 +25,6 @@ extern void add_virt_timer_periodic(struct vtimer_list *timer);
extern int mod_virt_timer(struct vtimer_list *timer, u64 expires);
extern int mod_virt_timer_periodic(struct vtimer_list *timer, u64 expires);
extern int del_virt_timer(struct vtimer_list *timer);
-
-extern void init_cpu_vtimer(void);
extern void vtime_init(void);
#endif /* _ASM_S390_TIMER_H */
diff --git a/arch/s390/include/asm/vx-insn-asm.h b/arch/s390/include/asm/vx-insn-asm.h
new file mode 100644
index 000000000000..360f8b36d962
--- /dev/null
+++ b/arch/s390/include/asm/vx-insn-asm.h
@@ -0,0 +1,681 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Support for Vector Instructions
+ *
+ * Assembler macros to generate .byte/.word code for particular
+ * vector instructions that are supported by recent binutils (>= 2.26) only.
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#ifndef __ASM_S390_VX_INSN_INTERNAL_H
+#define __ASM_S390_VX_INSN_INTERNAL_H
+
+#ifndef __ASM_S390_VX_INSN_H
+#error only <asm/vx-insn.h> can be included directly
+#endif
+
+#ifdef __ASSEMBLY__
+
+/* Macros to generate vector instruction byte code */
+
+/* GR_NUM - Retrieve general-purpose register number
+ *
+ * @opd: Operand to store register number
+ * @r64: String designation register in the format "%rN"
+ */
+.macro GR_NUM opd gr
+ \opd = 255
+ .ifc \gr,%r0
+ \opd = 0
+ .endif
+ .ifc \gr,%r1
+ \opd = 1
+ .endif
+ .ifc \gr,%r2
+ \opd = 2
+ .endif
+ .ifc \gr,%r3
+ \opd = 3
+ .endif
+ .ifc \gr,%r4
+ \opd = 4
+ .endif
+ .ifc \gr,%r5
+ \opd = 5
+ .endif
+ .ifc \gr,%r6
+ \opd = 6
+ .endif
+ .ifc \gr,%r7
+ \opd = 7
+ .endif
+ .ifc \gr,%r8
+ \opd = 8
+ .endif
+ .ifc \gr,%r9
+ \opd = 9
+ .endif
+ .ifc \gr,%r10
+ \opd = 10
+ .endif
+ .ifc \gr,%r11
+ \opd = 11
+ .endif
+ .ifc \gr,%r12
+ \opd = 12
+ .endif
+ .ifc \gr,%r13
+ \opd = 13
+ .endif
+ .ifc \gr,%r14
+ \opd = 14
+ .endif
+ .ifc \gr,%r15
+ \opd = 15
+ .endif
+ .if \opd == 255
+ \opd = \gr
+ .endif
+.endm
+
+/* VX_NUM - Retrieve vector register number
+ *
+ * @opd: Operand to store register number
+ * @vxr: String designation register in the format "%vN"
+ *
+ * The vector register number is used for as input number to the
+ * instruction and, as well as, to compute the RXB field of the
+ * instruction.
+ */
+.macro VX_NUM opd vxr
+ \opd = 255
+ .ifc \vxr,%v0
+ \opd = 0
+ .endif
+ .ifc \vxr,%v1
+ \opd = 1
+ .endif
+ .ifc \vxr,%v2
+ \opd = 2
+ .endif
+ .ifc \vxr,%v3
+ \opd = 3
+ .endif
+ .ifc \vxr,%v4
+ \opd = 4
+ .endif
+ .ifc \vxr,%v5
+ \opd = 5
+ .endif
+ .ifc \vxr,%v6
+ \opd = 6
+ .endif
+ .ifc \vxr,%v7
+ \opd = 7
+ .endif
+ .ifc \vxr,%v8
+ \opd = 8
+ .endif
+ .ifc \vxr,%v9
+ \opd = 9
+ .endif
+ .ifc \vxr,%v10
+ \opd = 10
+ .endif
+ .ifc \vxr,%v11
+ \opd = 11
+ .endif
+ .ifc \vxr,%v12
+ \opd = 12
+ .endif
+ .ifc \vxr,%v13
+ \opd = 13
+ .endif
+ .ifc \vxr,%v14
+ \opd = 14
+ .endif
+ .ifc \vxr,%v15
+ \opd = 15
+ .endif
+ .ifc \vxr,%v16
+ \opd = 16
+ .endif
+ .ifc \vxr,%v17
+ \opd = 17
+ .endif
+ .ifc \vxr,%v18
+ \opd = 18
+ .endif
+ .ifc \vxr,%v19
+ \opd = 19
+ .endif
+ .ifc \vxr,%v20
+ \opd = 20
+ .endif
+ .ifc \vxr,%v21
+ \opd = 21
+ .endif
+ .ifc \vxr,%v22
+ \opd = 22
+ .endif
+ .ifc \vxr,%v23
+ \opd = 23
+ .endif
+ .ifc \vxr,%v24
+ \opd = 24
+ .endif
+ .ifc \vxr,%v25
+ \opd = 25
+ .endif
+ .ifc \vxr,%v26
+ \opd = 26
+ .endif
+ .ifc \vxr,%v27
+ \opd = 27
+ .endif
+ .ifc \vxr,%v28
+ \opd = 28
+ .endif
+ .ifc \vxr,%v29
+ \opd = 29
+ .endif
+ .ifc \vxr,%v30
+ \opd = 30
+ .endif
+ .ifc \vxr,%v31
+ \opd = 31
+ .endif
+ .if \opd == 255
+ \opd = \vxr
+ .endif
+.endm
+
+/* RXB - Compute most significant bit used vector registers
+ *
+ * @rxb: Operand to store computed RXB value
+ * @v1: First vector register designated operand
+ * @v2: Second vector register designated operand
+ * @v3: Third vector register designated operand
+ * @v4: Fourth vector register designated operand
+ */
+.macro RXB rxb v1 v2=0 v3=0 v4=0
+ \rxb = 0
+ .if \v1 & 0x10
+ \rxb = \rxb | 0x08
+ .endif
+ .if \v2 & 0x10
+ \rxb = \rxb | 0x04
+ .endif
+ .if \v3 & 0x10
+ \rxb = \rxb | 0x02
+ .endif
+ .if \v4 & 0x10
+ \rxb = \rxb | 0x01
+ .endif
+.endm
+
+/* MRXB - Generate Element Size Control and RXB value
+ *
+ * @m: Element size control
+ * @v1: First vector register designated operand (for RXB)
+ * @v2: Second vector register designated operand (for RXB)
+ * @v3: Third vector register designated operand (for RXB)
+ * @v4: Fourth vector register designated operand (for RXB)
+ */
+.macro MRXB m v1 v2=0 v3=0 v4=0
+ rxb = 0
+ RXB rxb, \v1, \v2, \v3, \v4
+ .byte (\m << 4) | rxb
+.endm
+
+/* MRXBOPC - Generate Element Size Control, RXB, and final Opcode fields
+ *
+ * @m: Element size control
+ * @opc: Opcode
+ * @v1: First vector register designated operand (for RXB)
+ * @v2: Second vector register designated operand (for RXB)
+ * @v3: Third vector register designated operand (for RXB)
+ * @v4: Fourth vector register designated operand (for RXB)
+ */
+.macro MRXBOPC m opc v1 v2=0 v3=0 v4=0
+ MRXB \m, \v1, \v2, \v3, \v4
+ .byte \opc
+.endm
+
+/* Vector support instructions */
+
+/* VECTOR GENERATE BYTE MASK */
+.macro VGBM vr imm2
+ VX_NUM v1, \vr
+ .word (0xE700 | ((v1&15) << 4))
+ .word \imm2
+ MRXBOPC 0, 0x44, v1
+.endm
+.macro VZERO vxr
+ VGBM \vxr, 0
+.endm
+.macro VONE vxr
+ VGBM \vxr, 0xFFFF
+.endm
+
+/* VECTOR LOAD VR ELEMENT FROM GR */
+.macro VLVG v, gr, disp, m
+ VX_NUM v1, \v
+ GR_NUM b2, "%r0"
+ GR_NUM r3, \gr
+ .word 0xE700 | ((v1&15) << 4) | r3
+ .word (b2 << 12) | (\disp)
+ MRXBOPC \m, 0x22, v1
+.endm
+.macro VLVGB v, gr, index, base
+ VLVG \v, \gr, \index, \base, 0
+.endm
+.macro VLVGH v, gr, index
+ VLVG \v, \gr, \index, 1
+.endm
+.macro VLVGF v, gr, index
+ VLVG \v, \gr, \index, 2
+.endm
+.macro VLVGG v, gr, index
+ VLVG \v, \gr, \index, 3
+.endm
+
+/* VECTOR LOAD REGISTER */
+.macro VLR v1, v2
+ VX_NUM v1, \v1
+ VX_NUM v2, \v2
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word 0
+ MRXBOPC 0, 0x56, v1, v2
+.endm
+
+/* VECTOR LOAD */
+.macro VL v, disp, index="%r0", base
+ VX_NUM v1, \v
+ GR_NUM x2, \index
+ GR_NUM b2, \base
+ .word 0xE700 | ((v1&15) << 4) | x2
+ .word (b2 << 12) | (\disp)
+ MRXBOPC 0, 0x06, v1
+.endm
+
+/* VECTOR LOAD ELEMENT */
+.macro VLEx vr1, disp, index="%r0", base, m3, opc
+ VX_NUM v1, \vr1
+ GR_NUM x2, \index
+ GR_NUM b2, \base
+ .word 0xE700 | ((v1&15) << 4) | x2
+ .word (b2 << 12) | (\disp)
+ MRXBOPC \m3, \opc, v1
+.endm
+.macro VLEB vr1, disp, index="%r0", base, m3
+ VLEx \vr1, \disp, \index, \base, \m3, 0x00
+.endm
+.macro VLEH vr1, disp, index="%r0", base, m3
+ VLEx \vr1, \disp, \index, \base, \m3, 0x01
+.endm
+.macro VLEF vr1, disp, index="%r0", base, m3
+ VLEx \vr1, \disp, \index, \base, \m3, 0x03
+.endm
+.macro VLEG vr1, disp, index="%r0", base, m3
+ VLEx \vr1, \disp, \index, \base, \m3, 0x02
+.endm
+
+/* VECTOR LOAD ELEMENT IMMEDIATE */
+.macro VLEIx vr1, imm2, m3, opc
+ VX_NUM v1, \vr1
+ .word 0xE700 | ((v1&15) << 4)
+ .word \imm2
+ MRXBOPC \m3, \opc, v1
+.endm
+.macro VLEIB vr1, imm2, index
+ VLEIx \vr1, \imm2, \index, 0x40
+.endm
+.macro VLEIH vr1, imm2, index
+ VLEIx \vr1, \imm2, \index, 0x41
+.endm
+.macro VLEIF vr1, imm2, index
+ VLEIx \vr1, \imm2, \index, 0x43
+.endm
+.macro VLEIG vr1, imm2, index
+ VLEIx \vr1, \imm2, \index, 0x42
+.endm
+
+/* VECTOR LOAD GR FROM VR ELEMENT */
+.macro VLGV gr, vr, disp, base="%r0", m
+ GR_NUM r1, \gr
+ GR_NUM b2, \base
+ VX_NUM v3, \vr
+ .word 0xE700 | (r1 << 4) | (v3&15)
+ .word (b2 << 12) | (\disp)
+ MRXBOPC \m, 0x21, v3
+.endm
+.macro VLGVB gr, vr, disp, base="%r0"
+ VLGV \gr, \vr, \disp, \base, 0
+.endm
+.macro VLGVH gr, vr, disp, base="%r0"
+ VLGV \gr, \vr, \disp, \base, 1
+.endm
+.macro VLGVF gr, vr, disp, base="%r0"
+ VLGV \gr, \vr, \disp, \base, 2
+.endm
+.macro VLGVG gr, vr, disp, base="%r0"
+ VLGV \gr, \vr, \disp, \base, 3
+.endm
+
+/* VECTOR LOAD MULTIPLE */
+.macro VLM vfrom, vto, disp, base, hint=3
+ VX_NUM v1, \vfrom
+ VX_NUM v3, \vto
+ GR_NUM b2, \base
+ .word 0xE700 | ((v1&15) << 4) | (v3&15)
+ .word (b2 << 12) | (\disp)
+ MRXBOPC \hint, 0x36, v1, v3
+.endm
+
+/* VECTOR STORE */
+.macro VST vr1, disp, index="%r0", base
+ VX_NUM v1, \vr1
+ GR_NUM x2, \index
+ GR_NUM b2, \base
+ .word 0xE700 | ((v1&15) << 4) | (x2&15)
+ .word (b2 << 12) | (\disp)
+ MRXBOPC 0, 0x0E, v1
+.endm
+
+/* VECTOR STORE MULTIPLE */
+.macro VSTM vfrom, vto, disp, base, hint=3
+ VX_NUM v1, \vfrom
+ VX_NUM v3, \vto
+ GR_NUM b2, \base
+ .word 0xE700 | ((v1&15) << 4) | (v3&15)
+ .word (b2 << 12) | (\disp)
+ MRXBOPC \hint, 0x3E, v1, v3
+.endm
+
+/* VECTOR PERMUTE */
+.macro VPERM vr1, vr2, vr3, vr4
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ VX_NUM v4, \vr4
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC (v4&15), 0x8C, v1, v2, v3, v4
+.endm
+
+/* VECTOR UNPACK LOGICAL LOW */
+.macro VUPLL vr1, vr2, m3
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word 0x0000
+ MRXBOPC \m3, 0xD4, v1, v2
+.endm
+.macro VUPLLB vr1, vr2
+ VUPLL \vr1, \vr2, 0
+.endm
+.macro VUPLLH vr1, vr2
+ VUPLL \vr1, \vr2, 1
+.endm
+.macro VUPLLF vr1, vr2
+ VUPLL \vr1, \vr2, 2
+.endm
+
+/* VECTOR PERMUTE DOUBLEWORD IMMEDIATE */
+.macro VPDI vr1, vr2, vr3, m4
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC \m4, 0x84, v1, v2, v3
+.endm
+
+/* VECTOR REPLICATE */
+.macro VREP vr1, vr3, imm2, m4
+ VX_NUM v1, \vr1
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v3&15)
+ .word \imm2
+ MRXBOPC \m4, 0x4D, v1, v3
+.endm
+.macro VREPB vr1, vr3, imm2
+ VREP \vr1, \vr3, \imm2, 0
+.endm
+.macro VREPH vr1, vr3, imm2
+ VREP \vr1, \vr3, \imm2, 1
+.endm
+.macro VREPF vr1, vr3, imm2
+ VREP \vr1, \vr3, \imm2, 2
+.endm
+.macro VREPG vr1, vr3, imm2
+ VREP \vr1, \vr3, \imm2, 3
+.endm
+
+/* VECTOR MERGE HIGH */
+.macro VMRH vr1, vr2, vr3, m4
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC \m4, 0x61, v1, v2, v3
+.endm
+.macro VMRHB vr1, vr2, vr3
+ VMRH \vr1, \vr2, \vr3, 0
+.endm
+.macro VMRHH vr1, vr2, vr3
+ VMRH \vr1, \vr2, \vr3, 1
+.endm
+.macro VMRHF vr1, vr2, vr3
+ VMRH \vr1, \vr2, \vr3, 2
+.endm
+.macro VMRHG vr1, vr2, vr3
+ VMRH \vr1, \vr2, \vr3, 3
+.endm
+
+/* VECTOR MERGE LOW */
+.macro VMRL vr1, vr2, vr3, m4
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC \m4, 0x60, v1, v2, v3
+.endm
+.macro VMRLB vr1, vr2, vr3
+ VMRL \vr1, \vr2, \vr3, 0
+.endm
+.macro VMRLH vr1, vr2, vr3
+ VMRL \vr1, \vr2, \vr3, 1
+.endm
+.macro VMRLF vr1, vr2, vr3
+ VMRL \vr1, \vr2, \vr3, 2
+.endm
+.macro VMRLG vr1, vr2, vr3
+ VMRL \vr1, \vr2, \vr3, 3
+.endm
+
+
+/* Vector integer instructions */
+
+/* VECTOR AND */
+.macro VN vr1, vr2, vr3
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC 0, 0x68, v1, v2, v3
+.endm
+
+/* VECTOR EXCLUSIVE OR */
+.macro VX vr1, vr2, vr3
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC 0, 0x6D, v1, v2, v3
+.endm
+
+/* VECTOR GALOIS FIELD MULTIPLY SUM */
+.macro VGFM vr1, vr2, vr3, m4
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC \m4, 0xB4, v1, v2, v3
+.endm
+.macro VGFMB vr1, vr2, vr3
+ VGFM \vr1, \vr2, \vr3, 0
+.endm
+.macro VGFMH vr1, vr2, vr3
+ VGFM \vr1, \vr2, \vr3, 1
+.endm
+.macro VGFMF vr1, vr2, vr3
+ VGFM \vr1, \vr2, \vr3, 2
+.endm
+.macro VGFMG vr1, vr2, vr3
+ VGFM \vr1, \vr2, \vr3, 3
+.endm
+
+/* VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE */
+.macro VGFMA vr1, vr2, vr3, vr4, m5
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ VX_NUM v4, \vr4
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12) | (\m5 << 8)
+ MRXBOPC (v4&15), 0xBC, v1, v2, v3, v4
+.endm
+.macro VGFMAB vr1, vr2, vr3, vr4
+ VGFMA \vr1, \vr2, \vr3, \vr4, 0
+.endm
+.macro VGFMAH vr1, vr2, vr3, vr4
+ VGFMA \vr1, \vr2, \vr3, \vr4, 1
+.endm
+.macro VGFMAF vr1, vr2, vr3, vr4
+ VGFMA \vr1, \vr2, \vr3, \vr4, 2
+.endm
+.macro VGFMAG vr1, vr2, vr3, vr4
+ VGFMA \vr1, \vr2, \vr3, \vr4, 3
+.endm
+
+/* VECTOR SHIFT RIGHT LOGICAL BY BYTE */
+.macro VSRLB vr1, vr2, vr3
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC 0, 0x7D, v1, v2, v3
+.endm
+
+/* VECTOR REPLICATE IMMEDIATE */
+.macro VREPI vr1, imm2, m3
+ VX_NUM v1, \vr1
+ .word 0xE700 | ((v1&15) << 4)
+ .word \imm2
+ MRXBOPC \m3, 0x45, v1
+.endm
+.macro VREPIB vr1, imm2
+ VREPI \vr1, \imm2, 0
+.endm
+.macro VREPIH vr1, imm2
+ VREPI \vr1, \imm2, 1
+.endm
+.macro VREPIF vr1, imm2
+ VREPI \vr1, \imm2, 2
+.endm
+.macro VREPIG vr1, imm2
+ VREP \vr1, \imm2, 3
+.endm
+
+/* VECTOR ADD */
+.macro VA vr1, vr2, vr3, m4
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC \m4, 0xF3, v1, v2, v3
+.endm
+.macro VAB vr1, vr2, vr3
+ VA \vr1, \vr2, \vr3, 0
+.endm
+.macro VAH vr1, vr2, vr3
+ VA \vr1, \vr2, \vr3, 1
+.endm
+.macro VAF vr1, vr2, vr3
+ VA \vr1, \vr2, \vr3, 2
+.endm
+.macro VAG vr1, vr2, vr3
+ VA \vr1, \vr2, \vr3, 3
+.endm
+.macro VAQ vr1, vr2, vr3
+ VA \vr1, \vr2, \vr3, 4
+.endm
+
+/* VECTOR ELEMENT SHIFT RIGHT ARITHMETIC */
+.macro VESRAV vr1, vr2, vr3, m4
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC \m4, 0x7A, v1, v2, v3
+.endm
+
+.macro VESRAVB vr1, vr2, vr3
+ VESRAV \vr1, \vr2, \vr3, 0
+.endm
+.macro VESRAVH vr1, vr2, vr3
+ VESRAV \vr1, \vr2, \vr3, 1
+.endm
+.macro VESRAVF vr1, vr2, vr3
+ VESRAV \vr1, \vr2, \vr3, 2
+.endm
+.macro VESRAVG vr1, vr2, vr3
+ VESRAV \vr1, \vr2, \vr3, 3
+.endm
+
+/* VECTOR ELEMENT ROTATE LEFT LOGICAL */
+.macro VERLL vr1, vr3, disp, base="%r0", m4
+ VX_NUM v1, \vr1
+ VX_NUM v3, \vr3
+ GR_NUM b2, \base
+ .word 0xE700 | ((v1&15) << 4) | (v3&15)
+ .word (b2 << 12) | (\disp)
+ MRXBOPC \m4, 0x33, v1, v3
+.endm
+.macro VERLLB vr1, vr3, disp, base="%r0"
+ VERLL \vr1, \vr3, \disp, \base, 0
+.endm
+.macro VERLLH vr1, vr3, disp, base="%r0"
+ VERLL \vr1, \vr3, \disp, \base, 1
+.endm
+.macro VERLLF vr1, vr3, disp, base="%r0"
+ VERLL \vr1, \vr3, \disp, \base, 2
+.endm
+.macro VERLLG vr1, vr3, disp, base="%r0"
+ VERLL \vr1, \vr3, \disp, \base, 3
+.endm
+
+/* VECTOR SHIFT LEFT DOUBLE BY BYTE */
+.macro VSLDB vr1, vr2, vr3, imm4
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12) | (\imm4)
+ MRXBOPC 0, 0x77, v1, v2, v3
+.endm
+
+#endif /* __ASSEMBLY__ */
+#endif /* __ASM_S390_VX_INSN_INTERNAL_H */
diff --git a/arch/s390/include/asm/vx-insn.h b/arch/s390/include/asm/vx-insn.h
index 0c05a673811c..8c188f1c6d27 100644
--- a/arch/s390/include/asm/vx-insn.h
+++ b/arch/s390/include/asm/vx-insn.h
@@ -2,560 +2,18 @@
/*
* Support for Vector Instructions
*
- * Assembler macros to generate .byte/.word code for particular
- * vector instructions that are supported by recent binutils (>= 2.26) only.
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ * This wrapper header file allows to use the vector instruction macros in
+ * both assembler files as well as in inline assemblies in C files.
*/
#ifndef __ASM_S390_VX_INSN_H
#define __ASM_S390_VX_INSN_H
-#ifdef __ASSEMBLY__
-
-
-/* Macros to generate vector instruction byte code */
-
-/* GR_NUM - Retrieve general-purpose register number
- *
- * @opd: Operand to store register number
- * @r64: String designation register in the format "%rN"
- */
-.macro GR_NUM opd gr
- \opd = 255
- .ifc \gr,%r0
- \opd = 0
- .endif
- .ifc \gr,%r1
- \opd = 1
- .endif
- .ifc \gr,%r2
- \opd = 2
- .endif
- .ifc \gr,%r3
- \opd = 3
- .endif
- .ifc \gr,%r4
- \opd = 4
- .endif
- .ifc \gr,%r5
- \opd = 5
- .endif
- .ifc \gr,%r6
- \opd = 6
- .endif
- .ifc \gr,%r7
- \opd = 7
- .endif
- .ifc \gr,%r8
- \opd = 8
- .endif
- .ifc \gr,%r9
- \opd = 9
- .endif
- .ifc \gr,%r10
- \opd = 10
- .endif
- .ifc \gr,%r11
- \opd = 11
- .endif
- .ifc \gr,%r12
- \opd = 12
- .endif
- .ifc \gr,%r13
- \opd = 13
- .endif
- .ifc \gr,%r14
- \opd = 14
- .endif
- .ifc \gr,%r15
- \opd = 15
- .endif
- .if \opd == 255
- \opd = \gr
- .endif
-.endm
-
-/* VX_NUM - Retrieve vector register number
- *
- * @opd: Operand to store register number
- * @vxr: String designation register in the format "%vN"
- *
- * The vector register number is used for as input number to the
- * instruction and, as well as, to compute the RXB field of the
- * instruction.
- */
-.macro VX_NUM opd vxr
- \opd = 255
- .ifc \vxr,%v0
- \opd = 0
- .endif
- .ifc \vxr,%v1
- \opd = 1
- .endif
- .ifc \vxr,%v2
- \opd = 2
- .endif
- .ifc \vxr,%v3
- \opd = 3
- .endif
- .ifc \vxr,%v4
- \opd = 4
- .endif
- .ifc \vxr,%v5
- \opd = 5
- .endif
- .ifc \vxr,%v6
- \opd = 6
- .endif
- .ifc \vxr,%v7
- \opd = 7
- .endif
- .ifc \vxr,%v8
- \opd = 8
- .endif
- .ifc \vxr,%v9
- \opd = 9
- .endif
- .ifc \vxr,%v10
- \opd = 10
- .endif
- .ifc \vxr,%v11
- \opd = 11
- .endif
- .ifc \vxr,%v12
- \opd = 12
- .endif
- .ifc \vxr,%v13
- \opd = 13
- .endif
- .ifc \vxr,%v14
- \opd = 14
- .endif
- .ifc \vxr,%v15
- \opd = 15
- .endif
- .ifc \vxr,%v16
- \opd = 16
- .endif
- .ifc \vxr,%v17
- \opd = 17
- .endif
- .ifc \vxr,%v18
- \opd = 18
- .endif
- .ifc \vxr,%v19
- \opd = 19
- .endif
- .ifc \vxr,%v20
- \opd = 20
- .endif
- .ifc \vxr,%v21
- \opd = 21
- .endif
- .ifc \vxr,%v22
- \opd = 22
- .endif
- .ifc \vxr,%v23
- \opd = 23
- .endif
- .ifc \vxr,%v24
- \opd = 24
- .endif
- .ifc \vxr,%v25
- \opd = 25
- .endif
- .ifc \vxr,%v26
- \opd = 26
- .endif
- .ifc \vxr,%v27
- \opd = 27
- .endif
- .ifc \vxr,%v28
- \opd = 28
- .endif
- .ifc \vxr,%v29
- \opd = 29
- .endif
- .ifc \vxr,%v30
- \opd = 30
- .endif
- .ifc \vxr,%v31
- \opd = 31
- .endif
- .if \opd == 255
- \opd = \vxr
- .endif
-.endm
-
-/* RXB - Compute most significant bit used vector registers
- *
- * @rxb: Operand to store computed RXB value
- * @v1: First vector register designated operand
- * @v2: Second vector register designated operand
- * @v3: Third vector register designated operand
- * @v4: Fourth vector register designated operand
- */
-.macro RXB rxb v1 v2=0 v3=0 v4=0
- \rxb = 0
- .if \v1 & 0x10
- \rxb = \rxb | 0x08
- .endif
- .if \v2 & 0x10
- \rxb = \rxb | 0x04
- .endif
- .if \v3 & 0x10
- \rxb = \rxb | 0x02
- .endif
- .if \v4 & 0x10
- \rxb = \rxb | 0x01
- .endif
-.endm
-
-/* MRXB - Generate Element Size Control and RXB value
- *
- * @m: Element size control
- * @v1: First vector register designated operand (for RXB)
- * @v2: Second vector register designated operand (for RXB)
- * @v3: Third vector register designated operand (for RXB)
- * @v4: Fourth vector register designated operand (for RXB)
- */
-.macro MRXB m v1 v2=0 v3=0 v4=0
- rxb = 0
- RXB rxb, \v1, \v2, \v3, \v4
- .byte (\m << 4) | rxb
-.endm
-
-/* MRXBOPC - Generate Element Size Control, RXB, and final Opcode fields
- *
- * @m: Element size control
- * @opc: Opcode
- * @v1: First vector register designated operand (for RXB)
- * @v2: Second vector register designated operand (for RXB)
- * @v3: Third vector register designated operand (for RXB)
- * @v4: Fourth vector register designated operand (for RXB)
- */
-.macro MRXBOPC m opc v1 v2=0 v3=0 v4=0
- MRXB \m, \v1, \v2, \v3, \v4
- .byte \opc
-.endm
-
-/* Vector support instructions */
-
-/* VECTOR GENERATE BYTE MASK */
-.macro VGBM vr imm2
- VX_NUM v1, \vr
- .word (0xE700 | ((v1&15) << 4))
- .word \imm2
- MRXBOPC 0, 0x44, v1
-.endm
-.macro VZERO vxr
- VGBM \vxr, 0
-.endm
-.macro VONE vxr
- VGBM \vxr, 0xFFFF
-.endm
-
-/* VECTOR LOAD VR ELEMENT FROM GR */
-.macro VLVG v, gr, disp, m
- VX_NUM v1, \v
- GR_NUM b2, "%r0"
- GR_NUM r3, \gr
- .word 0xE700 | ((v1&15) << 4) | r3
- .word (b2 << 12) | (\disp)
- MRXBOPC \m, 0x22, v1
-.endm
-.macro VLVGB v, gr, index, base
- VLVG \v, \gr, \index, \base, 0
-.endm
-.macro VLVGH v, gr, index
- VLVG \v, \gr, \index, 1
-.endm
-.macro VLVGF v, gr, index
- VLVG \v, \gr, \index, 2
-.endm
-.macro VLVGG v, gr, index
- VLVG \v, \gr, \index, 3
-.endm
-
-/* VECTOR LOAD REGISTER */
-.macro VLR v1, v2
- VX_NUM v1, \v1
- VX_NUM v2, \v2
- .word 0xE700 | ((v1&15) << 4) | (v2&15)
- .word 0
- MRXBOPC 0, 0x56, v1, v2
-.endm
-
-/* VECTOR LOAD */
-.macro VL v, disp, index="%r0", base
- VX_NUM v1, \v
- GR_NUM x2, \index
- GR_NUM b2, \base
- .word 0xE700 | ((v1&15) << 4) | x2
- .word (b2 << 12) | (\disp)
- MRXBOPC 0, 0x06, v1
-.endm
-
-/* VECTOR LOAD ELEMENT */
-.macro VLEx vr1, disp, index="%r0", base, m3, opc
- VX_NUM v1, \vr1
- GR_NUM x2, \index
- GR_NUM b2, \base
- .word 0xE700 | ((v1&15) << 4) | x2
- .word (b2 << 12) | (\disp)
- MRXBOPC \m3, \opc, v1
-.endm
-.macro VLEB vr1, disp, index="%r0", base, m3
- VLEx \vr1, \disp, \index, \base, \m3, 0x00
-.endm
-.macro VLEH vr1, disp, index="%r0", base, m3
- VLEx \vr1, \disp, \index, \base, \m3, 0x01
-.endm
-.macro VLEF vr1, disp, index="%r0", base, m3
- VLEx \vr1, \disp, \index, \base, \m3, 0x03
-.endm
-.macro VLEG vr1, disp, index="%r0", base, m3
- VLEx \vr1, \disp, \index, \base, \m3, 0x02
-.endm
-
-/* VECTOR LOAD ELEMENT IMMEDIATE */
-.macro VLEIx vr1, imm2, m3, opc
- VX_NUM v1, \vr1
- .word 0xE700 | ((v1&15) << 4)
- .word \imm2
- MRXBOPC \m3, \opc, v1
-.endm
-.macro VLEIB vr1, imm2, index
- VLEIx \vr1, \imm2, \index, 0x40
-.endm
-.macro VLEIH vr1, imm2, index
- VLEIx \vr1, \imm2, \index, 0x41
-.endm
-.macro VLEIF vr1, imm2, index
- VLEIx \vr1, \imm2, \index, 0x43
-.endm
-.macro VLEIG vr1, imm2, index
- VLEIx \vr1, \imm2, \index, 0x42
-.endm
-
-/* VECTOR LOAD GR FROM VR ELEMENT */
-.macro VLGV gr, vr, disp, base="%r0", m
- GR_NUM r1, \gr
- GR_NUM b2, \base
- VX_NUM v3, \vr
- .word 0xE700 | (r1 << 4) | (v3&15)
- .word (b2 << 12) | (\disp)
- MRXBOPC \m, 0x21, v3
-.endm
-.macro VLGVB gr, vr, disp, base="%r0"
- VLGV \gr, \vr, \disp, \base, 0
-.endm
-.macro VLGVH gr, vr, disp, base="%r0"
- VLGV \gr, \vr, \disp, \base, 1
-.endm
-.macro VLGVF gr, vr, disp, base="%r0"
- VLGV \gr, \vr, \disp, \base, 2
-.endm
-.macro VLGVG gr, vr, disp, base="%r0"
- VLGV \gr, \vr, \disp, \base, 3
-.endm
-
-/* VECTOR LOAD MULTIPLE */
-.macro VLM vfrom, vto, disp, base, hint=3
- VX_NUM v1, \vfrom
- VX_NUM v3, \vto
- GR_NUM b2, \base /* Base register */
- .word 0xE700 | ((v1&15) << 4) | (v3&15)
- .word (b2 << 12) | (\disp)
- MRXBOPC \hint, 0x36, v1, v3
-.endm
-
-/* VECTOR STORE MULTIPLE */
-.macro VSTM vfrom, vto, disp, base, hint=3
- VX_NUM v1, \vfrom
- VX_NUM v3, \vto
- GR_NUM b2, \base /* Base register */
- .word 0xE700 | ((v1&15) << 4) | (v3&15)
- .word (b2 << 12) | (\disp)
- MRXBOPC \hint, 0x3E, v1, v3
-.endm
-
-/* VECTOR PERMUTE */
-.macro VPERM vr1, vr2, vr3, vr4
- VX_NUM v1, \vr1
- VX_NUM v2, \vr2
- VX_NUM v3, \vr3
- VX_NUM v4, \vr4
- .word 0xE700 | ((v1&15) << 4) | (v2&15)
- .word ((v3&15) << 12)
- MRXBOPC (v4&15), 0x8C, v1, v2, v3, v4
-.endm
-
-/* VECTOR UNPACK LOGICAL LOW */
-.macro VUPLL vr1, vr2, m3
- VX_NUM v1, \vr1
- VX_NUM v2, \vr2
- .word 0xE700 | ((v1&15) << 4) | (v2&15)
- .word 0x0000
- MRXBOPC \m3, 0xD4, v1, v2
-.endm
-.macro VUPLLB vr1, vr2
- VUPLL \vr1, \vr2, 0
-.endm
-.macro VUPLLH vr1, vr2
- VUPLL \vr1, \vr2, 1
-.endm
-.macro VUPLLF vr1, vr2
- VUPLL \vr1, \vr2, 2
-.endm
-
-
-/* Vector integer instructions */
-
-/* VECTOR AND */
-.macro VN vr1, vr2, vr3
- VX_NUM v1, \vr1
- VX_NUM v2, \vr2
- VX_NUM v3, \vr3
- .word 0xE700 | ((v1&15) << 4) | (v2&15)
- .word ((v3&15) << 12)
- MRXBOPC 0, 0x68, v1, v2, v3
-.endm
-
-/* VECTOR EXCLUSIVE OR */
-.macro VX vr1, vr2, vr3
- VX_NUM v1, \vr1
- VX_NUM v2, \vr2
- VX_NUM v3, \vr3
- .word 0xE700 | ((v1&15) << 4) | (v2&15)
- .word ((v3&15) << 12)
- MRXBOPC 0, 0x6D, v1, v2, v3
-.endm
-
-/* VECTOR GALOIS FIELD MULTIPLY SUM */
-.macro VGFM vr1, vr2, vr3, m4
- VX_NUM v1, \vr1
- VX_NUM v2, \vr2
- VX_NUM v3, \vr3
- .word 0xE700 | ((v1&15) << 4) | (v2&15)
- .word ((v3&15) << 12)
- MRXBOPC \m4, 0xB4, v1, v2, v3
-.endm
-.macro VGFMB vr1, vr2, vr3
- VGFM \vr1, \vr2, \vr3, 0
-.endm
-.macro VGFMH vr1, vr2, vr3
- VGFM \vr1, \vr2, \vr3, 1
-.endm
-.macro VGFMF vr1, vr2, vr3
- VGFM \vr1, \vr2, \vr3, 2
-.endm
-.macro VGFMG vr1, vr2, vr3
- VGFM \vr1, \vr2, \vr3, 3
-.endm
-
-/* VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE */
-.macro VGFMA vr1, vr2, vr3, vr4, m5
- VX_NUM v1, \vr1
- VX_NUM v2, \vr2
- VX_NUM v3, \vr3
- VX_NUM v4, \vr4
- .word 0xE700 | ((v1&15) << 4) | (v2&15)
- .word ((v3&15) << 12) | (\m5 << 8)
- MRXBOPC (v4&15), 0xBC, v1, v2, v3, v4
-.endm
-.macro VGFMAB vr1, vr2, vr3, vr4
- VGFMA \vr1, \vr2, \vr3, \vr4, 0
-.endm
-.macro VGFMAH vr1, vr2, vr3, vr4
- VGFMA \vr1, \vr2, \vr3, \vr4, 1
-.endm
-.macro VGFMAF vr1, vr2, vr3, vr4
- VGFMA \vr1, \vr2, \vr3, \vr4, 2
-.endm
-.macro VGFMAG vr1, vr2, vr3, vr4
- VGFMA \vr1, \vr2, \vr3, \vr4, 3
-.endm
-
-/* VECTOR SHIFT RIGHT LOGICAL BY BYTE */
-.macro VSRLB vr1, vr2, vr3
- VX_NUM v1, \vr1
- VX_NUM v2, \vr2
- VX_NUM v3, \vr3
- .word 0xE700 | ((v1&15) << 4) | (v2&15)
- .word ((v3&15) << 12)
- MRXBOPC 0, 0x7D, v1, v2, v3
-.endm
-
-/* VECTOR REPLICATE IMMEDIATE */
-.macro VREPI vr1, imm2, m3
- VX_NUM v1, \vr1
- .word 0xE700 | ((v1&15) << 4)
- .word \imm2
- MRXBOPC \m3, 0x45, v1
-.endm
-.macro VREPIB vr1, imm2
- VREPI \vr1, \imm2, 0
-.endm
-.macro VREPIH vr1, imm2
- VREPI \vr1, \imm2, 1
-.endm
-.macro VREPIF vr1, imm2
- VREPI \vr1, \imm2, 2
-.endm
-.macro VREPIG vr1, imm2
- VREP \vr1, \imm2, 3
-.endm
-
-/* VECTOR ADD */
-.macro VA vr1, vr2, vr3, m4
- VX_NUM v1, \vr1
- VX_NUM v2, \vr2
- VX_NUM v3, \vr3
- .word 0xE700 | ((v1&15) << 4) | (v2&15)
- .word ((v3&15) << 12)
- MRXBOPC \m4, 0xF3, v1, v2, v3
-.endm
-.macro VAB vr1, vr2, vr3
- VA \vr1, \vr2, \vr3, 0
-.endm
-.macro VAH vr1, vr2, vr3
- VA \vr1, \vr2, \vr3, 1
-.endm
-.macro VAF vr1, vr2, vr3
- VA \vr1, \vr2, \vr3, 2
-.endm
-.macro VAG vr1, vr2, vr3
- VA \vr1, \vr2, \vr3, 3
-.endm
-.macro VAQ vr1, vr2, vr3
- VA \vr1, \vr2, \vr3, 4
-.endm
+#include <asm/vx-insn-asm.h>
-/* VECTOR ELEMENT SHIFT RIGHT ARITHMETIC */
-.macro VESRAV vr1, vr2, vr3, m4
- VX_NUM v1, \vr1
- VX_NUM v2, \vr2
- VX_NUM v3, \vr3
- .word 0xE700 | ((v1&15) << 4) | (v2&15)
- .word ((v3&15) << 12)
- MRXBOPC \m4, 0x7A, v1, v2, v3
-.endm
+#ifndef __ASSEMBLY__
-.macro VESRAVB vr1, vr2, vr3
- VESRAV \vr1, \vr2, \vr3, 0
-.endm
-.macro VESRAVH vr1, vr2, vr3
- VESRAV \vr1, \vr2, \vr3, 1
-.endm
-.macro VESRAVF vr1, vr2, vr3
- VESRAV \vr1, \vr2, \vr3, 2
-.endm
-.macro VESRAVG vr1, vr2, vr3
- VESRAV \vr1, \vr2, \vr3, 3
-.endm
+asm(".include \"asm/vx-insn-asm.h\"\n");
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLY__ */
#endif /* __ASM_S390_VX_INSN_H */
diff --git a/arch/s390/include/asm/word-at-a-time.h b/arch/s390/include/asm/word-at-a-time.h
new file mode 100644
index 000000000000..2579f1694b82
--- /dev/null
+++ b/arch/s390/include/asm/word-at-a-time.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_WORD_AT_A_TIME_H
+#define _ASM_WORD_AT_A_TIME_H
+
+#include <linux/kernel.h>
+#include <asm/asm-extable.h>
+#include <asm/bitsperlong.h>
+
+struct word_at_a_time {
+ const unsigned long bits;
+};
+
+#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x7f) }
+
+static inline unsigned long prep_zero_mask(unsigned long val, unsigned long data, const struct word_at_a_time *c)
+{
+ return data;
+}
+
+static inline unsigned long create_zero_mask(unsigned long data)
+{
+ return __fls(data);
+}
+
+static inline unsigned long find_zero(unsigned long data)
+{
+ return (data ^ (BITS_PER_LONG - 1)) >> 3;
+}
+
+static inline unsigned long has_zero(unsigned long val, unsigned long *data, const struct word_at_a_time *c)
+{
+ unsigned long mask = (val & c->bits) + c->bits;
+
+ *data = ~(mask | val | c->bits);
+ return *data;
+}
+
+static inline unsigned long zero_bytemask(unsigned long data)
+{
+ return ~1UL << data;
+}
+
+/*
+ * Load an unaligned word from kernel space.
+ *
+ * In the (very unlikely) case of the word being a page-crosser
+ * and the next page not being mapped, take the exception and
+ * return zeroes in the non-existing part.
+ */
+static inline unsigned long load_unaligned_zeropad(const void *addr)
+{
+ unsigned long data;
+
+ asm volatile(
+ "0: lg %[data],0(%[addr])\n"
+ "1: nopr %%r7\n"
+ EX_TABLE_ZEROPAD(0b, 1b, %[data], %[addr])
+ EX_TABLE_ZEROPAD(1b, 1b, %[data], %[addr])
+ : [data] "=d" (data)
+ : [addr] "a" (addr), "m" (*(unsigned long *)addr));
+ return data;
+}
+
+#endif /* _ASM_WORD_AT_A_TIME_H */
diff --git a/arch/s390/include/uapi/asm/cmb.h b/arch/s390/include/uapi/asm/cmb.h
index ecbe94941403..115434ab98fb 100644
--- a/arch/s390/include/uapi/asm/cmb.h
+++ b/arch/s390/include/uapi/asm/cmb.h
@@ -31,7 +31,7 @@
struct cmbdata {
__u64 size;
__u64 elapsed_time;
- /* basic and exended format: */
+ /* basic and extended format: */
__u64 ssch_rsch_count;
__u64 sample_count;
__u64 device_connect_time;
diff --git a/arch/s390/include/uapi/asm/dasd.h b/arch/s390/include/uapi/asm/dasd.h
index 9ec86fae9980..b11d98800458 100644
--- a/arch/s390/include/uapi/asm/dasd.h
+++ b/arch/s390/include/uapi/asm/dasd.h
@@ -24,7 +24,7 @@
/*
* struct dasd_information2_t
* represents any data about the device, which is visible to userspace.
- * including foramt and featueres.
+ * including format and featueres.
*/
typedef struct dasd_information2_t {
unsigned int devno; /* S/390 devno */
@@ -78,6 +78,7 @@ typedef struct dasd_information2_t {
* 0x040: give access to raw eckd data
* 0x080: enable discard support
* 0x100: enable autodisable for IFCC errors (default)
+ * 0x200: enable requeue of all requests on autoquiesce
*/
#define DASD_FEATURE_READONLY 0x001
#define DASD_FEATURE_USEDIAG 0x002
@@ -88,6 +89,7 @@ typedef struct dasd_information2_t {
#define DASD_FEATURE_USERAW 0x040
#define DASD_FEATURE_DISCARD 0x080
#define DASD_FEATURE_PATH_AUTODISABLE 0x100
+#define DASD_FEATURE_REQUEUEQUIESCE 0x200
#define DASD_FEATURE_DEFAULT DASD_FEATURE_PATH_AUTODISABLE
#define DASD_PARTN_BITS 2
@@ -183,6 +185,18 @@ typedef struct format_data_t {
} format_data_t;
/*
+ * struct dasd_copypair_swap_data_t
+ * represents all data necessary to issue a swap of the copy pair relation
+ */
+struct dasd_copypair_swap_data_t {
+ char primary[20]; /* BUSID of primary */
+ char secondary[20]; /* BUSID of secondary */
+
+ /* Reserved for future updates. */
+ __u8 reserved[64];
+};
+
+/*
* values to be used for format_data_t.intensity
* 0/8: normal format
* 1/9: also write record zero
@@ -326,6 +340,8 @@ struct dasd_snid_ioctl_data {
#define BIODASDSATTR _IOW(DASD_IOCTL_LETTER,2,attrib_data_t)
/* Release Allocated Space */
#define BIODASDRAS _IOW(DASD_IOCTL_LETTER, 3, format_data_t)
+/* Swap copy pair relation */
+#define BIODASDCOPYPAIRSWAP _IOW(DASD_IOCTL_LETTER, 4, struct dasd_copypair_swap_data_t)
/* Get Sense Path Group ID (SNID) data */
#define BIODASDSNID _IOWR(DASD_IOCTL_LETTER, 1, struct dasd_snid_ioctl_data)
diff --git a/arch/s390/include/uapi/asm/debug.h b/arch/s390/include/uapi/asm/debug.h
deleted file mode 100644
index c7c564d9aea4..000000000000
--- a/arch/s390/include/uapi/asm/debug.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * S/390 debug facility
- *
- * Copyright IBM Corp. 1999, 2000
- */
-
-#ifndef _UAPIDEBUG_H
-#define _UAPIDEBUG_H
-
-#include <linux/fs.h>
-
-/* Note:
- * struct __debug_entry must be defined outside of #ifdef __KERNEL__
- * in order to allow a user program to analyze the 'raw'-view.
- */
-
-struct __debug_entry{
- union {
- struct {
- unsigned long long clock:52;
- unsigned long long exception:1;
- unsigned long long level:3;
- unsigned long long cpuid:8;
- } fields;
-
- unsigned long long stck;
- } id;
- void* caller;
-} __attribute__((packed));
-
-
-#define __DEBUG_FEATURE_VERSION 2 /* version of debug feature */
-
-#endif /* _UAPIDEBUG_H */
diff --git a/arch/s390/include/uapi/asm/fs3270.h b/arch/s390/include/uapi/asm/fs3270.h
new file mode 100644
index 000000000000..c4bc1108af6a
--- /dev/null
+++ b/arch/s390/include/uapi/asm/fs3270.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_S390_UAPI_FS3270_H
+#define __ASM_S390_UAPI_FS3270_H
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+
+/* ioctls for fullscreen 3270 */
+#define TUBICMD _IO('3', 3) /* set ccw command for fs reads. */
+#define TUBOCMD _IO('3', 4) /* set ccw command for fs writes. */
+#define TUBGETI _IO('3', 7) /* get ccw command for fs reads. */
+#define TUBGETO _IO('3', 8) /* get ccw command for fs writes. */
+#define TUBGETMOD _IO('3', 13) /* get characteristics like model, cols, rows */
+
+/* For TUBGETMOD */
+struct raw3270_iocb {
+ __u16 model;
+ __u16 line_cnt;
+ __u16 col_cnt;
+ __u16 pf_cnt;
+ __u16 re_cnt;
+ __u16 map;
+};
+
+#endif /* __ASM_S390_UAPI_FS3270_H */
diff --git a/arch/s390/include/uapi/asm/hwctrset.h b/arch/s390/include/uapi/asm/hwctrset.h
new file mode 100644
index 000000000000..e56b9dd23a4b
--- /dev/null
+++ b/arch/s390/include/uapi/asm/hwctrset.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright IBM Corp. 2021
+ * Interface implementation for communication with the CPU Measurement
+ * counter facility device driver.
+ *
+ * Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ *
+ * Define for ioctl() commands to communicate with the CPU Measurement
+ * counter facility device driver.
+ */
+
+#ifndef _PERF_CPUM_CF_DIAG_H
+#define _PERF_CPUM_CF_DIAG_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define S390_HWCTR_DEVICE "hwctr"
+#define S390_HWCTR_START_VERSION 1
+
+struct s390_ctrset_start { /* Set CPUs to operate on */
+ __u64 version; /* Version of interface */
+ __u64 data_bytes; /* # of bytes required */
+ __u64 cpumask_len; /* Length of CPU mask in bytes */
+ __u64 *cpumask; /* Pointer to CPU mask */
+ __u64 counter_sets; /* Bit mask of counter sets to get */
+};
+
+struct s390_ctrset_setdata { /* Counter set data */
+ __u32 set; /* Counter set number */
+ __u32 no_cnts; /* # of counters stored in cv[] */
+ __u64 cv[]; /* Counter values (variable length) */
+};
+
+struct s390_ctrset_cpudata { /* Counter set data per CPU */
+ __u32 cpu_nr; /* CPU number */
+ __u32 no_sets; /* # of counters sets in data[] */
+ struct s390_ctrset_setdata data[];
+};
+
+struct s390_ctrset_read { /* Structure to get all ctr sets */
+ __u64 no_cpus; /* Total # of CPUs data taken from */
+ struct s390_ctrset_cpudata data[];
+};
+
+#define S390_HWCTR_MAGIC 'C' /* Random magic # for ioctls */
+#define S390_HWCTR_START _IOWR(S390_HWCTR_MAGIC, 1, struct s390_ctrset_start)
+#define S390_HWCTR_STOP _IO(S390_HWCTR_MAGIC, 2)
+#define S390_HWCTR_READ _IOWR(S390_HWCTR_MAGIC, 3, struct s390_ctrset_read)
+#endif
diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h
index 451ba7d08905..2cd28af50dd4 100644
--- a/arch/s390/include/uapi/asm/ipl.h
+++ b/arch/s390/include/uapi/asm/ipl.h
@@ -27,6 +27,8 @@ enum ipl_pbt {
IPL_PBT_FCP = 0,
IPL_PBT_SCP_DATA = 1,
IPL_PBT_CCW = 2,
+ IPL_PBT_ECKD = 3,
+ IPL_PBT_NVME = 4,
};
/* IPL Parameter Block 0 with common fields */
@@ -67,6 +69,30 @@ struct ipl_pb0_fcp {
#define IPL_PB0_FCP_OPT_IPL 0x10
#define IPL_PB0_FCP_OPT_DUMP 0x20
+/* IPL Parameter Block 0 for NVMe */
+struct ipl_pb0_nvme {
+ __u32 len;
+ __u8 pbt;
+ __u8 reserved1[3];
+ __u8 loadparm[8];
+ __u8 reserved2[304];
+ __u8 opt;
+ __u8 reserved3[3];
+ __u32 fid;
+ __u8 reserved4[12];
+ __u32 nsid;
+ __u8 reserved5[4];
+ __u32 bootprog;
+ __u8 reserved6[12];
+ __u64 br_lba;
+ __u32 scp_data_len;
+ __u8 reserved7[260];
+ __u8 scp_data[];
+} __packed;
+
+#define IPL_PB0_NVME_OPT_IPL 0x10
+#define IPL_PB0_NVME_OPT_DUMP 0x20
+
/* IPL Parameter Block 0 for CCW */
struct ipl_pb0_ccw {
__u32 len;
@@ -86,6 +112,34 @@ struct ipl_pb0_ccw {
__u8 reserved5[8];
} __packed;
+/* IPL Parameter Block 0 for ECKD */
+struct ipl_pb0_eckd {
+ __u32 len;
+ __u8 pbt;
+ __u8 reserved1[3];
+ __u32 reserved2[78];
+ __u8 opt;
+ __u8 reserved4[4];
+ __u8 reserved5:5;
+ __u8 ssid:3;
+ __u16 devno;
+ __u32 reserved6[5];
+ __u32 bootprog;
+ __u8 reserved7[12];
+ struct {
+ __u16 cyl;
+ __u8 head;
+ __u8 record;
+ __u32 reserved;
+ } br_chr __packed;
+ __u32 scp_data_len;
+ __u8 reserved8[260];
+ __u8 scp_data[];
+} __packed;
+
+#define IPL_PB0_ECKD_OPT_IPL 0x10
+#define IPL_PB0_ECKD_OPT_DUMP 0x20
+
#define IPL_PB0_CCW_VM_FLAG_NSS 0x80
#define IPL_PB0_CCW_VM_FLAG_VP 0x40
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 436ec7636927..abe926d43cbe 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -74,6 +74,7 @@ struct kvm_s390_io_adapter_req {
#define KVM_S390_VM_CRYPTO 2
#define KVM_S390_VM_CPU_MODEL 3
#define KVM_S390_VM_MIGRATION 4
+#define KVM_S390_VM_CPU_TOPOLOGY 5
/* kvm attributes for mem_ctrl */
#define KVM_S390_VM_MEM_ENABLE_CMMA 0
@@ -158,6 +159,22 @@ struct kvm_s390_vm_cpu_subfunc {
__u8 reserved[1728];
};
+#define KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST 6
+#define KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST 7
+
+#define KVM_S390_VM_CPU_UV_FEAT_NR_BITS 64
+struct kvm_s390_vm_cpu_uv_feat {
+ union {
+ struct {
+ __u64 : 4;
+ __u64 ap : 1; /* bit 4 */
+ __u64 ap_intr : 1; /* bit 5 */
+ __u64 : 58;
+ };
+ __u64 feat;
+ };
+};
+
/* kvm attributes for crypto */
#define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0
#define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1
@@ -231,11 +248,13 @@ struct kvm_guest_debug_arch {
#define KVM_SYNC_GSCB (1UL << 9)
#define KVM_SYNC_BPBC (1UL << 10)
#define KVM_SYNC_ETOKEN (1UL << 11)
+#define KVM_SYNC_DIAG318 (1UL << 12)
#define KVM_SYNC_S390_VALID_FIELDS \
(KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \
KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \
- KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN)
+ KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN | \
+ KVM_SYNC_DIAG318)
/* length and alignment of the sdnx as a power of two */
#define SDNXC 8
@@ -264,7 +283,8 @@ struct kvm_sync_regs {
__u8 reserved2 : 7;
__u8 padding1[51]; /* riccb needs to be 64byte aligned */
__u8 riccb[64]; /* runtime instrumentation controls block */
- __u8 padding2[192]; /* sdnx needs to be 256byte aligned */
+ __u64 diag318; /* diagnose 0x318 info */
+ __u8 padding2[184]; /* sdnx needs to be 256byte aligned */
union {
__u8 sdnx[SDNXL]; /* state description annex */
struct {
diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h
index e22f0720bbb8..5ad76471e73f 100644
--- a/arch/s390/include/uapi/asm/pkey.h
+++ b/arch/s390/include/uapi/asm/pkey.h
@@ -2,7 +2,7 @@
/*
* Userspace interface to the pkey device driver
*
- * Copyright IBM Corp. 2017, 2019
+ * Copyright IBM Corp. 2017, 2023
*
* Author: Harald Freudenberger <freude@de.ibm.com>
*
@@ -25,20 +25,31 @@
#define MAXPROTKEYSIZE 64 /* a protected key blob may be up to 64 bytes */
#define MAXCLRKEYSIZE 32 /* a clear key value may be up to 32 bytes */
#define MAXAESCIPHERKEYSIZE 136 /* our aes cipher keys have always 136 bytes */
+#define MINEP11AESKEYBLOBSIZE 256 /* min EP11 AES key blob size */
+#define MAXEP11AESKEYBLOBSIZE 336 /* max EP11 AES key blob size */
-/* Minimum and maximum size of a key blob */
+/* Minimum size of a key blob */
#define MINKEYBLOBSIZE SECKEYBLOBSIZE
-#define MAXKEYBLOBSIZE MAXAESCIPHERKEYSIZE
/* defines for the type field within the pkey_protkey struct */
-#define PKEY_KEYTYPE_AES_128 1
-#define PKEY_KEYTYPE_AES_192 2
-#define PKEY_KEYTYPE_AES_256 3
+#define PKEY_KEYTYPE_AES_128 1
+#define PKEY_KEYTYPE_AES_192 2
+#define PKEY_KEYTYPE_AES_256 3
+#define PKEY_KEYTYPE_ECC 4
+#define PKEY_KEYTYPE_ECC_P256 5
+#define PKEY_KEYTYPE_ECC_P384 6
+#define PKEY_KEYTYPE_ECC_P521 7
+#define PKEY_KEYTYPE_ECC_ED25519 8
+#define PKEY_KEYTYPE_ECC_ED448 9
/* the newer ioctls use a pkey_key_type enum for type information */
enum pkey_key_type {
PKEY_TYPE_CCA_DATA = (__u32) 1,
PKEY_TYPE_CCA_CIPHER = (__u32) 2,
+ PKEY_TYPE_EP11 = (__u32) 3,
+ PKEY_TYPE_CCA_ECC = (__u32) 0x1f,
+ PKEY_TYPE_EP11_AES = (__u32) 6,
+ PKEY_TYPE_EP11_ECC = (__u32) 7,
};
/* the newer ioctls use a pkey_key_size enum for key size information */
@@ -87,6 +98,20 @@ struct pkey_clrkey {
};
/*
+ * EP11 key blobs of type PKEY_TYPE_EP11_AES and PKEY_TYPE_EP11_ECC
+ * are ep11 blobs prepended by this header:
+ */
+struct ep11kblob_header {
+ __u8 type; /* always 0x00 */
+ __u8 hver; /* header version, currently needs to be 0x00 */
+ __u16 len; /* total length in bytes (including this header) */
+ __u8 version; /* PKEY_TYPE_EP11_AES or PKEY_TYPE_EP11_ECC */
+ __u8 res0; /* unused */
+ __u16 bitlen; /* clear key bit len, 0 for unknown */
+ __u8 res1[8]; /* unused */
+} __packed;
+
+/*
* Generate CCA AES secure key.
*/
struct pkey_genseck {
@@ -151,7 +176,7 @@ struct pkey_skey2pkey {
#define PKEY_SKEY2PKEY _IOWR(PKEY_IOCTL_MAGIC, 0x06, struct pkey_skey2pkey)
/*
- * Verify the given CCA AES secure key for being able to be useable with
+ * Verify the given CCA AES secure key for being able to be usable with
* the pkey module. Check for correct key type and check for having at
* least one crypto card being able to handle this key (master key
* or old master key verification pattern matches).
@@ -200,7 +225,7 @@ struct pkey_kblob2pkey {
/*
* Generate secure key, version 2.
- * Generate either a CCA AES secure key or a CCA AES cipher key.
+ * Generate CCA AES secure key, CCA AES cipher key or EP11 AES secure key.
* There needs to be a list of apqns given with at least one entry in there.
* All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain
* is not supported. The implementation walks through the list of apqns and
@@ -210,10 +235,13 @@ struct pkey_kblob2pkey {
* (return -1 with errno ENODEV). You may use the PKEY_APQNS4KT ioctl to
* generate a list of apqns based on the key type to generate.
* The keygenflags argument is passed to the low level generation functions
- * individual for the key type and has a key type specific meaning. Currently
- * only CCA AES cipher keys react to this parameter: Use one or more of the
- * PKEY_KEYGEN_* flags to widen the export possibilities. By default a cipher
- * key is only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC).
+ * individual for the key type and has a key type specific meaning. When
+ * generating CCA cipher keys you can use one or more of the PKEY_KEYGEN_*
+ * flags to widen the export possibilities. By default a cipher key is
+ * only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC).
+ * The keygenflag argument for generating an EP11 AES key should either be 0
+ * to use the defaults which are XCP_BLOB_ENCRYPT, XCP_BLOB_DECRYPT and
+ * XCP_BLOB_PROTKEY_EXTRACTABLE or a valid combination of XCP_BLOB_* flags.
*/
struct pkey_genseck2 {
struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets*/
@@ -229,8 +257,8 @@ struct pkey_genseck2 {
/*
* Generate secure key from clear key value, version 2.
- * Construct a CCA AES secure key or CCA AES cipher key from a given clear key
- * value.
+ * Construct an CCA AES secure key, CCA AES cipher key or EP11 AES secure
+ * key from a given clear key value.
* There needs to be a list of apqns given with at least one entry in there.
* All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain
* is not supported. The implementation walks through the list of apqns and
@@ -240,10 +268,13 @@ struct pkey_genseck2 {
* (return -1 with errno ENODEV). You may use the PKEY_APQNS4KT ioctl to
* generate a list of apqns based on the key type to generate.
* The keygenflags argument is passed to the low level generation functions
- * individual for the key type and has a key type specific meaning. Currently
- * only CCA AES cipher keys react to this parameter: Use one or more of the
- * PKEY_KEYGEN_* flags to widen the export possibilities. By default a cipher
- * key is only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC).
+ * individual for the key type and has a key type specific meaning. When
+ * generating CCA cipher keys you can use one or more of the PKEY_KEYGEN_*
+ * flags to widen the export possibilities. By default a cipher key is
+ * only exportable for CPACF (PKEY_KEYGEN_XPRT_CPAC).
+ * The keygenflag argument for generating an EP11 AES key should either be 0
+ * to use the defaults which are XCP_BLOB_ENCRYPT, XCP_BLOB_DECRYPT and
+ * XCP_BLOB_PROTKEY_EXTRACTABLE or a valid combination of XCP_BLOB_* flags.
*/
struct pkey_clr2seck2 {
struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets */
@@ -266,14 +297,19 @@ struct pkey_clr2seck2 {
* with one apqn able to handle this key.
* The function also checks for the master key verification patterns
* of the key matching to the current or alternate mkvp of the apqn.
- * Currently CCA AES secure keys and CCA AES cipher keys are supported.
- * The flags field is updated with some additional info about the apqn mkvp
+ * For CCA AES secure keys and CCA AES cipher keys this means to check
+ * the key's mkvp against the current or old mkvp of the apqns. The flags
+ * field is updated with some additional info about the apqn mkvp
* match: If the current mkvp matches to the key's mkvp then the
* PKEY_FLAGS_MATCH_CUR_MKVP bit is set, if the alternate mkvp matches to
* the key's mkvp the PKEY_FLAGS_MATCH_ALT_MKVP is set. For CCA keys the
* alternate mkvp is the old master key verification pattern.
* CCA AES secure keys are also checked to have the CPACF export allowed
* bit enabled (XPRTCPAC) in the kmf1 field.
+ * EP11 keys are also supported and the wkvp of the key is checked against
+ * the current wkvp of the apqns. There is no alternate for this type of
+ * key and so on a match the flag PKEY_FLAGS_MATCH_CUR_MKVP always is set.
+ * EP11 keys are also checked to have XCP_BLOB_PROTKEY_EXTRACTABLE set.
* The ioctl returns 0 as long as the given or found apqn matches to
* matches with the current or alternate mkvp to the key's mkvp. If the given
* apqn does not match or there is no such apqn found, -1 with errno
@@ -291,7 +327,7 @@ struct pkey_verifykey2 {
#define PKEY_VERIFYKEY2 _IOWR(PKEY_IOCTL_MAGIC, 0x17, struct pkey_verifykey2)
/*
- * Transform a key blob (of any type) into a protected key, version 2.
+ * Transform a key blob into a protected key, version 2.
* There needs to be a list of apqns given with at least one entry in there.
* All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain
* is not supported. The implementation walks through the list of apqns and
@@ -300,6 +336,8 @@ struct pkey_verifykey2 {
* list is tried until success (return 0) or the end of the list is reached
* (return -1 with errno ENODEV). You may use the PKEY_APQNS4K ioctl to
* generate a list of apqns based on the key.
+ * Deriving ECC protected keys from ECC secure keys is not supported with
+ * this ioctl, use PKEY_KBLOB2PROTK3 for this purpose.
*/
struct pkey_kblob2pkey2 {
__u8 __user *key; /* in: pointer to key blob */
@@ -313,22 +351,26 @@ struct pkey_kblob2pkey2 {
/*
* Build a list of APQNs based on a key blob given.
* Is able to find out which type of secure key is given (CCA AES secure
- * key or CCA AES cipher key) and tries to find all matching crypto cards
- * based on the MKVP and maybe other criterias (like CCA AES cipher keys
- * need a CEX5C or higher). The list of APQNs is further filtered by the key's
- * mkvp which needs to match to either the current mkvp or the alternate mkvp
- * (which is the old mkvp on CCA adapters) of the apqns. The flags argument may
- * be used to limit the matching apqns. If the PKEY_FLAGS_MATCH_CUR_MKVP is
- * given, only the current mkvp of each apqn is compared. Likewise with the
- * PKEY_FLAGS_MATCH_ALT_MKVP. If both are given, it is assumed to
- * return apqns where either the current or the alternate mkvp
- * matches. At least one of the matching flags needs to be given.
+ * key, CCA AES cipher key, CCA ECC private key, EP11 AES key, EP11 ECC private
+ * key) and tries to find all matching crypto cards based on the MKVP and maybe
+ * other criteria (like CCA AES cipher keys need a CEX5C or higher, EP11 keys
+ * with BLOB_PKEY_EXTRACTABLE need a CEX7 and EP11 api version 4). The list of
+ * APQNs is further filtered by the key's mkvp which needs to match to either
+ * the current mkvp (CCA and EP11) or the alternate mkvp (old mkvp, CCA adapters
+ * only) of the apqns. The flags argument may be used to limit the matching
+ * apqns. If the PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current mkvp of
+ * each apqn is compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP. If both
+ * are given, it is assumed to return apqns where either the current or the
+ * alternate mkvp matches. At least one of the matching flags needs to be given.
+ * The flags argument for EP11 keys has no further action and is currently
+ * ignored (but needs to be given as PKEY_FLAGS_MATCH_CUR_MKVP) as there is only
+ * the wkvp from the key to match against the apqn's wkvp.
* The list of matching apqns is stored into the space given by the apqns
* argument and the number of stored entries goes into apqn_entries. If the list
* is empty (apqn_entries is 0) the apqn_entries field is updated to the number
* of apqn targets found and the ioctl returns with 0. If apqn_entries is > 0
* but the number of apqn targets does not fit into the list, the apqn_targets
- * field is updatedd with the number of reqired entries but there are no apqn
+ * field is updated with the number of required entries but there are no apqn
* values stored in the list and the ioctl returns with ENOSPC. If no matching
* APQN is found, the ioctl returns with 0 but the apqn_entries value is 0.
*/
@@ -348,20 +390,25 @@ struct pkey_apqns4key {
* restrict the list by given master key verification patterns.
* For different key types there may be different ways to match the
* master key verification patterns. For CCA keys (CCA data key and CCA
- * cipher key) the first 8 bytes of cur_mkvp refer to the current mkvp value
- * of the apqn and the first 8 bytes of the alt_mkvp refer to the old mkvp.
- * The flags argument controls if the apqns current and/or alternate mkvp
+ * cipher key) the first 8 bytes of cur_mkvp refer to the current AES mkvp value
+ * of the apqn and the first 8 bytes of the alt_mkvp refer to the old AES mkvp.
+ * For CCA ECC keys it is similar but the match is against the APKA current/old
+ * mkvp. The flags argument controls if the apqns current and/or alternate mkvp
* should match. If the PKEY_FLAGS_MATCH_CUR_MKVP is given, only the current
* mkvp of each apqn is compared. Likewise with the PKEY_FLAGS_MATCH_ALT_MKVP.
* If both are given, it is assumed to return apqns where either the
* current or the alternate mkvp matches. If no match flag is given
* (flags is 0) the mkvp values are ignored for the match process.
+ * For EP11 keys there is only the current wkvp. So if the apqns should also
+ * match to a given wkvp, then the PKEY_FLAGS_MATCH_CUR_MKVP flag should be
+ * set. The wkvp value is 32 bytes but only the leftmost 16 bytes are compared
+ * against the leftmost 16 byte of the wkvp of the apqn.
* The list of matching apqns is stored into the space given by the apqns
* argument and the number of stored entries goes into apqn_entries. If the list
* is empty (apqn_entries is 0) the apqn_entries field is updated to the number
* of apqn targets found and the ioctl returns with 0. If apqn_entries is > 0
* but the number of apqn targets does not fit into the list, the apqn_targets
- * field is updatedd with the number of reqired entries but there are no apqn
+ * field is updated with the number of required entries but there are no apqn
* values stored in the list and the ioctl returns with ENOSPC. If no matching
* APQN is found, the ioctl returns with 0 but the apqn_entries value is 0.
*/
@@ -376,4 +423,30 @@ struct pkey_apqns4keytype {
};
#define PKEY_APQNS4KT _IOWR(PKEY_IOCTL_MAGIC, 0x1C, struct pkey_apqns4keytype)
+/*
+ * Transform a key blob into a protected key, version 3.
+ * The difference to version 2 of this ioctl is that the protected key
+ * buffer is now explicitly and not within a struct pkey_protkey any more.
+ * So this ioctl is also able to handle EP11 and CCA ECC secure keys and
+ * provide ECC protected keys.
+ * There needs to be a list of apqns given with at least one entry in there.
+ * All apqns in the list need to be exact apqns, 0xFFFF as ANY card or domain
+ * is not supported. The implementation walks through the list of apqns and
+ * tries to send the request to each apqn without any further checking (like
+ * card type or online state). If the apqn fails, simple the next one in the
+ * list is tried until success (return 0) or the end of the list is reached
+ * (return -1 with errno ENODEV). You may use the PKEY_APQNS4K ioctl to
+ * generate a list of apqns based on the key.
+ */
+struct pkey_kblob2pkey3 {
+ __u8 __user *key; /* in: pointer to key blob */
+ __u32 keylen; /* in: key blob size */
+ struct pkey_apqn __user *apqns; /* in: ptr to list of apqn targets */
+ __u32 apqn_entries; /* in: # of apqn target list entries */
+ __u32 pkeytype; /* out: prot key type (enum pkey_key_type) */
+ __u32 pkeylen; /* in/out: size of pkey buffer/actual len of pkey */
+ __u8 __user *pkey; /* in: pkey blob buffer space ptr */
+};
+#define PKEY_KBLOB2PROTK3 _IOWR(PKEY_IOCTL_MAGIC, 0x1D, struct pkey_kblob2pkey3)
+
#endif /* _UAPI_PKEY_H */
diff --git a/arch/s390/include/uapi/asm/ptrace.h b/arch/s390/include/uapi/asm/ptrace.h
index 543dd70e12c8..bb0826024bb9 100644
--- a/arch/s390/include/uapi/asm/ptrace.h
+++ b/arch/s390/include/uapi/asm/ptrace.h
@@ -8,6 +8,8 @@
#ifndef _UAPI_S390_PTRACE_H
#define _UAPI_S390_PTRACE_H
+#include <linux/const.h>
+
/*
* Offsets in the user_regs_struct. They are used for the ptrace
* system call and in entry.S
@@ -166,6 +168,64 @@
#endif /* __s390x__ */
+#ifndef __s390x__
+
+#define PSW_MASK_PER _AC(0x40000000, UL)
+#define PSW_MASK_DAT _AC(0x04000000, UL)
+#define PSW_MASK_IO _AC(0x02000000, UL)
+#define PSW_MASK_EXT _AC(0x01000000, UL)
+#define PSW_MASK_KEY _AC(0x00F00000, UL)
+#define PSW_MASK_BASE _AC(0x00080000, UL) /* always one */
+#define PSW_MASK_MCHECK _AC(0x00040000, UL)
+#define PSW_MASK_WAIT _AC(0x00020000, UL)
+#define PSW_MASK_PSTATE _AC(0x00010000, UL)
+#define PSW_MASK_ASC _AC(0x0000C000, UL)
+#define PSW_MASK_CC _AC(0x00003000, UL)
+#define PSW_MASK_PM _AC(0x00000F00, UL)
+#define PSW_MASK_RI _AC(0x00000000, UL)
+#define PSW_MASK_EA _AC(0x00000000, UL)
+#define PSW_MASK_BA _AC(0x00000000, UL)
+
+#define PSW_MASK_USER _AC(0x0000FF00, UL)
+
+#define PSW_ADDR_AMODE _AC(0x80000000, UL)
+#define PSW_ADDR_INSN _AC(0x7FFFFFFF, UL)
+
+#define PSW_ASC_PRIMARY _AC(0x00000000, UL)
+#define PSW_ASC_ACCREG _AC(0x00004000, UL)
+#define PSW_ASC_SECONDARY _AC(0x00008000, UL)
+#define PSW_ASC_HOME _AC(0x0000C000, UL)
+
+#else /* __s390x__ */
+
+#define PSW_MASK_PER _AC(0x4000000000000000, UL)
+#define PSW_MASK_DAT _AC(0x0400000000000000, UL)
+#define PSW_MASK_IO _AC(0x0200000000000000, UL)
+#define PSW_MASK_EXT _AC(0x0100000000000000, UL)
+#define PSW_MASK_BASE _AC(0x0000000000000000, UL)
+#define PSW_MASK_KEY _AC(0x00F0000000000000, UL)
+#define PSW_MASK_MCHECK _AC(0x0004000000000000, UL)
+#define PSW_MASK_WAIT _AC(0x0002000000000000, UL)
+#define PSW_MASK_PSTATE _AC(0x0001000000000000, UL)
+#define PSW_MASK_ASC _AC(0x0000C00000000000, UL)
+#define PSW_MASK_CC _AC(0x0000300000000000, UL)
+#define PSW_MASK_PM _AC(0x00000F0000000000, UL)
+#define PSW_MASK_RI _AC(0x0000008000000000, UL)
+#define PSW_MASK_EA _AC(0x0000000100000000, UL)
+#define PSW_MASK_BA _AC(0x0000000080000000, UL)
+
+#define PSW_MASK_USER _AC(0x0000FF0180000000, UL)
+
+#define PSW_ADDR_AMODE _AC(0x0000000000000000, UL)
+#define PSW_ADDR_INSN _AC(0xFFFFFFFFFFFFFFFF, UL)
+
+#define PSW_ASC_PRIMARY _AC(0x0000000000000000, UL)
+#define PSW_ASC_ACCREG _AC(0x0000400000000000, UL)
+#define PSW_ASC_SECONDARY _AC(0x0000800000000000, UL)
+#define PSW_ASC_HOME _AC(0x0000C00000000000, UL)
+
+#endif /* __s390x__ */
+
#define NUM_GPRS 16
#define NUM_FPRS 16
#define NUM_CRS 16
@@ -179,8 +239,9 @@
#define ACR_SIZE 4
-#define PTRACE_OLDSETOPTIONS 21
-
+#define PTRACE_OLDSETOPTIONS 21
+#define PTRACE_SYSEMU 31
+#define PTRACE_SYSEMU_SINGLESTEP 32
#ifndef __ASSEMBLY__
#include <linux/stddef.h>
#include <linux/types.h>
@@ -213,69 +274,6 @@ typedef struct {
unsigned long addr;
} __attribute__ ((aligned(8))) psw_t;
-#ifndef __s390x__
-
-#define PSW_MASK_PER 0x40000000UL
-#define PSW_MASK_DAT 0x04000000UL
-#define PSW_MASK_IO 0x02000000UL
-#define PSW_MASK_EXT 0x01000000UL
-#define PSW_MASK_KEY 0x00F00000UL
-#define PSW_MASK_BASE 0x00080000UL /* always one */
-#define PSW_MASK_MCHECK 0x00040000UL
-#define PSW_MASK_WAIT 0x00020000UL
-#define PSW_MASK_PSTATE 0x00010000UL
-#define PSW_MASK_ASC 0x0000C000UL
-#define PSW_MASK_CC 0x00003000UL
-#define PSW_MASK_PM 0x00000F00UL
-#define PSW_MASK_RI 0x00000000UL
-#define PSW_MASK_EA 0x00000000UL
-#define PSW_MASK_BA 0x00000000UL
-
-#define PSW_MASK_USER 0x0000FF00UL
-
-#define PSW_ADDR_AMODE 0x80000000UL
-#define PSW_ADDR_INSN 0x7FFFFFFFUL
-
-#define PSW_DEFAULT_KEY (((unsigned long) PAGE_DEFAULT_ACC) << 20)
-
-#define PSW_ASC_PRIMARY 0x00000000UL
-#define PSW_ASC_ACCREG 0x00004000UL
-#define PSW_ASC_SECONDARY 0x00008000UL
-#define PSW_ASC_HOME 0x0000C000UL
-
-#else /* __s390x__ */
-
-#define PSW_MASK_PER 0x4000000000000000UL
-#define PSW_MASK_DAT 0x0400000000000000UL
-#define PSW_MASK_IO 0x0200000000000000UL
-#define PSW_MASK_EXT 0x0100000000000000UL
-#define PSW_MASK_BASE 0x0000000000000000UL
-#define PSW_MASK_KEY 0x00F0000000000000UL
-#define PSW_MASK_MCHECK 0x0004000000000000UL
-#define PSW_MASK_WAIT 0x0002000000000000UL
-#define PSW_MASK_PSTATE 0x0001000000000000UL
-#define PSW_MASK_ASC 0x0000C00000000000UL
-#define PSW_MASK_CC 0x0000300000000000UL
-#define PSW_MASK_PM 0x00000F0000000000UL
-#define PSW_MASK_RI 0x0000008000000000UL
-#define PSW_MASK_EA 0x0000000100000000UL
-#define PSW_MASK_BA 0x0000000080000000UL
-
-#define PSW_MASK_USER 0x0000FF0180000000UL
-
-#define PSW_ADDR_AMODE 0x0000000000000000UL
-#define PSW_ADDR_INSN 0xFFFFFFFFFFFFFFFFUL
-
-#define PSW_DEFAULT_KEY (((unsigned long) PAGE_DEFAULT_ACC) << 52)
-
-#define PSW_ASC_PRIMARY 0x0000000000000000UL
-#define PSW_ASC_ACCREG 0x0000400000000000UL
-#define PSW_ASC_SECONDARY 0x0000800000000000UL
-#define PSW_ASC_HOME 0x0000C00000000000UL
-
-#endif /* __s390x__ */
-
-
/*
* The s390_regs structure is used to define the elf_gregset_t.
*/
diff --git a/arch/s390/include/uapi/asm/raw3270.h b/arch/s390/include/uapi/asm/raw3270.h
new file mode 100644
index 000000000000..6676f102bd50
--- /dev/null
+++ b/arch/s390/include/uapi/asm/raw3270.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_S390_UAPI_RAW3270_H
+#define __ASM_S390_UAPI_RAW3270_H
+
+/* Local Channel Commands */
+#define TC_WRITE 0x01 /* Write */
+#define TC_RDBUF 0x02 /* Read Buffer */
+#define TC_EWRITE 0x05 /* Erase write */
+#define TC_READMOD 0x06 /* Read modified */
+#define TC_EWRITEA 0x0d /* Erase write alternate */
+#define TC_WRITESF 0x11 /* Write structured field */
+
+/* Buffer Control Orders */
+#define TO_GE 0x08 /* Graphics Escape */
+#define TO_SF 0x1d /* Start field */
+#define TO_SBA 0x11 /* Set buffer address */
+#define TO_IC 0x13 /* Insert cursor */
+#define TO_PT 0x05 /* Program tab */
+#define TO_RA 0x3c /* Repeat to address */
+#define TO_SFE 0x29 /* Start field extended */
+#define TO_EUA 0x12 /* Erase unprotected to address */
+#define TO_MF 0x2c /* Modify field */
+#define TO_SA 0x28 /* Set attribute */
+
+/* Field Attribute Bytes */
+#define TF_INPUT 0x40 /* Visible input */
+#define TF_INPUTN 0x4c /* Invisible input */
+#define TF_INMDT 0xc1 /* Visible, Set-MDT */
+#define TF_LOG 0x60
+
+/* Character Attribute Bytes */
+#define TAT_RESET 0x00
+#define TAT_FIELD 0xc0
+#define TAT_EXTHI 0x41
+#define TAT_FGCOLOR 0x42
+#define TAT_CHARS 0x43
+#define TAT_BGCOLOR 0x45
+#define TAT_TRANS 0x46
+
+/* Extended-Highlighting Bytes */
+#define TAX_RESET 0x00
+#define TAX_BLINK 0xf1
+#define TAX_REVER 0xf2
+#define TAX_UNDER 0xf4
+
+/* Reset value */
+#define TAR_RESET 0x00
+
+/* Color values */
+#define TAC_RESET 0x00
+#define TAC_BLUE 0xf1
+#define TAC_RED 0xf2
+#define TAC_PINK 0xf3
+#define TAC_GREEN 0xf4
+#define TAC_TURQ 0xf5
+#define TAC_YELLOW 0xf6
+#define TAC_WHITE 0xf7
+#define TAC_DEFAULT 0x00
+
+/* Write Control Characters */
+#define TW_NONE 0x40 /* No particular action */
+#define TW_KR 0xc2 /* Keyboard restore */
+#define TW_PLUSALARM 0x04 /* Add this bit for alarm */
+
+#define RAW3270_FIRSTMINOR 1 /* First minor number */
+#define RAW3270_MAXDEVS 255 /* Max number of 3270 devices */
+
+#define AID_CLEAR 0x6d
+#define AID_ENTER 0x7d
+#define AID_PF3 0xf3
+#define AID_PF7 0xf7
+#define AID_PF8 0xf8
+#define AID_READ_PARTITION 0x88
+
+#endif /* __ASM_S390_UAPI_RAW3270_H */
diff --git a/arch/s390/include/uapi/asm/schid.h b/arch/s390/include/uapi/asm/schid.h
index 58fca6f48410..a3e1cf168553 100644
--- a/arch/s390/include/uapi/asm/schid.h
+++ b/arch/s390/include/uapi/asm/schid.h
@@ -4,6 +4,8 @@
#include <linux/types.h>
+#ifndef __ASSEMBLY__
+
struct subchannel_id {
__u32 cssid : 8;
__u32 : 4;
@@ -13,5 +15,6 @@ struct subchannel_id {
__u32 sch_no : 16;
} __attribute__ ((packed, aligned(4)));
+#endif /* __ASSEMBLY__ */
#endif /* _UAPIASM_SCHID_H */
diff --git a/arch/s390/include/uapi/asm/setup.h b/arch/s390/include/uapi/asm/setup.h
index 1f8803a31079..598d769e76df 100644
--- a/arch/s390/include/uapi/asm/setup.h
+++ b/arch/s390/include/uapi/asm/setup.h
@@ -1,14 +1 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * S390 version
- * Copyright IBM Corp. 1999, 2010
- */
-
-#ifndef _UAPI_ASM_S390_SETUP_H
-#define _UAPI_ASM_S390_SETUP_H
-
-#define COMMAND_LINE_SIZE 4096
-
-#define ARCH_COMMAND_LINE_SIZE 896
-
-#endif /* _UAPI_ASM_S390_SETUP_H */
diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h
index 6ca1e68d7103..ede318653c87 100644
--- a/arch/s390/include/uapi/asm/sie.h
+++ b/arch/s390/include/uapi/asm/sie.h
@@ -29,7 +29,7 @@
{ 0x13, "SIGP conditional emergency signal" }, \
{ 0x15, "SIGP sense running" }, \
{ 0x16, "SIGP set multithreading"}, \
- { 0x17, "SIGP store additional status ait address"}
+ { 0x17, "SIGP store additional status at address"}
#define icpt_prog_codes \
{ 0x0001, "Prog Operation" }, \
diff --git a/arch/s390/include/uapi/asm/signal.h b/arch/s390/include/uapi/asm/signal.h
index 9a14a611ed82..e74d6ba1bd3b 100644
--- a/arch/s390/include/uapi/asm/signal.h
+++ b/arch/s390/include/uapi/asm/signal.h
@@ -65,30 +65,6 @@ typedef unsigned long sigset_t;
#define SIGRTMIN 32
#define SIGRTMAX _NSIG
-/*
- * SA_FLAGS values:
- *
- * SA_ONSTACK indicates that a registered stack_t will be used.
- * SA_RESTART flag to get restarting signals (which were the default long ago)
- * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
- * SA_RESETHAND clears the handler when the signal is delivered.
- * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
- * SA_NODEFER prevents the current signal from being masked in the handler.
- *
- * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
- * Unix names RESETHAND and NODEFER respectively.
- */
-#define SA_NOCLDSTOP 0x00000001
-#define SA_NOCLDWAIT 0x00000002
-#define SA_SIGINFO 0x00000004
-#define SA_ONSTACK 0x08000000
-#define SA_RESTART 0x10000000
-#define SA_NODEFER 0x40000000
-#define SA_RESETHAND 0x80000000
-
-#define SA_NOMASK SA_NODEFER
-#define SA_ONESHOT SA_RESETHAND
-
#define SA_RESTORER 0x04000000
#define MINSIGSTKSZ 2048
@@ -132,7 +108,7 @@ struct sigaction {
typedef struct sigaltstack {
void __user *ss_sp;
int ss_flags;
- size_t ss_size;
+ __kernel_size_t ss_size;
} stack_t;
diff --git a/arch/s390/include/uapi/asm/statfs.h b/arch/s390/include/uapi/asm/statfs.h
index 72604f7792c3..f85b50723dd3 100644
--- a/arch/s390/include/uapi/asm/statfs.h
+++ b/arch/s390/include/uapi/asm/statfs.h
@@ -30,7 +30,7 @@ struct statfs {
unsigned int f_namelen;
unsigned int f_frsize;
unsigned int f_flags;
- unsigned int f_spare[4];
+ unsigned int f_spare[5];
};
struct statfs64 {
@@ -45,7 +45,7 @@ struct statfs64 {
unsigned int f_namelen;
unsigned int f_frsize;
unsigned int f_flags;
- unsigned int f_spare[4];
+ unsigned int f_spare[5];
};
#endif
diff --git a/arch/s390/include/uapi/asm/termios.h b/arch/s390/include/uapi/asm/termios.h
deleted file mode 100644
index 54223169c806..000000000000
--- a/arch/s390/include/uapi/asm/termios.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * S390 version
- *
- * Derived from "include/asm-i386/termios.h"
- */
-
-#ifndef _UAPI_S390_TERMIOS_H
-#define _UAPI_S390_TERMIOS_H
-
-#include <asm/termbits.h>
-#include <asm/ioctls.h>
-
-struct winsize {
- unsigned short ws_row;
- unsigned short ws_col;
- unsigned short ws_xpixel;
- unsigned short ws_ypixel;
-};
-
-#define NCC 8
-struct termio {
- unsigned short c_iflag; /* input mode flags */
- unsigned short c_oflag; /* output mode flags */
- unsigned short c_cflag; /* control mode flags */
- unsigned short c_lflag; /* local mode flags */
- unsigned char c_line; /* line discipline */
- unsigned char c_cc[NCC]; /* control characters */
-};
-
-/* modem lines */
-#define TIOCM_LE 0x001
-#define TIOCM_DTR 0x002
-#define TIOCM_RTS 0x004
-#define TIOCM_ST 0x008
-#define TIOCM_SR 0x010
-#define TIOCM_CTS 0x020
-#define TIOCM_CAR 0x040
-#define TIOCM_RNG 0x080
-#define TIOCM_DSR 0x100
-#define TIOCM_CD TIOCM_CAR
-#define TIOCM_RI TIOCM_RNG
-#define TIOCM_OUT1 0x2000
-#define TIOCM_OUT2 0x4000
-#define TIOCM_LOOP 0x8000
-
-/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
-
-
-#endif /* _UAPI_S390_TERMIOS_H */
diff --git a/arch/s390/include/uapi/asm/types.h b/arch/s390/include/uapi/asm/types.h
index da034c606314..84457dbb26b4 100644
--- a/arch/s390/include/uapi/asm/types.h
+++ b/arch/s390/include/uapi/asm/types.h
@@ -12,15 +12,18 @@
#ifndef __ASSEMBLY__
-/* A address type so that arithmetic can be done on it & it can be upgraded to
- 64 bit when necessary
-*/
-typedef unsigned long addr_t;
+typedef unsigned long addr_t;
typedef __signed__ long saddr_t;
typedef struct {
- __u32 u[4];
-} __vector128;
+ union {
+ struct {
+ __u64 high;
+ __u64 low;
+ };
+ __u32 u[4];
+ };
+} __attribute__((packed, aligned(4))) __vector128;
#endif /* __ASSEMBLY__ */
diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h
new file mode 100644
index 000000000000..b9c2f14a6af3
--- /dev/null
+++ b/arch/s390/include/uapi/asm/uvdevice.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright IBM Corp. 2022
+ * Author(s): Steffen Eiden <seiden@linux.ibm.com>
+ */
+#ifndef __S390_ASM_UVDEVICE_H
+#define __S390_ASM_UVDEVICE_H
+
+#include <linux/types.h>
+
+struct uvio_ioctl_cb {
+ __u32 flags;
+ __u16 uv_rc; /* UV header rc value */
+ __u16 uv_rrc; /* UV header rrc value */
+ __u64 argument_addr; /* Userspace address of uvio argument */
+ __u32 argument_len;
+ __u8 reserved14[0x40 - 0x14]; /* must be zero */
+};
+
+#define UVIO_ATT_USER_DATA_LEN 0x100
+#define UVIO_ATT_UID_LEN 0x10
+struct uvio_attest {
+ __u64 arcb_addr; /* 0x0000 */
+ __u64 meas_addr; /* 0x0008 */
+ __u64 add_data_addr; /* 0x0010 */
+ __u8 user_data[UVIO_ATT_USER_DATA_LEN]; /* 0x0018 */
+ __u8 config_uid[UVIO_ATT_UID_LEN]; /* 0x0118 */
+ __u32 arcb_len; /* 0x0128 */
+ __u32 meas_len; /* 0x012c */
+ __u32 add_data_len; /* 0x0130 */
+ __u16 user_data_len; /* 0x0134 */
+ __u16 reserved136; /* 0x0136 */
+};
+
+/**
+ * uvio_uvdev_info - Information of supported functions
+ * @supp_uvio_cmds - supported IOCTLs by this device
+ * @supp_uv_cmds - supported UVCs corresponding to the IOCTL
+ *
+ * UVIO request to get information about supported request types by this
+ * uvdevice and the Ultravisor. Everything is output. Bits are in LSB0
+ * ordering. If the bit is set in both, @supp_uvio_cmds and @supp_uv_cmds, the
+ * uvdevice and the Ultravisor support that call.
+ *
+ * Note that bit 0 (UVIO_IOCTL_UVDEV_INFO_NR) is always zero for `supp_uv_cmds`
+ * as there is no corresponding UV-call.
+ */
+struct uvio_uvdev_info {
+ /*
+ * If bit `n` is set, this device supports the IOCTL with nr `n`.
+ */
+ __u64 supp_uvio_cmds;
+ /*
+ * If bit `n` is set, the Ultravisor(UV) supports the UV-call
+ * corresponding to the IOCTL with nr `n` in the calling contextx (host
+ * or guest). The value is only valid if the corresponding bit in
+ * @supp_uvio_cmds is set as well.
+ */
+ __u64 supp_uv_cmds;
+};
+
+/*
+ * The following max values define an upper length for the IOCTL in/out buffers.
+ * However, they do not represent the maximum the Ultravisor allows which is
+ * often way smaller. By allowing larger buffer sizes we hopefully do not need
+ * to update the code with every machine update. It is therefore possible for
+ * userspace to request more memory than actually used by kernel/UV.
+ */
+#define UVIO_ATT_ARCB_MAX_LEN 0x100000
+#define UVIO_ATT_MEASUREMENT_MAX_LEN 0x8000
+#define UVIO_ATT_ADDITIONAL_MAX_LEN 0x8000
+#define UVIO_ADD_SECRET_MAX_LEN 0x100000
+#define UVIO_LIST_SECRETS_LEN 0x1000
+
+#define UVIO_DEVICE_NAME "uv"
+#define UVIO_TYPE_UVC 'u'
+
+enum UVIO_IOCTL_NR {
+ UVIO_IOCTL_UVDEV_INFO_NR = 0x00,
+ UVIO_IOCTL_ATT_NR,
+ UVIO_IOCTL_ADD_SECRET_NR,
+ UVIO_IOCTL_LIST_SECRETS_NR,
+ UVIO_IOCTL_LOCK_SECRETS_NR,
+ /* must be the last entry */
+ UVIO_IOCTL_NUM_IOCTLS
+};
+
+#define UVIO_IOCTL(nr) _IOWR(UVIO_TYPE_UVC, nr, struct uvio_ioctl_cb)
+#define UVIO_IOCTL_UVDEV_INFO UVIO_IOCTL(UVIO_IOCTL_UVDEV_INFO_NR)
+#define UVIO_IOCTL_ATT UVIO_IOCTL(UVIO_IOCTL_ATT_NR)
+#define UVIO_IOCTL_ADD_SECRET UVIO_IOCTL(UVIO_IOCTL_ADD_SECRET_NR)
+#define UVIO_IOCTL_LIST_SECRETS UVIO_IOCTL(UVIO_IOCTL_LIST_SECRETS_NR)
+#define UVIO_IOCTL_LOCK_SECRETS UVIO_IOCTL(UVIO_IOCTL_LOCK_SECRETS_NR)
+
+#define UVIO_SUPP_CALL(nr) (1ULL << (nr))
+#define UVIO_SUPP_UDEV_INFO UVIO_SUPP_CALL(UVIO_IOCTL_UDEV_INFO_NR)
+#define UVIO_SUPP_ATT UVIO_SUPP_CALL(UVIO_IOCTL_ATT_NR)
+#define UVIO_SUPP_ADD_SECRET UVIO_SUPP_CALL(UVIO_IOCTL_ADD_SECRET_NR)
+#define UVIO_SUPP_LIST_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LIST_SECRETS_NR)
+#define UVIO_SUPP_LOCK_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LOCK_SECRETS_NR)
+
+#endif /* __S390_ASM_UVDEVICE_H */
diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h
index f9e5e1f0821d..f4785abe1b9f 100644
--- a/arch/s390/include/uapi/asm/zcrypt.h
+++ b/arch/s390/include/uapi/asm/zcrypt.h
@@ -4,7 +4,7 @@
*
* zcrypt 2.2.1 (user-visible header)
*
- * Copyright IBM Corp. 2001, 2019
+ * Copyright IBM Corp. 2001, 2022
* Author(s): Robert Burroughs
* Eric Rossman (edrossma@us.ibm.com)
*
@@ -36,12 +36,12 @@
* - length(n_modulus) = inputdatalength
*/
struct ica_rsa_modexpo {
- char __user *inputdata;
- unsigned int inputdatalength;
- char __user *outputdata;
- unsigned int outputdatalength;
- char __user *b_key;
- char __user *n_modulus;
+ __u8 __user *inputdata;
+ __u32 inputdatalength;
+ __u8 __user *outputdata;
+ __u32 outputdatalength;
+ __u8 __user *b_key;
+ __u8 __user *n_modulus;
};
/**
@@ -59,15 +59,15 @@ struct ica_rsa_modexpo {
* - length(u_mult_inv) = inputdatalength/2 + 8
*/
struct ica_rsa_modexpo_crt {
- char __user *inputdata;
- unsigned int inputdatalength;
- char __user *outputdata;
- unsigned int outputdatalength;
- char __user *bp_key;
- char __user *bq_key;
- char __user *np_prime;
- char __user *nq_prime;
- char __user *u_mult_inv;
+ __u8 __user *inputdata;
+ __u32 inputdatalength;
+ __u8 __user *outputdata;
+ __u32 outputdatalength;
+ __u8 __user *bp_key;
+ __u8 __user *bq_key;
+ __u8 __user *np_prime;
+ __u8 __user *nq_prime;
+ __u8 __user *u_mult_inv;
};
/**
@@ -83,67 +83,66 @@ struct ica_rsa_modexpo_crt {
* key block
*/
struct CPRBX {
- unsigned short cprb_len; /* CPRB length 220 */
- unsigned char cprb_ver_id; /* CPRB version id. 0x02 */
- unsigned char pad_000[3]; /* Alignment pad bytes */
- unsigned char func_id[2]; /* function id 0x5432 */
- unsigned char cprb_flags[4]; /* Flags */
- unsigned int req_parml; /* request parameter buffer len */
- unsigned int req_datal; /* request data buffer */
- unsigned int rpl_msgbl; /* reply message block length */
- unsigned int rpld_parml; /* replied parameter block len */
- unsigned int rpl_datal; /* reply data block len */
- unsigned int rpld_datal; /* replied data block len */
- unsigned int req_extbl; /* request extension block len */
- unsigned char pad_001[4]; /* reserved */
- unsigned int rpld_extbl; /* replied extension block len */
- unsigned char padx000[16 - sizeof(char *)];
- unsigned char *req_parmb; /* request parm block 'address' */
- unsigned char padx001[16 - sizeof(char *)];
- unsigned char *req_datab; /* request data block 'address' */
- unsigned char padx002[16 - sizeof(char *)];
- unsigned char *rpl_parmb; /* reply parm block 'address' */
- unsigned char padx003[16 - sizeof(char *)];
- unsigned char *rpl_datab; /* reply data block 'address' */
- unsigned char padx004[16 - sizeof(char *)];
- unsigned char *req_extb; /* request extension block 'addr'*/
- unsigned char padx005[16 - sizeof(char *)];
- unsigned char *rpl_extb; /* reply extension block 'address'*/
- unsigned short ccp_rtcode; /* server return code */
- unsigned short ccp_rscode; /* server reason code */
- unsigned int mac_data_len; /* Mac Data Length */
- unsigned char logon_id[8]; /* Logon Identifier */
- unsigned char mac_value[8]; /* Mac Value */
- unsigned char mac_content_flgs;/* Mac content flag byte */
- unsigned char pad_002; /* Alignment */
- unsigned short domain; /* Domain */
- unsigned char usage_domain[4];/* Usage domain */
- unsigned char cntrl_domain[4];/* Control domain */
- unsigned char S390enf_mask[4];/* S/390 enforcement mask */
- unsigned char pad_004[36]; /* reserved */
+ __u16 cprb_len; /* CPRB length 220 */
+ __u8 cprb_ver_id; /* CPRB version id. 0x02 */
+ __u8 ctfm; /* Command Type Filtering Mask */
+ __u8 pad_000[2]; /* Alignment pad bytes */
+ __u8 func_id[2]; /* function id 0x5432 */
+ __u8 cprb_flags[4]; /* Flags */
+ __u32 req_parml; /* request parameter buffer len */
+ __u32 req_datal; /* request data buffer */
+ __u32 rpl_msgbl; /* reply message block length */
+ __u32 rpld_parml; /* replied parameter block len */
+ __u32 rpl_datal; /* reply data block len */
+ __u32 rpld_datal; /* replied data block len */
+ __u32 req_extbl; /* request extension block len */
+ __u8 _pad_001[4]; /* reserved */
+ __u32 rpld_extbl; /* replied extension block len */
+ __u8 _pad_002[16 - sizeof(__u8 *)];
+ __u8 __user *req_parmb; /* request parm block 'address' */
+ __u8 _pad_003[16 - sizeof(__u8 *)];
+ __u8 __user *req_datab; /* request data block 'address' */
+ __u8 _pad_004[16 - sizeof(__u8 *)];
+ __u8 __user *rpl_parmb; /* reply parm block 'address' */
+ __u8 _pad_005[16 - sizeof(__u8 *)];
+ __u8 __user *rpl_datab; /* reply data block 'address' */
+ __u8 _pad_006[16 - sizeof(__u8 *)];
+ __u8 __user *req_extb; /* request extension block 'addr'*/
+ __u8 _pad_007[16 - sizeof(__u8 *)];
+ __u8 __user *rpl_extb; /* reply extension block 'address'*/
+ __u16 ccp_rtcode; /* server return code */
+ __u16 ccp_rscode; /* server reason code */
+ __u32 mac_data_len; /* Mac Data Length */
+ __u8 logon_id[8]; /* Logon Identifier */
+ __u8 mac_value[8]; /* Mac Value */
+ __u8 mac_content_flgs; /* Mac content flag byte */
+ __u8 _pad_008; /* Alignment */
+ __u16 domain; /* Domain */
+ __u8 _pad_009[12]; /* reserved, checked for zeros */
+ __u8 _pad_010[36]; /* reserved */
} __attribute__((packed));
/**
* xcRB
*/
struct ica_xcRB {
- unsigned short agent_ID;
- unsigned int user_defined;
- unsigned short request_ID;
- unsigned int request_control_blk_length;
- unsigned char padding1[16 - sizeof(char *)];
- char __user *request_control_blk_addr;
- unsigned int request_data_length;
- char padding2[16 - sizeof(char *)];
- char __user *request_data_address;
- unsigned int reply_control_blk_length;
- char padding3[16 - sizeof(char *)];
- char __user *reply_control_blk_addr;
- unsigned int reply_data_length;
- char padding4[16 - sizeof(char *)];
- char __user *reply_data_addr;
- unsigned short priority_window;
- unsigned int status;
+ __u16 agent_ID;
+ __u32 user_defined;
+ __u16 request_ID;
+ __u32 request_control_blk_length;
+ __u8 _padding1[16 - sizeof(__u8 *)];
+ __u8 __user *request_control_blk_addr;
+ __u32 request_data_length;
+ __u8 _padding2[16 - sizeof(__u8 *)];
+ __u8 __user *request_data_address;
+ __u32 reply_control_blk_length;
+ __u8 _padding3[16 - sizeof(__u8 *)];
+ __u8 __user *reply_control_blk_addr;
+ __u32 reply_data_length;
+ __u8 __padding4[16 - sizeof(__u8 *)];
+ __u8 __user *reply_data_addr;
+ __u16 priority_window;
+ __u32 status;
} __attribute__((packed));
/**
@@ -161,17 +160,17 @@ struct ica_xcRB {
* @payload_len: Payload length
*/
struct ep11_cprb {
- __u16 cprb_len;
- unsigned char cprb_ver_id;
- unsigned char pad_000[2];
- unsigned char flags;
- unsigned char func_id[2];
- __u32 source_id;
- __u32 target_id;
- __u32 ret_code;
- __u32 reserved1;
- __u32 reserved2;
- __u32 payload_len;
+ __u16 cprb_len;
+ __u8 cprb_ver_id;
+ __u8 pad_000[2];
+ __u8 flags;
+ __u8 func_id[2];
+ __u32 source_id;
+ __u32 target_id;
+ __u32 ret_code;
+ __u32 reserved1;
+ __u32 reserved2;
+ __u32 payload_len;
} __attribute__((packed));
/**
@@ -197,13 +196,13 @@ struct ep11_target_dev {
*/
struct ep11_urb {
__u16 targets_num;
- __u64 targets;
+ __u8 __user *targets;
__u64 weight;
__u64 req_no;
__u64 req_len;
- __u64 req;
+ __u8 __user *req;
__u64 resp_len;
- __u64 resp;
+ __u8 __user *resp;
} __attribute__((packed));
/**
@@ -237,7 +236,9 @@ struct zcrypt_device_matrix_ext {
struct zcrypt_device_status_ext device[MAX_ZDEV_ENTRIES_EXT];
};
-#define AUTOSELECT 0xFFFFFFFF
+#define AUTOSELECT 0xFFFFFFFF
+#define AUTOSEL_AP ((__u16)0xFFFF)
+#define AUTOSEL_DOM ((__u16)0xFFFF)
#define ZCRYPT_IOCTL_MAGIC 'z'
@@ -286,7 +287,7 @@ struct zcrypt_device_matrix_ext {
* 0x08: CEX3A
* 0x0a: CEX4
* 0x0b: CEX5
- * 0x0c: CEX6 and CEX7
+ * 0x0c: CEX6, CEX7 or CEX8
* 0x0d: device is disabled
*
* ZCRYPT_QDEPTH_MASK
@@ -303,12 +304,12 @@ struct zcrypt_device_matrix_ext {
/**
* Supported ioctl calls
*/
-#define ICARSAMODEXPO _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x05, 0)
-#define ICARSACRT _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x06, 0)
-#define ZSECSENDCPRB _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x81, 0)
-#define ZSENDEP11CPRB _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x04, 0)
+#define ICARSAMODEXPO _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x05, 0)
+#define ICARSACRT _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x06, 0)
+#define ZSECSENDCPRB _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x81, 0)
+#define ZSENDEP11CPRB _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x04, 0)
-#define ZCRYPT_DEVICE_STATUS _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x5f, 0)
+#define ZCRYPT_DEVICE_STATUS _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x5f, 0)
#define ZCRYPT_STATUS_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x58, char[MAX_ZDEV_CARDIDS_EXT])
#define ZCRYPT_QDEPTH_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x59, char[MAX_ZDEV_CARDIDS_EXT])
#define ZCRYPT_PERDEV_REQCNT _IOR(ZCRYPT_IOCTL_MAGIC, 0x5a, int[MAX_ZDEV_CARDIDS_EXT])
@@ -350,7 +351,7 @@ struct zcrypt_device_matrix {
};
/* Deprecated: use ZCRYPT_DEVICE_STATUS */
-#define ZDEVICESTATUS _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x4f, 0)
+#define ZDEVICESTATUS _IOC(_IOC_READ | _IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x4f, 0)
/* Deprecated: use ZCRYPT_STATUS_MASK */
#define Z90STAT_STATUS_MASK _IOR(ZCRYPT_IOCTL_MAGIC, 0x48, char[64])
/* Deprecated: use ZCRYPT_QDEPTH_MASK */
diff --git a/arch/s390/kernel/.gitignore b/arch/s390/kernel/.gitignore
index c5f676c3c224..bbb90f92d051 100644
--- a/arch/s390/kernel/.gitignore
+++ b/arch/s390/kernel/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
vmlinux.lds
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 2b1203cf7be6..7a562b4199c8 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -10,6 +10,7 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE)
# Do not trace early setup code
CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_rethook.o = $(CC_FLAGS_FTRACE)
endif
@@ -33,51 +34,52 @@ CFLAGS_stacktrace.o += -fno-optimize-sibling-calls
CFLAGS_dumpstack.o += -fno-optimize-sibling-calls
CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls
-#
-# Pass UTS_MACHINE for user_regset definition
-#
-CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"'
-
-obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
-obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
-obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o
-obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o
+obj-y := head64.o traps.o time.o process.o earlypgm.o early.o setup.o idle.o vtime.o
+obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
+obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o cpufeature.o
+obj-y += sysinfo.o lgr.o os_info.o ctlreg.o
obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
-obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
+obj-y += entry.o reipl.o kdebugfs.o alternative.o
obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
-obj-y += smp.o
+obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o
-extra-y += head64.o vmlinux.lds
+extra-y += vmlinux.lds
obj-$(CONFIG_SYSFS) += nospec-sysfs.o
CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE)
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o
-obj-$(CONFIG_HIBERNATION) += suspend.o swsusp.o
+obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_AUDIT) += audit.o
compat-obj-$(CONFIG_AUDIT) += compat_audit.o
obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o
obj-$(CONFIG_COMPAT) += $(compat-obj-y)
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
-obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-$(CONFIG_KPROBES) += kprobes.o
-obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o
+obj-$(CONFIG_KPROBES) += kprobes_insn_page.o
+obj-$(CONFIG_KPROBES) += mcount.o
+obj-$(CONFIG_RETHOOK) += rethook.o
+obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o
+obj-$(CONFIG_FUNCTION_TRACER) += mcount.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
+obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o
obj-$(CONFIG_UPROBES) += uprobes.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o
obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o
+obj-$(CONFIG_CERT_STORE) += cert_store.o
+obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o
-obj-$(CONFIG_IMA) += ima_arch.o
-
-obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o
+obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o
obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o
-obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_diag.o
+obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o perf_pai_ext.o
obj-$(CONFIG_TRACEPOINTS) += trace.o
+obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o
# vdso
obj-y += vdso64/
+obj-$(CONFIG_COMPAT) += vdso32/
diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c
new file mode 100644
index 000000000000..f9efc54ec4b7
--- /dev/null
+++ b/arch/s390/kernel/abs_lowcore.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/pgtable.h>
+#include <asm/abs_lowcore.h>
+
+unsigned long __bootdata_preserved(__abs_lowcore);
+
+int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc)
+{
+ unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore));
+ unsigned long phys = __pa(lc);
+ int rc, i;
+
+ for (i = 0; i < LC_PAGES; i++) {
+ rc = __vmem_map_4k_page(addr, phys, PAGE_KERNEL, alloc);
+ if (rc) {
+ /*
+ * Do not unmap allocated page tables in case the
+ * allocation was not requested. In such a case the
+ * request is expected coming from an atomic context,
+ * while the unmap attempt might sleep.
+ */
+ if (alloc) {
+ for (--i; i >= 0; i--) {
+ addr -= PAGE_SIZE;
+ vmem_unmap_4k_page(addr);
+ }
+ }
+ return rc;
+ }
+ addr += PAGE_SIZE;
+ phys += PAGE_SIZE;
+ }
+ return 0;
+}
+
+void abs_lowcore_unmap(int cpu)
+{
+ unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore));
+ int i;
+
+ for (i = 0; i < LC_PAGES; i++) {
+ vmem_unmap_4k_page(addr);
+ addr += PAGE_SIZE;
+ }
+}
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index 8e1f2aee85ef..e7bca29f9c34 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -1,11 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/facility.h>
#include <asm/nospec-branch.h>
-#define MAX_PATCH_LEN (255 - 1)
-
static int __initdata_or_module alt_instr_disabled;
static int __init disable_alternative_instructions(char *str)
@@ -16,86 +17,30 @@ static int __init disable_alternative_instructions(char *str)
early_param("noaltinstr", disable_alternative_instructions);
-struct brcl_insn {
- u16 opc;
- s32 disp;
-} __packed;
-
-static u16 __initdata_or_module nop16 = 0x0700;
-static u32 __initdata_or_module nop32 = 0x47000000;
-static struct brcl_insn __initdata_or_module nop48 = {
- 0xc004, 0
-};
-
-static const void *nops[] __initdata_or_module = {
- &nop16,
- &nop32,
- &nop48
-};
-
-static void __init_or_module add_jump_padding(void *insns, unsigned int len)
-{
- struct brcl_insn brcl = {
- 0xc0f4,
- len / 2
- };
-
- memcpy(insns, &brcl, sizeof(brcl));
- insns += sizeof(brcl);
- len -= sizeof(brcl);
-
- while (len > 0) {
- memcpy(insns, &nop16, 2);
- insns += 2;
- len -= 2;
- }
-}
-
-static void __init_or_module add_padding(void *insns, unsigned int len)
-{
- if (len > 6)
- add_jump_padding(insns, len);
- else if (len >= 2)
- memcpy(insns, nops[len / 2 - 1], len);
-}
-
static void __init_or_module __apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
struct alt_instr *a;
u8 *instr, *replacement;
- u8 insnbuf[MAX_PATCH_LEN];
/*
* The scan order should be from start to end. A later scanned
* alternative code can overwrite previously scanned alternative code.
*/
for (a = start; a < end; a++) {
- int insnbuf_sz = 0;
-
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
- if (!__test_facility(a->facility,
- S390_lowcore.alt_stfle_fac_list))
+ if (!__test_facility(a->facility, alt_stfle_fac_list))
continue;
- if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) {
+ if (unlikely(a->instrlen % 2)) {
WARN_ONCE(1, "cpu alternatives instructions length is "
"odd, skipping patching\n");
continue;
}
- memcpy(insnbuf, replacement, a->replacementlen);
- insnbuf_sz = a->replacementlen;
-
- if (a->instrlen > a->replacementlen) {
- add_padding(insnbuf + a->replacementlen,
- a->instrlen - a->replacementlen);
- insnbuf_sz += a->instrlen - a->replacementlen;
- }
-
- s390_kernel_write(instr, insnbuf, insnbuf_sz);
+ s390_kernel_write(instr, replacement, a->instrlen);
}
}
@@ -111,3 +56,20 @@ void __init apply_alternative_instructions(void)
{
apply_alternatives(__alt_instructions, __alt_instructions_end);
}
+
+static void do_sync_core(void *info)
+{
+ sync_core();
+}
+
+void text_poke_sync(void)
+{
+ on_each_cpu(do_sync_core, NULL, 1);
+}
+
+void text_poke_sync_lock(void)
+{
+ cpus_read_lock();
+ text_poke_sync();
+ cpus_read_unlock();
+}
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index ce33406cfe83..fa5f6885c74a 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -11,11 +11,10 @@
#include <linux/kvm_host.h>
#include <linux/sched.h>
#include <linux/purgatory.h>
+#include <linux/pgtable.h>
+#include <linux/ftrace.h>
#include <asm/idle.h>
-#include <asm/vdso.h>
-#include <asm/pgtable.h>
#include <asm/gmap.h>
-#include <asm/nmi.h>
#include <asm/stacktrace.h>
int main(void)
@@ -27,80 +26,55 @@ int main(void)
BLANK();
/* thread struct offsets */
OFFSET(__THREAD_ksp, thread_struct, ksp);
- OFFSET(__THREAD_sysc_table, thread_struct, sys_call_table);
- OFFSET(__THREAD_last_break, thread_struct, last_break);
- OFFSET(__THREAD_FPU_fpc, thread_struct, fpu.fpc);
- OFFSET(__THREAD_FPU_regs, thread_struct, fpu.regs);
- OFFSET(__THREAD_per_cause, thread_struct, per_event.cause);
- OFFSET(__THREAD_per_address, thread_struct, per_event.address);
- OFFSET(__THREAD_per_paid, thread_struct, per_event.paid);
- OFFSET(__THREAD_trap_tdb, thread_struct, trap_tdb);
BLANK();
/* thread info offsets */
OFFSET(__TI_flags, task_struct, thread_info.flags);
BLANK();
/* pt_regs offsets */
- OFFSET(__PT_ARGS, pt_regs, args);
OFFSET(__PT_PSW, pt_regs, psw);
OFFSET(__PT_GPRS, pt_regs, gprs);
+ OFFSET(__PT_R0, pt_regs, gprs[0]);
+ OFFSET(__PT_R1, pt_regs, gprs[1]);
+ OFFSET(__PT_R2, pt_regs, gprs[2]);
+ OFFSET(__PT_R3, pt_regs, gprs[3]);
+ OFFSET(__PT_R4, pt_regs, gprs[4]);
+ OFFSET(__PT_R5, pt_regs, gprs[5]);
+ OFFSET(__PT_R6, pt_regs, gprs[6]);
+ OFFSET(__PT_R7, pt_regs, gprs[7]);
+ OFFSET(__PT_R8, pt_regs, gprs[8]);
+ OFFSET(__PT_R9, pt_regs, gprs[9]);
+ OFFSET(__PT_R10, pt_regs, gprs[10]);
+ OFFSET(__PT_R11, pt_regs, gprs[11]);
+ OFFSET(__PT_R12, pt_regs, gprs[12]);
+ OFFSET(__PT_R13, pt_regs, gprs[13]);
+ OFFSET(__PT_R14, pt_regs, gprs[14]);
+ OFFSET(__PT_R15, pt_regs, gprs[15]);
OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2);
- OFFSET(__PT_INT_CODE, pt_regs, int_code);
- OFFSET(__PT_INT_PARM, pt_regs, int_parm);
- OFFSET(__PT_INT_PARM_LONG, pt_regs, int_parm_long);
OFFSET(__PT_FLAGS, pt_regs, flags);
+ OFFSET(__PT_CR1, pt_regs, cr1);
+ OFFSET(__PT_LAST_BREAK, pt_regs, last_break);
DEFINE(__PT_SIZE, sizeof(struct pt_regs));
BLANK();
/* stack_frame offsets */
OFFSET(__SF_BACKCHAIN, stack_frame, back_chain);
OFFSET(__SF_GPRS, stack_frame, gprs);
- OFFSET(__SF_EMPTY, stack_frame, empty1);
- OFFSET(__SF_SIE_CONTROL, stack_frame, empty1[0]);
- OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[1]);
- OFFSET(__SF_SIE_REASON, stack_frame, empty1[2]);
- OFFSET(__SF_SIE_FLAGS, stack_frame, empty1[3]);
- BLANK();
- /* timeval/timezone offsets for use by vdso */
- OFFSET(__VDSO_UPD_COUNT, vdso_data, tb_update_count);
- OFFSET(__VDSO_XTIME_STAMP, vdso_data, xtime_tod_stamp);
- OFFSET(__VDSO_XTIME_SEC, vdso_data, xtime_clock_sec);
- OFFSET(__VDSO_XTIME_NSEC, vdso_data, xtime_clock_nsec);
- OFFSET(__VDSO_XTIME_CRS_SEC, vdso_data, xtime_coarse_sec);
- OFFSET(__VDSO_XTIME_CRS_NSEC, vdso_data, xtime_coarse_nsec);
- OFFSET(__VDSO_WTOM_SEC, vdso_data, wtom_clock_sec);
- OFFSET(__VDSO_WTOM_NSEC, vdso_data, wtom_clock_nsec);
- OFFSET(__VDSO_WTOM_CRS_SEC, vdso_data, wtom_coarse_sec);
- OFFSET(__VDSO_WTOM_CRS_NSEC, vdso_data, wtom_coarse_nsec);
- OFFSET(__VDSO_TIMEZONE, vdso_data, tz_minuteswest);
- OFFSET(__VDSO_ECTG_OK, vdso_data, ectg_available);
- OFFSET(__VDSO_TK_MULT, vdso_data, tk_mult);
- OFFSET(__VDSO_TK_SHIFT, vdso_data, tk_shift);
- OFFSET(__VDSO_TS_DIR, vdso_data, ts_dir);
- OFFSET(__VDSO_TS_END, vdso_data, ts_end);
- OFFSET(__VDSO_ECTG_BASE, vdso_per_cpu_data, ectg_timer_base);
- OFFSET(__VDSO_ECTG_USER, vdso_per_cpu_data, ectg_user_time);
- OFFSET(__VDSO_GETCPU_VAL, vdso_per_cpu_data, getcpu_val);
- BLANK();
- /* constants used by the vdso */
- DEFINE(__CLOCK_REALTIME, CLOCK_REALTIME);
- DEFINE(__CLOCK_MONOTONIC, CLOCK_MONOTONIC);
- DEFINE(__CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
- DEFINE(__CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
- DEFINE(__CLOCK_THREAD_CPUTIME_ID, CLOCK_THREAD_CPUTIME_ID);
- DEFINE(__CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC);
- DEFINE(__CLOCK_COARSE_RES, LOW_RES_NSEC);
+ OFFSET(__SF_EMPTY, stack_frame, empty[0]);
+ OFFSET(__SF_SIE_CONTROL, stack_frame, sie_control_block);
+ OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea);
+ OFFSET(__SF_SIE_REASON, stack_frame, sie_reason);
+ OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags);
+ OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys);
+ DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame));
BLANK();
/* idle data offsets */
OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter);
- OFFSET(__CLOCK_IDLE_EXIT, s390_idle_data, clock_idle_exit);
OFFSET(__TIMER_IDLE_ENTER, s390_idle_data, timer_idle_enter);
- OFFSET(__TIMER_IDLE_EXIT, s390_idle_data, timer_idle_exit);
+ OFFSET(__MT_CYCLES_ENTER, s390_idle_data, mt_cycles_enter);
BLANK();
/* hardware defined lowcore locations 0x000 - 0x1ff */
OFFSET(__LC_EXT_PARAMS, lowcore, ext_params);
OFFSET(__LC_EXT_CPU_ADDR, lowcore, ext_cpu_addr);
OFFSET(__LC_EXT_INT_CODE, lowcore, ext_int_code);
- OFFSET(__LC_SVC_ILC, lowcore, svc_ilc);
- OFFSET(__LC_SVC_INT_CODE, lowcore, svc_code);
OFFSET(__LC_PGM_ILC, lowcore, pgm_ilc);
OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_code);
OFFSET(__LC_DATA_EXC_CODE, lowcore, data_exc_code);
@@ -118,12 +92,12 @@ int main(void)
OFFSET(__LC_SUBCHANNEL_NR, lowcore, subchannel_nr);
OFFSET(__LC_IO_INT_PARM, lowcore, io_int_parm);
OFFSET(__LC_IO_INT_WORD, lowcore, io_int_word);
- OFFSET(__LC_STFL_FAC_LIST, lowcore, stfl_fac_list);
- OFFSET(__LC_STFLE_FAC_LIST, lowcore, stfle_fac_list);
OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code);
OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code);
OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address);
- OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr);
+ OFFSET(__LC_PGM_LAST_BREAK, lowcore, pgm_last_break);
+ OFFSET(__LC_RETURN_LPSWE, lowcore, return_lpswe);
+ OFFSET(__LC_RETURN_MCCK_LPSWE, lowcore, return_mcck_lpswe);
OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw);
OFFSET(__LC_EXT_OLD_PSW, lowcore, external_old_psw);
OFFSET(__LC_SVC_OLD_PSW, lowcore, svc_old_psw);
@@ -143,39 +117,33 @@ int main(void)
OFFSET(__LC_CPU_FLAGS, lowcore, cpu_flags);
OFFSET(__LC_RETURN_PSW, lowcore, return_psw);
OFFSET(__LC_RETURN_MCCK_PSW, lowcore, return_mcck_psw);
- OFFSET(__LC_SYNC_ENTER_TIMER, lowcore, sync_enter_timer);
- OFFSET(__LC_ASYNC_ENTER_TIMER, lowcore, async_enter_timer);
+ OFFSET(__LC_SYS_ENTER_TIMER, lowcore, sys_enter_timer);
OFFSET(__LC_MCCK_ENTER_TIMER, lowcore, mcck_enter_timer);
OFFSET(__LC_EXIT_TIMER, lowcore, exit_timer);
- OFFSET(__LC_USER_TIMER, lowcore, user_timer);
- OFFSET(__LC_SYSTEM_TIMER, lowcore, system_timer);
- OFFSET(__LC_STEAL_TIMER, lowcore, steal_timer);
OFFSET(__LC_LAST_UPDATE_TIMER, lowcore, last_update_timer);
OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock);
OFFSET(__LC_INT_CLOCK, lowcore, int_clock);
- OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock);
- OFFSET(__LC_CLOCK_COMPARATOR, lowcore, clock_comparator);
OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock);
OFFSET(__LC_CURRENT, lowcore, current_task);
OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack);
OFFSET(__LC_ASYNC_STACK, lowcore, async_stack);
OFFSET(__LC_NODAT_STACK, lowcore, nodat_stack);
OFFSET(__LC_RESTART_STACK, lowcore, restart_stack);
+ OFFSET(__LC_MCCK_STACK, lowcore, mcck_stack);
OFFSET(__LC_RESTART_FN, lowcore, restart_fn);
OFFSET(__LC_RESTART_DATA, lowcore, restart_data);
OFFSET(__LC_RESTART_SOURCE, lowcore, restart_source);
+ OFFSET(__LC_RESTART_FLAGS, lowcore, restart_flags);
+ OFFSET(__LC_KERNEL_ASCE, lowcore, kernel_asce);
OFFSET(__LC_USER_ASCE, lowcore, user_asce);
- OFFSET(__LC_VDSO_ASCE, lowcore, vdso_asce);
OFFSET(__LC_LPP, lowcore, lpp);
OFFSET(__LC_CURRENT_PID, lowcore, current_pid);
- OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset);
- OFFSET(__LC_VDSO_PER_CPU, lowcore, vdso_per_cpu_data);
- OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags);
- OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count);
OFFSET(__LC_GMAP, lowcore, gmap);
- OFFSET(__LC_BR_R1, lowcore, br_r1_trampoline);
+ OFFSET(__LC_LAST_BREAK, lowcore, last_break);
/* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */
OFFSET(__LC_DUMP_REIPL, lowcore, ipib);
+ OFFSET(__LC_VMCORE_INFO, lowcore, vmcore_info);
+ OFFSET(__LC_OS_INFO, lowcore, os_info);
/* hardware defined lowcore locations 0x1000 - 0x18ff */
OFFSET(__LC_MCESAD, lowcore, mcesad);
OFFSET(__LC_EXT_PARAMS2, lowcore, ext_params2);
@@ -187,13 +155,11 @@ int main(void)
OFFSET(__LC_TOD_PROGREG_SAVE_AREA, lowcore, tod_progreg_save_area);
OFFSET(__LC_CPU_TIMER_SAVE_AREA, lowcore, cpu_timer_save_area);
OFFSET(__LC_CLOCK_COMP_SAVE_AREA, lowcore, clock_comp_save_area);
+ OFFSET(__LC_LAST_BREAK_SAVE_AREA, lowcore, last_break_save_area);
OFFSET(__LC_AREGS_SAVE_AREA, lowcore, access_regs_save_area);
OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area);
OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb);
BLANK();
- /* extended machine check save area */
- OFFSET(__MCESA_GS_SAVE_AREA, mcesa, guarded_storage_save_area);
- BLANK();
/* gmap/sie offsets */
OFFSET(__GMAP_ASCE, gmap, asce);
OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c);
@@ -202,5 +168,23 @@ int main(void)
OFFSET(__KEXEC_SHA_REGION_START, kexec_sha_region, start);
OFFSET(__KEXEC_SHA_REGION_LEN, kexec_sha_region, len);
DEFINE(__KEXEC_SHA_REGION_SIZE, sizeof(struct kexec_sha_region));
+ /* sizeof kernel parameter area */
+ DEFINE(__PARMAREA_SIZE, sizeof(struct parmarea));
+ /* kernel parameter area offsets */
+ DEFINE(IPL_DEVICE, PARMAREA + offsetof(struct parmarea, ipl_device));
+ DEFINE(INITRD_START, PARMAREA + offsetof(struct parmarea, initrd_start));
+ DEFINE(INITRD_SIZE, PARMAREA + offsetof(struct parmarea, initrd_size));
+ DEFINE(OLDMEM_BASE, PARMAREA + offsetof(struct parmarea, oldmem_base));
+ DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size));
+ DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line));
+ DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size));
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ /* function graph return value tracing */
+ OFFSET(__FGRAPH_RET_GPR2, fgraph_ret_regs, gpr2);
+ OFFSET(__FGRAPH_RET_FP, fgraph_ret_regs, fp);
+ DEFINE(__FGRAPH_RET_SIZE, sizeof(struct fgraph_ret_regs));
+#endif
+ OFFSET(__FTRACE_REGS_PT_REGS, ftrace_regs, regs);
+ DEFINE(__FTRACE_REGS_SIZE, sizeof(struct ftrace_regs));
return 0;
}
diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c
index d395c6c9944c..02051a596b87 100644
--- a/arch/s390/kernel/audit.c
+++ b/arch/s390/kernel/audit.c
@@ -47,15 +47,17 @@ int audit_classify_syscall(int abi, unsigned syscall)
#endif
switch(syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_socketcall:
- return 4;
+ return AUDITSC_SOCKETCALL;
case __NR_execve:
- return 5;
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 0;
+ return AUDITSC_NATIVE;
}
}
diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S
deleted file mode 100644
index b79e0fd571f8..000000000000
--- a/arch/s390/kernel/base.S
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * arch/s390/kernel/base.S
- *
- * Copyright IBM Corp. 2006, 2007
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
- * Michael Holzheu <holzheu@de.ibm.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
-#include <asm/nospec-insn.h>
-#include <asm/ptrace.h>
-#include <asm/sigp.h>
-
- GEN_BR_THUNK %r9
- GEN_BR_THUNK %r14
-
-ENTRY(s390_base_ext_handler)
- stmg %r0,%r15,__LC_SAVE_AREA_ASYNC
- basr %r13,0
-0: aghi %r15,-STACK_FRAME_OVERHEAD
- larl %r1,s390_base_ext_handler_fn
- lg %r9,0(%r1)
- ltgr %r9,%r9
- jz 1f
- BASR_EX %r14,%r9
-1: lmg %r0,%r15,__LC_SAVE_AREA_ASYNC
- ni __LC_EXT_OLD_PSW+1,0xfd # clear wait state bit
- lpswe __LC_EXT_OLD_PSW
-ENDPROC(s390_base_ext_handler)
-
- .section .bss
- .align 8
- .globl s390_base_ext_handler_fn
-s390_base_ext_handler_fn:
- .quad 0
- .previous
-
-ENTRY(s390_base_pgm_handler)
- stmg %r0,%r15,__LC_SAVE_AREA_SYNC
- basr %r13,0
-0: aghi %r15,-STACK_FRAME_OVERHEAD
- larl %r1,s390_base_pgm_handler_fn
- lg %r9,0(%r1)
- ltgr %r9,%r9
- jz 1f
- BASR_EX %r14,%r9
- lmg %r0,%r15,__LC_SAVE_AREA_SYNC
- lpswe __LC_PGM_OLD_PSW
-1: lpswe disabled_wait_psw-0b(%r13)
-ENDPROC(s390_base_pgm_handler)
-
- .align 8
-disabled_wait_psw:
- .quad 0x0002000180000000,0x0000000000000000 + s390_base_pgm_handler
-
- .section .bss
- .align 8
- .globl s390_base_pgm_handler_fn
-s390_base_pgm_handler_fn:
- .quad 0
- .previous
diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c
index d66825e53fce..56254fa06f99 100644
--- a/arch/s390/kernel/cache.c
+++ b/arch/s390/kernel/cache.c
@@ -3,7 +3,6 @@
* Extract CPU cache information and expose them via sysfs.
*
* Copyright IBM Corp. 2012
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#include <linux/seq_file.h>
@@ -47,7 +46,7 @@ struct cache_info {
#define CACHE_MAX_LEVEL 8
union cache_topology {
struct cache_info ci[CACHE_MAX_LEVEL];
- unsigned long long raw;
+ unsigned long raw;
};
static const char * const cache_type_string[] = {
@@ -71,8 +70,6 @@ void show_cacheinfo(struct seq_file *m)
struct cacheinfo *cache;
int idx;
- if (!test_facility(34))
- return;
this_cpu_ci = get_cpu_cacheinfo(cpumask_any(cpu_online_mask));
for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
cache = this_cpu_ci->info_list + idx;
@@ -132,8 +129,6 @@ int init_cache_level(unsigned int cpu)
union cache_topology ct;
enum cache_type ctype;
- if (!test_facility(34))
- return -EOPNOTSUPP;
if (!this_cpu_ci)
return -EINVAL;
ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0);
@@ -157,8 +152,6 @@ int populate_cache_leaves(unsigned int cpu)
union cache_topology ct;
enum cache_type ctype;
- if (!test_facility(34))
- return -EOPNOTSUPP;
ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0);
for (idx = 0, level = 0; level < this_cpu_ci->num_levels &&
idx < this_cpu_ci->num_leaves; idx++, level++) {
diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c
new file mode 100644
index 000000000000..554447768bdd
--- /dev/null
+++ b/arch/s390/kernel/cert_store.c
@@ -0,0 +1,812 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DIAG 0x320 support and certificate store handling
+ *
+ * Copyright IBM Corp. 2023
+ * Author(s): Anastasia Eskova <anastasia.eskova@ibm.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/key-type.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <crypto/sha2.h>
+#include <keys/user-type.h>
+#include <asm/debug.h>
+#include <asm/diag.h>
+#include <asm/ebcdic.h>
+#include <asm/sclp.h>
+
+#define DIAG_MAX_RETRIES 10
+
+#define VCE_FLAGS_VALID_MASK 0x80
+
+#define ISM_LEN_DWORDS 4
+#define VCSSB_LEN_BYTES 128
+#define VCSSB_LEN_NO_CERTS 4
+#define VCB_LEN_NO_CERTS 64
+#define VC_NAME_LEN_BYTES 64
+
+#define CERT_STORE_KEY_TYPE_NAME "cert_store_key"
+#define CERT_STORE_KEYRING_NAME "cert_store"
+
+static debug_info_t *cert_store_dbf;
+static debug_info_t *cert_store_hexdump;
+
+#define pr_dbf_msg(fmt, ...) \
+ debug_sprintf_event(cert_store_dbf, 3, fmt "\n", ## __VA_ARGS__)
+
+enum diag320_subcode {
+ DIAG320_SUBCODES = 0,
+ DIAG320_STORAGE = 1,
+ DIAG320_CERT_BLOCK = 2,
+};
+
+enum diag320_rc {
+ DIAG320_RC_OK = 0x0001,
+ DIAG320_RC_CS_NOMATCH = 0x0306,
+};
+
+/* Verification Certificates Store Support Block (VCSSB). */
+struct vcssb {
+ u32 vcssb_length;
+ u8 pad_0x04[3];
+ u8 version;
+ u8 pad_0x08[8];
+ u32 cs_token;
+ u8 pad_0x14[12];
+ u16 total_vc_index_count;
+ u16 max_vc_index_count;
+ u8 pad_0x24[28];
+ u32 max_vce_length;
+ u32 max_vcxe_length;
+ u8 pad_0x48[8];
+ u32 max_single_vcb_length;
+ u32 total_vcb_length;
+ u32 max_single_vcxb_length;
+ u32 total_vcxb_length;
+ u8 pad_0x60[32];
+} __packed __aligned(8);
+
+/* Verification Certificate Entry (VCE) Header. */
+struct vce_header {
+ u32 vce_length;
+ u8 flags;
+ u8 key_type;
+ u16 vc_index;
+ u8 vc_name[VC_NAME_LEN_BYTES]; /* EBCDIC */
+ u8 vc_format;
+ u8 pad_0x49;
+ u16 key_id_length;
+ u8 pad_0x4c;
+ u8 vc_hash_type;
+ u16 vc_hash_length;
+ u8 pad_0x50[4];
+ u32 vc_length;
+ u8 pad_0x58[8];
+ u16 vc_hash_offset;
+ u16 vc_offset;
+ u8 pad_0x64[28];
+} __packed __aligned(4);
+
+/* Verification Certificate Block (VCB) Header. */
+struct vcb_header {
+ u32 vcb_input_length;
+ u8 pad_0x04[4];
+ u16 first_vc_index;
+ u16 last_vc_index;
+ u32 pad_0x0c;
+ u32 cs_token;
+ u8 pad_0x14[12];
+ u32 vcb_output_length;
+ u8 pad_0x24[3];
+ u8 version;
+ u16 stored_vc_count;
+ u16 remaining_vc_count;
+ u8 pad_0x2c[20];
+} __packed __aligned(4);
+
+/* Verification Certificate Block (VCB). */
+struct vcb {
+ struct vcb_header vcb_hdr;
+ u8 vcb_buf[];
+} __packed __aligned(4);
+
+/* Verification Certificate Entry (VCE). */
+struct vce {
+ struct vce_header vce_hdr;
+ u8 cert_data_buf[];
+} __packed __aligned(4);
+
+static void cert_store_key_describe(const struct key *key, struct seq_file *m)
+{
+ char ascii[VC_NAME_LEN_BYTES + 1];
+
+ /*
+ * First 64 bytes of the key description is key name in EBCDIC CP 500.
+ * Convert it to ASCII for displaying in /proc/keys.
+ */
+ strscpy(ascii, key->description, sizeof(ascii));
+ EBCASC_500(ascii, VC_NAME_LEN_BYTES);
+ seq_puts(m, ascii);
+
+ seq_puts(m, &key->description[VC_NAME_LEN_BYTES]);
+ if (key_is_positive(key))
+ seq_printf(m, ": %u", key->datalen);
+}
+
+/*
+ * Certificate store key type takes over properties of
+ * user key but cannot be updated.
+ */
+static struct key_type key_type_cert_store_key = {
+ .name = CERT_STORE_KEY_TYPE_NAME,
+ .preparse = user_preparse,
+ .free_preparse = user_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .revoke = user_revoke,
+ .destroy = user_destroy,
+ .describe = cert_store_key_describe,
+ .read = user_read,
+};
+
+/* Logging functions. */
+static void pr_dbf_vcb(const struct vcb *b)
+{
+ pr_dbf_msg("VCB Header:");
+ pr_dbf_msg("vcb_input_length: %d", b->vcb_hdr.vcb_input_length);
+ pr_dbf_msg("first_vc_index: %d", b->vcb_hdr.first_vc_index);
+ pr_dbf_msg("last_vc_index: %d", b->vcb_hdr.last_vc_index);
+ pr_dbf_msg("cs_token: %d", b->vcb_hdr.cs_token);
+ pr_dbf_msg("vcb_output_length: %d", b->vcb_hdr.vcb_output_length);
+ pr_dbf_msg("version: %d", b->vcb_hdr.version);
+ pr_dbf_msg("stored_vc_count: %d", b->vcb_hdr.stored_vc_count);
+ pr_dbf_msg("remaining_vc_count: %d", b->vcb_hdr.remaining_vc_count);
+}
+
+static void pr_dbf_vce(const struct vce *e)
+{
+ unsigned char vc_name[VC_NAME_LEN_BYTES + 1];
+ char log_string[VC_NAME_LEN_BYTES + 40];
+
+ pr_dbf_msg("VCE Header:");
+ pr_dbf_msg("vce_hdr.vce_length: %d", e->vce_hdr.vce_length);
+ pr_dbf_msg("vce_hdr.flags: %d", e->vce_hdr.flags);
+ pr_dbf_msg("vce_hdr.key_type: %d", e->vce_hdr.key_type);
+ pr_dbf_msg("vce_hdr.vc_index: %d", e->vce_hdr.vc_index);
+ pr_dbf_msg("vce_hdr.vc_format: %d", e->vce_hdr.vc_format);
+ pr_dbf_msg("vce_hdr.key_id_length: %d", e->vce_hdr.key_id_length);
+ pr_dbf_msg("vce_hdr.vc_hash_type: %d", e->vce_hdr.vc_hash_type);
+ pr_dbf_msg("vce_hdr.vc_hash_length: %d", e->vce_hdr.vc_hash_length);
+ pr_dbf_msg("vce_hdr.vc_hash_offset: %d", e->vce_hdr.vc_hash_offset);
+ pr_dbf_msg("vce_hdr.vc_length: %d", e->vce_hdr.vc_length);
+ pr_dbf_msg("vce_hdr.vc_offset: %d", e->vce_hdr.vc_offset);
+
+ /* Certificate name in ASCII. */
+ memcpy(vc_name, e->vce_hdr.vc_name, VC_NAME_LEN_BYTES);
+ EBCASC_500(vc_name, VC_NAME_LEN_BYTES);
+ vc_name[VC_NAME_LEN_BYTES] = '\0';
+
+ snprintf(log_string, sizeof(log_string),
+ "index: %d vce_hdr.vc_name (ASCII): %s",
+ e->vce_hdr.vc_index, vc_name);
+ debug_text_event(cert_store_hexdump, 3, log_string);
+
+ /* Certificate data. */
+ debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data start");
+ debug_event(cert_store_hexdump, 3, (u8 *)e->cert_data_buf, 128);
+ debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data end");
+ debug_event(cert_store_hexdump, 3,
+ (u8 *)e->cert_data_buf + e->vce_hdr.vce_length - 128, 128);
+}
+
+static void pr_dbf_vcssb(const struct vcssb *s)
+{
+ debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode1");
+ debug_event(cert_store_hexdump, 3, (u8 *)s, VCSSB_LEN_BYTES);
+
+ pr_dbf_msg("VCSSB:");
+ pr_dbf_msg("vcssb_length: %u", s->vcssb_length);
+ pr_dbf_msg("version: %u", s->version);
+ pr_dbf_msg("cs_token: %u", s->cs_token);
+ pr_dbf_msg("total_vc_index_count: %u", s->total_vc_index_count);
+ pr_dbf_msg("max_vc_index_count: %u", s->max_vc_index_count);
+ pr_dbf_msg("max_vce_length: %u", s->max_vce_length);
+ pr_dbf_msg("max_vcxe_length: %u", s->max_vce_length);
+ pr_dbf_msg("max_single_vcb_length: %u", s->max_single_vcb_length);
+ pr_dbf_msg("total_vcb_length: %u", s->total_vcb_length);
+ pr_dbf_msg("max_single_vcxb_length: %u", s->max_single_vcxb_length);
+ pr_dbf_msg("total_vcxb_length: %u", s->total_vcxb_length);
+}
+
+static int __diag320(unsigned long subcode, void *addr)
+{
+ union register_pair rp = { .even = (unsigned long)addr, };
+
+ asm volatile(
+ " diag %[rp],%[subcode],0x320\n"
+ "0: nopr %%r7\n"
+ EX_TABLE(0b, 0b)
+ : [rp] "+d" (rp.pair)
+ : [subcode] "d" (subcode)
+ : "cc", "memory");
+
+ return rp.odd;
+}
+
+static int diag320(unsigned long subcode, void *addr)
+{
+ diag_stat_inc(DIAG_STAT_X320);
+
+ return __diag320(subcode, addr);
+}
+
+/*
+ * Calculate SHA256 hash of the VCE certificate and compare it to hash stored in
+ * VCE. Return -EINVAL if hashes don't match.
+ */
+static int check_certificate_hash(const struct vce *vce)
+{
+ u8 hash[SHA256_DIGEST_SIZE];
+ u16 vc_hash_length;
+ u8 *vce_hash;
+
+ vce_hash = (u8 *)vce + vce->vce_hdr.vc_hash_offset;
+ vc_hash_length = vce->vce_hdr.vc_hash_length;
+ sha256((u8 *)vce + vce->vce_hdr.vc_offset, vce->vce_hdr.vc_length, hash);
+ if (memcmp(vce_hash, hash, vc_hash_length) == 0)
+ return 0;
+
+ pr_dbf_msg("SHA256 hash of received certificate does not match");
+ debug_text_event(cert_store_hexdump, 3, "VCE hash:");
+ debug_event(cert_store_hexdump, 3, vce_hash, SHA256_DIGEST_SIZE);
+ debug_text_event(cert_store_hexdump, 3, "Calculated hash:");
+ debug_event(cert_store_hexdump, 3, hash, SHA256_DIGEST_SIZE);
+
+ return -EINVAL;
+}
+
+static int check_certificate_valid(const struct vce *vce)
+{
+ if (!(vce->vce_hdr.flags & VCE_FLAGS_VALID_MASK)) {
+ pr_dbf_msg("Certificate entry is invalid");
+ return -EINVAL;
+ }
+ if (vce->vce_hdr.vc_format != 1) {
+ pr_dbf_msg("Certificate format is not supported");
+ return -EINVAL;
+ }
+ if (vce->vce_hdr.vc_hash_type != 1) {
+ pr_dbf_msg("Hash type is not supported");
+ return -EINVAL;
+ }
+
+ return check_certificate_hash(vce);
+}
+
+static struct key *get_user_session_keyring(void)
+{
+ key_ref_t us_keyring_ref;
+
+ us_keyring_ref = lookup_user_key(KEY_SPEC_USER_SESSION_KEYRING,
+ KEY_LOOKUP_CREATE, KEY_NEED_LINK);
+ if (IS_ERR(us_keyring_ref)) {
+ pr_dbf_msg("Couldn't get user session keyring: %ld",
+ PTR_ERR(us_keyring_ref));
+ return ERR_PTR(-ENOKEY);
+ }
+ key_ref_put(us_keyring_ref);
+ return key_ref_to_ptr(us_keyring_ref);
+}
+
+/* Invalidate all keys from cert_store keyring. */
+static int invalidate_keyring_keys(struct key *keyring)
+{
+ unsigned long num_keys, key_index;
+ size_t keyring_payload_len;
+ key_serial_t *key_array;
+ struct key *current_key;
+ int rc;
+
+ keyring_payload_len = key_type_keyring.read(keyring, NULL, 0);
+ num_keys = keyring_payload_len / sizeof(key_serial_t);
+ key_array = kcalloc(num_keys, sizeof(key_serial_t), GFP_KERNEL);
+ if (!key_array)
+ return -ENOMEM;
+
+ rc = key_type_keyring.read(keyring, (char *)key_array, keyring_payload_len);
+ if (rc != keyring_payload_len) {
+ pr_dbf_msg("Couldn't read keyring payload");
+ goto out;
+ }
+
+ for (key_index = 0; key_index < num_keys; key_index++) {
+ current_key = key_lookup(key_array[key_index]);
+ pr_dbf_msg("Invalidating key %08x", current_key->serial);
+
+ key_invalidate(current_key);
+ key_put(current_key);
+ rc = key_unlink(keyring, current_key);
+ if (rc) {
+ pr_dbf_msg("Couldn't unlink key %08x: %d", current_key->serial, rc);
+ break;
+ }
+ }
+out:
+ kfree(key_array);
+ return rc;
+}
+
+static struct key *find_cs_keyring(void)
+{
+ key_ref_t cs_keyring_ref;
+ struct key *cs_keyring;
+
+ cs_keyring_ref = keyring_search(make_key_ref(get_user_session_keyring(), true),
+ &key_type_keyring, CERT_STORE_KEYRING_NAME,
+ false);
+ if (!IS_ERR(cs_keyring_ref)) {
+ cs_keyring = key_ref_to_ptr(cs_keyring_ref);
+ key_ref_put(cs_keyring_ref);
+ goto found;
+ }
+ /* Search default locations: thread, process, session keyrings */
+ cs_keyring = request_key(&key_type_keyring, CERT_STORE_KEYRING_NAME, NULL);
+ if (IS_ERR(cs_keyring))
+ return NULL;
+ key_put(cs_keyring);
+found:
+ return cs_keyring;
+}
+
+static void cleanup_cs_keys(void)
+{
+ struct key *cs_keyring;
+
+ cs_keyring = find_cs_keyring();
+ if (!cs_keyring)
+ return;
+
+ pr_dbf_msg("Found cert_store keyring. Purging...");
+ /*
+ * Remove cert_store_key_type in case invalidation
+ * of old cert_store keys failed (= severe error).
+ */
+ if (invalidate_keyring_keys(cs_keyring))
+ unregister_key_type(&key_type_cert_store_key);
+
+ keyring_clear(cs_keyring);
+ key_invalidate(cs_keyring);
+ key_put(cs_keyring);
+ key_unlink(get_user_session_keyring(), cs_keyring);
+}
+
+static struct key *create_cs_keyring(void)
+{
+ static struct key *cs_keyring;
+
+ /* Cleanup previous cs_keyring and all associated keys if any. */
+ cleanup_cs_keys();
+ cs_keyring = keyring_alloc(CERT_STORE_KEYRING_NAME, GLOBAL_ROOT_UID,
+ GLOBAL_ROOT_GID, current_cred(),
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_SET_KEEP,
+ NULL, get_user_session_keyring());
+ if (IS_ERR(cs_keyring)) {
+ pr_dbf_msg("Can't allocate cert_store keyring");
+ return NULL;
+ }
+
+ pr_dbf_msg("Successfully allocated cert_store keyring: %08x", cs_keyring->serial);
+
+ /*
+ * In case a previous clean-up ran into an
+ * error and unregistered key type.
+ */
+ register_key_type(&key_type_cert_store_key);
+
+ return cs_keyring;
+}
+
+/*
+ * Allocate memory and create key description in format
+ * [key name in EBCDIC]:[VCE index]:[CS token].
+ * Return a pointer to key description or NULL if memory
+ * allocation failed. Memory should be freed by caller.
+ */
+static char *get_key_description(struct vcssb *vcssb, const struct vce *vce)
+{
+ size_t len, name_len;
+ u32 cs_token;
+ char *desc;
+
+ cs_token = vcssb->cs_token;
+ /* Description string contains "%64s:%05u:%010u\0". */
+ name_len = sizeof(vce->vce_hdr.vc_name);
+ len = name_len + 1 + 5 + 1 + 10 + 1;
+ desc = kmalloc(len, GFP_KERNEL);
+ if (!desc)
+ return NULL;
+
+ memcpy(desc, vce->vce_hdr.vc_name, name_len);
+ snprintf(desc + name_len, len - name_len, ":%05u:%010u",
+ vce->vce_hdr.vc_index, cs_token);
+
+ return desc;
+}
+
+/*
+ * Create a key of type "cert_store_key" using the data from VCE for key
+ * payload and key description. Link the key to "cert_store" keyring.
+ */
+static int create_key_from_vce(struct vcssb *vcssb, struct vce *vce,
+ struct key *keyring)
+{
+ key_ref_t newkey;
+ char *desc;
+ int rc;
+
+ desc = get_key_description(vcssb, vce);
+ if (!desc)
+ return -ENOMEM;
+
+ newkey = key_create_or_update(
+ make_key_ref(keyring, true), CERT_STORE_KEY_TYPE_NAME,
+ desc, (u8 *)vce + vce->vce_hdr.vc_offset,
+ vce->vce_hdr.vc_length,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA);
+
+ rc = PTR_ERR_OR_ZERO(newkey);
+ if (rc) {
+ pr_dbf_msg("Couldn't create a key from Certificate Entry (%d)", rc);
+ rc = -ENOKEY;
+ goto out;
+ }
+
+ key_ref_put(newkey);
+out:
+ kfree(desc);
+ return rc;
+}
+
+/* Get Verification Certificate Storage Size block with DIAG320 subcode2. */
+static int get_vcssb(struct vcssb *vcssb)
+{
+ int diag320_rc;
+
+ memset(vcssb, 0, sizeof(*vcssb));
+ vcssb->vcssb_length = VCSSB_LEN_BYTES;
+ diag320_rc = diag320(DIAG320_STORAGE, vcssb);
+ pr_dbf_vcssb(vcssb);
+
+ if (diag320_rc != DIAG320_RC_OK) {
+ pr_dbf_msg("Diag 320 Subcode 1 returned bad RC: %04x", diag320_rc);
+ return -EIO;
+ }
+ if (vcssb->vcssb_length == VCSSB_LEN_NO_CERTS) {
+ pr_dbf_msg("No certificates available for current configuration");
+ return -ENOKEY;
+ }
+
+ return 0;
+}
+
+static u32 get_4k_mult_vcb_size(struct vcssb *vcssb)
+{
+ return round_up(vcssb->max_single_vcb_length, PAGE_SIZE);
+}
+
+/* Fill input fields of single-entry VCB that will be read by LPAR. */
+static void fill_vcb_input(struct vcssb *vcssb, struct vcb *vcb, u16 index)
+{
+ memset(vcb, 0, sizeof(*vcb));
+ vcb->vcb_hdr.vcb_input_length = get_4k_mult_vcb_size(vcssb);
+ vcb->vcb_hdr.cs_token = vcssb->cs_token;
+
+ /* Request single entry. */
+ vcb->vcb_hdr.first_vc_index = index;
+ vcb->vcb_hdr.last_vc_index = index;
+}
+
+static void extract_vce_from_sevcb(struct vcb *vcb, struct vce *vce)
+{
+ struct vce *extracted_vce;
+
+ extracted_vce = (struct vce *)vcb->vcb_buf;
+ memcpy(vce, vcb->vcb_buf, extracted_vce->vce_hdr.vce_length);
+ pr_dbf_vce(vce);
+}
+
+static int get_sevcb(struct vcssb *vcssb, u16 index, struct vcb *vcb)
+{
+ int rc, diag320_rc;
+
+ fill_vcb_input(vcssb, vcb, index);
+
+ diag320_rc = diag320(DIAG320_CERT_BLOCK, vcb);
+ pr_dbf_msg("Diag 320 Subcode2 RC %2x", diag320_rc);
+ pr_dbf_vcb(vcb);
+
+ switch (diag320_rc) {
+ case DIAG320_RC_OK:
+ rc = 0;
+ if (vcb->vcb_hdr.vcb_output_length == VCB_LEN_NO_CERTS) {
+ pr_dbf_msg("No certificate entry for index %u", index);
+ rc = -ENOKEY;
+ } else if (vcb->vcb_hdr.remaining_vc_count != 0) {
+ /* Retry on insufficient space. */
+ pr_dbf_msg("Couldn't get all requested certificates");
+ rc = -EAGAIN;
+ }
+ break;
+ case DIAG320_RC_CS_NOMATCH:
+ pr_dbf_msg("Certificate Store token mismatch");
+ rc = -EAGAIN;
+ break;
+ default:
+ pr_dbf_msg("Diag 320 Subcode2 returned bad rc (0x%4x)", diag320_rc);
+ rc = -EINVAL;
+ break;
+ }
+
+ return rc;
+}
+
+/*
+ * Allocate memory for single-entry VCB, get VCB via DIAG320 subcode 2 call,
+ * extract VCE and create a key from its' certificate.
+ */
+static int create_key_from_sevcb(struct vcssb *vcssb, u16 index,
+ struct key *keyring)
+{
+ struct vcb *vcb;
+ struct vce *vce;
+ int rc;
+
+ rc = -ENOMEM;
+ vcb = vmalloc(get_4k_mult_vcb_size(vcssb));
+ vce = vmalloc(vcssb->max_single_vcb_length - sizeof(vcb->vcb_hdr));
+ if (!vcb || !vce)
+ goto out;
+
+ rc = get_sevcb(vcssb, index, vcb);
+ if (rc)
+ goto out;
+
+ extract_vce_from_sevcb(vcb, vce);
+ rc = check_certificate_valid(vce);
+ if (rc)
+ goto out;
+
+ rc = create_key_from_vce(vcssb, vce, keyring);
+ if (rc)
+ goto out;
+
+ pr_dbf_msg("Successfully created key from Certificate Entry %d", index);
+out:
+ vfree(vce);
+ vfree(vcb);
+ return rc;
+}
+
+/*
+ * Request a single-entry VCB for each VCE available for the partition.
+ * Create a key from it and link it to cert_store keyring. If no keys
+ * could be created (i.e. VCEs were invalid) return -ENOKEY.
+ */
+static int add_certificates_to_keyring(struct vcssb *vcssb, struct key *keyring)
+{
+ int rc, index, count, added;
+
+ count = 0;
+ added = 0;
+ /* Certificate Store entries indices start with 1 and have no gaps. */
+ for (index = 1; index < vcssb->total_vc_index_count + 1; index++) {
+ pr_dbf_msg("Creating key from VCE %u", index);
+ rc = create_key_from_sevcb(vcssb, index, keyring);
+ count++;
+
+ if (rc == -EAGAIN)
+ return rc;
+
+ if (rc)
+ pr_dbf_msg("Creating key from VCE %u failed (%d)", index, rc);
+ else
+ added++;
+ }
+
+ if (added == 0) {
+ pr_dbf_msg("Processed %d entries. No keys created", count);
+ return -ENOKEY;
+ }
+
+ pr_info("Added %d of %d keys to cert_store keyring", added, count);
+
+ /*
+ * Do not allow to link more keys to certificate store keyring after all
+ * the VCEs were processed.
+ */
+ rc = keyring_restrict(make_key_ref(keyring, true), NULL, NULL);
+ if (rc)
+ pr_dbf_msg("Failed to set restriction to cert_store keyring (%d)", rc);
+
+ return 0;
+}
+
+/*
+ * Check which DIAG320 subcodes are installed.
+ * Return -ENOENT if subcodes 1 or 2 are not available.
+ */
+static int query_diag320_subcodes(void)
+{
+ unsigned long ism[ISM_LEN_DWORDS];
+ int rc;
+
+ rc = diag320(0, ism);
+ if (rc != DIAG320_RC_OK) {
+ pr_dbf_msg("DIAG320 subcode query returned %04x", rc);
+ return -ENOENT;
+ }
+
+ debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode 0");
+ debug_event(cert_store_hexdump, 3, ism, sizeof(ism));
+
+ if (!test_bit_inv(1, ism) || !test_bit_inv(2, ism)) {
+ pr_dbf_msg("Not all required DIAG320 subcodes are installed");
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+/*
+ * Check if Certificate Store is supported by the firmware and DIAG320 subcodes
+ * 1 and 2 are installed. Create cert_store keyring and link all certificates
+ * available for the current partition to it as "cert_store_key" type
+ * keys. On refresh or error invalidate cert_store keyring and destroy
+ * all keys of "cert_store_key" type.
+ */
+static int fill_cs_keyring(void)
+{
+ struct key *cs_keyring;
+ struct vcssb *vcssb;
+ int rc;
+
+ rc = -ENOMEM;
+ vcssb = kmalloc(VCSSB_LEN_BYTES, GFP_KERNEL);
+ if (!vcssb)
+ goto cleanup_keys;
+
+ rc = -ENOENT;
+ if (!sclp.has_diag320) {
+ pr_dbf_msg("Certificate Store is not supported");
+ goto cleanup_keys;
+ }
+
+ rc = query_diag320_subcodes();
+ if (rc)
+ goto cleanup_keys;
+
+ rc = get_vcssb(vcssb);
+ if (rc)
+ goto cleanup_keys;
+
+ rc = -ENOMEM;
+ cs_keyring = create_cs_keyring();
+ if (!cs_keyring)
+ goto cleanup_keys;
+
+ rc = add_certificates_to_keyring(vcssb, cs_keyring);
+ if (rc)
+ goto cleanup_cs_keyring;
+
+ goto out;
+
+cleanup_cs_keyring:
+ key_put(cs_keyring);
+cleanup_keys:
+ cleanup_cs_keys();
+out:
+ kfree(vcssb);
+ return rc;
+}
+
+static DEFINE_MUTEX(cs_refresh_lock);
+static int cs_status_val = -1;
+
+static ssize_t cs_status_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ if (cs_status_val == -1)
+ return sysfs_emit(buf, "uninitialized\n");
+ else if (cs_status_val == 0)
+ return sysfs_emit(buf, "ok\n");
+
+ return sysfs_emit(buf, "failed (%d)\n", cs_status_val);
+}
+
+static struct kobj_attribute cs_status_attr = __ATTR_RO(cs_status);
+
+static ssize_t refresh_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int rc, retries;
+
+ pr_dbf_msg("Refresh certificate store information requested");
+ rc = mutex_lock_interruptible(&cs_refresh_lock);
+ if (rc)
+ return rc;
+
+ for (retries = 0; retries < DIAG_MAX_RETRIES; retries++) {
+ /* Request certificates from certificate store. */
+ rc = fill_cs_keyring();
+ if (rc)
+ pr_dbf_msg("Failed to refresh certificate store information (%d)", rc);
+ if (rc != -EAGAIN)
+ break;
+ }
+ cs_status_val = rc;
+ mutex_unlock(&cs_refresh_lock);
+
+ return rc ?: count;
+}
+
+static struct kobj_attribute refresh_attr = __ATTR_WO(refresh);
+
+static const struct attribute *cert_store_attrs[] __initconst = {
+ &cs_status_attr.attr,
+ &refresh_attr.attr,
+ NULL,
+};
+
+static struct kobject *cert_store_kobj;
+
+static int __init cert_store_init(void)
+{
+ int rc = -ENOMEM;
+
+ cert_store_dbf = debug_register("cert_store_msg", 10, 1, 64);
+ if (!cert_store_dbf)
+ goto cleanup_dbf;
+
+ cert_store_hexdump = debug_register("cert_store_hexdump", 3, 1, 128);
+ if (!cert_store_hexdump)
+ goto cleanup_dbf;
+
+ debug_register_view(cert_store_hexdump, &debug_hex_ascii_view);
+ debug_register_view(cert_store_dbf, &debug_sprintf_view);
+
+ /* Create directory /sys/firmware/cert_store. */
+ cert_store_kobj = kobject_create_and_add("cert_store", firmware_kobj);
+ if (!cert_store_kobj)
+ goto cleanup_dbf;
+
+ rc = sysfs_create_files(cert_store_kobj, cert_store_attrs);
+ if (rc)
+ goto cleanup_kobj;
+
+ register_key_type(&key_type_cert_store_key);
+
+ return rc;
+
+cleanup_kobj:
+ kobject_put(cert_store_kobj);
+cleanup_dbf:
+ debug_unregister(cert_store_dbf);
+ debug_unregister(cert_store_hexdump);
+
+ return rc;
+}
+device_initcall(cert_store_init);
diff --git a/arch/s390/kernel/compat_audit.c b/arch/s390/kernel/compat_audit.c
index 444fb1f66944..a7c46e8310f0 100644
--- a/arch/s390/kernel/compat_audit.c
+++ b/arch/s390/kernel/compat_audit.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#undef __s390x__
+#include <linux/audit_arch.h>
#include <asm/unistd.h>
#include "audit.h"
@@ -32,14 +33,16 @@ int s390_classify_syscall(unsigned syscall)
{
switch(syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_socketcall:
- return 4;
+ return AUDITSC_SOCKETCALL;
case __NR_execve:
- return 5;
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 1;
+ return AUDITSC_COMPAT;
}
}
diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h
index 64509e7dbd3b..ef23739b277c 100644
--- a/arch/s390/kernel/compat_linux.h
+++ b/arch/s390/kernel/compat_linux.h
@@ -5,69 +5,59 @@
#include <linux/compat.h>
#include <linux/socket.h>
#include <linux/syscalls.h>
+#include <asm/ptrace.h>
-/* Macro that masks the high order bit of an 32 bit pointer and converts it*/
-/* to a 64 bit pointer */
-#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL))
-#define AA(__x) \
- ((unsigned long)(__x))
+/*
+ * Macro that masks the high order bit of a 32 bit pointer and
+ * converts it to a 64 bit pointer.
+ */
+#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL))
+#define AA(__x) ((unsigned long)(__x))
/* Now 32bit compatibility types */
struct ipc_kludge_32 {
- __u32 msgp; /* pointer */
- __s32 msgtyp;
+ __u32 msgp; /* pointer */
+ __s32 msgtyp;
};
/* asm/sigcontext.h */
-typedef union
-{
- __u64 d;
- __u32 f;
+typedef union {
+ __u64 d;
+ __u32 f;
} freg_t32;
-typedef struct
-{
+typedef struct {
unsigned int fpc;
unsigned int pad;
- freg_t32 fprs[__NUM_FPRS];
+ freg_t32 fprs[__NUM_FPRS];
} _s390_fp_regs32;
-typedef struct
-{
- __u32 mask;
- __u32 addr;
-} _psw_t32 __attribute__ ((aligned(8)));
-
-typedef struct
-{
- _psw_t32 psw;
+typedef struct {
+ psw_t32 psw;
__u32 gprs[__NUM_GPRS];
__u32 acrs[__NUM_ACRS];
} _s390_regs_common32;
-typedef struct
-{
+typedef struct {
_s390_regs_common32 regs;
- _s390_fp_regs32 fpregs;
+ _s390_fp_regs32 fpregs;
} _sigregs32;
-typedef struct
-{
- __u32 gprs_high[__NUM_GPRS];
- __u64 vxrs_low[__NUM_VXRS_LOW];
- __vector128 vxrs_high[__NUM_VXRS_HIGH];
- __u8 __reserved[128];
+typedef struct {
+ __u32 gprs_high[__NUM_GPRS];
+ __u64 vxrs_low[__NUM_VXRS_LOW];
+ __vector128 vxrs_high[__NUM_VXRS_HIGH];
+ __u8 __reserved[128];
} _sigregs_ext32;
#define _SIGCONTEXT_NSIG32 64
#define _SIGCONTEXT_NSIG_BPW32 32
#define __SIGNAL_FRAMESIZE32 96
-#define _SIGMASK_COPY_SIZE32 (sizeof(u32)*2)
+#define _SIGMASK_COPY_SIZE32 (sizeof(u32) * 2)
-struct sigcontext32
-{
+struct sigcontext32 {
__u32 oldmask[_COMPAT_NSIG_WORDS];
- __u32 sregs; /* pointer */
+ __u32 sregs; /* pointer */
};
/* asm/signal.h */
@@ -75,11 +65,11 @@ struct sigcontext32
/* asm/ucontext.h */
struct ucontext32 {
__u32 uc_flags;
- __u32 uc_link; /* pointer */
+ __u32 uc_link; /* pointer */
compat_stack_t uc_stack;
_sigregs32 uc_mcontext;
compat_sigset_t uc_sigmask;
- /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */
+ /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */
unsigned char __unused[128 - sizeof(compat_sigset_t)];
_sigregs_ext32 uc_mcontext_ext;
};
@@ -88,25 +78,6 @@ struct stat64_emu31;
struct mmap_arg_struct_emu31;
struct fadvise64_64_args;
-long compat_sys_s390_chown16(const char __user *filename, u16 user, u16 group);
-long compat_sys_s390_lchown16(const char __user *filename, u16 user, u16 group);
-long compat_sys_s390_fchown16(unsigned int fd, u16 user, u16 group);
-long compat_sys_s390_setregid16(u16 rgid, u16 egid);
-long compat_sys_s390_setgid16(u16 gid);
-long compat_sys_s390_setreuid16(u16 ruid, u16 euid);
-long compat_sys_s390_setuid16(u16 uid);
-long compat_sys_s390_setresuid16(u16 ruid, u16 euid, u16 suid);
-long compat_sys_s390_getresuid16(u16 __user *ruid, u16 __user *euid, u16 __user *suid);
-long compat_sys_s390_setresgid16(u16 rgid, u16 egid, u16 sgid);
-long compat_sys_s390_getresgid16(u16 __user *rgid, u16 __user *egid, u16 __user *sgid);
-long compat_sys_s390_setfsuid16(u16 uid);
-long compat_sys_s390_setfsgid16(u16 gid);
-long compat_sys_s390_getgroups16(int gidsetsize, u16 __user *grouplist);
-long compat_sys_s390_setgroups16(int gidsetsize, u16 __user *grouplist);
-long compat_sys_s390_getuid16(void);
-long compat_sys_s390_geteuid16(void);
-long compat_sys_s390_getgid16(void);
-long compat_sys_s390_getegid16(void);
long compat_sys_s390_truncate64(const char __user *path, u32 high, u32 low);
long compat_sys_s390_ftruncate64(unsigned int fd, u32 high, u32 low);
long compat_sys_s390_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 high, u32 low);
@@ -118,8 +89,8 @@ long compat_sys_s390_fstat64(unsigned int fd, struct stat64_emu31 __user *statbu
long compat_sys_s390_fstatat64(unsigned int dfd, const char __user *filename, struct stat64_emu31 __user *statbuf, int flag);
long compat_sys_s390_old_mmap(struct mmap_arg_struct_emu31 __user *arg);
long compat_sys_s390_mmap2(struct mmap_arg_struct_emu31 __user *arg);
-long compat_sys_s390_read(unsigned int fd, char __user * buf, compat_size_t count);
-long compat_sys_s390_write(unsigned int fd, const char __user * buf, compat_size_t count);
+long compat_sys_s390_read(unsigned int fd, char __user *buf, compat_size_t count);
+long compat_sys_s390_write(unsigned int fd, const char __user *buf, compat_size_t count);
long compat_sys_s390_fadvise64(int fd, u32 high, u32 low, compat_size_t len, int advise);
long compat_sys_s390_fadvise64_64(struct fadvise64_64_args __user *args);
long compat_sys_s390_sync_file_range(int fd, u32 offhigh, u32 offlow, u32 nhigh, u32 nlow, unsigned int flags);
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 38d4bdbc34b9..f8fc6c25d051 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -28,6 +28,8 @@
#include <linux/uaccess.h>
#include <asm/lowcore.h>
#include <asm/switch_to.h>
+#include <asm/vdso.h>
+#include <asm/fpu/api.h>
#include "compat_linux.h"
#include "compat_ptrace.h"
#include "entry.h"
@@ -88,7 +90,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
_sigregs32 user_sregs;
int i;
- /* Alwys make any pending restarted system call return -EINTR */
+ /* Always make any pending restarted system call return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs)))
@@ -97,10 +99,6 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW32_MASK_RI))
return -EINVAL;
- /* Test the floating-point-control word. */
- if (test_fp_ctl(user_sregs.fpregs.fpc))
- return -EINVAL;
-
/* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */
regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) |
(__u64)(user_sregs.regs.psw.mask & PSW32_MASK_USER) << 32 |
@@ -136,9 +134,9 @@ static int save_sigregs_ext32(struct pt_regs *regs,
return -EFAULT;
/* Save vector registers to signal stack */
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
for (i = 0; i < __NUM_VXRS_LOW; i++)
- vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1);
+ vxrs[i] = current->thread.fpu.vxrs[i].low;
if (__copy_to_user(&sregs_ext->vxrs_low, vxrs,
sizeof(sregs_ext->vxrs_low)) ||
__copy_to_user(&sregs_ext->vxrs_high,
@@ -164,7 +162,7 @@ static int restore_sigregs_ext32(struct pt_regs *regs,
*(__u32 *)&regs->gprs[i] = gprs_high[i];
/* Restore vector registers from signal stack */
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
if (__copy_from_user(vxrs, &sregs_ext->vxrs_low,
sizeof(sregs_ext->vxrs_low)) ||
__copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW,
@@ -172,7 +170,7 @@ static int restore_sigregs_ext32(struct pt_regs *regs,
sizeof(sregs_ext->vxrs_high)))
return -EFAULT;
for (i = 0; i < __NUM_VXRS_LOW; i++)
- *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i];
+ current->thread.fpu.vxrs[i].low = vxrs[i];
}
return 0;
}
@@ -264,7 +262,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set,
* the machine supports it
*/
frame_size = sizeof(*frame) - sizeof(frame->sregs_ext.__reserved);
- if (!MACHINE_HAS_VX)
+ if (!cpu_has_vx())
frame_size -= sizeof(frame->sregs_ext.vxrs_low) +
sizeof(frame->sregs_ext.vxrs_high);
frame = get_sigframe(&ksig->ka, regs, frame_size);
@@ -303,11 +301,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set,
restorer = (unsigned long __force)
ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
} else {
- /* Signal frames without vectors registers are short ! */
- __u16 __user *svc = (void __user *) frame + frame_size - 2;
- if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
- return -EFAULT;
- restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
+ restorer = VDSO32_SYMBOL(current, sigreturn);
}
/* Set up registers for signal handler */
@@ -351,11 +345,12 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
* the machine supports it
*/
uc_flags = UC_GPRS_HIGH;
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
uc_flags |= UC_VXRS;
- } else
+ } else {
frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) +
sizeof(frame->uc.uc_mcontext_ext.vxrs_high);
+ }
frame = get_sigframe(&ksig->ka, regs, frame_size);
if (frame == (void __user *) -1UL)
return -EFAULT;
@@ -370,10 +365,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
restorer = (unsigned long __force)
ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
} else {
- __u16 __user *svc = &frame->svc_insn;
- if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
- return -EFAULT;
- restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
+ restorer = VDSO32_SYMBOL(current, rt_sigreturn);
}
/* Create siginfo on the signal stack */
diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c
index af013b4244d3..b210a29d3ee9 100644
--- a/arch/s390/kernel/cpcmd.c
+++ b/arch/s390/kernel/cpcmd.c
@@ -16,41 +16,45 @@
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/io.h>
#include <asm/diag.h>
#include <asm/ebcdic.h>
#include <asm/cpcmd.h>
-#include <asm/io.h>
static DEFINE_SPINLOCK(cpcmd_lock);
static char cpcmd_buf[241];
static int diag8_noresponse(int cmdlen)
{
- register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf;
- register unsigned long reg3 asm ("3") = cmdlen;
-
asm volatile(
- " diag %1,%0,0x8\n"
- : "+d" (reg3) : "d" (reg2) : "cc");
- return reg3;
+ " diag %[rx],%[ry],0x8\n"
+ : [ry] "+&d" (cmdlen)
+ : [rx] "d" (__pa(cpcmd_buf))
+ : "cc");
+ return cmdlen;
}
static int diag8_response(int cmdlen, char *response, int *rlen)
{
- register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf;
- register unsigned long reg3 asm ("3") = (addr_t) response;
- register unsigned long reg4 asm ("4") = cmdlen | 0x40000000L;
- register unsigned long reg5 asm ("5") = *rlen;
+ union register_pair rx, ry;
+ int cc;
+ rx.even = __pa(cpcmd_buf);
+ rx.odd = __pa(response);
+ ry.even = cmdlen | 0x40000000L;
+ ry.odd = *rlen;
asm volatile(
- " diag %2,%0,0x8\n"
- " brc 8,1f\n"
- " agr %1,%4\n"
- "1:\n"
- : "+d" (reg4), "+d" (reg5)
- : "d" (reg2), "d" (reg3), "d" (*rlen) : "cc");
- *rlen = reg5;
- return reg4;
+ " diag %[rx],%[ry],0x8\n"
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
+ : [cc] "=&d" (cc), [ry] "+&d" (ry.pair)
+ : [rx] "d" (rx.pair)
+ : "cc");
+ if (cc)
+ *rlen += ry.odd;
+ else
+ *rlen = ry.odd;
+ return ry.even;
}
/*
diff --git a/arch/s390/kernel/cpufeature.c b/arch/s390/kernel/cpufeature.c
new file mode 100644
index 000000000000..1b2ae42a0c15
--- /dev/null
+++ b/arch/s390/kernel/cpufeature.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2022
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/bug.h>
+#include <asm/elf.h>
+
+enum {
+ TYPE_HWCAP,
+ TYPE_FACILITY,
+};
+
+struct s390_cpu_feature {
+ unsigned int type : 4;
+ unsigned int num : 28;
+};
+
+static struct s390_cpu_feature s390_cpu_features[MAX_CPU_FEATURES] = {
+ [S390_CPU_FEATURE_MSA] = {.type = TYPE_HWCAP, .num = HWCAP_NR_MSA},
+ [S390_CPU_FEATURE_VXRS] = {.type = TYPE_HWCAP, .num = HWCAP_NR_VXRS},
+ [S390_CPU_FEATURE_UV] = {.type = TYPE_FACILITY, .num = 158},
+};
+
+/*
+ * cpu_have_feature - Test CPU features on module initialization
+ */
+int cpu_have_feature(unsigned int num)
+{
+ struct s390_cpu_feature *feature;
+
+ if (WARN_ON_ONCE(num >= MAX_CPU_FEATURES))
+ return 0;
+ feature = &s390_cpu_features[num];
+ switch (feature->type) {
+ case TYPE_HWCAP:
+ return !!(elf_hwcap & BIT(feature->num));
+ case TYPE_FACILITY:
+ return test_facility(feature->num);
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+}
+EXPORT_SYMBOL(cpu_have_feature);
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index f96a5857bbfd..5c46c2659305 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -15,11 +15,14 @@
#include <linux/slab.h>
#include <linux/memblock.h>
#include <linux/elf.h>
+#include <linux/uio.h>
#include <asm/asm-offsets.h>
#include <asm/os_info.h>
#include <asm/elf.h>
#include <asm/ipl.h>
#include <asm/sclp.h>
+#include <asm/maccess.h>
+#include <asm/fpu/api.h>
#define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y)))
#define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y)))
@@ -44,7 +47,7 @@ struct save_area {
u64 fprs[16];
u32 fpc;
u32 prefix;
- u64 todpreg;
+ u32 todpreg;
u64 timer;
u64 todcmp;
u64 vxrs_low[16];
@@ -60,9 +63,9 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu)
{
struct save_area *sa;
- sa = (void *) memblock_phys_alloc(sizeof(*sa), 8);
+ sa = memblock_alloc(sizeof(*sa), 8);
if (!sa)
- panic("Failed to allocate save area\n");
+ return NULL;
if (is_boot_cpu)
list_add(&sa->list, &dump_save_areas);
@@ -108,126 +111,65 @@ void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs)
/* Copy lower halves of vector registers 0-15 */
for (i = 0; i < 16; i++)
- memcpy(&sa->vxrs_low[i], &vxrs[i].u[2], 8);
+ sa->vxrs_low[i] = vxrs[i].low;
/* Copy vector registers 16-31 */
memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128));
}
-/*
- * Return physical address for virtual address
- */
-static inline void *load_real_addr(void *addr)
-{
- unsigned long real_addr;
-
- asm volatile(
- " lra %0,0(%1)\n"
- " jz 0f\n"
- " la %0,0\n"
- "0:"
- : "=a" (real_addr) : "a" (addr) : "cc");
- return (void *)real_addr;
-}
-
-/*
- * Copy memory of the old, dumped system to a kernel space virtual address
- */
-int copy_oldmem_kernel(void *dst, void *src, size_t count)
+static size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count)
{
- unsigned long from, len;
- void *ra;
- int rc;
+ size_t len, copied, res = 0;
while (count) {
- from = __pa(src);
- if (!OLDMEM_BASE && from < sclp.hsa_size) {
- /* Copy from zfcpdump HSA area */
- len = min(count, sclp.hsa_size - from);
- rc = memcpy_hsa_kernel(dst, from, len);
- if (rc)
- return rc;
+ if (!oldmem_data.start && src < sclp.hsa_size) {
+ /* Copy from zfcp/nvme dump HSA area */
+ len = min(count, sclp.hsa_size - src);
+ copied = memcpy_hsa_iter(iter, src, len);
} else {
/* Check for swapped kdump oldmem areas */
- if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) {
- from -= OLDMEM_BASE;
- len = min(count, OLDMEM_SIZE - from);
- } else if (OLDMEM_BASE && from < OLDMEM_SIZE) {
- len = min(count, OLDMEM_SIZE - from);
- from += OLDMEM_BASE;
+ if (oldmem_data.start && src - oldmem_data.start < oldmem_data.size) {
+ src -= oldmem_data.start;
+ len = min(count, oldmem_data.size - src);
+ } else if (oldmem_data.start && src < oldmem_data.size) {
+ len = min(count, oldmem_data.size - src);
+ src += oldmem_data.start;
} else {
len = count;
}
- if (is_vmalloc_or_module_addr(dst)) {
- ra = load_real_addr(dst);
- len = min(PAGE_SIZE - offset_in_page(ra), len);
- } else {
- ra = dst;
- }
- if (memcpy_real(ra, (void *) from, len))
- return -EFAULT;
+ copied = memcpy_real_iter(iter, src, len);
}
- dst += len;
- src += len;
- count -= len;
+ count -= copied;
+ src += copied;
+ res += copied;
+ if (copied < len)
+ break;
}
- return 0;
+ return res;
}
-/*
- * Copy memory of the old, dumped system to a user space virtual address
- */
-static int copy_oldmem_user(void __user *dst, void *src, size_t count)
+int copy_oldmem_kernel(void *dst, unsigned long src, size_t count)
{
- unsigned long from, len;
- int rc;
+ struct iov_iter iter;
+ struct kvec kvec;
- while (count) {
- from = __pa(src);
- if (!OLDMEM_BASE && from < sclp.hsa_size) {
- /* Copy from zfcpdump HSA area */
- len = min(count, sclp.hsa_size - from);
- rc = memcpy_hsa_user(dst, from, len);
- if (rc)
- return rc;
- } else {
- /* Check for swapped kdump oldmem areas */
- if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) {
- from -= OLDMEM_BASE;
- len = min(count, OLDMEM_SIZE - from);
- } else if (OLDMEM_BASE && from < OLDMEM_SIZE) {
- len = min(count, OLDMEM_SIZE - from);
- from += OLDMEM_BASE;
- } else {
- len = count;
- }
- rc = copy_to_user_real(dst, (void *) from, count);
- if (rc)
- return rc;
- }
- dst += len;
- src += len;
- count -= len;
- }
+ kvec.iov_base = dst;
+ kvec.iov_len = count;
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+ if (copy_oldmem_iter(&iter, src, count) < count)
+ return -EFAULT;
return 0;
}
/*
* Copy one page from "oldmem"
*/
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
- unsigned long offset, int userbuf)
+ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize,
+ unsigned long offset)
{
- void *src;
- int rc;
+ unsigned long src;
- if (!csize)
- return 0;
- src = (void *) (pfn << PAGE_SHIFT) + offset;
- if (userbuf)
- rc = copy_oldmem_user((void __force __user *) buf, src, csize);
- else
- rc = copy_oldmem_kernel((void *) buf, src, csize);
- return rc;
+ src = pfn_to_phys(pfn) + offset;
+ return copy_oldmem_iter(iter, src, csize);
}
/*
@@ -243,10 +185,10 @@ static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma,
unsigned long size_old;
int rc;
- if (pfn < OLDMEM_SIZE >> PAGE_SHIFT) {
- size_old = min(size, OLDMEM_SIZE - (pfn << PAGE_SHIFT));
+ if (pfn < oldmem_data.size >> PAGE_SHIFT) {
+ size_old = min(size, oldmem_data.size - (pfn << PAGE_SHIFT));
rc = remap_pfn_range(vma, from,
- pfn + (OLDMEM_BASE >> PAGE_SHIFT),
+ pfn + (oldmem_data.start >> PAGE_SHIFT),
size_old, prot);
if (rc || size == size_old)
return rc;
@@ -258,7 +200,7 @@ static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma,
}
/*
- * Remap "oldmem" for zfcpdump
+ * Remap "oldmem" for zfcp/nvme dump
*
* We only map available memory above HSA size. Memory below HSA size
* is read on demand using the copy_oldmem_page() function.
@@ -283,12 +225,12 @@ static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma,
}
/*
- * Remap "oldmem" for kdump or zfcpdump
+ * Remap "oldmem" for kdump or zfcp/nvme dump
*/
int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
- if (OLDMEM_BASE)
+ if (oldmem_data.start)
return remap_oldmem_pfn_range_kdump(vma, from, pfn, size, prot);
else
return remap_oldmem_pfn_range_zfcpdump(vma, from, pfn, size,
@@ -365,7 +307,7 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa)
memcpy(&nt_prstatus.pr_reg.gprs, sa->gprs, sizeof(sa->gprs));
memcpy(&nt_prstatus.pr_reg.psw, sa->psw, sizeof(sa->psw));
memcpy(&nt_prstatus.pr_reg.acrs, sa->acrs, sizeof(sa->acrs));
- nt_prstatus.pr_pid = cpu;
+ nt_prstatus.common.pr_pid = cpu;
/* Prepare fpregset (floating point) note */
memset(&nt_fpregset, 0, sizeof(nt_fpregset));
memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc));
@@ -378,7 +320,7 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa)
ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg));
ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs));
ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix));
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
ptr = nt_init(ptr, NT_S390_VXRS_HIGH,
&sa->vxrs_high, sizeof(sa->vxrs_high));
ptr = nt_init(ptr, NT_S390_VXRS_LOW,
@@ -402,7 +344,7 @@ static size_t get_cpu_elf_notes_size(void)
size += nt_size(NT_S390_TODPREG, sizeof(sa->todpreg));
size += nt_size(NT_S390_CTRS, sizeof(sa->ctrs));
size += nt_size(NT_S390_PREFIX, sizeof(sa->prefix));
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high));
size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low));
}
@@ -429,10 +371,10 @@ static void *nt_prpsinfo(void *ptr)
static void *get_vmcoreinfo_old(unsigned long *size)
{
char nt_name[11], *vmcoreinfo;
+ unsigned long addr;
Elf64_Nhdr note;
- void *addr;
- if (copy_oldmem_kernel(&addr, &S390_lowcore.vmcore_info, sizeof(addr)))
+ if (copy_oldmem_kernel(&addr, __LC_VMCORE_INFO, sizeof(addr)))
return NULL;
memset(nt_name, 0, sizeof(nt_name));
if (copy_oldmem_kernel(&note, addr, sizeof(note)))
@@ -549,8 +491,7 @@ static int get_mem_chunk_cnt(void)
int cnt = 0;
u64 idx;
- for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE,
- MEMBLOCK_NONE, NULL, NULL, NULL)
+ for_each_physmem_range(idx, &oldmem_type, NULL, NULL)
cnt++;
return cnt;
}
@@ -558,17 +499,16 @@ static int get_mem_chunk_cnt(void)
/*
* Initialize ELF loads (new kernel)
*/
-static void loads_init(Elf64_Phdr *phdr, u64 loads_offset)
+static void loads_init(Elf64_Phdr *phdr)
{
phys_addr_t start, end;
u64 idx;
- for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE,
- MEMBLOCK_NONE, &start, &end, NULL) {
+ for_each_physmem_range(idx, &oldmem_type, &start, &end) {
phdr->p_filesz = end - start;
phdr->p_type = PT_LOAD;
phdr->p_offset = start;
- phdr->p_vaddr = start;
+ phdr->p_vaddr = (unsigned long)__va(start);
phdr->p_paddr = start;
phdr->p_memsz = end - start;
phdr->p_flags = PF_R | PF_W | PF_X;
@@ -629,23 +569,23 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt)
int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
{
Elf64_Phdr *phdr_notes, *phdr_loads;
+ size_t alloc_size;
int mem_chunk_cnt;
void *ptr, *hdr;
- u32 alloc_size;
u64 hdr_off;
- /* If we are not in kdump or zfcpdump mode return */
- if (!OLDMEM_BASE && ipl_info.type != IPL_TYPE_FCP_DUMP)
+ /* If we are not in kdump or zfcp/nvme dump mode return */
+ if (!oldmem_data.start && !is_ipl_type_dump())
return 0;
- /* If we cannot get HSA size for zfcpdump return error */
- if (ipl_info.type == IPL_TYPE_FCP_DUMP && !sclp.hsa_size)
+ /* If we cannot get HSA size for zfcp/nvme dump return error */
+ if (is_ipl_type_dump() && !sclp.hsa_size)
return -ENODEV;
/* For kdump, exclude previous crashkernel memory */
- if (OLDMEM_BASE) {
- oldmem_region.base = OLDMEM_BASE;
- oldmem_region.size = OLDMEM_SIZE;
- oldmem_type.total_size = OLDMEM_SIZE;
+ if (oldmem_data.start) {
+ oldmem_region.base = oldmem_data.start;
+ oldmem_region.size = oldmem_data.size;
+ oldmem_type.total_size = oldmem_data.size;
}
mem_chunk_cnt = get_mem_chunk_cnt();
@@ -673,7 +613,7 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off);
/* Init loads */
hdr_off = PTR_DIFF(ptr, hdr);
- loads_init(phdr_loads, hdr_off);
+ loads_init(phdr_loads);
*addr = (unsigned long long) hdr;
*size = (unsigned long long) hdr_off;
BUG_ON(elfcorehdr_size > alloc_size);
diff --git a/arch/s390/kernel/ctlreg.c b/arch/s390/kernel/ctlreg.c
new file mode 100644
index 000000000000..8cc26cf2c64a
--- /dev/null
+++ b/arch/s390/kernel/ctlreg.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 1999, 2023
+ */
+
+#include <linux/irqflags.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/cache.h>
+#include <asm/abs_lowcore.h>
+#include <asm/ctlreg.h>
+
+/*
+ * ctl_lock guards access to global control register contents which
+ * are kept in the control register save area within absolute lowcore
+ * at physical address zero.
+ */
+static DEFINE_SPINLOCK(system_ctl_lock);
+
+void system_ctlreg_lock(void)
+ __acquires(&system_ctl_lock)
+{
+ spin_lock(&system_ctl_lock);
+}
+
+void system_ctlreg_unlock(void)
+ __releases(&system_ctl_lock)
+{
+ spin_unlock(&system_ctl_lock);
+}
+
+static bool system_ctlreg_area_init __ro_after_init;
+
+void __init system_ctlreg_init_save_area(struct lowcore *lc)
+{
+ struct lowcore *abs_lc;
+
+ abs_lc = get_abs_lowcore();
+ __local_ctl_store(0, 15, lc->cregs_save_area);
+ __local_ctl_store(0, 15, abs_lc->cregs_save_area);
+ put_abs_lowcore(abs_lc);
+ system_ctlreg_area_init = true;
+}
+
+struct ctlreg_parms {
+ unsigned long andval;
+ unsigned long orval;
+ unsigned long val;
+ int request;
+ int cr;
+};
+
+static void ctlreg_callback(void *info)
+{
+ struct ctlreg_parms *pp = info;
+ struct ctlreg regs[16];
+
+ __local_ctl_store(0, 15, regs);
+ if (pp->request == CTLREG_LOAD) {
+ regs[pp->cr].val = pp->val;
+ } else {
+ regs[pp->cr].val &= pp->andval;
+ regs[pp->cr].val |= pp->orval;
+ }
+ __local_ctl_load(0, 15, regs);
+}
+
+static void system_ctlreg_update(void *info)
+{
+ unsigned long flags;
+
+ if (system_state == SYSTEM_BOOTING) {
+ /*
+ * For very early calls do not call on_each_cpu()
+ * since not everything might be setup.
+ */
+ local_irq_save(flags);
+ ctlreg_callback(info);
+ local_irq_restore(flags);
+ } else {
+ on_each_cpu(ctlreg_callback, info, 1);
+ }
+}
+
+void system_ctlreg_modify(unsigned int cr, unsigned long data, int request)
+{
+ struct ctlreg_parms pp = { .cr = cr, .request = request, };
+ struct lowcore *abs_lc;
+
+ switch (request) {
+ case CTLREG_SET_BIT:
+ pp.orval = 1UL << data;
+ pp.andval = -1UL;
+ break;
+ case CTLREG_CLEAR_BIT:
+ pp.orval = 0;
+ pp.andval = ~(1UL << data);
+ break;
+ case CTLREG_LOAD:
+ pp.val = data;
+ break;
+ }
+ if (system_ctlreg_area_init) {
+ system_ctlreg_lock();
+ abs_lc = get_abs_lowcore();
+ if (request == CTLREG_LOAD) {
+ abs_lc->cregs_save_area[cr].val = pp.val;
+ } else {
+ abs_lc->cregs_save_area[cr].val &= pp.andval;
+ abs_lc->cregs_save_area[cr].val |= pp.orval;
+ }
+ put_abs_lowcore(abs_lc);
+ system_ctlreg_update(&pp);
+ system_ctlreg_unlock();
+ } else {
+ system_ctlreg_update(&pp);
+ }
+}
+EXPORT_SYMBOL(system_ctlreg_modify);
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 6d321f5f101d..85328a0ef3b6 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -2,7 +2,7 @@
/*
* S/390 debug facility
*
- * Copyright IBM Corp. 1999, 2012
+ * Copyright IBM Corp. 1999, 2020
*
* Author(s): Michael Holzheu (holzheu@de.ibm.com),
* Holger Smolinski (Holger.Smolinski@de.ibm.com)
@@ -24,6 +24,7 @@
#include <linux/export.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/minmax.h>
#include <linux/debugfs.h>
#include <asm/debug.h>
@@ -59,7 +60,7 @@ typedef struct {
* except of floats, and long long (32 bit)
*
*/
- long args[0];
+ long args[];
} debug_sprintf_entry_t;
/* internal function prototyes */
@@ -90,27 +91,13 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view,
size_t user_buf_size, loff_t *offset);
static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
char *out_buf, const char *in_buf);
-static int debug_raw_format_fn(debug_info_t *id,
- struct debug_view *view, char *out_buf,
- const char *in_buf);
-static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view,
- int area, debug_entry_t *entry, char *out_buf);
-
static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
- char *out_buf, debug_sprintf_entry_t *curr_event);
+ char *out_buf, const char *inbuf);
+static void debug_areas_swap(debug_info_t *a, debug_info_t *b);
+static void debug_events_append(debug_info_t *dest, debug_info_t *src);
/* globals */
-struct debug_view debug_raw_view = {
- "raw",
- NULL,
- &debug_raw_header_fn,
- &debug_raw_format_fn,
- NULL,
- NULL
-};
-EXPORT_SYMBOL(debug_raw_view);
-
struct debug_view debug_hex_ascii_view = {
"hex_ascii",
NULL,
@@ -152,7 +139,7 @@ struct debug_view debug_sprintf_view = {
"sprintf",
NULL,
&debug_dflt_header_fn,
- (debug_format_proc_t *)&debug_sprintf_format_fn,
+ &debug_sprintf_format_fn,
NULL,
NULL
};
@@ -198,9 +185,10 @@ static debug_entry_t ***debug_areas_alloc(int pages_per_area, int nr_areas)
if (!areas)
goto fail_malloc_areas;
for (i = 0; i < nr_areas; i++) {
+ /* GFP_NOWARN to avoid user triggerable WARN, we handle fails */
areas[i] = kmalloc_array(pages_per_area,
sizeof(debug_entry_t *),
- GFP_KERNEL);
+ GFP_KERNEL | __GFP_NOWARN);
if (!areas[i])
goto fail_malloc_areas2;
for (j = 0; j < pages_per_area; j++) {
@@ -262,7 +250,7 @@ static debug_info_t *debug_info_alloc(const char *name, int pages_per_area,
rc->level = level;
rc->buf_size = buf_size;
rc->entry_size = sizeof(debug_entry_t) + buf_size;
- strlcpy(rc->name, name, sizeof(rc->name));
+ strscpy(rc->name, name, sizeof(rc->name));
memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *));
memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *));
refcount_set(&(rc->ref_count), 0);
@@ -326,24 +314,6 @@ static debug_info_t *debug_info_create(const char *name, int pages_per_area,
goto out;
rc->mode = mode & ~S_IFMT;
-
- /* create root directory */
- rc->debugfs_root_entry = debugfs_create_dir(rc->name,
- debug_debugfs_root_entry);
-
- /* append new element to linked list */
- if (!debug_area_first) {
- /* first element in list */
- debug_area_first = rc;
- rc->prev = NULL;
- } else {
- /* append element to end of list */
- debug_area_last->next = rc;
- rc->prev = debug_area_last;
- }
- debug_area_last = rc;
- rc->next = NULL;
-
refcount_set(&rc->ref_count, 1);
out:
return rc;
@@ -403,27 +373,10 @@ static void debug_info_get(debug_info_t *db_info)
*/
static void debug_info_put(debug_info_t *db_info)
{
- int i;
-
if (!db_info)
return;
- if (refcount_dec_and_test(&db_info->ref_count)) {
- for (i = 0; i < DEBUG_MAX_VIEWS; i++) {
- if (!db_info->views[i])
- continue;
- debugfs_remove(db_info->debugfs_entries[i]);
- }
- debugfs_remove(db_info->debugfs_root_entry);
- if (db_info == debug_area_first)
- debug_area_first = db_info->next;
- if (db_info == debug_area_last)
- debug_area_last = db_info->prev;
- if (db_info->prev)
- db_info->prev->next = db_info->next;
- if (db_info->next)
- db_info->next->prev = db_info->prev;
+ if (refcount_dec_and_test(&db_info->ref_count))
debug_info_free(db_info);
- }
}
/*
@@ -448,7 +401,7 @@ static int debug_format_entry(file_private_info_t *p_info)
act_entry = (debug_entry_t *) ((char *)id_snap->areas[p_info->act_area]
[p_info->act_page] + p_info->act_entry);
- if (act_entry->id.stck == 0LL)
+ if (act_entry->clock == 0LL)
goto out; /* empty entry */
if (view->header_proc)
len += view->header_proc(id_snap, view, p_info->act_area,
@@ -647,6 +600,31 @@ static int debug_close(struct inode *inode, struct file *file)
return 0; /* success */
}
+/* Create debugfs entries and add to internal list. */
+static void _debug_register(debug_info_t *id)
+{
+ /* create root directory */
+ id->debugfs_root_entry = debugfs_create_dir(id->name,
+ debug_debugfs_root_entry);
+
+ /* append new element to linked list */
+ if (!debug_area_first) {
+ /* first element in list */
+ debug_area_first = id;
+ id->prev = NULL;
+ } else {
+ /* append element to end of list */
+ debug_area_last->next = id;
+ id->prev = debug_area_last;
+ }
+ debug_area_last = id;
+ id->next = NULL;
+
+ debug_register_view(id, &debug_level_view);
+ debug_register_view(id, &debug_flush_view);
+ debug_register_view(id, &debug_pages_view);
+}
+
/**
* debug_register_mode() - creates and initializes debug area.
*
@@ -676,19 +654,16 @@ debug_info_t *debug_register_mode(const char *name, int pages_per_area,
if ((uid != 0) || (gid != 0))
pr_warn("Root becomes the owner of all s390dbf files in sysfs\n");
BUG_ON(!initialized);
- mutex_lock(&debug_mutex);
/* create new debug_info */
rc = debug_info_create(name, pages_per_area, nr_areas, buf_size, mode);
- if (!rc)
- goto out;
- debug_register_view(rc, &debug_level_view);
- debug_register_view(rc, &debug_flush_view);
- debug_register_view(rc, &debug_pages_view);
-out:
- if (!rc)
+ if (rc) {
+ mutex_lock(&debug_mutex);
+ _debug_register(rc);
+ mutex_unlock(&debug_mutex);
+ } else {
pr_err("Registering debug feature %s failed\n", name);
- mutex_unlock(&debug_mutex);
+ }
return rc;
}
EXPORT_SYMBOL(debug_register_mode);
@@ -718,6 +693,82 @@ debug_info_t *debug_register(const char *name, int pages_per_area,
EXPORT_SYMBOL(debug_register);
/**
+ * debug_register_static() - registers a static debug area
+ *
+ * @id: Handle for static debug area
+ * @pages_per_area: Number of pages per area
+ * @nr_areas: Number of debug areas
+ *
+ * Register debug_info_t defined using DEFINE_STATIC_DEBUG_INFO.
+ *
+ * Note: This function is called automatically via an initcall generated by
+ * DEFINE_STATIC_DEBUG_INFO.
+ */
+void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas)
+{
+ unsigned long flags;
+ debug_info_t *copy;
+
+ if (!initialized) {
+ pr_err("Tried to register debug feature %s too early\n",
+ id->name);
+ return;
+ }
+
+ copy = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size,
+ id->level, ALL_AREAS);
+ if (!copy) {
+ pr_err("Registering debug feature %s failed\n", id->name);
+
+ /* Clear pointers to prevent tracing into released initdata. */
+ spin_lock_irqsave(&id->lock, flags);
+ id->areas = NULL;
+ id->active_pages = NULL;
+ id->active_entries = NULL;
+ spin_unlock_irqrestore(&id->lock, flags);
+
+ return;
+ }
+
+ /* Replace static trace area with dynamic copy. */
+ spin_lock_irqsave(&id->lock, flags);
+ debug_events_append(copy, id);
+ debug_areas_swap(id, copy);
+ spin_unlock_irqrestore(&id->lock, flags);
+
+ /* Clear pointers to initdata and discard copy. */
+ copy->areas = NULL;
+ copy->active_pages = NULL;
+ copy->active_entries = NULL;
+ debug_info_free(copy);
+
+ mutex_lock(&debug_mutex);
+ _debug_register(id);
+ mutex_unlock(&debug_mutex);
+}
+
+/* Remove debugfs entries and remove from internal list. */
+static void _debug_unregister(debug_info_t *id)
+{
+ int i;
+
+ for (i = 0; i < DEBUG_MAX_VIEWS; i++) {
+ if (!id->views[i])
+ continue;
+ debugfs_remove(id->debugfs_entries[i]);
+ }
+ debugfs_remove(id->debugfs_root_entry);
+ if (id == debug_area_first)
+ debug_area_first = id->next;
+ if (id == debug_area_last)
+ debug_area_last = id->prev;
+ if (id->prev)
+ id->prev->next = id->next;
+ if (id->next)
+ id->next->prev = id->prev;
+}
+
+/**
* debug_unregister() - give back debug area.
*
* @id: handle for debug log
@@ -730,8 +781,10 @@ void debug_unregister(debug_info_t *id)
if (!id)
return;
mutex_lock(&debug_mutex);
- debug_info_put(id);
+ _debug_unregister(id);
mutex_unlock(&debug_mutex);
+
+ debug_info_put(id);
}
EXPORT_SYMBOL(debug_unregister);
@@ -741,35 +794,28 @@ EXPORT_SYMBOL(debug_unregister);
*/
static int debug_set_size(debug_info_t *id, int nr_areas, int pages_per_area)
{
- debug_entry_t ***new_areas;
+ debug_info_t *new_id;
unsigned long flags;
- int rc = 0;
if (!id || (nr_areas <= 0) || (pages_per_area < 0))
return -EINVAL;
- if (pages_per_area > 0) {
- new_areas = debug_areas_alloc(pages_per_area, nr_areas);
- if (!new_areas) {
- pr_info("Allocating memory for %i pages failed\n",
- pages_per_area);
- rc = -ENOMEM;
- goto out;
- }
- } else {
- new_areas = NULL;
+
+ new_id = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size,
+ id->level, ALL_AREAS);
+ if (!new_id) {
+ pr_info("Allocating memory for %i pages failed\n",
+ pages_per_area);
+ return -ENOMEM;
}
+
spin_lock_irqsave(&id->lock, flags);
- debug_areas_free(id);
- id->areas = new_areas;
- id->nr_areas = nr_areas;
- id->pages_per_area = pages_per_area;
- id->active_area = 0;
- memset(id->active_entries, 0, sizeof(int)*id->nr_areas);
- memset(id->active_pages, 0, sizeof(int)*id->nr_areas);
+ debug_events_append(new_id, id);
+ debug_areas_swap(new_id, id);
+ debug_info_free(new_id);
spin_unlock_irqrestore(&id->lock, flags);
pr_info("%s: set new size (%i pages)\n", id->name, pages_per_area);
-out:
- return rc;
+
+ return 0;
}
/**
@@ -787,16 +833,17 @@ void debug_set_level(debug_info_t *id, int new_level)
if (!id)
return;
- spin_lock_irqsave(&id->lock, flags);
+
if (new_level == DEBUG_OFF_LEVEL) {
- id->level = DEBUG_OFF_LEVEL;
pr_info("%s: switched off\n", id->name);
} else if ((new_level > DEBUG_MAX_LEVEL) || (new_level < 0)) {
pr_info("%s: level %i is out of range (%i - %i)\n",
id->name, new_level, 0, DEBUG_MAX_LEVEL);
- } else {
- id->level = new_level;
+ return;
}
+
+ spin_lock_irqsave(&id->lock, flags);
+ id->level = new_level;
spin_unlock_irqrestore(&id->lock, flags);
}
EXPORT_SYMBOL(debug_set_level);
@@ -836,6 +883,42 @@ static inline debug_entry_t *get_active_entry(debug_info_t *id)
id->active_entries[id->active_area]);
}
+/* Swap debug areas of a and b. */
+static void debug_areas_swap(debug_info_t *a, debug_info_t *b)
+{
+ swap(a->nr_areas, b->nr_areas);
+ swap(a->pages_per_area, b->pages_per_area);
+ swap(a->areas, b->areas);
+ swap(a->active_area, b->active_area);
+ swap(a->active_pages, b->active_pages);
+ swap(a->active_entries, b->active_entries);
+}
+
+/* Append all debug events in active area from source to destination log. */
+static void debug_events_append(debug_info_t *dest, debug_info_t *src)
+{
+ debug_entry_t *from, *to, *last;
+
+ if (!src->areas || !dest->areas)
+ return;
+
+ /* Loop over all entries in src, starting with oldest. */
+ from = get_active_entry(src);
+ last = from;
+ do {
+ if (from->clock != 0LL) {
+ to = get_active_entry(dest);
+ memset(to, 0, dest->entry_size);
+ memcpy(to, from, min(src->entry_size,
+ dest->entry_size));
+ proceed_active_entry(dest);
+ }
+
+ proceed_active_entry(src);
+ from = get_active_entry(src);
+ } while (from != last);
+}
+
/*
* debug_finish_entry:
* - set timestamp, caller address, cpu number etc.
@@ -844,12 +927,17 @@ static inline debug_entry_t *get_active_entry(debug_info_t *id)
static inline void debug_finish_entry(debug_info_t *id, debug_entry_t *active,
int level, int exception)
{
- active->id.stck = get_tod_clock_fast() -
- *(unsigned long long *) &tod_clock_base[1];
- active->id.fields.cpuid = smp_processor_id();
+ unsigned long timestamp;
+ union tod_clock clk;
+
+ store_tod_clock_ext(&clk);
+ timestamp = clk.us;
+ timestamp -= TOD_UNIX_EPOCH >> 12;
+ active->clock = timestamp;
+ active->cpu = smp_processor_id();
active->caller = __builtin_return_address(0);
- active->id.fields.exception = exception;
- active->id.fields.level = level;
+ active->exception = exception;
+ active->level = level;
proceed_active_entry(id);
if (exception)
proceed_active_area(id);
@@ -867,7 +955,7 @@ static int debug_active = 1;
* if debug_active is already off
*/
static int s390dbf_procactive(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!write || debug_stoppable || !debug_active)
return proc_dointvec(table, write, buffer, lenp, ppos);
@@ -890,17 +978,6 @@ static struct ctl_table s390dbf_table[] = {
.mode = S_IRUGO | S_IWUSR,
.proc_handler = s390dbf_procactive,
},
- { }
-};
-
-static struct ctl_table s390dbf_dir_table[] = {
- {
- .procname = "s390dbf",
- .maxlen = 0,
- .mode = S_IRUGO | S_IXUGO,
- .child = s390dbf_table,
- },
- { }
};
static struct ctl_table_header *s390dbf_sysctl_header;
@@ -1121,16 +1198,17 @@ int debug_register_view(debug_info_t *id, struct debug_view *view)
break;
}
if (i == DEBUG_MAX_VIEWS) {
- pr_err("Registering view %s/%s would exceed the maximum "
- "number of views %i\n", id->name, view->name, i);
rc = -1;
} else {
id->views[i] = view;
id->debugfs_entries[i] = pde;
}
spin_unlock_irqrestore(&id->lock, flags);
- if (rc)
+ if (rc) {
+ pr_err("Registering view %s/%s would exceed the maximum "
+ "number of views %i\n", id->name, view->name, i);
debugfs_remove(pde);
+ }
out:
return rc;
}
@@ -1385,32 +1463,6 @@ out:
}
/*
- * prints debug header in raw format
- */
-static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view,
- int area, debug_entry_t *entry, char *out_buf)
-{
- int rc;
-
- rc = sizeof(debug_entry_t);
- memcpy(out_buf, entry, sizeof(debug_entry_t));
- return rc;
-}
-
-/*
- * prints debug data in raw format
- */
-static int debug_raw_format_fn(debug_info_t *id, struct debug_view *view,
- char *out_buf, const char *in_buf)
-{
- int rc;
-
- rc = id->buf_size;
- memcpy(out_buf, in_buf, id->buf_size);
- return rc;
-}
-
-/*
* prints debug data in hex/ascii format
*/
static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
@@ -1439,25 +1491,24 @@ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view,
int area, debug_entry_t *entry, char *out_buf)
{
- unsigned long base, sec, usec;
+ unsigned long sec, usec;
unsigned long caller;
unsigned int level;
char *except_str;
int rc = 0;
- level = entry->id.fields.level;
- base = (*(unsigned long *) &tod_clock_base[0]) >> 4;
- sec = (entry->id.stck >> 12) + base - (TOD_UNIX_EPOCH >> 12);
+ level = entry->level;
+ sec = entry->clock;
usec = do_div(sec, USEC_PER_SEC);
- if (entry->id.fields.exception)
+ if (entry->exception)
except_str = "*";
else
except_str = "-";
caller = (unsigned long) entry->caller;
- rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %02i %pK ",
+ rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %04u %px ",
area, sec, usec, level, except_str,
- entry->id.fields.cpuid, (void *)caller);
+ entry->cpu, (void *)caller);
return rc;
}
EXPORT_SYMBOL(debug_dflt_header_fn);
@@ -1470,8 +1521,9 @@ EXPORT_SYMBOL(debug_dflt_header_fn);
#define DEBUG_SPRINTF_MAX_ARGS 10
static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
- char *out_buf, debug_sprintf_entry_t *curr_event)
+ char *out_buf, const char *inbuf)
{
+ debug_sprintf_entry_t *curr_event = (debug_sprintf_entry_t *)inbuf;
int num_longs, num_used_args = 0, i, rc = 0;
int index[DEBUG_SPRINTF_MAX_ARGS];
@@ -1511,7 +1563,7 @@ out:
*/
static int __init debug_init(void)
{
- s390dbf_sysctl_header = register_sysctl_table(s390dbf_dir_table);
+ s390dbf_sysctl_header = register_sysctl("s390dbf", s390dbf_table);
mutex_lock(&debug_mutex);
debug_debugfs_root_entry = debugfs_create_dir(DEBUG_DIR_ROOT, NULL);
initialized = 1;
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index e9dac9a24d3f..92fdc35f028c 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -11,9 +11,12 @@
#include <linux/cpu.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
+#include <linux/vmalloc.h>
+#include <asm/asm-extable.h>
#include <asm/diag.h>
#include <asm/trace/diag.h>
#include <asm/sections.h>
+#include "entry.h"
struct diag_stat {
unsigned int counter[NR_DIAG_STAT];
@@ -33,6 +36,7 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = {
[DIAG_STAT_X014] = { .code = 0x014, .name = "Spool File Services" },
[DIAG_STAT_X044] = { .code = 0x044, .name = "Voluntary Timeslice End" },
[DIAG_STAT_X064] = { .code = 0x064, .name = "NSS Manipulation" },
+ [DIAG_STAT_X08C] = { .code = 0x08c, .name = "Access 3270 Display Device Information" },
[DIAG_STAT_X09C] = { .code = 0x09c, .name = "Relinquish Timeslice" },
[DIAG_STAT_X0DC] = { .code = 0x0dc, .name = "Appldata Control" },
[DIAG_STAT_X204] = { .code = 0x204, .name = "Logical-CPU Utilization" },
@@ -47,11 +51,24 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = {
[DIAG_STAT_X304] = { .code = 0x304, .name = "Partition-Resource Service" },
[DIAG_STAT_X308] = { .code = 0x308, .name = "List-Directed IPL" },
[DIAG_STAT_X318] = { .code = 0x318, .name = "CP Name and Version Codes" },
+ [DIAG_STAT_X320] = { .code = 0x320, .name = "Certificate Store" },
[DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" },
};
-struct diag_ops __bootdata_preserved(diag_dma_ops);
-struct diag210 *__bootdata_preserved(__diag210_tmp_dma);
+struct diag_ops __amode31_ref diag_amode31_ops = {
+ .diag210 = _diag210_amode31,
+ .diag26c = _diag26c_amode31,
+ .diag14 = _diag14_amode31,
+ .diag0c = _diag0c_amode31,
+ .diag8c = _diag8c_amode31,
+ .diag308_reset = _diag308_reset_amode31
+};
+
+static struct diag210 _diag210_tmp_amode31 __section(".amode31.data");
+struct diag210 __amode31_ref *__diag210_tmp_amode31 = &_diag210_tmp_amode31;
+
+static struct diag8c _diag8c_tmp_amode31 __section(".amode31.data");
+static struct diag8c __amode31_ref *__diag8c_tmp_amode31 = &_diag8c_tmp_amode31;
static int show_diag_stat(struct seq_file *m, void *v)
{
@@ -59,7 +76,7 @@ static int show_diag_stat(struct seq_file *m, void *v)
unsigned long n = (unsigned long) v - 1;
int cpu, prec, tmp;
- get_online_cpus();
+ cpus_read_lock();
if (n == 0) {
seq_puts(m, " ");
@@ -78,13 +95,13 @@ static int show_diag_stat(struct seq_file *m, void *v)
}
seq_printf(m, " %s\n", diag_map[n-1].name);
}
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
static void *show_diag_stat_start(struct seq_file *m, loff_t *pos)
{
- return *pos <= nr_cpu_ids ? (void *)((unsigned long) *pos + 1) : NULL;
+ return *pos <= NR_DIAG_STAT ? (void *)((unsigned long) *pos + 1) : NULL;
}
static void *show_diag_stat_next(struct seq_file *m, void *v, loff_t *pos)
@@ -104,18 +121,7 @@ static const struct seq_operations show_diag_stat_sops = {
.show = show_diag_stat,
};
-static int show_diag_stat_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &show_diag_stat_sops);
-}
-
-static const struct file_operations show_diag_stat_fops = {
- .open = show_diag_stat_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
+DEFINE_SEQ_ATTRIBUTE(show_diag_stat);
static int __init show_diag_stat_init(void)
{
@@ -133,7 +139,7 @@ void diag_stat_inc(enum diag_stat_enum nr)
}
EXPORT_SYMBOL(diag_stat_inc);
-void diag_stat_inc_norecursion(enum diag_stat_enum nr)
+void notrace diag_stat_inc_norecursion(enum diag_stat_enum nr)
{
this_cpu_inc(diag_stat.counter[nr]);
trace_s390_diagnose_norecursion(diag_map[nr].code);
@@ -146,26 +152,46 @@ EXPORT_SYMBOL(diag_stat_inc_norecursion);
int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
{
diag_stat_inc(DIAG_STAT_X014);
- return diag_dma_ops.diag14(rx, ry1, subcode);
+ return diag_amode31_ops.diag14(rx, ry1, subcode);
}
EXPORT_SYMBOL(diag14);
static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
{
- register unsigned long _subcode asm("0") = *subcode;
- register unsigned long _size asm("1") = size;
+ union register_pair rp = { .even = *subcode, .odd = size };
asm volatile(
- " diag %2,%0,0x204\n"
+ " diag %[addr],%[rp],0x204\n"
"0: nopr %%r7\n"
EX_TABLE(0b,0b)
- : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
- *subcode = _subcode;
- return _size;
+ : [rp] "+&d" (rp.pair) : [addr] "d" (addr) : "memory");
+ *subcode = rp.even;
+ return rp.odd;
}
+/**
+ * diag204() - Issue diagnose 204 call.
+ * @subcode: Subcode of diagnose 204 to be executed.
+ * @size: Size of area in pages which @area points to, if given.
+ * @addr: Vmalloc'ed memory area where the result is written to.
+ *
+ * Execute diagnose 204 with the given subcode and write the result to the
+ * memory area specified with @addr. For subcodes which do not write a
+ * result to memory both @size and @addr must be zero. If @addr is
+ * specified it must be page aligned and must have been allocated with
+ * vmalloc(). Conversion to real / physical addresses will be handled by
+ * this function if required.
+ */
int diag204(unsigned long subcode, unsigned long size, void *addr)
{
+ if (addr) {
+ if (WARN_ON_ONCE(!is_vmalloc_addr(addr)))
+ return -1;
+ if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)addr, PAGE_SIZE)))
+ return -1;
+ }
+ if ((subcode & DIAG204_SUBCODE_MASK) == DIAG204_SUBC_STIB4)
+ addr = (void *)pfn_to_phys(vmalloc_to_pfn(addr));
diag_stat_inc(DIAG_STAT_X204);
size = __diag204(&subcode, size, addr);
if (subcode)
@@ -184,20 +210,42 @@ int diag210(struct diag210 *addr)
int ccode;
spin_lock_irqsave(&diag210_lock, flags);
- *__diag210_tmp_dma = *addr;
+ *__diag210_tmp_amode31 = *addr;
diag_stat_inc(DIAG_STAT_X210);
- ccode = diag_dma_ops.diag210(__diag210_tmp_dma);
+ ccode = diag_amode31_ops.diag210(__diag210_tmp_amode31);
- *addr = *__diag210_tmp_dma;
+ *addr = *__diag210_tmp_amode31;
spin_unlock_irqrestore(&diag210_lock, flags);
return ccode;
}
EXPORT_SYMBOL(diag210);
+/*
+ * Diagnose 8C: Access 3270 Display Device Information
+ */
+int diag8c(struct diag8c *addr, struct ccw_dev_id *devno)
+{
+ static DEFINE_SPINLOCK(diag8c_lock);
+ unsigned long flags;
+ int ccode;
+
+ spin_lock_irqsave(&diag8c_lock, flags);
+
+ diag_stat_inc(DIAG_STAT_X08C);
+ ccode = diag_amode31_ops.diag8c(__diag8c_tmp_amode31, devno, sizeof(*addr));
+
+ *addr = *__diag8c_tmp_amode31;
+ spin_unlock_irqrestore(&diag8c_lock, flags);
+
+ return ccode;
+}
+EXPORT_SYMBOL(diag8c);
+
int diag224(void *ptr)
{
+ unsigned long addr = __pa(ptr);
int rc = -EOPNOTSUPP;
diag_stat_inc(DIAG_STAT_X224);
@@ -206,7 +254,7 @@ int diag224(void *ptr)
"0: lhi %0,0x0\n"
"1:\n"
EX_TABLE(0b,1b)
- : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
+ : "+d" (rc) :"d" (0), "d" (addr) : "memory");
return rc;
}
EXPORT_SYMBOL(diag224);
@@ -217,6 +265,6 @@ EXPORT_SYMBOL(diag224);
int diag26c(void *req, void *resp, enum diag26c_sc subcode)
{
diag_stat_inc(DIAG_STAT_X26C);
- return diag_dma_ops.diag26c(req, resp, subcode);
+ return diag_amode31_ops.diag26c(req, resp, subcode);
}
EXPORT_SYMBOL(diag26c);
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index f304802ecf7b..89dc826a8d2e 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -24,8 +24,8 @@
#include <linux/kdebug.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
+#include <linux/io.h>
#include <asm/dis.h>
-#include <asm/io.h>
#include <asm/cpcmd.h>
#include <asm/lowcore.h>
#include <asm/debug.h>
@@ -278,6 +278,7 @@ static const unsigned char formats[][6] = {
[INSTR_SIL_RDI] = { D_20, B_16, I16_32, 0, 0, 0 },
[INSTR_SIL_RDU] = { D_20, B_16, U16_32, 0, 0, 0 },
[INSTR_SIY_IRD] = { D20_20, B_16, I8_8, 0, 0, 0 },
+ [INSTR_SIY_RD] = { D20_20, B_16, 0, 0, 0, 0 },
[INSTR_SIY_URD] = { D20_20, B_16, U8_8, 0, 0, 0 },
[INSTR_SI_RD] = { D_20, B_16, 0, 0, 0, 0 },
[INSTR_SI_URD] = { D_20, B_16, U8_8, 0, 0, 0 },
@@ -312,10 +313,12 @@ static const unsigned char formats[][6] = {
[INSTR_VRR_VV] = { V_8, V_12, 0, 0, 0, 0 },
[INSTR_VRR_VV0U] = { V_8, V_12, U4_32, 0, 0, 0 },
[INSTR_VRR_VV0U0U] = { V_8, V_12, U4_32, U4_24, 0, 0 },
+ [INSTR_VRR_VV0U2] = { V_8, V_12, U4_24, 0, 0, 0 },
[INSTR_VRR_VV0UU2] = { V_8, V_12, U4_32, U4_28, 0, 0 },
[INSTR_VRR_VV0UUU] = { V_8, V_12, U4_32, U4_28, U4_24, 0 },
[INSTR_VRR_VVV] = { V_8, V_12, V_16, 0, 0, 0 },
[INSTR_VRR_VVV0U] = { V_8, V_12, V_16, U4_32, 0, 0 },
+ [INSTR_VRR_VVV0U0] = { V_8, V_12, V_16, U4_24, 0, 0 },
[INSTR_VRR_VVV0U0U] = { V_8, V_12, V_16, U4_32, U4_24, 0 },
[INSTR_VRR_VVV0UU] = { V_8, V_12, V_16, U4_32, U4_28, 0 },
[INSTR_VRR_VVV0UUU] = { V_8, V_12, V_16, U4_32, U4_28, U4_24 },
@@ -482,32 +485,38 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
return (int) (ptr - buffer);
}
+static int copy_from_regs(struct pt_regs *regs, void *dst, void *src, int len)
+{
+ if (user_mode(regs)) {
+ if (copy_from_user(dst, (char __user *)src, len))
+ return -EFAULT;
+ } else {
+ if (copy_from_kernel_nofault(dst, src, len))
+ return -EFAULT;
+ }
+ return 0;
+}
+
void show_code(struct pt_regs *regs)
{
char *mode = user_mode(regs) ? "User" : "Krnl";
unsigned char code[64];
char buffer[128], *ptr;
- mm_segment_t old_fs;
unsigned long addr;
int start, end, opsize, hops, i;
/* Get a snapshot of the 64 bytes surrounding the fault address. */
- old_fs = get_fs();
- set_fs(user_mode(regs) ? USER_DS : KERNEL_DS);
for (start = 32; start && regs->psw.addr >= 34 - start; start -= 2) {
addr = regs->psw.addr - 34 + start;
- if (__copy_from_user(code + start - 2,
- (char __user *) addr, 2))
+ if (copy_from_regs(regs, code + start - 2, (void *)addr, 2))
break;
}
for (end = 32; end < 64; end += 2) {
addr = regs->psw.addr + end - 32;
- if (__copy_from_user(code + end,
- (char __user *) addr, 2))
+ if (copy_from_regs(regs, code + end, (void *)addr, 2))
break;
}
- set_fs(old_fs);
- /* Code snapshot useable ? */
+ /* Code snapshot usable ? */
if ((regs->psw.addr & 1) || start >= end) {
printk("%s Code: Bad PSW.\n", mode);
return;
@@ -557,7 +566,7 @@ void show_code(struct pt_regs *regs)
void print_fn_code(unsigned char *code, unsigned long len)
{
- char buffer[64], *ptr;
+ char buffer[128], *ptr;
int opsize, i;
while (len) {
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index d306fe04489a..d2012635b093 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -41,51 +41,50 @@ const char *stack_type_name(enum stack_type type)
EXPORT_SYMBOL_GPL(stack_type_name);
static inline bool in_stack(unsigned long sp, struct stack_info *info,
- enum stack_type type, unsigned long low,
- unsigned long high)
+ enum stack_type type, unsigned long stack)
{
- if (sp < low || sp >= high)
+ if (sp < stack || sp >= stack + THREAD_SIZE)
return false;
info->type = type;
- info->begin = low;
- info->end = high;
+ info->begin = stack;
+ info->end = stack + THREAD_SIZE;
return true;
}
static bool in_task_stack(unsigned long sp, struct task_struct *task,
struct stack_info *info)
{
- unsigned long stack;
+ unsigned long stack = (unsigned long)task_stack_page(task);
- stack = (unsigned long) task_stack_page(task);
- return in_stack(sp, info, STACK_TYPE_TASK, stack, stack + THREAD_SIZE);
+ return in_stack(sp, info, STACK_TYPE_TASK, stack);
}
static bool in_irq_stack(unsigned long sp, struct stack_info *info)
{
- unsigned long frame_size, top;
+ unsigned long stack = S390_lowcore.async_stack - STACK_INIT_OFFSET;
- frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
- top = S390_lowcore.async_stack + frame_size;
- return in_stack(sp, info, STACK_TYPE_IRQ, top - THREAD_SIZE, top);
+ return in_stack(sp, info, STACK_TYPE_IRQ, stack);
}
static bool in_nodat_stack(unsigned long sp, struct stack_info *info)
{
- unsigned long frame_size, top;
+ unsigned long stack = S390_lowcore.nodat_stack - STACK_INIT_OFFSET;
- frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
- top = S390_lowcore.nodat_stack + frame_size;
- return in_stack(sp, info, STACK_TYPE_NODAT, top - THREAD_SIZE, top);
+ return in_stack(sp, info, STACK_TYPE_NODAT, stack);
+}
+
+static bool in_mcck_stack(unsigned long sp, struct stack_info *info)
+{
+ unsigned long stack = S390_lowcore.mcck_stack - STACK_INIT_OFFSET;
+
+ return in_stack(sp, info, STACK_TYPE_MCCK, stack);
}
static bool in_restart_stack(unsigned long sp, struct stack_info *info)
{
- unsigned long frame_size, top;
+ unsigned long stack = S390_lowcore.restart_stack - STACK_INIT_OFFSET;
- frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
- top = S390_lowcore.restart_stack + frame_size;
- return in_stack(sp, info, STACK_TYPE_RESTART, top - THREAD_SIZE, top);
+ return in_stack(sp, info, STACK_TYPE_RESTART, stack);
}
int get_stack_info(unsigned long sp, struct task_struct *task,
@@ -108,7 +107,8 @@ int get_stack_info(unsigned long sp, struct task_struct *task,
/* Check per-cpu stacks */
if (!in_irq_stack(sp, info) &&
!in_nodat_stack(sp, info) &&
- !in_restart_stack(sp, info))
+ !in_restart_stack(sp, info) &&
+ !in_mcck_stack(sp, info))
goto unknown;
recursion_check:
@@ -126,22 +126,29 @@ unknown:
return -EINVAL;
}
-void show_stack(struct task_struct *task, unsigned long *stack)
+void show_stack(struct task_struct *task, unsigned long *stack,
+ const char *loglvl)
{
struct unwind_state state;
- printk("Call Trace:\n");
+ printk("%sCall Trace:\n", loglvl);
unwind_for_each_frame(&state, task, NULL, (unsigned long) stack)
- printk(state.reliable ? " [<%016lx>] %pSR \n" :
- "([<%016lx>] %pSR)\n",
- state.ip, (void *) state.ip);
+ printk(state.reliable ? "%s [<%016lx>] %pSR \n" :
+ "%s([<%016lx>] %pSR)\n",
+ loglvl, state.ip, (void *) state.ip);
debug_show_held_locks(task ? : current);
}
static void show_last_breaking_event(struct pt_regs *regs)
{
printk("Last Breaking-Event-Address:\n");
- printk(" [<%016lx>] %pSR\n", regs->args[0], (void *)regs->args[0]);
+ printk(" [<%016lx>] ", regs->last_break);
+ if (user_mode(regs)) {
+ print_vma_addr(KERN_CONT, regs->last_break);
+ pr_cont("\n");
+ } else {
+ pr_cont("%pSR\n", (void *)regs->last_break);
+ }
}
void show_registers(struct pt_regs *regs)
@@ -175,13 +182,13 @@ void show_regs(struct pt_regs *regs)
show_registers(regs);
/* Show stack backtrace if pt_regs is from kernel mode */
if (!user_mode(regs))
- show_stack(NULL, (unsigned long *) regs->gprs[15]);
+ show_stack(NULL, (unsigned long *) regs->gprs[15], KERN_DEFAULT);
show_last_breaking_event(regs);
}
static DEFINE_SPINLOCK(die_lock);
-void die(struct pt_regs *regs, const char *str)
+void __noreturn die(struct pt_regs *regs, const char *str)
{
static int die_counter;
@@ -195,6 +202,8 @@ void die(struct pt_regs *regs, const char *str)
regs->int_code >> 17, ++die_counter);
#ifdef CONFIG_PREEMPT
pr_cont("PREEMPT ");
+#elif defined(CONFIG_PREEMPT_RT)
+ pr_cont("PREEMPT_RT ");
#endif
pr_cont("SMP ");
if (debug_pagealloc_enabled())
@@ -211,5 +220,5 @@ void die(struct pt_regs *regs, const char *str)
if (panic_on_oops)
panic("Fatal exception: panic_on_oops");
oops_exit();
- do_exit(SIGSEGV);
+ make_task_dead(SIGSEGV);
}
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index db32a55daaec..2345ea332b97 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -2,7 +2,6 @@
/*
* Copyright IBM Corp. 2007, 2009
* Author(s): Hongjie Yang <hongjie@us.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#define KMSG_COMPONENT "setup"
@@ -18,6 +17,8 @@
#include <linux/pfn.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
+#include <asm/asm-extable.h>
+#include <linux/memblock.h>
#include <asm/diag.h>
#include <asm/ebcdic.h>
#include <asm/ipl.h>
@@ -33,18 +34,43 @@
#include <asm/switch_to.h>
#include "entry.h"
+#define decompressor_handled_param(param) \
+static int __init ignore_decompressor_param_##param(char *s) \
+{ \
+ return 0; \
+} \
+early_param(#param, ignore_decompressor_param_##param)
+
+decompressor_handled_param(mem);
+decompressor_handled_param(vmalloc);
+decompressor_handled_param(dfltcc);
+decompressor_handled_param(facilities);
+decompressor_handled_param(nokaslr);
+decompressor_handled_param(cmma);
+#if IS_ENABLED(CONFIG_KVM)
+decompressor_handled_param(prot_virt);
+#endif
+
+static void __init kasan_early_init(void)
+{
+#ifdef CONFIG_KASAN
+ init_task.kasan_depth = 0;
+ sclp_early_printk("KernelAddressSanitizer initialized\n");
+#endif
+}
+
static void __init reset_tod_clock(void)
{
- u64 time;
+ union tod_clock clk;
- if (store_tod_clock(&time) == 0)
+ if (store_tod_clock_ext_cc(&clk) == 0)
return;
/* TOD clock not running. Set the clock to Unix Epoch. */
- if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0)
+ if (set_tod_clock(TOD_UNIX_EPOCH) || store_tod_clock_ext_cc(&clk))
disabled_wait();
- memset(tod_clock_base, 0, 16);
- *(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH;
+ memset(&tod_clock_base, 0, sizeof(tod_clock_base));
+ tod_clock_base.tod = TOD_UNIX_EPOCH;
S390_lowcore.last_update_clock = TOD_UNIX_EPOCH;
}
@@ -147,44 +173,27 @@ static __init void setup_topology(void)
topology_max_mnest = max_mnest;
}
-static void early_pgm_check_handler(void)
+void __do_early_pgm_check(struct pt_regs *regs)
{
- const struct exception_table_entry *fixup;
- unsigned long cr0, cr0_new;
- unsigned long addr;
-
- addr = S390_lowcore.program_old_psw.addr;
- fixup = s390_search_extables(addr);
- if (!fixup)
+ if (!fixup_exception(regs))
disabled_wait();
- /* Disable low address protection before storing into lowcore. */
- __ctl_store(cr0, 0, 0);
- cr0_new = cr0 & ~(1UL << 28);
- __ctl_load(cr0_new, 0, 0);
- S390_lowcore.program_old_psw.addr = extable_fixup(fixup);
- __ctl_load(cr0, 0, 0);
}
static noinline __init void setup_lowcore_early(void)
{
psw_t psw;
- psw.mask = PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA;
- psw.addr = (unsigned long) s390_base_ext_handler;
- S390_lowcore.external_new_psw = psw;
- psw.addr = (unsigned long) s390_base_pgm_handler;
+ psw.addr = (unsigned long)early_pgm_check_handler;
+ psw.mask = PSW_KERNEL_BITS;
S390_lowcore.program_new_psw = psw;
- s390_base_pgm_handler_fn = early_pgm_check_handler;
S390_lowcore.preempt_count = INIT_PREEMPT_COUNT;
}
static noinline __init void setup_facility_list(void)
{
- memcpy(S390_lowcore.alt_stfle_fac_list,
- S390_lowcore.stfle_fac_list,
- sizeof(S390_lowcore.alt_stfle_fac_list));
+ memcpy(alt_stfle_fac_list, stfle_fac_list, sizeof(alt_stfle_fac_list));
if (!IS_ENABLED(CONFIG_KERNEL_NOBP))
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
}
static __init void detect_diag9c(void)
@@ -204,26 +213,11 @@ static __init void detect_diag9c(void)
S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG9C;
}
-static __init void detect_diag44(void)
-{
- int rc;
-
- diag_stat_inc(DIAG_STAT_X044);
- asm volatile(
- " diag 0,0,0x44\n"
- "0: la %0,0\n"
- "1:\n"
- EX_TABLE(0b,1b)
- : "=d" (rc) : "0" (-EOPNOTSUPP) : "cc");
- if (!rc)
- S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG44;
-}
-
static __init void detect_machine_facilities(void)
{
if (test_facility(8)) {
S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT1;
- __ctl_set_bit(0, 23);
+ system_ctl_set_bit(0, CR0_EDAT_BIT);
}
if (test_facility(78))
S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT2;
@@ -231,26 +225,28 @@ static __init void detect_machine_facilities(void)
S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE;
if (test_facility(50) && test_facility(73)) {
S390_lowcore.machine_flags |= MACHINE_FLAG_TE;
- __ctl_set_bit(0, 55);
+ system_ctl_set_bit(0, CR0_TRANSACTIONAL_EXECUTION_BIT);
}
if (test_facility(51))
S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC;
- if (test_facility(129)) {
- S390_lowcore.machine_flags |= MACHINE_FLAG_VX;
- __ctl_set_bit(0, 17);
- }
- if (test_facility(130) && !noexec_disabled) {
+ if (test_facility(129))
+ system_ctl_set_bit(0, CR0_VECTOR_BIT);
+ if (test_facility(130))
S390_lowcore.machine_flags |= MACHINE_FLAG_NX;
- __ctl_set_bit(0, 20);
- }
if (test_facility(133))
S390_lowcore.machine_flags |= MACHINE_FLAG_GS;
- if (test_facility(139) && (tod_clock_base[1] & 0x80)) {
+ if (test_facility(139) && (tod_clock_base.tod >> 63)) {
/* Enabled signed clock comparator comparisons */
S390_lowcore.machine_flags |= MACHINE_FLAG_SCC;
clock_comparator_max = -1ULL >> 1;
- __ctl_set_bit(0, 53);
+ system_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SIGN_BIT);
}
+ if (IS_ENABLED(CONFIG_PCI) && test_facility(153)) {
+ S390_lowcore.machine_flags |= MACHINE_FLAG_PCI_MIO;
+ /* the control bit is set during PCI initialization */
+ }
+ if (test_facility(194))
+ S390_lowcore.machine_flags |= MACHINE_FLAG_RDP;
}
static inline void save_vector_registers(void)
@@ -261,15 +257,9 @@ static inline void save_vector_registers(void)
#endif
}
-static inline void setup_control_registers(void)
+static inline void setup_low_address_protection(void)
{
- unsigned long reg;
-
- __ctl_store(reg, 0, 0);
- reg |= CR0_LOW_ADDRESS_PROTECTION;
- reg |= CR0_EMERGENCY_SIGNAL_SUBMASK;
- reg |= CR0_EXTERNAL_CALL_SUBMASK;
- __ctl_load(reg, 0, 0);
+ system_ctl_set_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT);
}
static inline void setup_access_registers(void)
@@ -279,64 +269,37 @@ static inline void setup_access_registers(void)
restore_access_regs(acrs);
}
-static int __init disable_vector_extension(char *str)
-{
- S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX;
- __ctl_clear_bit(0, 17);
- return 0;
-}
-early_param("novx", disable_vector_extension);
-
-static int __init cad_setup(char *str)
-{
- bool enabled;
- int rc;
-
- rc = kstrtobool(str, &enabled);
- if (!rc && enabled && test_facility(128))
- /* Enable problem state CAD. */
- __ctl_set_bit(2, 3);
- return rc;
-}
-early_param("cad", cad_setup);
-
char __bootdata(early_command_line)[COMMAND_LINE_SIZE];
static void __init setup_boot_command_line(void)
{
/* copy arch command line */
- strlcpy(boot_command_line, early_command_line, ARCH_COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE);
}
-static void __init check_image_bootable(void)
+static void __init sort_amode31_extable(void)
{
- if (!memcmp(EP_STRING, (void *)EP_OFFSET, strlen(EP_STRING)))
- return;
-
- sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n");
- sclp_early_printk("This image does not contain all parts necessary for starting up. Use\n");
- sclp_early_printk("bzImage or arch/s390/boot/compressed/vmlinux instead.\n");
- disabled_wait();
+ sort_extable(__start_amode31_ex_table, __stop_amode31_ex_table);
}
void __init startup_init(void)
{
+ kasan_early_init();
reset_tod_clock();
- check_image_bootable();
time_early_init();
init_kernel_storage_key();
lockdep_off();
+ sort_amode31_extable();
setup_lowcore_early();
setup_facility_list();
detect_machine_type();
setup_arch_string();
setup_boot_command_line();
detect_diag9c();
- detect_diag44();
detect_machine_facilities();
save_vector_registers();
setup_topology();
sclp_early_detect();
- setup_control_registers();
+ setup_low_address_protection();
setup_access_registers();
lockdep_on();
}
diff --git a/arch/s390/kernel/early_printk.c b/arch/s390/kernel/early_printk.c
index 6f24d83bc5dc..d9d53f44008a 100644
--- a/arch/s390/kernel/early_printk.c
+++ b/arch/s390/kernel/early_printk.c
@@ -10,7 +10,7 @@
static void sclp_early_write(struct console *con, const char *s, unsigned int len)
{
- __sclp_early_printk(s, len, 0);
+ __sclp_early_printk(s, len);
}
static struct console sclp_early_console = {
diff --git a/arch/s390/kernel/earlypgm.S b/arch/s390/kernel/earlypgm.S
new file mode 100644
index 000000000000..c634871f0d90
--- /dev/null
+++ b/arch/s390/kernel/earlypgm.S
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2006, 2007
+ * Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+
+SYM_CODE_START(early_pgm_check_handler)
+ stmg %r8,%r15,__LC_SAVE_AREA_SYNC
+ aghi %r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE)
+ la %r11,STACK_FRAME_OVERHEAD(%r15)
+ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+ stmg %r0,%r7,__PT_R0(%r11)
+ mvc __PT_PSW(16,%r11),__LC_PGM_OLD_PSW
+ mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
+ lgr %r2,%r11
+ brasl %r14,__do_early_pgm_check
+ mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+ lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+ lpswe __LC_RETURN_PSW
+SYM_CODE_END(early_pgm_check_handler)
diff --git a/arch/s390/kernel/ebcdic.c b/arch/s390/kernel/ebcdic.c
index 7f8246c9be08..0e51fa537262 100644
--- a/arch/s390/kernel/ebcdic.c
+++ b/arch/s390/kernel/ebcdic.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * ECBDIC -> ASCII, ASCII -> ECBDIC,
+ * EBCDIC -> ASCII, ASCII -> EBCDIC,
* upper to lower case (EBCDIC) conversion tables.
*
* S390 version
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 270d1d145761..49a11f6dd7ae 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -6,15 +6,15 @@
* Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com),
* Hartmut Penner (hp@de.ibm.com),
* Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com),
- * Heiko Carstens <heiko.carstens@de.ibm.com>
*/
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/linkage.h>
+#include <asm/asm-extable.h>
#include <asm/alternative-asm.h>
#include <asm/processor.h>
#include <asm/cache.h>
-#include <asm/ctl_reg.h>
#include <asm/dwarf.h>
#include <asm/errno.h>
#include <asm/ptrace.h>
@@ -27,67 +27,29 @@
#include <asm/vx-insn.h>
#include <asm/setup.h>
#include <asm/nmi.h>
-#include <asm/export.h>
#include <asm/nospec-insn.h>
-__PT_R0 = __PT_GPRS
-__PT_R1 = __PT_GPRS + 8
-__PT_R2 = __PT_GPRS + 16
-__PT_R3 = __PT_GPRS + 24
-__PT_R4 = __PT_GPRS + 32
-__PT_R5 = __PT_GPRS + 40
-__PT_R6 = __PT_GPRS + 48
-__PT_R7 = __PT_GPRS + 56
-__PT_R8 = __PT_GPRS + 64
-__PT_R9 = __PT_GPRS + 72
-__PT_R10 = __PT_GPRS + 80
-__PT_R11 = __PT_GPRS + 88
-__PT_R12 = __PT_GPRS + 96
-__PT_R13 = __PT_GPRS + 104
-__PT_R14 = __PT_GPRS + 112
-__PT_R15 = __PT_GPRS + 120
-
-STACK_SHIFT = PAGE_SHIFT + THREAD_SIZE_ORDER
-STACK_SIZE = 1 << STACK_SHIFT
-STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE
-
-_TIF_WORK = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
- _TIF_UPROBE | _TIF_GUARDED_STORAGE | _TIF_PATCH_PENDING)
-_TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
- _TIF_SYSCALL_TRACEPOINT)
-_CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \
- _CIF_ASCE_SECONDARY | _CIF_FPU)
-_PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
-
_LPP_OFFSET = __LC_LPP
-#define BASED(name) name-cleanup_critical(%r13)
+ .macro STBEAR address
+ ALTERNATIVE "nop", ".insn s,0xb2010000,\address", 193
+ .endm
- .macro TRACE_IRQS_ON
-#ifdef CONFIG_TRACE_IRQFLAGS
- basr %r2,%r0
- brasl %r14,trace_hardirqs_on_caller
-#endif
+ .macro LBEAR address
+ ALTERNATIVE "nop", ".insn s,0xb2000000,\address", 193
.endm
- .macro TRACE_IRQS_OFF
-#ifdef CONFIG_TRACE_IRQFLAGS
- basr %r2,%r0
- brasl %r14,trace_hardirqs_off_caller
-#endif
+ .macro LPSWEY address,lpswe
+ ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", 193
.endm
- .macro LOCKDEP_SYS_EXIT
-#ifdef CONFIG_LOCKDEP
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jz .+10
- brasl %r14,lockdep_sys_exit
-#endif
+ .macro MBEAR reg
+ ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193
.endm
.macro CHECK_STACK savearea
#ifdef CONFIG_CHECK_STACK
- tml %r15,STACK_SIZE - CONFIG_STACK_GUARD
+ tml %r15,THREAD_SIZE - CONFIG_STACK_GUARD
lghi %r14,\savearea
jz stack_overflow
#endif
@@ -96,12 +58,14 @@ _LPP_OFFSET = __LC_LPP
.macro CHECK_VMAP_STACK savearea,oklabel
#ifdef CONFIG_VMAP_STACK
lgr %r14,%r15
- nill %r14,0x10000 - STACK_SIZE
- oill %r14,STACK_INIT
+ nill %r14,0x10000 - THREAD_SIZE
+ oill %r14,STACK_INIT_OFFSET
clg %r14,__LC_KERNEL_STACK
je \oklabel
clg %r14,__LC_ASYNC_STACK
je \oklabel
+ clg %r14,__LC_MCCK_STACK
+ je \oklabel
clg %r14,__LC_NODAT_STACK
je \oklabel
clg %r14,__LC_RESTART_STACK
@@ -113,56 +77,6 @@ _LPP_OFFSET = __LC_LPP
#endif
.endm
- .macro SWITCH_ASYNC savearea,timer
- tmhh %r8,0x0001 # interrupting from user ?
- jnz 1f
- lgr %r14,%r9
- slg %r14,BASED(.Lcritical_start)
- clg %r14,BASED(.Lcritical_length)
- jhe 0f
- lghi %r11,\savearea # inside critical section, do cleanup
- brasl %r14,cleanup_critical
- tmhh %r8,0x0001 # retest problem state after cleanup
- jnz 1f
-0: lg %r14,__LC_ASYNC_STACK # are we already on the target stack?
- slgr %r14,%r15
- srag %r14,%r14,STACK_SHIFT
- jnz 2f
- CHECK_STACK \savearea
- aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
- j 3f
-1: UPDATE_VTIME %r14,%r15,\timer
- BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
-2: lg %r15,__LC_ASYNC_STACK # load async stack
-3: la %r11,STACK_FRAME_OVERHEAD(%r15)
- .endm
-
- .macro UPDATE_VTIME w1,w2,enter_timer
- lg \w1,__LC_EXIT_TIMER
- lg \w2,__LC_LAST_UPDATE_TIMER
- slg \w1,\enter_timer
- slg \w2,__LC_EXIT_TIMER
- alg \w1,__LC_USER_TIMER
- alg \w2,__LC_SYSTEM_TIMER
- stg \w1,__LC_USER_TIMER
- stg \w2,__LC_SYSTEM_TIMER
- mvc __LC_LAST_UPDATE_TIMER(8),\enter_timer
- .endm
-
- .macro REENABLE_IRQS
- stg %r8,__LC_RETURN_PSW
- ni __LC_RETURN_PSW,0xbf
- ssm __LC_RETURN_PSW
- .endm
-
- .macro STCK savearea
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
- .insn s,0xb27c0000,\savearea # store clock fast
-#else
- .insn s,0xb2050000,\savearea # store clock
-#endif
- .endm
-
/*
* The TSTMSK macro generates a test-under-mask instruction by
* calculating the memory offset for the specified mask value.
@@ -186,46 +100,76 @@ _LPP_OFFSET = __LC_LPP
.endm
.macro BPOFF
- ALTERNATIVE "", ".long 0xb2e8c000", 82
+ ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", 82
.endm
.macro BPON
- ALTERNATIVE "", ".long 0xb2e8d000", 82
+ ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", 82
.endm
.macro BPENTER tif_ptr,tif_mask
- ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .long 0xb2e8d000", \
- "", 82
+ ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \
+ "j .+12; nop; nop", 82
.endm
.macro BPEXIT tif_ptr,tif_mask
TSTMSK \tif_ptr,\tif_mask
- ALTERNATIVE "jz .+8; .long 0xb2e8c000", \
- "jnz .+8; .long 0xb2e8d000", 82
+ ALTERNATIVE "jz .+8; .insn rrf,0xb2e80000,0,0,12,0", \
+ "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", 82
+ .endm
+
+#if IS_ENABLED(CONFIG_KVM)
+ /*
+ * The OUTSIDE macro jumps to the provided label in case the value
+ * in the provided register is outside of the provided range. The
+ * macro is useful for checking whether a PSW stored in a register
+ * pair points inside or outside of a block of instructions.
+ * @reg: register to check
+ * @start: start of the range
+ * @end: end of the range
+ * @outside_label: jump here if @reg is outside of [@start..@end)
+ */
+ .macro OUTSIDE reg,start,end,outside_label
+ lgr %r14,\reg
+ larl %r13,\start
+ slgr %r14,%r13
+ clgfrl %r14,.Lrange_size\@
+ jhe \outside_label
+ .section .rodata, "a"
+ .balign 4
+.Lrange_size\@:
+ .long \end - \start
+ .previous
+ .endm
+
+ .macro SIEEXIT
+ lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer
+ ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE
+ lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce
+ larl %r9,sie_exit # skip forward to sie_exit
+ .endm
+#endif
+
+ .macro STACKLEAK_ERASE
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+ brasl %r14,stackleak_erase_on_task_stack
+#endif
.endm
- GEN_BR_THUNK %r9
GEN_BR_THUNK %r14
- GEN_BR_THUNK %r14,%r11
.section .kprobes.text, "ax"
.Ldummy:
/*
- * This nop exists only in order to avoid that __switch_to starts at
- * the beginning of the kprobes text section. In that case we would
- * have several symbols at the same address. E.g. objdump would take
- * an arbitrary symbol name when disassembling this code.
- * With the added nop in between the __switch_to symbol is unique
- * again.
+ * The following nop exists only in order to avoid that the next
+ * symbol starts at the beginning of the kprobes text section.
+ * In that case there would be several symbols at the same address.
+ * E.g. objdump would take an arbitrary symbol when disassembling
+ * the code.
+ * With the added nop in between this cannot happen.
*/
nop 0
-ENTRY(__bpon)
- .globl __bpon
- BPON
- BR_EX %r14
-ENDPROC(__bpon)
-
/*
* Scheduler resume function, called by switch_to
* gpr2 = (task_struct *) prev
@@ -233,11 +177,11 @@ ENDPROC(__bpon)
* Returns:
* gpr2 = prev
*/
-ENTRY(__switch_to)
+SYM_FUNC_START(__switch_to)
stmg %r6,%r15,__SF_GPRS(%r15) # store gprs of prev task
lghi %r4,__TASK_stack
lghi %r1,__TASK_thread
- llill %r5,STACK_INIT
+ llill %r5,STACK_INIT_OFFSET
stg %r15,__THREAD_ksp(%r1,%r2) # store kernel stack of prev
lg %r15,0(%r4,%r3) # start of kernel stack of next
agr %r15,%r5 # end of kernel stack of next
@@ -247,30 +191,26 @@ ENTRY(__switch_to)
aghi %r3,__TASK_pid
mvc __LC_CURRENT_PID(4,%r0),0(%r3) # store pid of next
lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task
- ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40
+ ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40
BR_EX %r14
-ENDPROC(__switch_to)
-
-.L__critical_start:
+SYM_FUNC_END(__switch_to)
#if IS_ENABLED(CONFIG_KVM)
/*
- * sie64a calling convention:
- * %r2 pointer to sie control block
- * %r3 guest register save area
+ * __sie64a calling convention:
+ * %r2 pointer to sie control block phys
+ * %r3 pointer to sie control block virt
+ * %r4 guest register save area
*/
-ENTRY(sie64a)
+SYM_FUNC_START(__sie64a)
stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers
lg %r12,__LC_CURRENT
- stg %r2,__SF_SIE_CONTROL(%r15) # save control block pointer
- stg %r3,__SF_SIE_SAVEAREA(%r15) # save guest register save area
+ stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical..
+ stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses
+ stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area
xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0
mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU # load guest fp/vx registers ?
- jno .Lsie_load_guest_gprs
- brasl %r14,load_fpu_regs # load guest fp/vx regs
-.Lsie_load_guest_gprs:
- lmg %r0,%r13,0(%r3) # load guest gprs 0-13
+ lmg %r0,%r13,0(%r4) # load guest gprs 0-13
lg %r14,__LC_GMAP # get gmap pointer
ltgr %r14,%r14
jz .Lsie_gmap
@@ -282,35 +222,37 @@ ENTRY(sie64a)
jnz .Lsie_skip
TSTMSK __LC_CPU_FLAGS,_CIF_FPU
jo .Lsie_skip # exit if fp/vx regs changed
- BPEXIT __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+ lg %r14,__SF_SIE_CONTROL_PHYS(%r15) # get sie block phys addr
+ BPEXIT __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
.Lsie_entry:
sie 0(%r14)
-.Lsie_exit:
+# Let the next instruction be NOP to avoid triggering a machine check
+# and handling it in a guest as result of the instruction execution.
+ nopr 7
+.Lsie_leave:
BPOFF
- BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+ BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
.Lsie_skip:
+ lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer
ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
+ lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce
.Lsie_done:
# some program checks are suppressing. C code (e.g. do_protection_exception)
# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There
# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable.
-# Other instructions between sie64a and .Lsie_done should not cause program
+# Other instructions between __sie64a and .Lsie_done should not cause program
# interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
-# See also .Lcleanup_sie
.Lrewind_pad6:
nopr 7
.Lrewind_pad4:
nopr 7
.Lrewind_pad2:
nopr 7
- .globl sie_exit
-sie_exit:
+SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL)
lg %r14,__SF_SIE_SAVEAREA(%r15) # load guest register save area
stmg %r0,%r13,0(%r14) # save guest gprs 0-13
xgr %r0,%r0 # clear guest registers to
xgr %r1,%r1 # prevent speculative use
- xgr %r2,%r2
xgr %r3,%r3
xgr %r4,%r4
xgr %r5,%r5
@@ -326,838 +268,244 @@ sie_exit:
EX_TABLE(.Lrewind_pad4,.Lsie_fault)
EX_TABLE(.Lrewind_pad2,.Lsie_fault)
EX_TABLE(sie_exit,.Lsie_fault)
-ENDPROC(sie64a)
-EXPORT_SYMBOL(sie64a)
+SYM_FUNC_END(__sie64a)
+EXPORT_SYMBOL(__sie64a)
EXPORT_SYMBOL(sie_exit)
#endif
/*
* SVC interrupt handler routine. System calls are synchronous events and
- * are executed with interrupts enabled.
+ * are entered with interrupts disabled.
*/
-ENTRY(system_call)
- stpt __LC_SYNC_ENTER_TIMER
-.Lsysc_stmg:
+SYM_CODE_START(system_call)
+ stpt __LC_SYS_ENTER_TIMER
stmg %r8,%r15,__LC_SAVE_AREA_SYNC
BPOFF
- lg %r12,__LC_CURRENT
- lghi %r13,__TASK_thread
- lghi %r14,_PIF_SYSCALL
+ lghi %r14,0
.Lsysc_per:
+ STBEAR __LC_LAST_BREAK
+ lctlg %c1,%c1,__LC_KERNEL_ASCE
lg %r15,__LC_KERNEL_STACK
- la %r11,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs
-.Lsysc_vtime:
- UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER
- BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
- stmg %r0,%r7,__PT_R0(%r11)
- mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
- mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW
- mvc __PT_INT_CODE(4,%r11),__LC_SVC_ILC
- stg %r14,__PT_FLAGS(%r11)
-.Lsysc_do_svc:
+ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+ stmg %r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
# clear user controlled register to prevent speculative use
xgr %r0,%r0
- # load address of system call table
- lg %r10,__THREAD_sysc_table(%r13,%r12)
- llgh %r8,__PT_INT_CODE+2(%r11)
- slag %r8,%r8,3 # shift and test for svc 0
- jnz .Lsysc_nr_ok
- # svc 0: system call number in %r1
- llgfr %r1,%r1 # clear high word in r1
- cghi %r1,NR_syscalls
- jnl .Lsysc_nr_ok
- sth %r1,__PT_INT_CODE+2(%r11)
- slag %r8,%r1,3
-.Lsysc_nr_ok:
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
- stg %r2,__PT_ORIG_GPR2(%r11)
- stg %r7,STACK_FRAME_OVERHEAD(%r15)
- lg %r9,0(%r8,%r10) # get system call add.
- TSTMSK __TI_flags(%r12),_TIF_TRACE
- jnz .Lsysc_tracesys
- BASR_EX %r14,%r9 # call sys_xxxx
- stg %r2,__PT_R2(%r11) # store return value
-
-.Lsysc_return:
-#ifdef CONFIG_DEBUG_RSEQ
- lgr %r2,%r11
- brasl %r14,rseq_syscall
-#endif
- LOCKDEP_SYS_EXIT
-.Lsysc_tif:
- TSTMSK __PT_FLAGS(%r11),_PIF_WORK
- jnz .Lsysc_work
- TSTMSK __TI_flags(%r12),_TIF_WORK
- jnz .Lsysc_work # check for work
- TSTMSK __LC_CPU_FLAGS,_CIF_WORK
- jnz .Lsysc_work
- BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
-.Lsysc_restore:
- lg %r14,__LC_VDSO_PER_CPU
- lmg %r0,%r10,__PT_R0(%r11)
- mvc __LC_RETURN_PSW(16),__PT_PSW(%r11)
-.Lsysc_exit_timer:
+ xgr %r1,%r1
+ xgr %r4,%r4
+ xgr %r5,%r5
+ xgr %r6,%r6
+ xgr %r7,%r7
+ xgr %r8,%r8
+ xgr %r9,%r9
+ xgr %r10,%r10
+ xgr %r11,%r11
+ la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs
+ mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC
+ MBEAR %r2
+ lgr %r3,%r14
+ brasl %r14,__do_syscall
+ STACKLEAK_ERASE
+ lctlg %c1,%c1,__LC_USER_ASCE
+ mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+ BPON
+ LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+ lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
stpt __LC_EXIT_TIMER
- mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
- lmg %r11,%r15,__PT_R11(%r11)
- lpswe __LC_RETURN_PSW
-.Lsysc_done:
-
-#
-# One of the work bits is on. Find out which one.
-#
-.Lsysc_work:
- TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING
- jo .Lsysc_mcck_pending
- TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED
- jo .Lsysc_reschedule
- TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART
- jo .Lsysc_syscall_restart
-#ifdef CONFIG_UPROBES
- TSTMSK __TI_flags(%r12),_TIF_UPROBE
- jo .Lsysc_uprobe_notify
-#endif
- TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE
- jo .Lsysc_guarded_storage
- TSTMSK __PT_FLAGS(%r11),_PIF_PER_TRAP
- jo .Lsysc_singlestep
-#ifdef CONFIG_LIVEPATCH
- TSTMSK __TI_flags(%r12),_TIF_PATCH_PENDING
- jo .Lsysc_patch_pending # handle live patching just before
- # signals and possible syscall restart
-#endif
- TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART
- jo .Lsysc_syscall_restart
- TSTMSK __TI_flags(%r12),_TIF_SIGPENDING
- jo .Lsysc_sigpending
- TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME
- jo .Lsysc_notify_resume
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU
- jo .Lsysc_vxrs
- TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
- jnz .Lsysc_asce
- j .Lsysc_return # beware of critical section cleanup
-
-#
-# _TIF_NEED_RESCHED is set, call schedule
-#
-.Lsysc_reschedule:
- larl %r14,.Lsysc_return
- jg schedule
-
-#
-# _CIF_MCCK_PENDING is set, call handler
-#
-.Lsysc_mcck_pending:
- larl %r14,.Lsysc_return
- jg s390_handle_mcck # TIF bit will be cleared by handler
-
-#
-# _CIF_ASCE_PRIMARY and/or _CIF_ASCE_SECONDARY set, load user space asce
-#
-.Lsysc_asce:
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY
- lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce
- TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY
- jz .Lsysc_return
-#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES
- tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ?
- jnz .Lsysc_set_fs_fixup
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- j .Lsysc_return
-.Lsysc_set_fs_fixup:
-#endif
- larl %r14,.Lsysc_return
- jg set_fs_fixup
-
-#
-# CIF_FPU is set, restore floating-point controls and floating-point registers.
-#
-.Lsysc_vxrs:
- larl %r14,.Lsysc_return
- jg load_fpu_regs
-
-#
-# _TIF_SIGPENDING is set, call do_signal
-#
-.Lsysc_sigpending:
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,do_signal
- TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL
- jno .Lsysc_return
-.Lsysc_do_syscall:
- lghi %r13,__TASK_thread
- lmg %r2,%r7,__PT_R2(%r11) # load svc arguments
- lghi %r1,0 # svc 0 returns -ENOSYS
- j .Lsysc_do_svc
-
-#
-# _TIF_NOTIFY_RESUME is set, call do_notify_resume
-#
-.Lsysc_notify_resume:
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg do_notify_resume
-
-#
-# _TIF_UPROBE is set, call uprobe_notify_resume
-#
-#ifdef CONFIG_UPROBES
-.Lsysc_uprobe_notify:
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg uprobe_notify_resume
-#endif
-
-#
-# _TIF_GUARDED_STORAGE is set, call guarded_storage_load
-#
-.Lsysc_guarded_storage:
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg gs_load_bc_cb
-#
-# _TIF_PATCH_PENDING is set, call klp_update_patch_state
-#
-#ifdef CONFIG_LIVEPATCH
-.Lsysc_patch_pending:
- lg %r2,__LC_CURRENT # pass pointer to task struct
- larl %r14,.Lsysc_return
- jg klp_update_patch_state
-#endif
-
-#
-# _PIF_PER_TRAP is set, call do_per_trap
-#
-.Lsysc_singlestep:
- ni __PT_FLAGS+7(%r11),255-_PIF_PER_TRAP
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg do_per_trap
-
-#
-# _PIF_SYSCALL_RESTART is set, repeat the current system call
-#
-.Lsysc_syscall_restart:
- ni __PT_FLAGS+7(%r11),255-_PIF_SYSCALL_RESTART
- lmg %r1,%r7,__PT_R1(%r11) # load svc arguments
- lg %r2,__PT_ORIG_GPR2(%r11)
- j .Lsysc_do_svc
-
-#
-# call tracehook_report_syscall_entry/tracehook_report_syscall_exit before
-# and after the system call
-#
-.Lsysc_tracesys:
- lgr %r2,%r11 # pass pointer to pt_regs
- la %r3,0
- llgh %r0,__PT_INT_CODE+2(%r11)
- stg %r0,__PT_R2(%r11)
- brasl %r14,do_syscall_trace_enter
- lghi %r0,NR_syscalls
- clgr %r0,%r2
- jnh .Lsysc_tracenogo
- sllg %r8,%r2,3
- lg %r9,0(%r8,%r10)
-.Lsysc_tracego:
- lmg %r3,%r7,__PT_R3(%r11)
- stg %r7,STACK_FRAME_OVERHEAD(%r15)
- lg %r2,__PT_ORIG_GPR2(%r11)
- BASR_EX %r14,%r9 # call sys_xxx
- stg %r2,__PT_R2(%r11) # store return value
-.Lsysc_tracenogo:
- TSTMSK __TI_flags(%r12),_TIF_TRACE
- jz .Lsysc_return
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg do_syscall_trace_exit
-ENDPROC(system_call)
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE
+SYM_CODE_END(system_call)
#
# a new process exits the kernel with ret_from_fork
#
-ENTRY(ret_from_fork)
- la %r11,STACK_FRAME_OVERHEAD(%r15)
- lg %r12,__LC_CURRENT
- brasl %r14,schedule_tail
- TRACE_IRQS_ON
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- tm __PT_PSW+1(%r11),0x01 # forking a kernel thread ?
- jne .Lsysc_tracenogo
- # it's a kernel thread
- lmg %r9,%r10,__PT_R9(%r11) # load gprs
- la %r2,0(%r10)
- BASR_EX %r14,%r9
- j .Lsysc_tracenogo
-ENDPROC(ret_from_fork)
-
-ENTRY(kernel_thread_starter)
- la %r2,0(%r10)
- BASR_EX %r14,%r9
- j .Lsysc_tracenogo
-ENDPROC(kernel_thread_starter)
+SYM_CODE_START(ret_from_fork)
+ lgr %r3,%r11
+ brasl %r14,__ret_from_fork
+ STACKLEAK_ERASE
+ lctlg %c1,%c1,__LC_USER_ASCE
+ mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+ BPON
+ LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+ lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+ stpt __LC_EXIT_TIMER
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE
+SYM_CODE_END(ret_from_fork)
/*
* Program check handler routine
*/
-ENTRY(pgm_check_handler)
- stpt __LC_SYNC_ENTER_TIMER
+SYM_CODE_START(pgm_check_handler)
+ stpt __LC_SYS_ENTER_TIMER
BPOFF
stmg %r8,%r15,__LC_SAVE_AREA_SYNC
- lg %r10,__LC_LAST_BREAK
- lg %r12,__LC_CURRENT
- lghi %r11,0
- larl %r13,cleanup_critical
+ lghi %r10,0
lmg %r8,%r9,__LC_PGM_OLD_PSW
- tmhh %r8,0x0001 # test problem state bit
- jnz 2f # -> fault in user space
+ tmhh %r8,0x0001 # coming from user space?
+ jno .Lpgm_skip_asce
+ lctlg %c1,%c1,__LC_KERNEL_ASCE
+ j 3f # -> fault in user space
+.Lpgm_skip_asce:
#if IS_ENABLED(CONFIG_KVM)
- # cleanup critical section for program checks in sie64a
- lgr %r14,%r9
- slg %r14,BASED(.Lsie_critical_start)
- clg %r14,BASED(.Lsie_critical_length)
- jhe 0f
- lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer
- ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- larl %r9,sie_exit # skip forward to sie_exit
- lghi %r11,_PIF_GUEST_FAULT
+ # cleanup critical section for program checks in __sie64a
+ OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f
+ BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
+ SIEEXIT
+ lghi %r10,_PIF_GUEST_FAULT
#endif
-0: tmhh %r8,0x4000 # PER bit set in old PSW ?
- jnz 1f # -> enabled, can't be a double fault
+1: tmhh %r8,0x4000 # PER bit set in old PSW ?
+ jnz 2f # -> enabled, can't be a double fault
tm __LC_PGM_ILC+3,0x80 # check for per exception
jnz .Lpgm_svcper # -> single stepped svc
-1: CHECK_STACK __LC_SAVE_AREA_SYNC
+2: CHECK_STACK __LC_SAVE_AREA_SYNC
aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
# CHECK_VMAP_STACK branches to stack_overflow or 4f
CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f
-2: UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER
- BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
- lg %r15,__LC_KERNEL_STACK
- lgr %r14,%r12
- aghi %r14,__TASK_thread # pointer to thread_struct
- lghi %r13,__LC_PGM_TDB
- tm __LC_PGM_ILC+2,0x02 # check for transaction abort
- jz 3f
- mvc __THREAD_trap_tdb(256,%r14),0(%r13)
-3: stg %r10,__THREAD_last_break(%r14)
-4: lgr %r13,%r11
- la %r11,STACK_FRAME_OVERHEAD(%r15)
+3: lg %r15,__LC_KERNEL_STACK
+4: la %r11,STACK_FRAME_OVERHEAD(%r15)
+ stg %r10,__PT_FLAGS(%r11)
+ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
stmg %r0,%r7,__PT_R0(%r11)
+ mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
+ mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK
+ stmg %r8,%r9,__PT_PSW(%r11)
+
# clear user controlled registers to prevent speculative use
xgr %r0,%r0
xgr %r1,%r1
- xgr %r2,%r2
xgr %r3,%r3
xgr %r4,%r4
xgr %r5,%r5
xgr %r6,%r6
xgr %r7,%r7
- mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
- stmg %r8,%r9,__PT_PSW(%r11)
- mvc __PT_INT_CODE(4,%r11),__LC_PGM_ILC
- mvc __PT_INT_PARM_LONG(8,%r11),__LC_TRANS_EXC_CODE
- stg %r13,__PT_FLAGS(%r11)
- stg %r10,__PT_ARGS(%r11)
- tm __LC_PGM_ILC+3,0x80 # check for per exception
- jz 5f
- tmhh %r8,0x0001 # kernel per event ?
- jz .Lpgm_kprobe
- oi __PT_FLAGS+7(%r11),_PIF_PER_TRAP
- mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS
- mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE
- mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID
-5: REENABLE_IRQS
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
- larl %r1,pgm_check_table
- llgh %r10,__PT_INT_CODE+2(%r11)
- nill %r10,0x007f
- sll %r10,3
- je .Lpgm_return
- lg %r9,0(%r10,%r1) # load address of handler routine
- lgr %r2,%r11 # pass pointer to pt_regs
- BASR_EX %r14,%r9 # branch to interrupt-handler
-.Lpgm_return:
- LOCKDEP_SYS_EXIT
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jno .Lsysc_restore
- TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL
- jo .Lsysc_do_syscall
- j .Lsysc_tif
-
-#
-# PER event in supervisor state, must be kprobes
-#
-.Lpgm_kprobe:
- REENABLE_IRQS
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,do_per_trap
- j .Lpgm_return
+ lgr %r2,%r11
+ brasl %r14,__do_pgm_check
+ tmhh %r8,0x0001 # returning to user space?
+ jno .Lpgm_exit_kernel
+ STACKLEAK_ERASE
+ lctlg %c1,%c1,__LC_USER_ASCE
+ BPON
+ stpt __LC_EXIT_TIMER
+.Lpgm_exit_kernel:
+ mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+ LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+ lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE
#
# single stepped system call
#
.Lpgm_svcper:
mvc __LC_RETURN_PSW(8),__LC_SVC_NEW_PSW
- lghi %r13,__TASK_thread
larl %r14,.Lsysc_per
stg %r14,__LC_RETURN_PSW+8
- lghi %r14,_PIF_SYSCALL | _PIF_PER_TRAP
- lpswe __LC_RETURN_PSW # branch to .Lsysc_per and enable irqs
-ENDPROC(pgm_check_handler)
+ lghi %r14,1
+ LBEAR __LC_PGM_LAST_BREAK
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per
+SYM_CODE_END(pgm_check_handler)
/*
- * IO interrupt handler routine
+ * Interrupt handler macro used for external and IO interrupts.
*/
-ENTRY(io_int_handler)
- STCK __LC_INT_CLOCK
- stpt __LC_ASYNC_ENTER_TIMER
+.macro INT_HANDLER name,lc_old_psw,handler
+SYM_CODE_START(\name)
+ stckf __LC_INT_CLOCK
+ stpt __LC_SYS_ENTER_TIMER
+ STBEAR __LC_LAST_BREAK
BPOFF
stmg %r8,%r15,__LC_SAVE_AREA_ASYNC
- lg %r12,__LC_CURRENT
- larl %r13,cleanup_critical
- lmg %r8,%r9,__LC_IO_OLD_PSW
- SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
+ lmg %r8,%r9,\lc_old_psw
+ tmhh %r8,0x0001 # interrupting from user ?
+ jnz 1f
+#if IS_ENABLED(CONFIG_KVM)
+ OUTSIDE %r9,.Lsie_gmap,.Lsie_done,0f
+ BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
+ SIEEXIT
+#endif
+0: CHECK_STACK __LC_SAVE_AREA_ASYNC
+ aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
+ j 2f
+1: lctlg %c1,%c1,__LC_KERNEL_ASCE
+ lg %r15,__LC_KERNEL_STACK
+2: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+ la %r11,STACK_FRAME_OVERHEAD(%r15)
stmg %r0,%r7,__PT_R0(%r11)
# clear user controlled registers to prevent speculative use
xgr %r0,%r0
xgr %r1,%r1
- xgr %r2,%r2
xgr %r3,%r3
xgr %r4,%r4
xgr %r5,%r5
xgr %r6,%r6
xgr %r7,%r7
xgr %r10,%r10
+ xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
+ MBEAR %r11
stmg %r8,%r9,__PT_PSW(%r11)
- mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID
- xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
- TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ
- jo .Lio_restore
- TRACE_IRQS_OFF
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
-.Lio_loop:
lgr %r2,%r11 # pass pointer to pt_regs
- lghi %r3,IO_INTERRUPT
- tm __PT_INT_CODE+8(%r11),0x80 # adapter interrupt ?
- jz .Lio_call
- lghi %r3,THIN_INTERRUPT
-.Lio_call:
- brasl %r14,do_IRQ
- TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_LPAR
- jz .Lio_return
- tpi 0
- jz .Lio_return
- mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID
- j .Lio_loop
-.Lio_return:
- LOCKDEP_SYS_EXIT
- TRACE_IRQS_ON
-.Lio_tif:
- TSTMSK __TI_flags(%r12),_TIF_WORK
- jnz .Lio_work # there is work to do (signals etc.)
- TSTMSK __LC_CPU_FLAGS,_CIF_WORK
- jnz .Lio_work
-.Lio_restore:
- lg %r14,__LC_VDSO_PER_CPU
- lmg %r0,%r10,__PT_R0(%r11)
+ brasl %r14,\handler
mvc __LC_RETURN_PSW(16),__PT_PSW(%r11)
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jno .Lio_exit_kernel
- BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
-.Lio_exit_timer:
+ tmhh %r8,0x0001 # returning to user ?
+ jno 2f
+ STACKLEAK_ERASE
+ lctlg %c1,%c1,__LC_USER_ASCE
+ BPON
stpt __LC_EXIT_TIMER
- mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
-.Lio_exit_kernel:
- lmg %r11,%r15,__PT_R11(%r11)
- lpswe __LC_RETURN_PSW
-.Lio_done:
-
-#
-# There is work todo, find out in which context we have been interrupted:
-# 1) if we return to user space we can do all _TIF_WORK work
-# 2) if we return to kernel code and kvm is enabled check if we need to
-# modify the psw to leave SIE
-# 3) if we return to kernel code and preemptive scheduling is enabled check
-# the preemption counter and if it is zero call preempt_schedule_irq
-# Before any work can be done, a switch to the kernel stack is required.
-#
-.Lio_work:
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jo .Lio_work_user # yes -> do resched & signal
-#ifdef CONFIG_PREEMPT
- # check for preemptive scheduling
- icm %r0,15,__LC_PREEMPT_COUNT
- jnz .Lio_restore # preemption is disabled
- TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED
- jno .Lio_restore
- # switch to kernel stack
- lg %r1,__PT_R15(%r11)
- aghi %r1,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
- mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11)
- xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1)
- la %r11,STACK_FRAME_OVERHEAD(%r1)
- lgr %r15,%r1
- # TRACE_IRQS_ON already done at .Lio_return, call
- # TRACE_IRQS_OFF to keep things symmetrical
- TRACE_IRQS_OFF
- brasl %r14,preempt_schedule_irq
- j .Lio_return
-#else
- j .Lio_restore
-#endif
-
-#
-# Need to do work before returning to userspace, switch to kernel stack
-#
-.Lio_work_user:
- lg %r1,__LC_KERNEL_STACK
- mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11)
- xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1)
- la %r11,STACK_FRAME_OVERHEAD(%r1)
- lgr %r15,%r1
-
-#
-# One of the work bits is on. Find out which one.
-#
-.Lio_work_tif:
- TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING
- jo .Lio_mcck_pending
- TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED
- jo .Lio_reschedule
-#ifdef CONFIG_LIVEPATCH
- TSTMSK __TI_flags(%r12),_TIF_PATCH_PENDING
- jo .Lio_patch_pending
-#endif
- TSTMSK __TI_flags(%r12),_TIF_SIGPENDING
- jo .Lio_sigpending
- TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME
- jo .Lio_notify_resume
- TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE
- jo .Lio_guarded_storage
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU
- jo .Lio_vxrs
- TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
- jnz .Lio_asce
- j .Lio_return # beware of critical section cleanup
-
-#
-# _CIF_MCCK_PENDING is set, call handler
-#
-.Lio_mcck_pending:
- # TRACE_IRQS_ON already done at .Lio_return
- brasl %r14,s390_handle_mcck # TIF bit will be cleared by handler
- TRACE_IRQS_OFF
- j .Lio_return
+2: LBEAR __PT_LAST_BREAK(%r11)
+ lmg %r0,%r15,__PT_R0(%r11)
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE
+SYM_CODE_END(\name)
+.endm
-#
-# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce
-#
-.Lio_asce:
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY
- lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce
- TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY
- jz .Lio_return
-#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES
- tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ?
- jnz .Lio_set_fs_fixup
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- j .Lio_return
-.Lio_set_fs_fixup:
-#endif
- larl %r14,.Lio_return
- jg set_fs_fixup
-
-#
-# CIF_FPU is set, restore floating-point controls and floating-point registers.
-#
-.Lio_vxrs:
- larl %r14,.Lio_return
- jg load_fpu_regs
-
-#
-# _TIF_GUARDED_STORAGE is set, call guarded_storage_load
-#
-.Lio_guarded_storage:
- # TRACE_IRQS_ON already done at .Lio_return
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,gs_load_bc_cb
- ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts
- TRACE_IRQS_OFF
- j .Lio_return
-
-#
-# _TIF_NEED_RESCHED is set, call schedule
-#
-.Lio_reschedule:
- # TRACE_IRQS_ON already done at .Lio_return
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- brasl %r14,schedule # call scheduler
- ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts
- TRACE_IRQS_OFF
- j .Lio_return
-
-#
-# _TIF_PATCH_PENDING is set, call klp_update_patch_state
-#
-#ifdef CONFIG_LIVEPATCH
-.Lio_patch_pending:
- lg %r2,__LC_CURRENT # pass pointer to task struct
- larl %r14,.Lio_return
- jg klp_update_patch_state
-#endif
-
-#
-# _TIF_SIGPENDING or is set, call do_signal
-#
-.Lio_sigpending:
- # TRACE_IRQS_ON already done at .Lio_return
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,do_signal
- ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts
- TRACE_IRQS_OFF
- j .Lio_return
-
-#
-# _TIF_NOTIFY_RESUME or is set, call do_notify_resume
-#
-.Lio_notify_resume:
- # TRACE_IRQS_ON already done at .Lio_return
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,do_notify_resume
- ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts
- TRACE_IRQS_OFF
- j .Lio_return
-ENDPROC(io_int_handler)
+INT_HANDLER ext_int_handler,__LC_EXT_OLD_PSW,do_ext_irq
+INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq
/*
- * External interrupt handler routine
+ * Load idle PSW.
*/
-ENTRY(ext_int_handler)
- STCK __LC_INT_CLOCK
- stpt __LC_ASYNC_ENTER_TIMER
- BPOFF
- stmg %r8,%r15,__LC_SAVE_AREA_ASYNC
- lg %r12,__LC_CURRENT
- larl %r13,cleanup_critical
- lmg %r8,%r9,__LC_EXT_OLD_PSW
- SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
- stmg %r0,%r7,__PT_R0(%r11)
- # clear user controlled registers to prevent speculative use
- xgr %r0,%r0
- xgr %r1,%r1
- xgr %r2,%r2
- xgr %r3,%r3
- xgr %r4,%r4
- xgr %r5,%r5
- xgr %r6,%r6
- xgr %r7,%r7
- xgr %r10,%r10
- mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
- stmg %r8,%r9,__PT_PSW(%r11)
- lghi %r1,__LC_EXT_PARAMS2
- mvc __PT_INT_CODE(4,%r11),__LC_EXT_CPU_ADDR
- mvc __PT_INT_PARM(4,%r11),__LC_EXT_PARAMS
- mvc __PT_INT_PARM_LONG(8,%r11),0(%r1)
- xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
- TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ
- jo .Lio_restore
- TRACE_IRQS_OFF
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
- lgr %r2,%r11 # pass pointer to pt_regs
- lghi %r3,EXT_INTERRUPT
- brasl %r14,do_IRQ
- j .Lio_return
-ENDPROC(ext_int_handler)
-
-/*
- * Load idle PSW. The second "half" of this function is in .Lcleanup_idle.
- */
-ENTRY(psw_idle)
+SYM_FUNC_START(psw_idle)
+ stg %r14,(__SF_GPRS+8*8)(%r15)
stg %r3,__SF_EMPTY(%r15)
- larl %r1,.Lpsw_idle_lpsw+4
+ larl %r1,psw_idle_exit
stg %r1,__SF_EMPTY+8(%r15)
larl %r1,smp_cpu_mtid
llgf %r1,0(%r1)
ltgr %r1,%r1
jz .Lpsw_idle_stcctm
- .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+16(%r15)
+ .insn rsy,0xeb0000000017,%r1,5,__MT_CYCLES_ENTER(%r2)
.Lpsw_idle_stcctm:
oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT
BPON
- STCK __CLOCK_IDLE_ENTER(%r2)
+ stckf __CLOCK_IDLE_ENTER(%r2)
stpt __TIMER_IDLE_ENTER(%r2)
-.Lpsw_idle_lpsw:
lpswe __SF_EMPTY(%r15)
+SYM_INNER_LABEL(psw_idle_exit, SYM_L_GLOBAL)
BR_EX %r14
-.Lpsw_idle_end:
-ENDPROC(psw_idle)
-
-/*
- * Store floating-point controls and floating-point or vector register
- * depending whether the vector facility is available. A critical section
- * cleanup assures that the registers are stored even if interrupted for
- * some other work. The CIF_FPU flag is set to trigger a lazy restore
- * of the register contents at return from io or a system call.
- */
-ENTRY(save_fpu_regs)
- lg %r2,__LC_CURRENT
- aghi %r2,__TASK_thread
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU
- jo .Lsave_fpu_regs_exit
- stfpc __THREAD_FPU_fpc(%r2)
- lg %r3,__THREAD_FPU_regs(%r2)
- TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX
- jz .Lsave_fpu_regs_fp # no -> store FP regs
- VSTM %v0,%v15,0,%r3 # vstm 0,15,0(3)
- VSTM %v16,%v31,256,%r3 # vstm 16,31,256(3)
- j .Lsave_fpu_regs_done # -> set CIF_FPU flag
-.Lsave_fpu_regs_fp:
- std 0,0(%r3)
- std 1,8(%r3)
- std 2,16(%r3)
- std 3,24(%r3)
- std 4,32(%r3)
- std 5,40(%r3)
- std 6,48(%r3)
- std 7,56(%r3)
- std 8,64(%r3)
- std 9,72(%r3)
- std 10,80(%r3)
- std 11,88(%r3)
- std 12,96(%r3)
- std 13,104(%r3)
- std 14,112(%r3)
- std 15,120(%r3)
-.Lsave_fpu_regs_done:
- oi __LC_CPU_FLAGS+7,_CIF_FPU
-.Lsave_fpu_regs_exit:
- BR_EX %r14
-.Lsave_fpu_regs_end:
-ENDPROC(save_fpu_regs)
-EXPORT_SYMBOL(save_fpu_regs)
-
-/*
- * Load floating-point controls and floating-point or vector registers.
- * A critical section cleanup assures that the register contents are
- * loaded even if interrupted for some other work.
- *
- * There are special calling conventions to fit into sysc and io return work:
- * %r15: <kernel stack>
- * The function requires:
- * %r4
- */
-load_fpu_regs:
- lg %r4,__LC_CURRENT
- aghi %r4,__TASK_thread
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU
- jno .Lload_fpu_regs_exit
- lfpc __THREAD_FPU_fpc(%r4)
- TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX
- lg %r4,__THREAD_FPU_regs(%r4) # %r4 <- reg save area
- jz .Lload_fpu_regs_fp # -> no VX, load FP regs
- VLM %v0,%v15,0,%r4
- VLM %v16,%v31,256,%r4
- j .Lload_fpu_regs_done
-.Lload_fpu_regs_fp:
- ld 0,0(%r4)
- ld 1,8(%r4)
- ld 2,16(%r4)
- ld 3,24(%r4)
- ld 4,32(%r4)
- ld 5,40(%r4)
- ld 6,48(%r4)
- ld 7,56(%r4)
- ld 8,64(%r4)
- ld 9,72(%r4)
- ld 10,80(%r4)
- ld 11,88(%r4)
- ld 12,96(%r4)
- ld 13,104(%r4)
- ld 14,112(%r4)
- ld 15,120(%r4)
-.Lload_fpu_regs_done:
- ni __LC_CPU_FLAGS+7,255-_CIF_FPU
-.Lload_fpu_regs_exit:
- BR_EX %r14
-.Lload_fpu_regs_end:
-ENDPROC(load_fpu_regs)
-
-.L__critical_end:
+SYM_FUNC_END(psw_idle)
/*
* Machine check handler routines
*/
-ENTRY(mcck_int_handler)
- STCK __LC_MCCK_CLOCK
+SYM_CODE_START(mcck_int_handler)
BPOFF
la %r1,4095 # validate r1
spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer
- sckc __LC_CLOCK_COMPARATOR # validate comparator
- lam %a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs
- lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs
- lg %r12,__LC_CURRENT
- larl %r13,cleanup_critical
+ LBEAR __LC_LAST_BREAK_SAVE_AREA-4095(%r1) # validate bear
+ lmg %r0,%r15,__LC_GPREGS_SAVE_AREA # validate gprs
lmg %r8,%r9,__LC_MCK_OLD_PSW
TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE
jo .Lmcck_panic # yes -> rest of mcck code invalid
TSTMSK __LC_MCCK_CODE,MCCK_CODE_CR_VALID
jno .Lmcck_panic # control registers invalid -> panic
- la %r14,4095
- lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs
+ lctlg %c0,%c15,__LC_CREGS_SAVE_AREA # validate ctl regs
ptlb
- lg %r11,__LC_MCESAD-4095(%r14) # extended machine check save area
- nill %r11,0xfc00 # MCESA_ORIGIN_MASK
- TSTMSK __LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE
- jno 0f
- TSTMSK __LC_MCCK_CODE,MCCK_CODE_GS_VALID
- jno 0f
- .insn rxy,0xe3000000004d,0,__MCESA_GS_SAVE_AREA(%r11) # LGSC
-0: l %r14,__LC_FP_CREG_SAVE_AREA-4095(%r14)
- TSTMSK __LC_MCCK_CODE,MCCK_CODE_FC_VALID
- jo 0f
- sr %r14,%r14
-0: sfpc %r14
- TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX
- jo 0f
- lghi %r14,__LC_FPREGS_SAVE_AREA
- ld %f0,0(%r14)
- ld %f1,8(%r14)
- ld %f2,16(%r14)
- ld %f3,24(%r14)
- ld %f4,32(%r14)
- ld %f5,40(%r14)
- ld %f6,48(%r14)
- ld %f7,56(%r14)
- ld %f8,64(%r14)
- ld %f9,72(%r14)
- ld %f10,80(%r14)
- ld %f11,88(%r14)
- ld %f12,96(%r14)
- ld %f13,104(%r14)
- ld %f14,112(%r14)
- ld %f15,120(%r14)
- j 1f
-0: VLM %v0,%v15,0,%r11
- VLM %v16,%v31,256,%r11
-1: lghi %r14,__LC_CPU_TIMER_SAVE_AREA
+ lghi %r14,__LC_CPU_TIMER_SAVE_AREA
mvc __LC_MCCK_ENTER_TIMER(8),0(%r14)
TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID
jo 3f
- la %r14,__LC_SYNC_ENTER_TIMER
- clc 0(8,%r14),__LC_ASYNC_ENTER_TIMER
- jl 0f
- la %r14,__LC_ASYNC_ENTER_TIMER
-0: clc 0(8,%r14),__LC_EXIT_TIMER
+ la %r14,__LC_SYS_ENTER_TIMER
+ clc 0(8,%r14),__LC_EXIT_TIMER
jl 1f
la %r14,__LC_EXIT_TIMER
1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER
@@ -1168,18 +516,27 @@ ENTRY(mcck_int_handler)
3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID
jno .Lmcck_panic
tmhh %r8,0x0001 # interrupting from user ?
- jnz 4f
+ jnz .Lmcck_user
TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID
jno .Lmcck_panic
-4: ssm __LC_PGM_NEW_PSW # turn dat on, keep irqs off
- SWITCH_ASYNC __LC_GPREGS_SAVE_AREA+64,__LC_MCCK_ENTER_TIMER
-.Lmcck_skip:
+#if IS_ENABLED(CONFIG_KVM)
+ OUTSIDE %r9,.Lsie_gmap,.Lsie_done,.Lmcck_user
+ OUTSIDE %r9,.Lsie_entry,.Lsie_leave,4f
+ oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
+4: BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
+ SIEEXIT
+#endif
+.Lmcck_user:
+ lg %r15,__LC_MCCK_STACK
+ la %r11,STACK_FRAME_OVERHEAD(%r15)
+ stctg %c1,%c1,__PT_CR1(%r11)
+ lctlg %c1,%c1,__LC_KERNEL_ASCE
+ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
lghi %r14,__LC_GPREGS_SAVE_AREA+64
stmg %r0,%r7,__PT_R0(%r11)
# clear user controlled registers to prevent speculative use
xgr %r0,%r0
xgr %r1,%r1
- xgr %r2,%r2
xgr %r3,%r3
xgr %r4,%r4
xgr %r5,%r5
@@ -1192,42 +549,57 @@ ENTRY(mcck_int_handler)
xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
lgr %r2,%r11 # pass pointer to pt_regs
brasl %r14,s390_do_machine_check
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jno .Lmcck_return
- lg %r1,__LC_KERNEL_STACK # switch to kernel stack
- mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11)
- xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1)
- la %r11,STACK_FRAME_OVERHEAD(%r1)
- lgr %r15,%r1
- TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING
- jno .Lmcck_return
- TRACE_IRQS_OFF
- brasl %r14,s390_handle_mcck
- TRACE_IRQS_ON
-.Lmcck_return:
- lg %r14,__LC_VDSO_PER_CPU
+ lctlg %c1,%c1,__PT_CR1(%r11)
lmg %r0,%r10,__PT_R0(%r11)
mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW
tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
jno 0f
- BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
+ BPON
stpt __LC_EXIT_TIMER
- mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
-0: lmg %r11,%r15,__PT_R11(%r11)
- lpswe __LC_RETURN_MCCK_PSW
+0: ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193
+ LBEAR 0(%r12)
+ lmg %r11,%r15,__PT_R11(%r11)
+ LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE
.Lmcck_panic:
- lg %r15,__LC_NODAT_STACK
- la %r11,STACK_FRAME_OVERHEAD(%r15)
- j .Lmcck_skip
-ENDPROC(mcck_int_handler)
-
-#
-# PSW restart interrupt handler
-#
-ENTRY(restart_int_handler)
- ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40
+ /*
+ * Iterate over all possible CPU addresses in the range 0..0xffff
+ * and stop each CPU using signal processor. Use compare and swap
+ * to allow just one CPU-stopper and prevent concurrent CPUs from
+ * stopping each other while leaving the others running.
+ */
+ lhi %r5,0
+ lhi %r6,1
+ larl %r7,stop_lock
+ cs %r5,%r6,0(%r7) # single CPU-stopper only
+ jnz 4f
+ larl %r7,this_cpu
+ stap 0(%r7) # this CPU address
+ lh %r4,0(%r7)
+ nilh %r4,0
+ lhi %r0,1
+ sll %r0,16 # CPU counter
+ lhi %r3,0 # next CPU address
+0: cr %r3,%r4
+ je 2f
+1: sigp %r1,%r3,SIGP_STOP # stop next CPU
+ brc SIGP_CC_BUSY,1b
+2: ahi %r3,1
+ brct %r0,0b
+3: sigp %r1,%r4,SIGP_STOP # stop this CPU
+ brc SIGP_CC_BUSY,3b
+4: j 4b
+SYM_CODE_END(mcck_int_handler)
+
+SYM_CODE_START(restart_int_handler)
+ ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40
stg %r15,__LC_SAVE_AREA_RESTART
+ TSTMSK __LC_RESTART_FLAGS,RESTART_FLAG_CTLREGS,4
+ jz 0f
+ lctlg %c0,%c15,__LC_CREGS_SAVE_AREA
+0: larl %r15,daton_psw
+ lpswe 0(%r15) # turn dat on, keep irqs off
+.Ldaton:
lg %r15,__LC_RESTART_STACK
xc STACK_FRAME_OVERHEAD(__PT_SIZE,%r15),STACK_FRAME_OVERHEAD(%r15)
stmg %r0,%r14,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
@@ -1236,7 +608,7 @@ ENTRY(restart_int_handler)
xc 0(STACK_FRAME_OVERHEAD,%r15),0(%r15)
lg %r1,__LC_RESTART_FN # load fn, parm & source cpu
lg %r2,__LC_RESTART_DATA
- lg %r3,__LC_RESTART_SOURCE
+ lgf %r3,__LC_RESTART_SOURCE
ltgr %r3,%r3 # test source cpu address
jm 1f # negative -> skip source stop
0: sigp %r4,%r3,SIGP_SENSE # sigp sense to source cpu
@@ -1247,7 +619,7 @@ ENTRY(restart_int_handler)
2: sigp %r4,%r3,SIGP_STOP # sigp stop to current cpu
brc 2,2b
3: j 3b
-ENDPROC(restart_int_handler)
+SYM_CODE_END(restart_int_handler)
.section .kprobes.text, "ax"
@@ -1257,7 +629,7 @@ ENDPROC(restart_int_handler)
* No need to properly save the registers, we are going to panic anyway.
* Setup a pt_regs so that show_trace can provide a good call trace.
*/
-ENTRY(stack_overflow)
+SYM_CODE_START(stack_overflow)
lg %r15,__LC_NODAT_STACK # change to panic stack
la %r11,STACK_FRAME_OVERHEAD(%r15)
stmg %r0,%r7,__PT_R0(%r11)
@@ -1267,278 +639,31 @@ ENTRY(stack_overflow)
xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
lgr %r2,%r11 # pass pointer to pt_regs
jg kernel_stack_overflow
-ENDPROC(stack_overflow)
+SYM_CODE_END(stack_overflow)
#endif
-ENTRY(cleanup_critical)
-#if IS_ENABLED(CONFIG_KVM)
- clg %r9,BASED(.Lcleanup_table_sie) # .Lsie_gmap
- jl 0f
- clg %r9,BASED(.Lcleanup_table_sie+8)# .Lsie_done
- jl .Lcleanup_sie
-#endif
- clg %r9,BASED(.Lcleanup_table) # system_call
- jl 0f
- clg %r9,BASED(.Lcleanup_table+8) # .Lsysc_do_svc
- jl .Lcleanup_system_call
- clg %r9,BASED(.Lcleanup_table+16) # .Lsysc_tif
- jl 0f
- clg %r9,BASED(.Lcleanup_table+24) # .Lsysc_restore
- jl .Lcleanup_sysc_tif
- clg %r9,BASED(.Lcleanup_table+32) # .Lsysc_done
- jl .Lcleanup_sysc_restore
- clg %r9,BASED(.Lcleanup_table+40) # .Lio_tif
- jl 0f
- clg %r9,BASED(.Lcleanup_table+48) # .Lio_restore
- jl .Lcleanup_io_tif
- clg %r9,BASED(.Lcleanup_table+56) # .Lio_done
- jl .Lcleanup_io_restore
- clg %r9,BASED(.Lcleanup_table+64) # psw_idle
- jl 0f
- clg %r9,BASED(.Lcleanup_table+72) # .Lpsw_idle_end
- jl .Lcleanup_idle
- clg %r9,BASED(.Lcleanup_table+80) # save_fpu_regs
- jl 0f
- clg %r9,BASED(.Lcleanup_table+88) # .Lsave_fpu_regs_end
- jl .Lcleanup_save_fpu_regs
- clg %r9,BASED(.Lcleanup_table+96) # load_fpu_regs
- jl 0f
- clg %r9,BASED(.Lcleanup_table+104) # .Lload_fpu_regs_end
- jl .Lcleanup_load_fpu_regs
-0: BR_EX %r14,%r11
-ENDPROC(cleanup_critical)
-
- .align 8
-.Lcleanup_table:
- .quad system_call
- .quad .Lsysc_do_svc
- .quad .Lsysc_tif
- .quad .Lsysc_restore
- .quad .Lsysc_done
- .quad .Lio_tif
- .quad .Lio_restore
- .quad .Lio_done
- .quad psw_idle
- .quad .Lpsw_idle_end
- .quad save_fpu_regs
- .quad .Lsave_fpu_regs_end
- .quad load_fpu_regs
- .quad .Lload_fpu_regs_end
-
-#if IS_ENABLED(CONFIG_KVM)
-.Lcleanup_table_sie:
- .quad .Lsie_gmap
- .quad .Lsie_done
-
-.Lcleanup_sie:
- cghi %r11,__LC_SAVE_AREA_ASYNC #Is this in normal interrupt?
- je 1f
- slg %r9,BASED(.Lsie_crit_mcck_start)
- clg %r9,BASED(.Lsie_crit_mcck_length)
- jh 1f
- oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
-1: BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
- lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer
- ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- larl %r9,sie_exit # skip forward to sie_exit
- BR_EX %r14,%r11
-#endif
+ .section .data, "aw"
+ .balign 4
+SYM_DATA_LOCAL(stop_lock, .long 0)
+SYM_DATA_LOCAL(this_cpu, .short 0)
+ .balign 8
+SYM_DATA_START_LOCAL(daton_psw)
+ .quad PSW_KERNEL_BITS
+ .quad .Ldaton
+SYM_DATA_END(daton_psw)
-.Lcleanup_system_call:
- # check if stpt has been executed
- clg %r9,BASED(.Lcleanup_system_call_insn)
- jh 0f
- mvc __LC_SYNC_ENTER_TIMER(8),__LC_ASYNC_ENTER_TIMER
- cghi %r11,__LC_SAVE_AREA_ASYNC
- je 0f
- mvc __LC_SYNC_ENTER_TIMER(8),__LC_MCCK_ENTER_TIMER
-0: # check if stmg has been executed
- clg %r9,BASED(.Lcleanup_system_call_insn+8)
- jh 0f
- mvc __LC_SAVE_AREA_SYNC(64),0(%r11)
-0: # check if base register setup + TIF bit load has been done
- clg %r9,BASED(.Lcleanup_system_call_insn+16)
- jhe 0f
- # set up saved register r12 task struct pointer
- stg %r12,32(%r11)
- # set up saved register r13 __TASK_thread offset
- mvc 40(8,%r11),BASED(.Lcleanup_system_call_const)
-0: # check if the user time update has been done
- clg %r9,BASED(.Lcleanup_system_call_insn+24)
- jh 0f
- lg %r15,__LC_EXIT_TIMER
- slg %r15,__LC_SYNC_ENTER_TIMER
- alg %r15,__LC_USER_TIMER
- stg %r15,__LC_USER_TIMER
-0: # check if the system time update has been done
- clg %r9,BASED(.Lcleanup_system_call_insn+32)
- jh 0f
- lg %r15,__LC_LAST_UPDATE_TIMER
- slg %r15,__LC_EXIT_TIMER
- alg %r15,__LC_SYSTEM_TIMER
- stg %r15,__LC_SYSTEM_TIMER
-0: # update accounting time stamp
- mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER
- BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
- # set up saved register r11
- lg %r15,__LC_KERNEL_STACK
- la %r9,STACK_FRAME_OVERHEAD(%r15)
- stg %r9,24(%r11) # r11 pt_regs pointer
- # fill pt_regs
- mvc __PT_R8(64,%r9),__LC_SAVE_AREA_SYNC
- stmg %r0,%r7,__PT_R0(%r9)
- mvc __PT_PSW(16,%r9),__LC_SVC_OLD_PSW
- mvc __PT_INT_CODE(4,%r9),__LC_SVC_ILC
- xc __PT_FLAGS(8,%r9),__PT_FLAGS(%r9)
- mvi __PT_FLAGS+7(%r9),_PIF_SYSCALL
- # setup saved register r15
- stg %r15,56(%r11) # r15 stack pointer
- # set new psw address and exit
- larl %r9,.Lsysc_do_svc
- BR_EX %r14,%r11
-.Lcleanup_system_call_insn:
- .quad system_call
- .quad .Lsysc_stmg
- .quad .Lsysc_per
- .quad .Lsysc_vtime+36
- .quad .Lsysc_vtime+42
-.Lcleanup_system_call_const:
- .quad __TASK_thread
-
-.Lcleanup_sysc_tif:
- larl %r9,.Lsysc_tif
- BR_EX %r14,%r11
-
-.Lcleanup_sysc_restore:
- # check if stpt has been executed
- clg %r9,BASED(.Lcleanup_sysc_restore_insn)
- jh 0f
- mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER
- cghi %r11,__LC_SAVE_AREA_ASYNC
- je 0f
- mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER
-0: clg %r9,BASED(.Lcleanup_sysc_restore_insn+8)
- je 1f
- lg %r9,24(%r11) # get saved pointer to pt_regs
- mvc __LC_RETURN_PSW(16),__PT_PSW(%r9)
- mvc 0(64,%r11),__PT_R8(%r9)
- lmg %r0,%r7,__PT_R0(%r9)
-1: lmg %r8,%r9,__LC_RETURN_PSW
- BR_EX %r14,%r11
-.Lcleanup_sysc_restore_insn:
- .quad .Lsysc_exit_timer
- .quad .Lsysc_done - 4
-
-.Lcleanup_io_tif:
- larl %r9,.Lio_tif
- BR_EX %r14,%r11
-
-.Lcleanup_io_restore:
- # check if stpt has been executed
- clg %r9,BASED(.Lcleanup_io_restore_insn)
- jh 0f
- mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER
-0: clg %r9,BASED(.Lcleanup_io_restore_insn+8)
- je 1f
- lg %r9,24(%r11) # get saved r11 pointer to pt_regs
- mvc __LC_RETURN_PSW(16),__PT_PSW(%r9)
- mvc 0(64,%r11),__PT_R8(%r9)
- lmg %r0,%r7,__PT_R0(%r9)
-1: lmg %r8,%r9,__LC_RETURN_PSW
- BR_EX %r14,%r11
-.Lcleanup_io_restore_insn:
- .quad .Lio_exit_timer
- .quad .Lio_done - 4
-
-.Lcleanup_idle:
- ni __LC_CPU_FLAGS+7,255-_CIF_ENABLED_WAIT
- # copy interrupt clock & cpu timer
- mvc __CLOCK_IDLE_EXIT(8,%r2),__LC_INT_CLOCK
- mvc __TIMER_IDLE_EXIT(8,%r2),__LC_ASYNC_ENTER_TIMER
- cghi %r11,__LC_SAVE_AREA_ASYNC
- je 0f
- mvc __CLOCK_IDLE_EXIT(8,%r2),__LC_MCCK_CLOCK
- mvc __TIMER_IDLE_EXIT(8,%r2),__LC_MCCK_ENTER_TIMER
-0: # check if stck & stpt have been executed
- clg %r9,BASED(.Lcleanup_idle_insn)
- jhe 1f
- mvc __CLOCK_IDLE_ENTER(8,%r2),__CLOCK_IDLE_EXIT(%r2)
- mvc __TIMER_IDLE_ENTER(8,%r2),__TIMER_IDLE_EXIT(%r2)
-1: # calculate idle cycles
- clg %r9,BASED(.Lcleanup_idle_insn)
- jl 3f
- larl %r1,smp_cpu_mtid
- llgf %r1,0(%r1)
- ltgr %r1,%r1
- jz 3f
- .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+80(%r15)
- larl %r3,mt_cycles
- ag %r3,__LC_PERCPU_OFFSET
- la %r4,__SF_EMPTY+16(%r15)
-2: lg %r0,0(%r3)
- slg %r0,0(%r4)
- alg %r0,64(%r4)
- stg %r0,0(%r3)
- la %r3,8(%r3)
- la %r4,8(%r4)
- brct %r1,2b
-3: # account system time going idle
- lg %r9,__LC_STEAL_TIMER
- alg %r9,__CLOCK_IDLE_ENTER(%r2)
- slg %r9,__LC_LAST_UPDATE_CLOCK
- stg %r9,__LC_STEAL_TIMER
- mvc __LC_LAST_UPDATE_CLOCK(8),__CLOCK_IDLE_EXIT(%r2)
- lg %r9,__LC_SYSTEM_TIMER
- alg %r9,__LC_LAST_UPDATE_TIMER
- slg %r9,__TIMER_IDLE_ENTER(%r2)
- stg %r9,__LC_SYSTEM_TIMER
- mvc __LC_LAST_UPDATE_TIMER(8),__TIMER_IDLE_EXIT(%r2)
- # prepare return psw
- nihh %r8,0xfcfd # clear irq & wait state bits
- lg %r9,48(%r11) # return from psw_idle
- BR_EX %r14,%r11
-.Lcleanup_idle_insn:
- .quad .Lpsw_idle_lpsw
-
-.Lcleanup_save_fpu_regs:
- larl %r9,save_fpu_regs
- BR_EX %r14,%r11
-
-.Lcleanup_load_fpu_regs:
- larl %r9,load_fpu_regs
- BR_EX %r14,%r11
-
-/*
- * Integer constants
- */
- .align 8
-.Lcritical_start:
- .quad .L__critical_start
-.Lcritical_length:
- .quad .L__critical_end - .L__critical_start
-#if IS_ENABLED(CONFIG_KVM)
-.Lsie_critical_start:
- .quad .Lsie_gmap
-.Lsie_critical_length:
- .quad .Lsie_done - .Lsie_gmap
-.Lsie_crit_mcck_start:
- .quad .Lsie_entry
-.Lsie_crit_mcck_length:
- .quad .Lsie_skip - .Lsie_entry
-#endif
.section .rodata, "a"
#define SYSCALL(esame,emu) .quad __s390x_ ## esame
- .globl sys_call_table
-sys_call_table:
+SYM_DATA_START(sys_call_table)
#include "asm/syscall_table.h"
+SYM_DATA_END(sys_call_table)
#undef SYSCALL
#ifdef CONFIG_COMPAT
#define SYSCALL(esame,emu) .quad __s390_ ## emu
- .globl sys_call_table_emu
-sys_call_table_emu:
+SYM_DATA_START(sys_call_table_emu)
#include "asm/syscall_table.h"
+SYM_DATA_END(sys_call_table_emu)
#undef SYSCALL
#endif
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index b2956d49b6ad..9f41853f36b9 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -5,11 +5,11 @@
#include <linux/percpu.h>
#include <linux/types.h>
#include <linux/signal.h>
+#include <asm/extable.h>
#include <asm/ptrace.h>
#include <asm/idle.h>
extern void *restart_stack;
-extern unsigned long suspend_zero_pages;
void system_call(void);
void pgm_check_handler(void);
@@ -17,53 +17,29 @@ void ext_int_handler(void);
void io_int_handler(void);
void mcck_int_handler(void);
void restart_int_handler(void);
-void restart_call_handler(void);
+void early_pgm_check_handler(void);
-asmlinkage long do_syscall_trace_enter(struct pt_regs *regs);
-asmlinkage void do_syscall_trace_exit(struct pt_regs *regs);
+void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs);
+void __do_pgm_check(struct pt_regs *regs);
+void __do_syscall(struct pt_regs *regs, int per_trap);
+void __do_early_pgm_check(struct pt_regs *regs);
void do_protection_exception(struct pt_regs *regs);
void do_dat_exception(struct pt_regs *regs);
-
-void addressing_exception(struct pt_regs *regs);
-void data_exception(struct pt_regs *regs);
-void default_trap_handler(struct pt_regs *regs);
-void divide_exception(struct pt_regs *regs);
-void execute_exception(struct pt_regs *regs);
-void hfp_divide_exception(struct pt_regs *regs);
-void hfp_overflow_exception(struct pt_regs *regs);
-void hfp_significance_exception(struct pt_regs *regs);
-void hfp_sqrt_exception(struct pt_regs *regs);
-void hfp_underflow_exception(struct pt_regs *regs);
-void illegal_op(struct pt_regs *regs);
-void operand_exception(struct pt_regs *regs);
-void overflow_exception(struct pt_regs *regs);
-void privileged_op(struct pt_regs *regs);
-void space_switch_exception(struct pt_regs *regs);
-void special_op_exception(struct pt_regs *regs);
-void specification_exception(struct pt_regs *regs);
-void transaction_exception(struct pt_regs *regs);
-void translation_exception(struct pt_regs *regs);
-void vector_exception(struct pt_regs *regs);
-
-void do_per_trap(struct pt_regs *regs);
+void do_secure_storage_access(struct pt_regs *regs);
+void do_non_secure_storage_access(struct pt_regs *regs);
+void do_secure_storage_violation(struct pt_regs *regs);
void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str);
-void syscall_trace(struct pt_regs *regs, int entryexit);
void kernel_stack_overflow(struct pt_regs * regs);
-void do_signal(struct pt_regs *regs);
void handle_signal32(struct ksignal *ksig, sigset_t *oldset,
struct pt_regs *regs);
-void do_notify_resume(struct pt_regs *regs);
-void __init init_IRQ(void);
-void do_IRQ(struct pt_regs *regs, int irq);
-void do_restart(void);
-void __init startup_init_nobss(void);
+void do_io_irq(struct pt_regs *regs);
+void do_ext_irq(struct pt_regs *regs);
+void do_restart(void *arg);
void __init startup_init(void);
void die(struct pt_regs *regs, const char *str);
int setup_profiling_timer(unsigned int multiplier);
-void __init time_init(void);
-void s390_early_resume(void);
unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip);
struct s390_mmap_arg_struct;
@@ -82,10 +58,18 @@ long sys_s390_sthyi(unsigned long function_code, void __user *buffer, u64 __user
DECLARE_PER_CPU(u64, mt_cycles[8]);
-void gs_load_bc_cb(struct pt_regs *regs);
-void set_fs_fixup(void);
-
unsigned long stack_alloc(void);
void stack_free(unsigned long stack);
+extern char kprobes_insn_page[];
+
+extern char _samode31[], _eamode31[];
+extern char _stext_amode31[], _etext_amode31[];
+extern struct exception_table_entry _start_amode31_ex_table[];
+extern struct exception_table_entry _stop_amode31_ex_table[];
+
+#define __amode31_data __section(".amode31.data")
+#define __amode31_ref __section(".amode31.refs")
+extern long _start_amode31_refs[], _end_amode31_refs[];
+
#endif /* _ENTRY_H */
diff --git a/arch/s390/kernel/facility.c b/arch/s390/kernel/facility.c
new file mode 100644
index 000000000000..f02127219a27
--- /dev/null
+++ b/arch/s390/kernel/facility.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2023
+ */
+
+#include <asm/facility.h>
+
+unsigned int stfle_size(void)
+{
+ static unsigned int size;
+ unsigned int r;
+ u64 dummy;
+
+ r = READ_ONCE(size);
+ if (!r) {
+ r = __stfle_asm(&dummy, 1) + 1;
+ WRITE_ONCE(size, r);
+ }
+ return r;
+}
+EXPORT_SYMBOL(stfle_size);
diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c
index 0da378e2eb25..a4f3449cc814 100644
--- a/arch/s390/kernel/fpu.c
+++ b/arch/s390/kernel/fpu.c
@@ -10,8 +10,7 @@
#include <linux/sched.h>
#include <asm/fpu/types.h>
#include <asm/fpu/api.h>
-
-asm(".include \"asm/vx-insn.h\"\n");
+#include <asm/vx-insn.h>
void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
{
@@ -25,7 +24,7 @@ void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
/* Save floating point control */
asm volatile("stfpc %0" : "=Q" (state->fpc));
- if (!MACHINE_HAS_VX) {
+ if (!cpu_has_vx()) {
if (flags & KERNEL_VXR_V0V7) {
/* Save floating-point registers */
asm volatile("std 0,%0" : "=Q" (state->fprs[0]));
@@ -107,7 +106,7 @@ void __kernel_fpu_end(struct kernel_fpu *state, u32 flags)
/* Restore floating-point controls */
asm volatile("lfpc %0" : : "Q" (state->fpc));
- if (!MACHINE_HAS_VX) {
+ if (!cpu_has_vx()) {
if (flags & KERNEL_VXR_V0V7) {
/* Restore floating-point registers */
asm volatile("ld 0,%0" : : "Q" (state->fprs[0]));
@@ -175,3 +174,90 @@ void __kernel_fpu_end(struct kernel_fpu *state, u32 flags)
: "1", "cc");
}
EXPORT_SYMBOL(__kernel_fpu_end);
+
+void __load_fpu_regs(void)
+{
+ unsigned long *regs = current->thread.fpu.regs;
+ struct fpu *state = &current->thread.fpu;
+
+ sfpc_safe(state->fpc);
+ if (likely(cpu_has_vx())) {
+ asm volatile("lgr 1,%0\n"
+ "VLM 0,15,0,1\n"
+ "VLM 16,31,256,1\n"
+ :
+ : "d" (regs)
+ : "1", "cc", "memory");
+ } else {
+ asm volatile("ld 0,%0" : : "Q" (regs[0]));
+ asm volatile("ld 1,%0" : : "Q" (regs[1]));
+ asm volatile("ld 2,%0" : : "Q" (regs[2]));
+ asm volatile("ld 3,%0" : : "Q" (regs[3]));
+ asm volatile("ld 4,%0" : : "Q" (regs[4]));
+ asm volatile("ld 5,%0" : : "Q" (regs[5]));
+ asm volatile("ld 6,%0" : : "Q" (regs[6]));
+ asm volatile("ld 7,%0" : : "Q" (regs[7]));
+ asm volatile("ld 8,%0" : : "Q" (regs[8]));
+ asm volatile("ld 9,%0" : : "Q" (regs[9]));
+ asm volatile("ld 10,%0" : : "Q" (regs[10]));
+ asm volatile("ld 11,%0" : : "Q" (regs[11]));
+ asm volatile("ld 12,%0" : : "Q" (regs[12]));
+ asm volatile("ld 13,%0" : : "Q" (regs[13]));
+ asm volatile("ld 14,%0" : : "Q" (regs[14]));
+ asm volatile("ld 15,%0" : : "Q" (regs[15]));
+ }
+ clear_cpu_flag(CIF_FPU);
+}
+
+void load_fpu_regs(void)
+{
+ raw_local_irq_disable();
+ __load_fpu_regs();
+ raw_local_irq_enable();
+}
+EXPORT_SYMBOL(load_fpu_regs);
+
+void save_fpu_regs(void)
+{
+ unsigned long flags, *regs;
+ struct fpu *state;
+
+ local_irq_save(flags);
+
+ if (test_cpu_flag(CIF_FPU))
+ goto out;
+
+ state = &current->thread.fpu;
+ regs = current->thread.fpu.regs;
+
+ asm volatile("stfpc %0" : "=Q" (state->fpc));
+ if (likely(cpu_has_vx())) {
+ asm volatile("lgr 1,%0\n"
+ "VSTM 0,15,0,1\n"
+ "VSTM 16,31,256,1\n"
+ :
+ : "d" (regs)
+ : "1", "cc", "memory");
+ } else {
+ asm volatile("std 0,%0" : "=Q" (regs[0]));
+ asm volatile("std 1,%0" : "=Q" (regs[1]));
+ asm volatile("std 2,%0" : "=Q" (regs[2]));
+ asm volatile("std 3,%0" : "=Q" (regs[3]));
+ asm volatile("std 4,%0" : "=Q" (regs[4]));
+ asm volatile("std 5,%0" : "=Q" (regs[5]));
+ asm volatile("std 6,%0" : "=Q" (regs[6]));
+ asm volatile("std 7,%0" : "=Q" (regs[7]));
+ asm volatile("std 8,%0" : "=Q" (regs[8]));
+ asm volatile("std 9,%0" : "=Q" (regs[9]));
+ asm volatile("std 10,%0" : "=Q" (regs[10]));
+ asm volatile("std 11,%0" : "=Q" (regs[11]));
+ asm volatile("std 12,%0" : "=Q" (regs[12]));
+ asm volatile("std 13,%0" : "=Q" (regs[13]));
+ asm volatile("std 14,%0" : "=Q" (regs[14]));
+ asm volatile("std 15,%0" : "=Q" (regs[15]));
+ }
+ set_cpu_flag(CIF_FPU);
+out:
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(save_fpu_regs);
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 1bb85f60c0dd..c46381ea04ec 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -4,8 +4,7 @@
*
* Copyright IBM Corp. 2009,2014
*
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
- * Martin Schwidefsky <schwidefsky@de.ibm.com>
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
*/
#include <linux/moduleloader.h>
@@ -17,179 +16,217 @@
#include <linux/kprobes.h>
#include <trace/syscall.h>
#include <asm/asm-offsets.h>
+#include <asm/text-patching.h>
#include <asm/cacheflush.h>
+#include <asm/ftrace.lds.h>
+#include <asm/nospec-branch.h>
#include <asm/set_memory.h>
#include "entry.h"
+#include "ftrace.h"
/*
- * The mcount code looks like this:
- * stg %r14,8(%r15) # offset 0
- * larl %r1,<&counter> # offset 6
- * brasl %r14,_mcount # offset 12
- * lg %r14,8(%r15) # offset 18
- * Total length is 24 bytes. Only the first instruction will be patched
- * by ftrace_make_call / ftrace_make_nop.
- * The enabled ftrace code block looks like this:
+ * To generate function prologue either gcc's hotpatch feature (since gcc 4.8)
+ * or a combination of -pg -mrecord-mcount -mnop-mcount -mfentry flags
+ * (since gcc 9 / clang 10) is used.
+ * In both cases the original and also the disabled function prologue contains
+ * only a single six byte instruction and looks like this:
+ * > brcl 0,0 # offset 0
+ * To enable ftrace the code gets patched like above and afterwards looks
+ * like this:
* > brasl %r0,ftrace_caller # offset 0
- * larl %r1,<&counter> # offset 6
- * brasl %r14,_mcount # offset 12
- * lg %r14,8(%r15) # offset 18
+ *
+ * The instruction will be patched by ftrace_make_call / ftrace_make_nop.
* The ftrace function gets called with a non-standard C function call ABI
* where r0 contains the return address. It is also expected that the called
* function only clobbers r0 and r1, but restores r2-r15.
* For module code we can't directly jump to ftrace caller, but need a
* trampoline (ftrace_plt), which clobbers also r1.
- * The return point of the ftrace function has offset 24, so execution
- * continues behind the mcount block.
- * The disabled ftrace code block looks like this:
- * > jg .+24 # offset 0
- * larl %r1,<&counter> # offset 6
- * brasl %r14,_mcount # offset 12
- * lg %r14,8(%r15) # offset 18
- * The jg instruction branches to offset 24 to skip as many instructions
- * as possible.
- * In case we use gcc's hotpatch feature the original and also the disabled
- * function prologue contains only a single six byte instruction and looks
- * like this:
- * > brcl 0,0 # offset 0
- * To enable ftrace the code gets patched like above and afterwards looks
- * like this:
- * > brasl %r0,ftrace_caller # offset 0
*/
-unsigned long ftrace_plt;
+void *ftrace_func __read_mostly = ftrace_stub;
+struct ftrace_insn {
+ u16 opc;
+ s32 disp;
+} __packed;
-static inline void ftrace_generate_orig_insn(struct ftrace_insn *insn)
+#ifdef CONFIG_MODULES
+static char *ftrace_plt;
+#endif /* CONFIG_MODULES */
+
+static const char *ftrace_shared_hotpatch_trampoline(const char **end)
{
-#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)
- /* brcl 0,0 */
- insn->opc = 0xc004;
- insn->disp = 0;
-#else
- /* stg r14,8(r15) */
- insn->opc = 0xe3e0;
- insn->disp = 0xf0080024;
-#endif
+ const char *tstart, *tend;
+
+ tstart = ftrace_shared_hotpatch_trampoline_br;
+ tend = ftrace_shared_hotpatch_trampoline_br_end;
+#ifdef CONFIG_EXPOLINE
+ if (!nospec_disable) {
+ tstart = ftrace_shared_hotpatch_trampoline_exrl;
+ tend = ftrace_shared_hotpatch_trampoline_exrl_end;
+ }
+#endif /* CONFIG_EXPOLINE */
+ if (end)
+ *end = tend;
+ return tstart;
}
-static inline int is_kprobe_on_ftrace(struct ftrace_insn *insn)
+bool ftrace_need_init_nop(void)
{
-#ifdef CONFIG_KPROBES
- if (insn->opc == BREAKPOINT_INSTRUCTION)
- return 1;
-#endif
- return 0;
+ return true;
}
-static inline void ftrace_generate_kprobe_nop_insn(struct ftrace_insn *insn)
+int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
{
-#ifdef CONFIG_KPROBES
- insn->opc = BREAKPOINT_INSTRUCTION;
- insn->disp = KPROBE_ON_FTRACE_NOP;
+ static struct ftrace_hotpatch_trampoline *next_vmlinux_trampoline =
+ __ftrace_hotpatch_trampolines_start;
+ static const char orig[6] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 };
+ static struct ftrace_hotpatch_trampoline *trampoline;
+ struct ftrace_hotpatch_trampoline **next_trampoline;
+ struct ftrace_hotpatch_trampoline *trampolines_end;
+ struct ftrace_hotpatch_trampoline tmp;
+ struct ftrace_insn *insn;
+ const char *shared;
+ s32 disp;
+
+ BUILD_BUG_ON(sizeof(struct ftrace_hotpatch_trampoline) !=
+ SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE);
+
+ next_trampoline = &next_vmlinux_trampoline;
+ trampolines_end = __ftrace_hotpatch_trampolines_end;
+ shared = ftrace_shared_hotpatch_trampoline(NULL);
+#ifdef CONFIG_MODULES
+ if (mod) {
+ next_trampoline = &mod->arch.next_trampoline;
+ trampolines_end = mod->arch.trampolines_end;
+ shared = ftrace_plt;
+ }
#endif
+
+ if (WARN_ON_ONCE(*next_trampoline >= trampolines_end))
+ return -ENOMEM;
+ trampoline = (*next_trampoline)++;
+
+ /* Check for the compiler-generated fentry nop (brcl 0, .). */
+ if (WARN_ON_ONCE(memcmp((const void *)rec->ip, &orig, sizeof(orig))))
+ return -EINVAL;
+
+ /* Generate the trampoline. */
+ tmp.brasl_opc = 0xc015; /* brasl %r1, shared */
+ tmp.brasl_disp = (shared - (const char *)&trampoline->brasl_opc) / 2;
+ tmp.interceptor = FTRACE_ADDR;
+ tmp.rest_of_intercepted_function = rec->ip + sizeof(struct ftrace_insn);
+ s390_kernel_write(trampoline, &tmp, sizeof(tmp));
+
+ /* Generate a jump to the trampoline. */
+ disp = ((char *)trampoline - (char *)rec->ip) / 2;
+ insn = (struct ftrace_insn *)rec->ip;
+ s390_kernel_write(&insn->disp, &disp, sizeof(disp));
+
+ return 0;
}
-static inline void ftrace_generate_kprobe_call_insn(struct ftrace_insn *insn)
+static struct ftrace_hotpatch_trampoline *ftrace_get_trampoline(struct dyn_ftrace *rec)
{
-#ifdef CONFIG_KPROBES
- insn->opc = BREAKPOINT_INSTRUCTION;
- insn->disp = KPROBE_ON_FTRACE_CALL;
-#endif
+ struct ftrace_hotpatch_trampoline *trampoline;
+ struct ftrace_insn insn;
+ s64 disp;
+ u16 opc;
+
+ if (copy_from_kernel_nofault(&insn, (void *)rec->ip, sizeof(insn)))
+ return ERR_PTR(-EFAULT);
+ disp = (s64)insn.disp * 2;
+ trampoline = (void *)(rec->ip + disp);
+ if (get_kernel_nofault(opc, &trampoline->brasl_opc))
+ return ERR_PTR(-EFAULT);
+ if (opc != 0xc015)
+ return ERR_PTR(-EINVAL);
+ return trampoline;
}
int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
unsigned long addr)
{
+ struct ftrace_hotpatch_trampoline *trampoline;
+ u64 old;
+
+ trampoline = ftrace_get_trampoline(rec);
+ if (IS_ERR(trampoline))
+ return PTR_ERR(trampoline);
+ if (get_kernel_nofault(old, &trampoline->interceptor))
+ return -EFAULT;
+ if (old != old_addr)
+ return -EINVAL;
+ s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr));
return 0;
}
-int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
- unsigned long addr)
+static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable)
{
- struct ftrace_insn orig, new, old;
+ u16 old;
+ u8 op;
- if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old)))
+ if (get_kernel_nofault(old, addr))
return -EFAULT;
- if (addr == MCOUNT_ADDR) {
- /* Initial code replacement */
- ftrace_generate_orig_insn(&orig);
- ftrace_generate_nop_insn(&new);
- } else if (is_kprobe_on_ftrace(&old)) {
- /*
- * If we find a breakpoint instruction, a kprobe has been
- * placed at the beginning of the function. We write the
- * constant KPROBE_ON_FTRACE_NOP into the remaining four
- * bytes of the original instruction so that the kprobes
- * handler can execute a nop, if it reaches this breakpoint.
- */
- ftrace_generate_kprobe_call_insn(&orig);
- ftrace_generate_kprobe_nop_insn(&new);
- } else {
- /* Replace ftrace call with a nop. */
- ftrace_generate_call_insn(&orig, rec->ip);
- ftrace_generate_nop_insn(&new);
- }
- /* Verify that the to be replaced code matches what we expect. */
- if (memcmp(&orig, &old, sizeof(old)))
+ if (old != expected)
return -EINVAL;
- s390_kernel_write((void *) rec->ip, &new, sizeof(new));
+ /* set mask field to all ones or zeroes */
+ op = enable ? 0xf4 : 0x04;
+ s390_kernel_write((char *)addr + 1, &op, sizeof(op));
return 0;
}
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
+ unsigned long addr)
+{
+ /* Expect brcl 0xf,... */
+ return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false);
+}
+
int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
- struct ftrace_insn orig, new, old;
+ struct ftrace_hotpatch_trampoline *trampoline;
- if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old)))
- return -EFAULT;
- if (is_kprobe_on_ftrace(&old)) {
- /*
- * If we find a breakpoint instruction, a kprobe has been
- * placed at the beginning of the function. We write the
- * constant KPROBE_ON_FTRACE_CALL into the remaining four
- * bytes of the original instruction so that the kprobes
- * handler can execute a brasl if it reaches this breakpoint.
- */
- ftrace_generate_kprobe_nop_insn(&orig);
- ftrace_generate_kprobe_call_insn(&new);
- } else {
- /* Replace nop with an ftrace call. */
- ftrace_generate_nop_insn(&orig);
- ftrace_generate_call_insn(&new, rec->ip);
- }
- /* Verify that the to be replaced code matches what we expect. */
- if (memcmp(&orig, &old, sizeof(old)))
- return -EINVAL;
- s390_kernel_write((void *) rec->ip, &new, sizeof(new));
- return 0;
+ trampoline = ftrace_get_trampoline(rec);
+ if (IS_ERR(trampoline))
+ return PTR_ERR(trampoline);
+ s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr));
+ /* Expect brcl 0x0,... */
+ return ftrace_patch_branch_mask((void *)rec->ip, 0xc004, true);
}
int ftrace_update_ftrace_func(ftrace_func_t func)
{
+ ftrace_func = func;
return 0;
}
-int __init ftrace_dyn_arch_init(void)
+void arch_ftrace_update_code(int command)
{
- return 0;
+ ftrace_modify_all_code(command);
+}
+
+void ftrace_arch_code_modify_post_process(void)
+{
+ /*
+ * Flush any pre-fetched instructions on all
+ * CPUs to make the new code visible.
+ */
+ text_poke_sync_lock();
}
#ifdef CONFIG_MODULES
static int __init ftrace_plt_init(void)
{
- unsigned int *ip;
+ const char *start, *end;
- ftrace_plt = (unsigned long) module_alloc(PAGE_SIZE);
+ ftrace_plt = module_alloc(PAGE_SIZE);
if (!ftrace_plt)
panic("cannot allocate ftrace plt\n");
- ip = (unsigned int *) ftrace_plt;
- ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */
- ip[1] = 0x100a0004;
- ip[2] = 0x07f10000;
- ip[3] = FTRACE_ADDR >> 32;
- ip[4] = FTRACE_ADDR & 0xffffffff;
- set_memory_ro(ftrace_plt, 1);
+
+ start = ftrace_shared_hotpatch_trampoline(&end);
+ memcpy(ftrace_plt, start, end - start);
+ set_memory_rox((unsigned long)ftrace_plt, 1);
return 0;
}
device_initcall(ftrace_plt_init);
@@ -226,18 +263,78 @@ NOKPROBE_SYMBOL(prepare_ftrace_return);
*/
int ftrace_enable_ftrace_graph_caller(void)
{
- u8 op = 0x04; /* set mask field to zero */
+ int rc;
- s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op));
+ /* Expect brc 0xf,... */
+ rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa7f4, false);
+ if (rc)
+ return rc;
+ text_poke_sync_lock();
return 0;
}
int ftrace_disable_ftrace_graph_caller(void)
{
- u8 op = 0xf4; /* set mask field to all ones */
+ int rc;
- s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op));
+ /* Expect brc 0x0,... */
+ rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa704, true);
+ if (rc)
+ return rc;
+ text_poke_sync_lock();
return 0;
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+#ifdef CONFIG_KPROBES_ON_FTRACE
+void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct kprobe_ctlblk *kcb;
+ struct pt_regs *regs;
+ struct kprobe *p;
+ int bit;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
+ regs = ftrace_get_regs(fregs);
+ p = get_kprobe((kprobe_opcode_t *)ip);
+ if (!regs || unlikely(!p) || kprobe_disabled(p))
+ goto out;
+
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(p);
+ goto out;
+ }
+
+ __this_cpu_write(current_kprobe, p);
+
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ instruction_pointer_set(regs, ip);
+
+ if (!p->pre_handler || !p->pre_handler(p, regs)) {
+
+ instruction_pointer_set(regs, ip + MCOUNT_INSN_SIZE);
+
+ if (unlikely(p->post_handler)) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ p->post_handler(p, regs, 0);
+ }
+ }
+ __this_cpu_write(current_kprobe, NULL);
+out:
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
+
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+ p->ainsn.insn = NULL;
+ return 0;
+}
+#endif
diff --git a/arch/s390/kernel/ftrace.h b/arch/s390/kernel/ftrace.h
new file mode 100644
index 000000000000..7f75a9616406
--- /dev/null
+++ b/arch/s390/kernel/ftrace.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FTRACE_H
+#define _FTRACE_H
+
+#include <asm/types.h>
+
+struct ftrace_hotpatch_trampoline {
+ u16 brasl_opc;
+ s32 brasl_disp;
+ s16: 16;
+ u64 rest_of_intercepted_function;
+ u64 interceptor;
+} __packed;
+
+extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_start[];
+extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_end[];
+extern const char ftrace_shared_hotpatch_trampoline_br[];
+extern const char ftrace_shared_hotpatch_trampoline_br_end[];
+extern const char ftrace_shared_hotpatch_trampoline_exrl[];
+extern const char ftrace_shared_hotpatch_trampoline_exrl_end[];
+extern const char ftrace_plt_template[];
+extern const char ftrace_plt_template_end[];
+
+#endif /* _FTRACE_H */
diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c
index d14dd1c2e524..0b68168d9566 100644
--- a/arch/s390/kernel/guarded_storage.c
+++ b/arch/s390/kernel/guarded_storage.c
@@ -28,7 +28,7 @@ static int gs_enable(void)
return -ENOMEM;
gs_cb->gsd = 25;
preempt_disable();
- __ctl_set_bit(2, 4);
+ local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
load_gs_cb(gs_cb);
current->thread.gs_cb = gs_cb;
preempt_enable();
@@ -42,7 +42,7 @@ static int gs_disable(void)
preempt_disable();
kfree(current->thread.gs_cb);
current->thread.gs_cb = NULL;
- __ctl_clear_bit(2, 4);
+ local_ctl_clear_bit(2, CR2_GUARDED_STORAGE_BIT);
preempt_enable();
}
return 0;
@@ -84,7 +84,7 @@ void gs_load_bc_cb(struct pt_regs *regs)
if (gs_cb) {
kfree(current->thread.gs_cb);
current->thread.gs_bc_cb = NULL;
- __ctl_set_bit(2, 4);
+ local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
load_gs_cb(gs_cb);
current->thread.gs_cb = gs_cb;
}
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index 8b88dbbda7df..45413b04efc5 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -5,7 +5,6 @@
* Author(s): Hartmut Penner <hp@de.ibm.com>
* Martin Schwidefsky <schwidefsky@de.ibm.com>
* Rob van der Heij <rvdhei@iae.nl>
- * Heiko Carstens <heiko.carstens@de.ibm.com>
*
*/
@@ -17,32 +16,25 @@
#include <asm/ptrace.h>
__HEAD
-ENTRY(startup_continue)
- tm __LC_STFLE_FAC_LIST+5,0x80 # LPP available ?
- jz 0f
- xc __LC_LPP+1(7,0),__LC_LPP+1 # clear lpp and current_pid
- mvi __LC_LPP,0x80 # and set LPP_MAGIC
- .insn s,0xb2800000,__LC_LPP # load program parameter
-0: larl %r1,tod_clock_base
+SYM_CODE_START(startup_continue)
+ larl %r1,tod_clock_base
mvc 0(16,%r1),__LC_BOOT_CLOCK
- larl %r13,.LPG1 # get base
#
# Setup stack
#
larl %r14,init_task
stg %r14,__LC_CURRENT
- larl %r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD-__PT_SIZE
-#ifdef CONFIG_KASAN
- brasl %r14,kasan_early_init
-#endif
+ larl %r15,init_thread_union+STACK_INIT_OFFSET
+ stg %r15,__LC_KERNEL_STACK
+ brasl %r14,sclp_early_adjust_va # allow sclp_early_printk
brasl %r14,startup_init # s390 specific early init
brasl %r14,start_kernel # common init code
#
# We returned from start_kernel ?!? PANIK
#
basr %r13,0
- lpswe .Ldw-.(%r13) # load disabled wait psw
+ lpswe dw_psw-.(%r13) # load disabled wait psw
+SYM_CODE_END(startup_continue)
- .align 16
-.LPG1:
-.Ldw: .quad 0x0002000180000000,0x0000000000000000
+ .balign 16
+SYM_DATA_LOCAL(dw_psw, .quad 0x0002000180000000,0x0000000000000000)
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 8f8456816d83..e7239aaf428b 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -9,134 +9,86 @@
#include <linux/kernel.h>
#include <linux/kernel_stat.h>
-#include <linux/kprobes.h>
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/cpu.h>
-#include <linux/sched/cputime.h>
+#include <trace/events/power.h>
+#include <asm/cpu_mf.h>
+#include <asm/cputime.h>
#include <asm/nmi.h>
#include <asm/smp.h>
#include "entry.h"
static DEFINE_PER_CPU(struct s390_idle_data, s390_idle);
-void enabled_wait(void)
+void account_idle_time_irq(void)
{
struct s390_idle_data *idle = this_cpu_ptr(&s390_idle);
- unsigned long long idle_time;
- unsigned long psw_mask;
+ unsigned long idle_time;
+ u64 cycles_new[8];
+ int i;
+
+ if (smp_cpu_mtid) {
+ stcctm(MT_DIAG, smp_cpu_mtid, cycles_new);
+ for (i = 0; i < smp_cpu_mtid; i++)
+ this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]);
+ }
- trace_hardirqs_on();
+ idle_time = S390_lowcore.int_clock - idle->clock_idle_enter;
- /* Wait for external, I/O or machine check interrupt. */
- psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_DAT |
- PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
- clear_cpu_flag(CIF_NOHZ_DELAY);
+ S390_lowcore.steal_timer += idle->clock_idle_enter - S390_lowcore.last_update_clock;
+ S390_lowcore.last_update_clock = S390_lowcore.int_clock;
- /* Call the assembler magic in entry.S */
- psw_idle(idle, psw_mask);
-
- trace_hardirqs_off();
+ S390_lowcore.system_timer += S390_lowcore.last_update_timer - idle->timer_idle_enter;
+ S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer;
/* Account time spent with enabled wait psw loaded as idle time. */
- write_seqcount_begin(&idle->seqcount);
- idle_time = idle->clock_idle_exit - idle->clock_idle_enter;
- idle->clock_idle_enter = idle->clock_idle_exit = 0ULL;
- idle->idle_time += idle_time;
- idle->idle_count++;
+ WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time);
+ WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1);
account_idle_time(cputime_to_nsecs(idle_time));
- write_seqcount_end(&idle->seqcount);
}
-NOKPROBE_SYMBOL(enabled_wait);
+
+void noinstr arch_cpu_idle(void)
+{
+ struct s390_idle_data *idle = this_cpu_ptr(&s390_idle);
+ unsigned long psw_mask;
+
+ /* Wait for external, I/O or machine check interrupt. */
+ psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT |
+ PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
+ clear_cpu_flag(CIF_NOHZ_DELAY);
+
+ /* psw_idle() returns with interrupts disabled. */
+ psw_idle(idle, psw_mask);
+}
static ssize_t show_idle_count(struct device *dev,
- struct device_attribute *attr, char *buf)
+ struct device_attribute *attr, char *buf)
{
struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id);
- unsigned long long idle_count;
- unsigned int seq;
-
- do {
- seq = read_seqcount_begin(&idle->seqcount);
- idle_count = READ_ONCE(idle->idle_count);
- if (READ_ONCE(idle->clock_idle_enter))
- idle_count++;
- } while (read_seqcount_retry(&idle->seqcount, seq));
- return sprintf(buf, "%llu\n", idle_count);
+
+ return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_count));
}
DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL);
static ssize_t show_idle_time(struct device *dev,
- struct device_attribute *attr, char *buf)
+ struct device_attribute *attr, char *buf)
{
- unsigned long long now, idle_time, idle_enter, idle_exit, in_idle;
struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id);
- unsigned int seq;
-
- do {
- seq = read_seqcount_begin(&idle->seqcount);
- idle_time = READ_ONCE(idle->idle_time);
- idle_enter = READ_ONCE(idle->clock_idle_enter);
- idle_exit = READ_ONCE(idle->clock_idle_exit);
- } while (read_seqcount_retry(&idle->seqcount, seq));
- in_idle = 0;
- now = get_tod_clock();
- if (idle_enter) {
- if (idle_exit) {
- in_idle = idle_exit - idle_enter;
- } else if (now > idle_enter) {
- in_idle = now - idle_enter;
- }
- }
- idle_time += in_idle;
- return sprintf(buf, "%llu\n", idle_time >> 12);
-}
-DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL);
-u64 arch_cpu_idle_time(int cpu)
-{
- struct s390_idle_data *idle = &per_cpu(s390_idle, cpu);
- unsigned long long now, idle_enter, idle_exit, in_idle;
- unsigned int seq;
-
- do {
- seq = read_seqcount_begin(&idle->seqcount);
- idle_enter = READ_ONCE(idle->clock_idle_enter);
- idle_exit = READ_ONCE(idle->clock_idle_exit);
- } while (read_seqcount_retry(&idle->seqcount, seq));
- in_idle = 0;
- now = get_tod_clock();
- if (idle_enter) {
- if (idle_exit) {
- in_idle = idle_exit - idle_enter;
- } else if (now > idle_enter) {
- in_idle = now - idle_enter;
- }
- }
- return cputime_to_nsecs(in_idle);
+ return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_time) >> 12);
}
+DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL);
void arch_cpu_idle_enter(void)
{
- local_mcck_disable();
-}
-
-void arch_cpu_idle(void)
-{
- if (!test_cpu_flag(CIF_MCCK_PENDING))
- /* Halt the cpu and keep track of cpu time accounting. */
- enabled_wait();
- local_irq_enable();
}
void arch_cpu_idle_exit(void)
{
- local_mcck_enable();
- if (test_cpu_flag(CIF_MCCK_PENDING))
- s390_handle_mcck();
}
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
cpu_die();
}
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 6837affc19e8..ba75f6bee774 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -4,7 +4,6 @@
*
* Copyright IBM Corp. 2005, 2012
* Author(s): Michael Holzheu <holzheu@de.ibm.com>
- * Heiko Carstens <heiko.carstens@de.ibm.com>
* Volker Sameske <sameske@de.ibm.com>
*/
@@ -13,12 +12,15 @@
#include <linux/init.h>
#include <linux/device.h>
#include <linux/delay.h>
+#include <linux/kstrtox.h>
+#include <linux/panic_notifier.h>
#include <linux/reboot.h>
#include <linux/ctype.h>
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/crash_dump.h>
#include <linux/debug_locks.h>
+#include <asm/asm-extable.h>
#include <asm/diag.h>
#include <asm/ipl.h>
#include <asm/smp.h>
@@ -28,6 +30,7 @@
#include <asm/sclp.h>
#include <asm/checksum.h>
#include <asm/debug.h>
+#include <asm/abs_lowcore.h>
#include <asm/os_info.h>
#include <asm/sections.h>
#include <asm/boot_data.h>
@@ -37,12 +40,18 @@
#define IPL_UNKNOWN_STR "unknown"
#define IPL_CCW_STR "ccw"
+#define IPL_ECKD_STR "eckd"
+#define IPL_ECKD_DUMP_STR "eckd_dump"
#define IPL_FCP_STR "fcp"
#define IPL_FCP_DUMP_STR "fcp_dump"
+#define IPL_NVME_STR "nvme"
+#define IPL_NVME_DUMP_STR "nvme_dump"
#define IPL_NSS_STR "nss"
#define DUMP_CCW_STR "ccw"
+#define DUMP_ECKD_STR "eckd"
#define DUMP_FCP_STR "fcp"
+#define DUMP_NVME_STR "nvme"
#define DUMP_NONE_STR "none"
/*
@@ -87,12 +96,20 @@ static char *ipl_type_str(enum ipl_type type)
switch (type) {
case IPL_TYPE_CCW:
return IPL_CCW_STR;
+ case IPL_TYPE_ECKD:
+ return IPL_ECKD_STR;
+ case IPL_TYPE_ECKD_DUMP:
+ return IPL_ECKD_DUMP_STR;
case IPL_TYPE_FCP:
return IPL_FCP_STR;
case IPL_TYPE_FCP_DUMP:
return IPL_FCP_DUMP_STR;
case IPL_TYPE_NSS:
return IPL_NSS_STR;
+ case IPL_TYPE_NVME:
+ return IPL_NVME_STR;
+ case IPL_TYPE_NVME_DUMP:
+ return IPL_NVME_DUMP_STR;
case IPL_TYPE_UNKNOWN:
default:
return IPL_UNKNOWN_STR;
@@ -103,6 +120,8 @@ enum dump_type {
DUMP_TYPE_NONE = 1,
DUMP_TYPE_CCW = 2,
DUMP_TYPE_FCP = 4,
+ DUMP_TYPE_NVME = 8,
+ DUMP_TYPE_ECKD = 16,
};
static char *dump_type_str(enum dump_type type)
@@ -112,8 +131,12 @@ static char *dump_type_str(enum dump_type type)
return DUMP_NONE_STR;
case DUMP_TYPE_CCW:
return DUMP_CCW_STR;
+ case DUMP_TYPE_ECKD:
+ return DUMP_ECKD_STR;
case DUMP_TYPE_FCP:
return DUMP_FCP_STR;
+ case DUMP_TYPE_NVME:
+ return DUMP_NVME_STR;
default:
return NULL;
}
@@ -133,37 +156,48 @@ static int reipl_capabilities = IPL_TYPE_UNKNOWN;
static enum ipl_type reipl_type = IPL_TYPE_UNKNOWN;
static struct ipl_parameter_block *reipl_block_fcp;
+static struct ipl_parameter_block *reipl_block_nvme;
static struct ipl_parameter_block *reipl_block_ccw;
+static struct ipl_parameter_block *reipl_block_eckd;
static struct ipl_parameter_block *reipl_block_nss;
static struct ipl_parameter_block *reipl_block_actual;
static int dump_capabilities = DUMP_TYPE_NONE;
static enum dump_type dump_type = DUMP_TYPE_NONE;
static struct ipl_parameter_block *dump_block_fcp;
+static struct ipl_parameter_block *dump_block_nvme;
static struct ipl_parameter_block *dump_block_ccw;
+static struct ipl_parameter_block *dump_block_eckd;
static struct sclp_ipl_info sclp_ipl_info;
-static inline int __diag308(unsigned long subcode, void *addr)
+static bool reipl_nvme_clear;
+static bool reipl_fcp_clear;
+static bool reipl_ccw_clear;
+static bool reipl_eckd_clear;
+
+static unsigned long os_info_flags;
+
+static inline int __diag308(unsigned long subcode, unsigned long addr)
{
- register unsigned long _addr asm("0") = (unsigned long) addr;
- register unsigned long _rc asm("1") = 0;
+ union register_pair r1;
+ r1.even = addr;
+ r1.odd = 0;
asm volatile(
- " diag %0,%2,0x308\n"
+ " diag %[r1],%[subcode],0x308\n"
"0: nopr %%r7\n"
EX_TABLE(0b,0b)
- : "+d" (_addr), "+d" (_rc)
- : "d" (subcode) : "cc", "memory");
- return _rc;
+ : [r1] "+&d" (r1.pair)
+ : [subcode] "d" (subcode)
+ : "cc", "memory");
+ return r1.odd;
}
int diag308(unsigned long subcode, void *addr)
{
- if (IS_ENABLED(CONFIG_KASAN))
- __arch_local_irq_stosm(0x04); /* enable DAT */
diag_stat_inc(DIAG_STAT_X308);
- return __diag308(subcode, addr);
+ return __diag308(subcode, addr ? virt_to_phys(addr) : 0);
}
EXPORT_SYMBOL_GPL(diag308);
@@ -174,7 +208,7 @@ static ssize_t sys_##_prefix##_##_name##_show(struct kobject *kobj, \
struct kobj_attribute *attr, \
char *page) \
{ \
- return snprintf(page, PAGE_SIZE, _format, ##args); \
+ return scnprintf(page, PAGE_SIZE, _format, ##args); \
}
#define IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk) \
@@ -200,14 +234,14 @@ IPL_ATTR_SHOW_FN(_prefix, _name, "0.%x.%04x\n", \
_ipl_blk.ssid, _ipl_blk.devno); \
IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk); \
static struct kobj_attribute sys_##_prefix##_##_name##_attr = \
- __ATTR(_name, (S_IRUGO | S_IWUSR), \
+ __ATTR(_name, 0644, \
sys_##_prefix##_##_name##_show, \
sys_##_prefix##_##_name##_store) \
#define DEFINE_IPL_ATTR_RO(_prefix, _name, _format, _value) \
IPL_ATTR_SHOW_FN(_prefix, _name, _format, _value) \
static struct kobj_attribute sys_##_prefix##_##_name##_attr = \
- __ATTR(_name, S_IRUGO, sys_##_prefix##_##_name##_show, NULL)
+ __ATTR(_name, 0444, sys_##_prefix##_##_name##_show, NULL)
#define DEFINE_IPL_ATTR_RW(_prefix, _name, _fmt_out, _fmt_in, _value) \
IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, (unsigned long long) _value) \
@@ -222,7 +256,7 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \
return len; \
} \
static struct kobj_attribute sys_##_prefix##_##_name##_attr = \
- __ATTR(_name,(S_IRUGO | S_IWUSR), \
+ __ATTR(_name, 0644, \
sys_##_prefix##_##_name##_show, \
sys_##_prefix##_##_name##_store)
@@ -232,12 +266,12 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \
struct kobj_attribute *attr, \
const char *buf, size_t len) \
{ \
- strncpy(_value, buf, sizeof(_value) - 1); \
+ strscpy(_value, buf, sizeof(_value)); \
strim(_value); \
return len; \
} \
static struct kobj_attribute sys_##_prefix##_##_name##_attr = \
- __ATTR(_name,(S_IRUGO | S_IWUSR), \
+ __ATTR(_name, 0644, \
sys_##_prefix##_##_name##_show, \
sys_##_prefix##_##_name##_store)
@@ -258,6 +292,16 @@ static __init enum ipl_type get_ipl_type(void)
return IPL_TYPE_FCP_DUMP;
else
return IPL_TYPE_FCP;
+ case IPL_PBT_NVME:
+ if (ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP)
+ return IPL_TYPE_NVME_DUMP;
+ else
+ return IPL_TYPE_NVME;
+ case IPL_PBT_ECKD:
+ if (ipl_block.eckd.opt == IPL_PB0_ECKD_OPT_DUMP)
+ return IPL_TYPE_ECKD_DUMP;
+ else
+ return IPL_TYPE_ECKD;
}
return IPL_TYPE_UNKNOWN;
}
@@ -302,7 +346,7 @@ static ssize_t ipl_vm_parm_show(struct kobject *kobj,
}
static struct kobj_attribute sys_ipl_vm_parm_attr =
- __ATTR(parm, S_IRUGO, ipl_vm_parm_show, NULL);
+ __ATTR(parm, 0444, ipl_vm_parm_show, NULL);
static ssize_t sys_ipl_device_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
@@ -311,16 +355,23 @@ static ssize_t sys_ipl_device_show(struct kobject *kobj,
case IPL_TYPE_CCW:
return sprintf(page, "0.%x.%04x\n", ipl_block.ccw.ssid,
ipl_block.ccw.devno);
+ case IPL_TYPE_ECKD:
+ case IPL_TYPE_ECKD_DUMP:
+ return sprintf(page, "0.%x.%04x\n", ipl_block.eckd.ssid,
+ ipl_block.eckd.devno);
case IPL_TYPE_FCP:
case IPL_TYPE_FCP_DUMP:
return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno);
+ case IPL_TYPE_NVME:
+ case IPL_TYPE_NVME_DUMP:
+ return sprintf(page, "%08ux\n", ipl_block.nvme.fid);
default:
return 0;
}
}
static struct kobj_attribute sys_ipl_device_attr =
- __ATTR(device, S_IRUGO, sys_ipl_device_show, NULL);
+ __ATTR(device, 0444, sys_ipl_device_show, NULL);
static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *attr, char *buf,
@@ -330,7 +381,7 @@ static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj,
ipl_block.hdr.len);
}
static struct bin_attribute ipl_parameter_attr =
- __BIN_ATTR(binary_parameter, S_IRUGO, ipl_parameter_read, NULL,
+ __BIN_ATTR(binary_parameter, 0444, ipl_parameter_read, NULL,
PAGE_SIZE);
static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj,
@@ -342,8 +393,35 @@ static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj,
return memory_read_from_buffer(buf, count, &off, scp_data, size);
}
+
+static ssize_t ipl_nvme_scp_data_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t off, size_t count)
+{
+ unsigned int size = ipl_block.nvme.scp_data_len;
+ void *scp_data = &ipl_block.nvme.scp_data;
+
+ return memory_read_from_buffer(buf, count, &off, scp_data, size);
+}
+
+static ssize_t ipl_eckd_scp_data_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t off, size_t count)
+{
+ unsigned int size = ipl_block.eckd.scp_data_len;
+ void *scp_data = &ipl_block.eckd.scp_data;
+
+ return memory_read_from_buffer(buf, count, &off, scp_data, size);
+}
+
static struct bin_attribute ipl_scp_data_attr =
- __BIN_ATTR(scp_data, S_IRUGO, ipl_scp_data_read, NULL, PAGE_SIZE);
+ __BIN_ATTR(scp_data, 0444, ipl_scp_data_read, NULL, PAGE_SIZE);
+
+static struct bin_attribute ipl_nvme_scp_data_attr =
+ __BIN_ATTR(scp_data, 0444, ipl_nvme_scp_data_read, NULL, PAGE_SIZE);
+
+static struct bin_attribute ipl_eckd_scp_data_attr =
+ __BIN_ATTR(scp_data, 0444, ipl_eckd_scp_data_read, NULL, PAGE_SIZE);
static struct bin_attribute *ipl_fcp_bin_attrs[] = {
&ipl_parameter_attr,
@@ -351,6 +429,18 @@ static struct bin_attribute *ipl_fcp_bin_attrs[] = {
NULL,
};
+static struct bin_attribute *ipl_nvme_bin_attrs[] = {
+ &ipl_parameter_attr,
+ &ipl_nvme_scp_data_attr,
+ NULL,
+};
+
+static struct bin_attribute *ipl_eckd_bin_attrs[] = {
+ &ipl_parameter_attr,
+ &ipl_eckd_scp_data_attr,
+ NULL,
+};
+
/* FCP ipl device attributes */
DEFINE_IPL_ATTR_RO(ipl_fcp, wwpn, "0x%016llx\n",
@@ -362,6 +452,94 @@ DEFINE_IPL_ATTR_RO(ipl_fcp, bootprog, "%lld\n",
DEFINE_IPL_ATTR_RO(ipl_fcp, br_lba, "%lld\n",
(unsigned long long)ipl_block.fcp.br_lba);
+/* NVMe ipl device attributes */
+DEFINE_IPL_ATTR_RO(ipl_nvme, fid, "0x%08llx\n",
+ (unsigned long long)ipl_block.nvme.fid);
+DEFINE_IPL_ATTR_RO(ipl_nvme, nsid, "0x%08llx\n",
+ (unsigned long long)ipl_block.nvme.nsid);
+DEFINE_IPL_ATTR_RO(ipl_nvme, bootprog, "%lld\n",
+ (unsigned long long)ipl_block.nvme.bootprog);
+DEFINE_IPL_ATTR_RO(ipl_nvme, br_lba, "%lld\n",
+ (unsigned long long)ipl_block.nvme.br_lba);
+
+/* ECKD ipl device attributes */
+DEFINE_IPL_ATTR_RO(ipl_eckd, bootprog, "%lld\n",
+ (unsigned long long)ipl_block.eckd.bootprog);
+
+#define IPL_ATTR_BR_CHR_SHOW_FN(_name, _ipb) \
+static ssize_t eckd_##_name##_br_chr_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ char *buf) \
+{ \
+ struct ipl_pb0_eckd *ipb = &(_ipb); \
+ \
+ if (!ipb->br_chr.cyl && \
+ !ipb->br_chr.head && \
+ !ipb->br_chr.record) \
+ return sprintf(buf, "auto\n"); \
+ \
+ return sprintf(buf, "0x%x,0x%x,0x%x\n", \
+ ipb->br_chr.cyl, \
+ ipb->br_chr.head, \
+ ipb->br_chr.record); \
+}
+
+#define IPL_ATTR_BR_CHR_STORE_FN(_name, _ipb) \
+static ssize_t eckd_##_name##_br_chr_store(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ const char *buf, size_t len) \
+{ \
+ struct ipl_pb0_eckd *ipb = &(_ipb); \
+ unsigned long args[3] = { 0 }; \
+ char *p, *p1, *tmp = NULL; \
+ int i, rc; \
+ \
+ if (!strncmp(buf, "auto", 4)) \
+ goto out; \
+ \
+ tmp = kstrdup(buf, GFP_KERNEL); \
+ p = tmp; \
+ for (i = 0; i < 3; i++) { \
+ p1 = strsep(&p, ", "); \
+ if (!p1) { \
+ rc = -EINVAL; \
+ goto err; \
+ } \
+ rc = kstrtoul(p1, 0, args + i); \
+ if (rc) \
+ goto err; \
+ } \
+ \
+ rc = -EINVAL; \
+ if (i != 3) \
+ goto err; \
+ \
+ if ((args[0] || args[1]) && !args[2]) \
+ goto err; \
+ \
+ if (args[0] > UINT_MAX || args[1] > 255 || args[2] > 255) \
+ goto err; \
+ \
+out: \
+ ipb->br_chr.cyl = args[0]; \
+ ipb->br_chr.head = args[1]; \
+ ipb->br_chr.record = args[2]; \
+ rc = len; \
+err: \
+ kfree(tmp); \
+ return rc; \
+}
+
+IPL_ATTR_BR_CHR_SHOW_FN(ipl, ipl_block.eckd);
+static struct kobj_attribute sys_ipl_eckd_br_chr_attr =
+ __ATTR(br_chr, 0644, eckd_ipl_br_chr_show, NULL);
+
+IPL_ATTR_BR_CHR_SHOW_FN(reipl, reipl_block_eckd->eckd);
+IPL_ATTR_BR_CHR_STORE_FN(reipl, reipl_block_eckd->eckd);
+
+static struct kobj_attribute sys_reipl_eckd_br_chr_attr =
+ __ATTR(br_chr, 0644, eckd_reipl_br_chr_show, eckd_reipl_br_chr_store);
+
static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
@@ -379,15 +557,12 @@ static struct kobj_attribute sys_ipl_ccw_loadparm_attr =
__ATTR(loadparm, 0444, ipl_ccw_loadparm_show, NULL);
static struct attribute *ipl_fcp_attrs[] = {
- &sys_ipl_type_attr.attr,
&sys_ipl_device_attr.attr,
&sys_ipl_fcp_wwpn_attr.attr,
&sys_ipl_fcp_lun_attr.attr,
&sys_ipl_fcp_bootprog_attr.attr,
&sys_ipl_fcp_br_lba_attr.attr,
&sys_ipl_ccw_loadparm_attr.attr,
- &sys_ipl_secure_attr.attr,
- &sys_ipl_has_secure_attr.attr,
NULL,
};
@@ -396,24 +571,45 @@ static struct attribute_group ipl_fcp_attr_group = {
.bin_attrs = ipl_fcp_bin_attrs,
};
+static struct attribute *ipl_nvme_attrs[] = {
+ &sys_ipl_nvme_fid_attr.attr,
+ &sys_ipl_nvme_nsid_attr.attr,
+ &sys_ipl_nvme_bootprog_attr.attr,
+ &sys_ipl_nvme_br_lba_attr.attr,
+ &sys_ipl_ccw_loadparm_attr.attr,
+ NULL,
+};
+
+static struct attribute_group ipl_nvme_attr_group = {
+ .attrs = ipl_nvme_attrs,
+ .bin_attrs = ipl_nvme_bin_attrs,
+};
+
+static struct attribute *ipl_eckd_attrs[] = {
+ &sys_ipl_eckd_bootprog_attr.attr,
+ &sys_ipl_eckd_br_chr_attr.attr,
+ &sys_ipl_ccw_loadparm_attr.attr,
+ &sys_ipl_device_attr.attr,
+ NULL,
+};
+
+static struct attribute_group ipl_eckd_attr_group = {
+ .attrs = ipl_eckd_attrs,
+ .bin_attrs = ipl_eckd_bin_attrs,
+};
+
/* CCW ipl device attributes */
static struct attribute *ipl_ccw_attrs_vm[] = {
- &sys_ipl_type_attr.attr,
&sys_ipl_device_attr.attr,
&sys_ipl_ccw_loadparm_attr.attr,
&sys_ipl_vm_parm_attr.attr,
- &sys_ipl_secure_attr.attr,
- &sys_ipl_has_secure_attr.attr,
NULL,
};
static struct attribute *ipl_ccw_attrs_lpar[] = {
- &sys_ipl_type_attr.attr,
&sys_ipl_device_attr.attr,
&sys_ipl_ccw_loadparm_attr.attr,
- &sys_ipl_secure_attr.attr,
- &sys_ipl_has_secure_attr.attr,
NULL,
};
@@ -425,22 +621,21 @@ static struct attribute_group ipl_ccw_attr_group_lpar = {
.attrs = ipl_ccw_attrs_lpar
};
-/* UNKNOWN ipl device attributes */
-
-static struct attribute *ipl_unknown_attrs[] = {
+static struct attribute *ipl_common_attrs[] = {
&sys_ipl_type_attr.attr,
+ &sys_ipl_secure_attr.attr,
+ &sys_ipl_has_secure_attr.attr,
NULL,
};
-static struct attribute_group ipl_unknown_attr_group = {
- .attrs = ipl_unknown_attrs,
+static struct attribute_group ipl_common_attr_group = {
+ .attrs = ipl_common_attrs,
};
static struct kset *ipl_kset;
static void __ipl_run(void *unused)
{
- __bpon();
diag308(DIAG308_LOAD_CLEAR, NULL);
}
@@ -458,6 +653,9 @@ static int __init ipl_init(void)
rc = -ENOMEM;
goto out;
}
+ rc = sysfs_create_group(&ipl_kset->kobj, &ipl_common_attr_group);
+ if (rc)
+ goto out;
switch (ipl_info.type) {
case IPL_TYPE_CCW:
if (MACHINE_IS_VM)
@@ -467,13 +665,19 @@ static int __init ipl_init(void)
rc = sysfs_create_group(&ipl_kset->kobj,
&ipl_ccw_attr_group_lpar);
break;
+ case IPL_TYPE_ECKD:
+ case IPL_TYPE_ECKD_DUMP:
+ rc = sysfs_create_group(&ipl_kset->kobj, &ipl_eckd_attr_group);
+ break;
case IPL_TYPE_FCP:
case IPL_TYPE_FCP_DUMP:
rc = sysfs_create_group(&ipl_kset->kobj, &ipl_fcp_attr_group);
break;
+ case IPL_TYPE_NVME:
+ case IPL_TYPE_NVME_DUMP:
+ rc = sysfs_create_group(&ipl_kset->kobj, &ipl_nvme_attr_group);
+ break;
default:
- rc = sysfs_create_group(&ipl_kset->kobj,
- &ipl_unknown_attr_group);
break;
}
out:
@@ -564,11 +768,11 @@ static ssize_t reipl_ccw_vmparm_store(struct kobject *kobj,
}
static struct kobj_attribute sys_reipl_nss_vmparm_attr =
- __ATTR(parm, S_IRUGO | S_IWUSR, reipl_nss_vmparm_show,
- reipl_nss_vmparm_store);
+ __ATTR(parm, 0644, reipl_nss_vmparm_show,
+ reipl_nss_vmparm_store);
static struct kobj_attribute sys_reipl_ccw_vmparm_attr =
- __ATTR(parm, S_IRUGO | S_IWUSR, reipl_ccw_vmparm_show,
- reipl_ccw_vmparm_store);
+ __ATTR(parm, 0644, reipl_ccw_vmparm_show,
+ reipl_ccw_vmparm_store);
/* FCP reipl device attributes */
@@ -608,7 +812,7 @@ static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj,
return count;
}
static struct bin_attribute sys_reipl_fcp_scp_data_attr =
- __BIN_ATTR(scp_data, (S_IRUGO | S_IWUSR), reipl_fcp_scpdata_read,
+ __BIN_ATTR(scp_data, 0644, reipl_fcp_scpdata_read,
reipl_fcp_scpdata_write, DIAG308_SCPDATA_SIZE);
static struct bin_attribute *reipl_fcp_bin_attrs[] = {
@@ -673,24 +877,43 @@ static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb,
return len;
}
-/* FCP wrapper */
-static ssize_t reipl_fcp_loadparm_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+#define DEFINE_GENERIC_LOADPARM(name) \
+static ssize_t reipl_##name##_loadparm_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *page) \
+{ \
+ return reipl_generic_loadparm_show(reipl_block_##name, page); \
+} \
+static ssize_t reipl_##name##_loadparm_store(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ const char *buf, size_t len) \
+{ \
+ return reipl_generic_loadparm_store(reipl_block_##name, buf, len); \
+} \
+static struct kobj_attribute sys_reipl_##name##_loadparm_attr = \
+ __ATTR(loadparm, 0644, reipl_##name##_loadparm_show, \
+ reipl_##name##_loadparm_store)
+
+DEFINE_GENERIC_LOADPARM(fcp);
+DEFINE_GENERIC_LOADPARM(nvme);
+DEFINE_GENERIC_LOADPARM(ccw);
+DEFINE_GENERIC_LOADPARM(nss);
+DEFINE_GENERIC_LOADPARM(eckd);
+
+static ssize_t reipl_fcp_clear_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
{
- return reipl_generic_loadparm_show(reipl_block_fcp, page);
+ return sprintf(page, "%u\n", reipl_fcp_clear);
}
-static ssize_t reipl_fcp_loadparm_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t len)
+static ssize_t reipl_fcp_clear_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
{
- return reipl_generic_loadparm_store(reipl_block_fcp, buf, len);
+ if (kstrtobool(buf, &reipl_fcp_clear) < 0)
+ return -EINVAL;
+ return len;
}
-static struct kobj_attribute sys_reipl_fcp_loadparm_attr =
- __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_fcp_loadparm_show,
- reipl_fcp_loadparm_store);
-
static struct attribute *reipl_fcp_attrs[] = {
&sys_reipl_fcp_device_attr.attr,
&sys_reipl_fcp_wwpn_attr.attr,
@@ -706,51 +929,129 @@ static struct attribute_group reipl_fcp_attr_group = {
.bin_attrs = reipl_fcp_bin_attrs,
};
-/* CCW reipl device attributes */
-DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw);
+static struct kobj_attribute sys_reipl_fcp_clear_attr =
+ __ATTR(clear, 0644, reipl_fcp_clear_show, reipl_fcp_clear_store);
-/* NSS wrapper */
-static ssize_t reipl_nss_loadparm_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+/* NVME reipl device attributes */
+
+static ssize_t reipl_nvme_scpdata_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr,
+ char *buf, loff_t off, size_t count)
{
- return reipl_generic_loadparm_show(reipl_block_nss, page);
+ size_t size = reipl_block_nvme->nvme.scp_data_len;
+ void *scp_data = reipl_block_nvme->nvme.scp_data;
+
+ return memory_read_from_buffer(buf, count, &off, scp_data, size);
}
-static ssize_t reipl_nss_loadparm_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t len)
+static ssize_t reipl_nvme_scpdata_write(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr,
+ char *buf, loff_t off, size_t count)
{
- return reipl_generic_loadparm_store(reipl_block_nss, buf, len);
+ size_t scpdata_len = count;
+ size_t padding;
+
+ if (off)
+ return -EINVAL;
+
+ memcpy(reipl_block_nvme->nvme.scp_data, buf, count);
+ if (scpdata_len % 8) {
+ padding = 8 - (scpdata_len % 8);
+ memset(reipl_block_nvme->nvme.scp_data + scpdata_len,
+ 0, padding);
+ scpdata_len += padding;
+ }
+
+ reipl_block_nvme->hdr.len = IPL_BP_FCP_LEN + scpdata_len;
+ reipl_block_nvme->nvme.len = IPL_BP0_FCP_LEN + scpdata_len;
+ reipl_block_nvme->nvme.scp_data_len = scpdata_len;
+
+ return count;
}
-/* CCW wrapper */
-static ssize_t reipl_ccw_loadparm_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+static struct bin_attribute sys_reipl_nvme_scp_data_attr =
+ __BIN_ATTR(scp_data, 0644, reipl_nvme_scpdata_read,
+ reipl_nvme_scpdata_write, DIAG308_SCPDATA_SIZE);
+
+static struct bin_attribute *reipl_nvme_bin_attrs[] = {
+ &sys_reipl_nvme_scp_data_attr,
+ NULL,
+};
+
+DEFINE_IPL_ATTR_RW(reipl_nvme, fid, "0x%08llx\n", "%llx\n",
+ reipl_block_nvme->nvme.fid);
+DEFINE_IPL_ATTR_RW(reipl_nvme, nsid, "0x%08llx\n", "%llx\n",
+ reipl_block_nvme->nvme.nsid);
+DEFINE_IPL_ATTR_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n",
+ reipl_block_nvme->nvme.bootprog);
+DEFINE_IPL_ATTR_RW(reipl_nvme, br_lba, "%lld\n", "%lld\n",
+ reipl_block_nvme->nvme.br_lba);
+
+static struct attribute *reipl_nvme_attrs[] = {
+ &sys_reipl_nvme_fid_attr.attr,
+ &sys_reipl_nvme_nsid_attr.attr,
+ &sys_reipl_nvme_bootprog_attr.attr,
+ &sys_reipl_nvme_br_lba_attr.attr,
+ &sys_reipl_nvme_loadparm_attr.attr,
+ NULL,
+};
+
+static struct attribute_group reipl_nvme_attr_group = {
+ .attrs = reipl_nvme_attrs,
+ .bin_attrs = reipl_nvme_bin_attrs
+};
+
+static ssize_t reipl_nvme_clear_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return sprintf(page, "%u\n", reipl_nvme_clear);
+}
+
+static ssize_t reipl_nvme_clear_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (kstrtobool(buf, &reipl_nvme_clear) < 0)
+ return -EINVAL;
+ return len;
+}
+
+static struct kobj_attribute sys_reipl_nvme_clear_attr =
+ __ATTR(clear, 0644, reipl_nvme_clear_show, reipl_nvme_clear_store);
+
+/* CCW reipl device attributes */
+DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw);
+
+static ssize_t reipl_ccw_clear_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
{
- return reipl_generic_loadparm_show(reipl_block_ccw, page);
+ return sprintf(page, "%u\n", reipl_ccw_clear);
}
-static ssize_t reipl_ccw_loadparm_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t len)
+static ssize_t reipl_ccw_clear_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
{
- return reipl_generic_loadparm_store(reipl_block_ccw, buf, len);
+ if (kstrtobool(buf, &reipl_ccw_clear) < 0)
+ return -EINVAL;
+ return len;
}
-static struct kobj_attribute sys_reipl_ccw_loadparm_attr =
- __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_ccw_loadparm_show,
- reipl_ccw_loadparm_store);
+static struct kobj_attribute sys_reipl_ccw_clear_attr =
+ __ATTR(clear, 0644, reipl_ccw_clear_show, reipl_ccw_clear_store);
static struct attribute *reipl_ccw_attrs_vm[] = {
&sys_reipl_ccw_device_attr.attr,
&sys_reipl_ccw_loadparm_attr.attr,
&sys_reipl_ccw_vmparm_attr.attr,
+ &sys_reipl_ccw_clear_attr.attr,
NULL,
};
static struct attribute *reipl_ccw_attrs_lpar[] = {
&sys_reipl_ccw_device_attr.attr,
&sys_reipl_ccw_loadparm_attr.attr,
+ &sys_reipl_ccw_clear_attr.attr,
NULL,
};
@@ -764,6 +1065,86 @@ static struct attribute_group reipl_ccw_attr_group_lpar = {
.attrs = reipl_ccw_attrs_lpar,
};
+/* ECKD reipl device attributes */
+
+static ssize_t reipl_eckd_scpdata_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr,
+ char *buf, loff_t off, size_t count)
+{
+ size_t size = reipl_block_eckd->eckd.scp_data_len;
+ void *scp_data = reipl_block_eckd->eckd.scp_data;
+
+ return memory_read_from_buffer(buf, count, &off, scp_data, size);
+}
+
+static ssize_t reipl_eckd_scpdata_write(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr,
+ char *buf, loff_t off, size_t count)
+{
+ size_t scpdata_len = count;
+ size_t padding;
+
+ if (off)
+ return -EINVAL;
+
+ memcpy(reipl_block_eckd->eckd.scp_data, buf, count);
+ if (scpdata_len % 8) {
+ padding = 8 - (scpdata_len % 8);
+ memset(reipl_block_eckd->eckd.scp_data + scpdata_len,
+ 0, padding);
+ scpdata_len += padding;
+ }
+
+ reipl_block_eckd->hdr.len = IPL_BP_ECKD_LEN + scpdata_len;
+ reipl_block_eckd->eckd.len = IPL_BP0_ECKD_LEN + scpdata_len;
+ reipl_block_eckd->eckd.scp_data_len = scpdata_len;
+
+ return count;
+}
+
+static struct bin_attribute sys_reipl_eckd_scp_data_attr =
+ __BIN_ATTR(scp_data, 0644, reipl_eckd_scpdata_read,
+ reipl_eckd_scpdata_write, DIAG308_SCPDATA_SIZE);
+
+static struct bin_attribute *reipl_eckd_bin_attrs[] = {
+ &sys_reipl_eckd_scp_data_attr,
+ NULL,
+};
+
+DEFINE_IPL_CCW_ATTR_RW(reipl_eckd, device, reipl_block_eckd->eckd);
+DEFINE_IPL_ATTR_RW(reipl_eckd, bootprog, "%lld\n", "%lld\n",
+ reipl_block_eckd->eckd.bootprog);
+
+static struct attribute *reipl_eckd_attrs[] = {
+ &sys_reipl_eckd_device_attr.attr,
+ &sys_reipl_eckd_bootprog_attr.attr,
+ &sys_reipl_eckd_br_chr_attr.attr,
+ &sys_reipl_eckd_loadparm_attr.attr,
+ NULL,
+};
+
+static struct attribute_group reipl_eckd_attr_group = {
+ .attrs = reipl_eckd_attrs,
+ .bin_attrs = reipl_eckd_bin_attrs
+};
+
+static ssize_t reipl_eckd_clear_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return sprintf(page, "%u\n", reipl_eckd_clear);
+}
+
+static ssize_t reipl_eckd_clear_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (kstrtobool(buf, &reipl_eckd_clear) < 0)
+ return -EINVAL;
+ return len;
+}
+
+static struct kobj_attribute sys_reipl_eckd_clear_attr =
+ __ATTR(clear, 0644, reipl_eckd_clear_show, reipl_eckd_clear_store);
/* NSS reipl device attributes */
static void reipl_get_ascii_nss_name(char *dst,
@@ -811,12 +1192,8 @@ static ssize_t reipl_nss_name_store(struct kobject *kobj,
}
static struct kobj_attribute sys_reipl_nss_name_attr =
- __ATTR(name, S_IRUGO | S_IWUSR, reipl_nss_name_show,
- reipl_nss_name_store);
-
-static struct kobj_attribute sys_reipl_nss_loadparm_attr =
- __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_nss_loadparm_show,
- reipl_nss_loadparm_store);
+ __ATTR(name, 0644, reipl_nss_name_show,
+ reipl_nss_name_store);
static struct attribute *reipl_nss_attrs[] = {
&sys_reipl_nss_name_attr.attr,
@@ -847,9 +1224,15 @@ static int reipl_set_type(enum ipl_type type)
case IPL_TYPE_CCW:
reipl_block_actual = reipl_block_ccw;
break;
+ case IPL_TYPE_ECKD:
+ reipl_block_actual = reipl_block_eckd;
+ break;
case IPL_TYPE_FCP:
reipl_block_actual = reipl_block_fcp;
break;
+ case IPL_TYPE_NVME:
+ reipl_block_actual = reipl_block_nvme;
+ break;
case IPL_TYPE_NSS:
reipl_block_actual = reipl_block_nss;
break;
@@ -874,8 +1257,12 @@ static ssize_t reipl_type_store(struct kobject *kobj,
if (strncmp(buf, IPL_CCW_STR, strlen(IPL_CCW_STR)) == 0)
rc = reipl_set_type(IPL_TYPE_CCW);
+ else if (strncmp(buf, IPL_ECKD_STR, strlen(IPL_ECKD_STR)) == 0)
+ rc = reipl_set_type(IPL_TYPE_ECKD);
else if (strncmp(buf, IPL_FCP_STR, strlen(IPL_FCP_STR)) == 0)
rc = reipl_set_type(IPL_TYPE_FCP);
+ else if (strncmp(buf, IPL_NVME_STR, strlen(IPL_NVME_STR)) == 0)
+ rc = reipl_set_type(IPL_TYPE_NVME);
else if (strncmp(buf, IPL_NSS_STR, strlen(IPL_NSS_STR)) == 0)
rc = reipl_set_type(IPL_TYPE_NSS);
return (rc != 0) ? rc : len;
@@ -886,17 +1273,39 @@ static struct kobj_attribute reipl_type_attr =
static struct kset *reipl_kset;
static struct kset *reipl_fcp_kset;
+static struct kset *reipl_nvme_kset;
+static struct kset *reipl_eckd_kset;
static void __reipl_run(void *unused)
{
switch (reipl_type) {
case IPL_TYPE_CCW:
diag308(DIAG308_SET, reipl_block_ccw);
- diag308(DIAG308_LOAD_CLEAR, NULL);
+ if (reipl_ccw_clear)
+ diag308(DIAG308_LOAD_CLEAR, NULL);
+ else
+ diag308(DIAG308_LOAD_NORMAL_DUMP, NULL);
+ break;
+ case IPL_TYPE_ECKD:
+ diag308(DIAG308_SET, reipl_block_eckd);
+ if (reipl_eckd_clear)
+ diag308(DIAG308_LOAD_CLEAR, NULL);
+ else
+ diag308(DIAG308_LOAD_NORMAL, NULL);
break;
case IPL_TYPE_FCP:
diag308(DIAG308_SET, reipl_block_fcp);
- diag308(DIAG308_LOAD_CLEAR, NULL);
+ if (reipl_fcp_clear)
+ diag308(DIAG308_LOAD_CLEAR, NULL);
+ else
+ diag308(DIAG308_LOAD_NORMAL, NULL);
+ break;
+ case IPL_TYPE_NVME:
+ diag308(DIAG308_SET, reipl_block_nvme);
+ if (reipl_nvme_clear)
+ diag308(DIAG308_LOAD_CLEAR, NULL);
+ else
+ diag308(DIAG308_LOAD_NORMAL, NULL);
break;
case IPL_TYPE_NSS:
diag308(DIAG308_SET, reipl_block_nss);
@@ -906,6 +1315,8 @@ static void __reipl_run(void *unused)
diag308(DIAG308_LOAD_CLEAR, NULL);
break;
case IPL_TYPE_FCP_DUMP:
+ case IPL_TYPE_NVME_DUMP:
+ case IPL_TYPE_ECKD_DUMP:
break;
}
disabled_wait();
@@ -1008,10 +1419,16 @@ static int __init reipl_fcp_init(void)
}
rc = sysfs_create_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group);
- if (rc) {
- kset_unregister(reipl_fcp_kset);
- free_page((unsigned long) reipl_block_fcp);
- return rc;
+ if (rc)
+ goto out1;
+
+ if (test_facility(141)) {
+ rc = sysfs_create_file(&reipl_fcp_kset->kobj,
+ &sys_reipl_fcp_clear_attr.attr);
+ if (rc)
+ goto out2;
+ } else {
+ reipl_fcp_clear = true;
}
if (ipl_info.type == IPL_TYPE_FCP) {
@@ -1032,6 +1449,121 @@ static int __init reipl_fcp_init(void)
}
reipl_capabilities |= IPL_TYPE_FCP;
return 0;
+
+out2:
+ sysfs_remove_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group);
+out1:
+ kset_unregister(reipl_fcp_kset);
+ free_page((unsigned long) reipl_block_fcp);
+ return rc;
+}
+
+static int __init reipl_nvme_init(void)
+{
+ int rc;
+
+ reipl_block_nvme = (void *) get_zeroed_page(GFP_KERNEL);
+ if (!reipl_block_nvme)
+ return -ENOMEM;
+
+ /* sysfs: create kset for mixing attr group and bin attrs */
+ reipl_nvme_kset = kset_create_and_add(IPL_NVME_STR, NULL,
+ &reipl_kset->kobj);
+ if (!reipl_nvme_kset) {
+ free_page((unsigned long) reipl_block_nvme);
+ return -ENOMEM;
+ }
+
+ rc = sysfs_create_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group);
+ if (rc)
+ goto out1;
+
+ if (test_facility(141)) {
+ rc = sysfs_create_file(&reipl_nvme_kset->kobj,
+ &sys_reipl_nvme_clear_attr.attr);
+ if (rc)
+ goto out2;
+ } else {
+ reipl_nvme_clear = true;
+ }
+
+ if (ipl_info.type == IPL_TYPE_NVME) {
+ memcpy(reipl_block_nvme, &ipl_block, sizeof(ipl_block));
+ /*
+ * Fix loadparm: There are systems where the (SCSI) LOADPARM
+ * is invalid in the IPL parameter block, so take it
+ * always from sclp_ipl_info.
+ */
+ memcpy(reipl_block_nvme->nvme.loadparm, sclp_ipl_info.loadparm,
+ LOADPARM_LEN);
+ } else {
+ reipl_block_nvme->hdr.len = IPL_BP_NVME_LEN;
+ reipl_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION;
+ reipl_block_nvme->nvme.len = IPL_BP0_NVME_LEN;
+ reipl_block_nvme->nvme.pbt = IPL_PBT_NVME;
+ reipl_block_nvme->nvme.opt = IPL_PB0_NVME_OPT_IPL;
+ }
+ reipl_capabilities |= IPL_TYPE_NVME;
+ return 0;
+
+out2:
+ sysfs_remove_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group);
+out1:
+ kset_unregister(reipl_nvme_kset);
+ free_page((unsigned long) reipl_block_nvme);
+ return rc;
+}
+
+static int __init reipl_eckd_init(void)
+{
+ int rc;
+
+ if (!sclp.has_sipl_eckd)
+ return 0;
+
+ reipl_block_eckd = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!reipl_block_eckd)
+ return -ENOMEM;
+
+ /* sysfs: create kset for mixing attr group and bin attrs */
+ reipl_eckd_kset = kset_create_and_add(IPL_ECKD_STR, NULL,
+ &reipl_kset->kobj);
+ if (!reipl_eckd_kset) {
+ free_page((unsigned long)reipl_block_eckd);
+ return -ENOMEM;
+ }
+
+ rc = sysfs_create_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group);
+ if (rc)
+ goto out1;
+
+ if (test_facility(141)) {
+ rc = sysfs_create_file(&reipl_eckd_kset->kobj,
+ &sys_reipl_eckd_clear_attr.attr);
+ if (rc)
+ goto out2;
+ } else {
+ reipl_eckd_clear = true;
+ }
+
+ if (ipl_info.type == IPL_TYPE_ECKD) {
+ memcpy(reipl_block_eckd, &ipl_block, sizeof(ipl_block));
+ } else {
+ reipl_block_eckd->hdr.len = IPL_BP_ECKD_LEN;
+ reipl_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION;
+ reipl_block_eckd->eckd.len = IPL_BP0_ECKD_LEN;
+ reipl_block_eckd->eckd.pbt = IPL_PBT_ECKD;
+ reipl_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_IPL;
+ }
+ reipl_capabilities |= IPL_TYPE_ECKD;
+ return 0;
+
+out2:
+ sysfs_remove_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group);
+out1:
+ kset_unregister(reipl_eckd_kset);
+ free_page((unsigned long)reipl_block_eckd);
+ return rc;
}
static int __init reipl_type_init(void)
@@ -1049,9 +1581,15 @@ static int __init reipl_type_init(void)
if (reipl_block->pb0_hdr.pbt == IPL_PBT_FCP) {
memcpy(reipl_block_fcp, reipl_block, size);
reipl_type = IPL_TYPE_FCP;
+ } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_NVME) {
+ memcpy(reipl_block_nvme, reipl_block, size);
+ reipl_type = IPL_TYPE_NVME;
} else if (reipl_block->pb0_hdr.pbt == IPL_PBT_CCW) {
memcpy(reipl_block_ccw, reipl_block, size);
reipl_type = IPL_TYPE_CCW;
+ } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_ECKD) {
+ memcpy(reipl_block_eckd, reipl_block, size);
+ reipl_type = IPL_TYPE_ECKD;
}
out:
return reipl_set_type(reipl_type);
@@ -1072,9 +1610,15 @@ static int __init reipl_init(void)
rc = reipl_ccw_init();
if (rc)
return rc;
+ rc = reipl_eckd_init();
+ if (rc)
+ return rc;
rc = reipl_fcp_init();
if (rc)
return rc;
+ rc = reipl_nvme_init();
+ if (rc)
+ return rc;
rc = reipl_nss_init();
if (rc)
return rc;
@@ -1118,6 +1662,52 @@ static struct attribute_group dump_fcp_attr_group = {
.attrs = dump_fcp_attrs,
};
+/* NVME dump device attributes */
+DEFINE_IPL_ATTR_RW(dump_nvme, fid, "0x%08llx\n", "%llx\n",
+ dump_block_nvme->nvme.fid);
+DEFINE_IPL_ATTR_RW(dump_nvme, nsid, "0x%08llx\n", "%llx\n",
+ dump_block_nvme->nvme.nsid);
+DEFINE_IPL_ATTR_RW(dump_nvme, bootprog, "%lld\n", "%llx\n",
+ dump_block_nvme->nvme.bootprog);
+DEFINE_IPL_ATTR_RW(dump_nvme, br_lba, "%lld\n", "%llx\n",
+ dump_block_nvme->nvme.br_lba);
+
+static struct attribute *dump_nvme_attrs[] = {
+ &sys_dump_nvme_fid_attr.attr,
+ &sys_dump_nvme_nsid_attr.attr,
+ &sys_dump_nvme_bootprog_attr.attr,
+ &sys_dump_nvme_br_lba_attr.attr,
+ NULL,
+};
+
+static struct attribute_group dump_nvme_attr_group = {
+ .name = IPL_NVME_STR,
+ .attrs = dump_nvme_attrs,
+};
+
+/* ECKD dump device attributes */
+DEFINE_IPL_CCW_ATTR_RW(dump_eckd, device, dump_block_eckd->eckd);
+DEFINE_IPL_ATTR_RW(dump_eckd, bootprog, "%lld\n", "%llx\n",
+ dump_block_eckd->eckd.bootprog);
+
+IPL_ATTR_BR_CHR_SHOW_FN(dump, dump_block_eckd->eckd);
+IPL_ATTR_BR_CHR_STORE_FN(dump, dump_block_eckd->eckd);
+
+static struct kobj_attribute sys_dump_eckd_br_chr_attr =
+ __ATTR(br_chr, 0644, eckd_dump_br_chr_show, eckd_dump_br_chr_store);
+
+static struct attribute *dump_eckd_attrs[] = {
+ &sys_dump_eckd_device_attr.attr,
+ &sys_dump_eckd_bootprog_attr.attr,
+ &sys_dump_eckd_br_chr_attr.attr,
+ NULL,
+};
+
+static struct attribute_group dump_eckd_attr_group = {
+ .name = IPL_ECKD_STR,
+ .attrs = dump_eckd_attrs,
+};
+
/* CCW dump device attributes */
DEFINE_IPL_CCW_ATTR_RW(dump_ccw, device, dump_block_ccw->ccw);
@@ -1157,8 +1747,12 @@ static ssize_t dump_type_store(struct kobject *kobj,
rc = dump_set_type(DUMP_TYPE_NONE);
else if (strncmp(buf, DUMP_CCW_STR, strlen(DUMP_CCW_STR)) == 0)
rc = dump_set_type(DUMP_TYPE_CCW);
+ else if (strncmp(buf, DUMP_ECKD_STR, strlen(DUMP_ECKD_STR)) == 0)
+ rc = dump_set_type(DUMP_TYPE_ECKD);
else if (strncmp(buf, DUMP_FCP_STR, strlen(DUMP_FCP_STR)) == 0)
rc = dump_set_type(DUMP_TYPE_FCP);
+ else if (strncmp(buf, DUMP_NVME_STR, strlen(DUMP_NVME_STR)) == 0)
+ rc = dump_set_type(DUMP_TYPE_NVME);
return (rc != 0) ? rc : len;
}
@@ -1173,7 +1767,7 @@ static void diag308_dump(void *dump_block)
while (1) {
if (diag308(DIAG308_LOAD_NORMAL_DUMP, NULL) != 0x302)
break;
- udelay_simple(USEC_PER_SEC);
+ udelay(USEC_PER_SEC);
}
}
@@ -1183,9 +1777,15 @@ static void __dump_run(void *unused)
case DUMP_TYPE_CCW:
diag308_dump(dump_block_ccw);
break;
+ case DUMP_TYPE_ECKD:
+ diag308_dump(dump_block_eckd);
+ break;
case DUMP_TYPE_FCP:
diag308_dump(dump_block_fcp);
break;
+ case DUMP_TYPE_NVME:
+ diag308_dump(dump_block_nvme);
+ break;
default:
break;
}
@@ -1242,6 +1842,52 @@ static int __init dump_fcp_init(void)
return 0;
}
+static int __init dump_nvme_init(void)
+{
+ int rc;
+
+ if (!sclp_ipl_info.has_dump)
+ return 0; /* LDIPL DUMP is not installed */
+ dump_block_nvme = (void *) get_zeroed_page(GFP_KERNEL);
+ if (!dump_block_nvme)
+ return -ENOMEM;
+ rc = sysfs_create_group(&dump_kset->kobj, &dump_nvme_attr_group);
+ if (rc) {
+ free_page((unsigned long)dump_block_nvme);
+ return rc;
+ }
+ dump_block_nvme->hdr.len = IPL_BP_NVME_LEN;
+ dump_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION;
+ dump_block_nvme->fcp.len = IPL_BP0_NVME_LEN;
+ dump_block_nvme->fcp.pbt = IPL_PBT_NVME;
+ dump_block_nvme->fcp.opt = IPL_PB0_NVME_OPT_DUMP;
+ dump_capabilities |= DUMP_TYPE_NVME;
+ return 0;
+}
+
+static int __init dump_eckd_init(void)
+{
+ int rc;
+
+ if (!sclp_ipl_info.has_dump || !sclp.has_sipl_eckd)
+ return 0; /* LDIPL DUMP is not installed */
+ dump_block_eckd = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!dump_block_eckd)
+ return -ENOMEM;
+ rc = sysfs_create_group(&dump_kset->kobj, &dump_eckd_attr_group);
+ if (rc) {
+ free_page((unsigned long)dump_block_eckd);
+ return rc;
+ }
+ dump_block_eckd->hdr.len = IPL_BP_ECKD_LEN;
+ dump_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION;
+ dump_block_eckd->eckd.len = IPL_BP0_ECKD_LEN;
+ dump_block_eckd->eckd.pbt = IPL_PBT_ECKD;
+ dump_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_DUMP;
+ dump_capabilities |= DUMP_TYPE_ECKD;
+ return 0;
+}
+
static int __init dump_init(void)
{
int rc;
@@ -1257,9 +1903,15 @@ static int __init dump_init(void)
rc = dump_ccw_init();
if (rc)
return rc;
+ rc = dump_eckd_init();
+ if (rc)
+ return rc;
rc = dump_fcp_init();
if (rc)
return rc;
+ rc = dump_nvme_init();
+ if (rc)
+ return rc;
dump_set_type(DUMP_TYPE_NONE);
return 0;
}
@@ -1272,13 +1924,29 @@ static struct shutdown_action __refdata dump_action = {
static void dump_reipl_run(struct shutdown_trigger *trigger)
{
- unsigned long ipib = (unsigned long) reipl_block_actual;
+ struct lowcore *abs_lc;
unsigned int csum;
+ /*
+ * Set REIPL_CLEAR flag in os_info flags entry indicating
+ * 'clear' sysfs attribute has been set on the panicked system
+ * for specified reipl type.
+ * Always set for IPL_TYPE_NSS and IPL_TYPE_UNKNOWN.
+ */
+ if ((reipl_type == IPL_TYPE_CCW && reipl_ccw_clear) ||
+ (reipl_type == IPL_TYPE_ECKD && reipl_eckd_clear) ||
+ (reipl_type == IPL_TYPE_FCP && reipl_fcp_clear) ||
+ (reipl_type == IPL_TYPE_NVME && reipl_nvme_clear) ||
+ reipl_type == IPL_TYPE_NSS ||
+ reipl_type == IPL_TYPE_UNKNOWN)
+ os_info_flags |= OS_INFO_FLAG_REIPL_CLEAR;
+ os_info_entry_add(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags));
csum = (__force unsigned int)
csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0);
- mem_assign_absolute(S390_lowcore.ipib, ipib);
- mem_assign_absolute(S390_lowcore.ipib_checksum, csum);
+ abs_lc = get_abs_lowcore();
+ abs_lc->ipib = __pa(reipl_block_actual);
+ abs_lc->ipib_checksum = csum;
+ put_abs_lowcore(abs_lc);
dump_run(trigger);
}
@@ -1472,7 +2140,6 @@ static struct kobj_attribute on_restart_attr = __ATTR_RW(on_restart);
static void __do_restart(void *ignore)
{
- __arch_local_irq_stosm(0x04); /* enable DAT */
smp_send_stop();
#ifdef CONFIG_CRASH_DUMP
crash_kexec(NULL);
@@ -1481,12 +2148,12 @@ static void __do_restart(void *ignore)
stop_run(&on_restart_trigger);
}
-void do_restart(void)
+void do_restart(void *arg)
{
tracing_off();
debug_locks_off();
lgr_info_log();
- smp_call_online_cpu(__do_restart, NULL);
+ smp_call_online_cpu(__do_restart, arg);
}
/* on halt */
@@ -1684,6 +2351,11 @@ void __init setup_ipl(void)
ipl_info.data.ccw.dev_id.ssid = ipl_block.ccw.ssid;
ipl_info.data.ccw.dev_id.devno = ipl_block.ccw.devno;
break;
+ case IPL_TYPE_ECKD:
+ case IPL_TYPE_ECKD_DUMP:
+ ipl_info.data.eckd.dev_id.ssid = ipl_block.eckd.ssid;
+ ipl_info.data.eckd.dev_id.devno = ipl_block.eckd.devno;
+ break;
case IPL_TYPE_FCP:
case IPL_TYPE_FCP_DUMP:
ipl_info.data.fcp.dev_id.ssid = 0;
@@ -1691,6 +2363,11 @@ void __init setup_ipl(void)
ipl_info.data.fcp.wwpn = ipl_block.fcp.wwpn;
ipl_info.data.fcp.lun = ipl_block.fcp.lun;
break;
+ case IPL_TYPE_NVME:
+ case IPL_TYPE_NVME_DUMP:
+ ipl_info.data.nvme.fid = ipl_block.nvme.fid;
+ ipl_info.data.nvme.nsid = ipl_block.nvme.nsid;
+ break;
case IPL_TYPE_NSS:
case IPL_TYPE_UNKNOWN:
/* We have no info to copy */
@@ -1705,8 +2382,8 @@ void s390_reset_system(void)
set_prefix(0);
/* Disable lowcore protection */
- __ctl_clear_bit(0, 28);
- diag_dma_ops.diag308_reset();
+ local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT);
+ diag_amode31_ops.diag308_reset();
}
#ifdef CONFIG_KEXEC_FILE
@@ -1783,7 +2460,7 @@ void *ipl_report_finish(struct ipl_report *report)
buf = vzalloc(report->size);
if (!buf)
- return ERR_PTR(-ENOMEM);
+ goto out;
ptr = buf;
memcpy(ptr, report->ipib, report->ipib->hdr.len);
@@ -1822,6 +2499,7 @@ void *ipl_report_finish(struct ipl_report *report)
}
BUG_ON(ptr > buf + report->size);
+out:
return buf;
}
diff --git a/arch/s390/kernel/ipl_vmparm.c b/arch/s390/kernel/ipl_vmparm.c
index af43535a976d..b5245fadcfb0 100644
--- a/arch/s390/kernel/ipl_vmparm.c
+++ b/arch/s390/kernel/ipl_vmparm.c
@@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/minmax.h>
+#include <linux/string.h>
#include <asm/ebcdic.h>
#include <asm/ipl.h>
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 8371855042dc..6f71b0ce1068 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -21,12 +21,14 @@
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/irq.h>
+#include <linux/entry-common.h>
#include <asm/irq_regs.h>
#include <asm/cputime.h>
#include <asm/lowcore.h>
#include <asm/irq.h>
#include <asm/hw_irq.h>
#include <asm/stacktrace.h>
+#include <asm/softirq_stack.h>
#include "entry.h"
DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
@@ -95,27 +97,106 @@ static const struct irq_class irqclass_sub_desc[] = {
{.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"},
};
-void __init init_IRQ(void)
+static void do_IRQ(struct pt_regs *regs, int irq)
{
- BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS);
- init_cio_interrupts();
- init_airq_interrupts();
- init_ext_interrupts();
-}
-
-void do_IRQ(struct pt_regs *regs, int irq)
-{
- struct pt_regs *old_regs;
-
- old_regs = set_irq_regs(regs);
- irq_enter();
if (tod_after_eq(S390_lowcore.int_clock,
S390_lowcore.clock_comparator))
/* Serve timer interrupts first. */
clock_comparator_work();
generic_handle_irq(irq);
- irq_exit();
+}
+
+static int on_async_stack(void)
+{
+ unsigned long frame = current_frame_address();
+
+ return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0;
+}
+
+static void do_irq_async(struct pt_regs *regs, int irq)
+{
+ if (on_async_stack()) {
+ do_IRQ(regs, irq);
+ } else {
+ call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ,
+ struct pt_regs *, regs, int, irq);
+ }
+}
+
+static int irq_pending(struct pt_regs *regs)
+{
+ int cc;
+
+ asm volatile("tpi 0\n"
+ "ipm %0" : "=d" (cc) : : "cc");
+ return cc >> 28;
+}
+
+void noinstr do_io_irq(struct pt_regs *regs)
+{
+ irqentry_state_t state = irqentry_enter(regs);
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ bool from_idle;
+
+ irq_enter_rcu();
+
+ if (user_mode(regs)) {
+ update_timer_sys();
+ if (static_branch_likely(&cpu_has_bear))
+ current->thread.last_break = regs->last_break;
+ }
+
+ from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT);
+ if (from_idle)
+ account_idle_time_irq();
+
+ do {
+ regs->tpi_info = S390_lowcore.tpi_info;
+ if (S390_lowcore.tpi_info.adapter_IO)
+ do_irq_async(regs, THIN_INTERRUPT);
+ else
+ do_irq_async(regs, IO_INTERRUPT);
+ } while (MACHINE_IS_LPAR && irq_pending(regs));
+
+ irq_exit_rcu();
+
set_irq_regs(old_regs);
+ irqentry_exit(regs, state);
+
+ if (from_idle)
+ regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT);
+}
+
+void noinstr do_ext_irq(struct pt_regs *regs)
+{
+ irqentry_state_t state = irqentry_enter(regs);
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ bool from_idle;
+
+ irq_enter_rcu();
+
+ if (user_mode(regs)) {
+ update_timer_sys();
+ if (static_branch_likely(&cpu_has_bear))
+ current->thread.last_break = regs->last_break;
+ }
+
+ regs->int_code = S390_lowcore.ext_int_code_addr;
+ regs->int_parm = S390_lowcore.ext_params;
+ regs->int_parm_long = S390_lowcore.ext_params2;
+
+ from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT);
+ if (from_idle)
+ account_idle_time_irq();
+
+ do_irq_async(regs, EXT_INTERRUPT);
+
+ irq_exit_rcu();
+ set_irq_regs(old_regs);
+ irqentry_exit(regs, state);
+
+ if (from_idle)
+ regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT);
}
static void show_msi_interrupt(struct seq_file *p, int irq)
@@ -124,7 +205,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq)
unsigned long flags;
int cpu;
- irq_lock_sparse();
+ rcu_read_lock();
desc = irq_to_desc(irq);
if (!desc)
goto out;
@@ -132,7 +213,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq)
raw_spin_lock_irqsave(&desc->lock, flags);
seq_printf(p, "%3d: ", irq);
for_each_online_cpu(cpu)
- seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu));
+ seq_printf(p, "%10u ", irq_desc_kstat_cpu(desc, cpu));
if (desc->irq_data.chip)
seq_printf(p, " %8s", desc->irq_data.chip->name);
@@ -143,7 +224,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq)
seq_putc(p, '\n');
raw_spin_unlock_irqrestore(&desc->lock, flags);
out:
- irq_unlock_sparse();
+ rcu_read_unlock();
}
/*
@@ -154,7 +235,7 @@ int show_interrupts(struct seq_file *p, void *v)
int index = *(loff_t *) v;
int cpu, irq;
- get_online_cpus();
+ cpus_read_lock();
if (index == 0) {
seq_puts(p, " ");
for_each_online_cpu(cpu)
@@ -184,7 +265,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
}
out:
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
@@ -194,24 +275,6 @@ unsigned int arch_dynirq_lower_bound(unsigned int from)
}
/*
- * Switch to the asynchronous interrupt stack for softirq execution.
- */
-void do_softirq_own_stack(void)
-{
- unsigned long old, new;
-
- old = current_stack_pointer();
- /* Check against async. stack address range. */
- new = S390_lowcore.async_stack;
- if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) {
- CALL_ON_STACK(__do_softirq, new, 0);
- } else {
- /* We are already on the async stack. */
- __do_softirq();
- }
-}
-
-/*
* ext_int_hash[index] is the list head for all external interrupts that hash
* to this index.
*/
@@ -279,7 +342,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy)
struct ext_int_info *p;
int index;
- ext_code = *(struct ext_code *) &regs->int_code;
+ ext_code.int_code = regs->int_code;
if (ext_code.code != EXT_IRQ_CLK_COMP)
set_cpu_flag(CIF_NOHZ_DELAY);
@@ -294,12 +357,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy)
return IRQ_HANDLED;
}
-static struct irqaction external_interrupt = {
- .name = "EXT",
- .handler = do_ext_interrupt,
-};
-
-void __init init_ext_interrupts(void)
+static void __init init_ext_interrupts(void)
{
int idx;
@@ -308,7 +366,16 @@ void __init init_ext_interrupts(void)
irq_set_chip_and_handler(EXT_INTERRUPT,
&dummy_irq_chip, handle_percpu_irq);
- setup_irq(EXT_INTERRUPT, &external_interrupt);
+ if (request_irq(EXT_INTERRUPT, do_ext_interrupt, 0, "EXT", NULL))
+ panic("Failed to register EXT interrupt\n");
+}
+
+void __init init_IRQ(void)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS);
+ init_cio_interrupts();
+ init_airq_interrupts();
+ init_ext_interrupts();
}
static DEFINE_SPINLOCK(irq_subclass_lock);
@@ -318,7 +385,7 @@ void irq_subclass_register(enum irq_subclass subclass)
{
spin_lock(&irq_subclass_lock);
if (!irq_subclass_refcount[subclass])
- ctl_set_bit(0, subclass);
+ system_ctl_set_bit(0, subclass);
irq_subclass_refcount[subclass]++;
spin_unlock(&irq_subclass_lock);
}
@@ -329,7 +396,7 @@ void irq_subclass_unregister(enum irq_subclass subclass)
spin_lock(&irq_subclass_lock);
irq_subclass_refcount[subclass]--;
if (!irq_subclass_refcount[subclass])
- ctl_clear_bit(0, subclass);
+ system_ctl_clear_bit(0, subclass);
spin_unlock(&irq_subclass_lock);
}
EXPORT_SYMBOL(irq_subclass_unregister);
diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c
index ab584e8e3527..e808bb8bc0da 100644
--- a/arch/s390/kernel/jump_label.c
+++ b/arch/s390/kernel/jump_label.c
@@ -6,8 +6,9 @@
* Author(s): Jan Glauber <jang@linux.vnet.ibm.com>
*/
#include <linux/uaccess.h>
-#include <linux/stop_machine.h>
#include <linux/jump_label.h>
+#include <linux/module.h>
+#include <asm/text-patching.h>
#include <asm/ipl.h>
struct insn {
@@ -36,21 +37,15 @@ static void jump_label_bug(struct jump_entry *entry, struct insn *expected,
unsigned char *ipe = (unsigned char *)expected;
unsigned char *ipn = (unsigned char *)new;
- pr_emerg("Jump label code mismatch at %pS [%p]\n", ipc, ipc);
+ pr_emerg("Jump label code mismatch at %pS [%px]\n", ipc, ipc);
pr_emerg("Found: %6ph\n", ipc);
pr_emerg("Expected: %6ph\n", ipe);
pr_emerg("New: %6ph\n", ipn);
panic("Corrupted kernel text");
}
-static struct insn orignop = {
- .opcode = 0xc004,
- .offset = JUMP_LABEL_NOP_OFFSET >> 1,
-};
-
-static void __jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type,
- int init)
+static void jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
{
void *code = (void *)jump_entry_code(entry);
struct insn old, new;
@@ -62,29 +57,26 @@ static void __jump_label_transform(struct jump_entry *entry,
jump_label_make_branch(entry, &old);
jump_label_make_nop(entry, &new);
}
- if (init) {
- if (memcmp(code, &orignop, sizeof(orignop)))
- jump_label_bug(entry, &orignop, &new);
- } else {
- if (memcmp(code, &old, sizeof(old)))
- jump_label_bug(entry, &old, &new);
- }
+ if (memcmp(code, &old, sizeof(old)))
+ jump_label_bug(entry, &old, &new);
s390_kernel_write(code, &new, sizeof(new));
}
-static void __jump_label_sync(void *dummy)
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
{
+ jump_label_transform(entry, type);
+ text_poke_sync();
}
-void arch_jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type)
+bool arch_jump_label_transform_queue(struct jump_entry *entry,
+ enum jump_label_type type)
{
- __jump_label_transform(entry, type, 0);
- smp_call_function(__jump_label_sync, NULL, 1);
+ jump_label_transform(entry, type);
+ return true;
}
-void arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
+void arch_jump_label_transform_apply(void)
{
- __jump_label_transform(entry, type, 1);
+ text_poke_sync();
}
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 6f1388391620..f0cf20d4b3c5 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -7,6 +7,9 @@
* s390 port, used ppc64 as template. Mike Grundy <grundym@us.ibm.com>
*/
+#define pr_fmt(fmt) "kprobes: " fmt
+
+#include <linux/moduleloader.h>
#include <linux/kprobes.h>
#include <linux/ptrace.h>
#include <linux/preempt.h>
@@ -21,28 +24,36 @@
#include <asm/set_memory.h>
#include <asm/sections.h>
#include <asm/dis.h>
+#include "kprobes.h"
+#include "entry.h"
DEFINE_PER_CPU(struct kprobe *, current_kprobe);
DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
struct kretprobe_blackpoint kretprobe_blacklist[] = { };
-DEFINE_INSN_CACHE_OPS(s390_insn);
-
static int insn_page_in_use;
-static char insn_page[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+void *alloc_insn_page(void)
+{
+ void *page;
+
+ page = module_alloc(PAGE_SIZE);
+ if (!page)
+ return NULL;
+ set_memory_rox((unsigned long)page, 1);
+ return page;
+}
static void *alloc_s390_insn_page(void)
{
if (xchg(&insn_page_in_use, 1) == 1)
return NULL;
- set_memory_x((unsigned long) &insn_page, 1);
- return &insn_page;
+ return &kprobes_insn_page;
}
static void free_s390_insn_page(void *page)
{
- set_memory_nx((unsigned long) page, 1);
xchg(&insn_page_in_use, 0);
}
@@ -56,44 +67,32 @@ struct kprobe_insn_cache kprobe_s390_insn_slots = {
static void copy_instruction(struct kprobe *p)
{
- unsigned long ip = (unsigned long) p->addr;
+ kprobe_opcode_t insn[MAX_INSN_SIZE];
s64 disp, new_disp;
u64 addr, new_addr;
+ unsigned int len;
- if (ftrace_location(ip) == ip) {
+ len = insn_length(*p->addr >> 8);
+ memcpy(&insn, p->addr, len);
+ p->opcode = insn[0];
+ if (probe_is_insn_relative_long(&insn[0])) {
/*
- * If kprobes patches the instruction that is morphed by
- * ftrace make sure that kprobes always sees the branch
- * "jg .+24" that skips the mcount block or the "brcl 0,0"
- * in case of hotpatch.
+ * For pc-relative instructions in RIL-b or RIL-c format patch
+ * the RI2 displacement field. We have already made sure that
+ * the insn slot for the patched instruction is within the same
+ * 2GB area as the original instruction (either kernel image or
+ * module area). Therefore the new displacement will always fit.
*/
- ftrace_generate_nop_insn((struct ftrace_insn *)p->ainsn.insn);
- p->ainsn.is_ftrace_insn = 1;
- } else
- memcpy(p->ainsn.insn, p->addr, insn_length(*p->addr >> 8));
- p->opcode = p->ainsn.insn[0];
- if (!probe_is_insn_relative_long(p->ainsn.insn))
- return;
- /*
- * For pc-relative instructions in RIL-b or RIL-c format patch the
- * RI2 displacement field. We have already made sure that the insn
- * slot for the patched instruction is within the same 2GB area
- * as the original instruction (either kernel image or module area).
- * Therefore the new displacement will always fit.
- */
- disp = *(s32 *)&p->ainsn.insn[1];
- addr = (u64)(unsigned long)p->addr;
- new_addr = (u64)(unsigned long)p->ainsn.insn;
- new_disp = ((addr + (disp * 2)) - new_addr) / 2;
- *(s32 *)&p->ainsn.insn[1] = new_disp;
+ disp = *(s32 *)&insn[1];
+ addr = (u64)(unsigned long)p->addr;
+ new_addr = (u64)(unsigned long)p->ainsn.insn;
+ new_disp = ((addr + (disp * 2)) - new_addr) / 2;
+ *(s32 *)&insn[1] = new_disp;
+ }
+ s390_kernel_write(p->ainsn.insn, &insn, len);
}
NOKPROBE_SYMBOL(copy_instruction);
-static inline int is_kernel_addr(void *addr)
-{
- return addr < (void *)_end;
-}
-
static int s390_get_insn_slot(struct kprobe *p)
{
/*
@@ -102,7 +101,7 @@ static int s390_get_insn_slot(struct kprobe *p)
* field can be patched and executed within the insn slot.
*/
p->ainsn.insn = NULL;
- if (is_kernel_addr(p->addr))
+ if (is_kernel((unsigned long)p->addr))
p->ainsn.insn = get_s390_insn_slot();
else if (is_module_addr(p->addr))
p->ainsn.insn = get_insn_slot();
@@ -114,7 +113,7 @@ static void s390_free_insn_slot(struct kprobe *p)
{
if (!p->ainsn.insn)
return;
- if (is_kernel_addr(p->addr))
+ if (is_kernel((unsigned long)p->addr))
free_s390_insn_slot(p->ainsn.insn, 0);
else
free_insn_slot(p->ainsn.insn, 0);
@@ -122,9 +121,55 @@ static void s390_free_insn_slot(struct kprobe *p)
}
NOKPROBE_SYMBOL(s390_free_insn_slot);
+/* Check if paddr is at an instruction boundary */
+static bool can_probe(unsigned long paddr)
+{
+ unsigned long addr, offset = 0;
+ kprobe_opcode_t insn;
+ struct kprobe *kp;
+
+ if (paddr & 0x01)
+ return false;
+
+ if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
+ return false;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr) {
+ if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(insn)))
+ return false;
+
+ if (insn >> 8 == 0) {
+ if (insn != BREAKPOINT_INSTRUCTION) {
+ /*
+ * Note that QEMU inserts opcode 0x0000 to implement
+ * software breakpoints for guests. Since the size of
+ * the original instruction is unknown, stop following
+ * instructions and prevent setting a kprobe.
+ */
+ return false;
+ }
+ /*
+ * Check if the instruction has been modified by another
+ * kprobe, in which case the original instruction is
+ * decoded.
+ */
+ kp = get_kprobe((void *)addr);
+ if (!kp) {
+ /* not a kprobe */
+ return false;
+ }
+ insn = kp->opcode;
+ }
+ addr += insn_length(insn >> 8);
+ }
+ return addr == paddr;
+}
+
int arch_prepare_kprobe(struct kprobe *p)
{
- if ((unsigned long) p->addr & 0x01)
+ if (!can_probe((unsigned long)p->addr))
return -EINVAL;
/* Make sure the probe isn't going on a difficult instruction */
if (probe_is_prohibited_opcode(p->addr))
@@ -136,11 +181,6 @@ int arch_prepare_kprobe(struct kprobe *p)
}
NOKPROBE_SYMBOL(arch_prepare_kprobe);
-int arch_check_ftrace_location(struct kprobe *p)
-{
- return 0;
-}
-
struct swap_insn_args {
struct kprobe *p;
unsigned int arm_kprobe : 1;
@@ -149,28 +189,11 @@ struct swap_insn_args {
static int swap_instruction(void *data)
{
struct swap_insn_args *args = data;
- struct ftrace_insn new_insn, *insn;
struct kprobe *p = args->p;
- size_t len;
-
- new_insn.opc = args->arm_kprobe ? BREAKPOINT_INSTRUCTION : p->opcode;
- len = sizeof(new_insn.opc);
- if (!p->ainsn.is_ftrace_insn)
- goto skip_ftrace;
- len = sizeof(new_insn);
- insn = (struct ftrace_insn *) p->addr;
- if (args->arm_kprobe) {
- if (is_ftrace_nop(insn))
- new_insn.disp = KPROBE_ON_FTRACE_NOP;
- else
- new_insn.disp = KPROBE_ON_FTRACE_CALL;
- } else {
- ftrace_generate_call_insn(&new_insn, (unsigned long)p->addr);
- if (insn->disp == KPROBE_ON_FTRACE_NOP)
- ftrace_generate_nop_insn(&new_insn);
- }
-skip_ftrace:
- s390_kernel_write(p->addr, &new_insn, len);
+ u16 opc;
+
+ opc = args->arm_kprobe ? BREAKPOINT_INSTRUCTION : p->opcode;
+ s390_kernel_write(p->addr, &opc, sizeof(opc));
return 0;
}
NOKPROBE_SYMBOL(swap_instruction);
@@ -201,20 +224,27 @@ static void enable_singlestep(struct kprobe_ctlblk *kcb,
struct pt_regs *regs,
unsigned long ip)
{
- struct per_regs per_kprobe;
+ union {
+ struct ctlreg regs[3];
+ struct {
+ struct ctlreg control;
+ struct ctlreg start;
+ struct ctlreg end;
+ };
+ } per_kprobe;
/* Set up the PER control registers %cr9-%cr11 */
- per_kprobe.control = PER_EVENT_IFETCH;
- per_kprobe.start = ip;
- per_kprobe.end = ip;
+ per_kprobe.control.val = PER_EVENT_IFETCH;
+ per_kprobe.start.val = ip;
+ per_kprobe.end.val = ip;
/* Save control regs and psw mask */
- __ctl_store(kcb->kprobe_saved_ctl, 9, 11);
+ __local_ctl_store(9, 11, kcb->kprobe_saved_ctl);
kcb->kprobe_saved_imask = regs->psw.mask &
(PSW_MASK_PER | PSW_MASK_IO | PSW_MASK_EXT);
/* Set PER control regs, turns on single step for the given address */
- __ctl_load(per_kprobe, 9, 11);
+ __local_ctl_load(9, 11, per_kprobe.regs);
regs->psw.mask |= PSW_MASK_PER;
regs->psw.mask &= ~(PSW_MASK_IO | PSW_MASK_EXT);
regs->psw.addr = ip;
@@ -226,7 +256,7 @@ static void disable_singlestep(struct kprobe_ctlblk *kcb,
unsigned long ip)
{
/* Restore control regs and psw mask, set new psw address */
- __ctl_load(kcb->kprobe_saved_ctl, 9, 11);
+ __local_ctl_load(9, 11, kcb->kprobe_saved_ctl);
regs->psw.mask &= ~PSW_MASK_PER;
regs->psw.mask |= kcb->kprobe_saved_imask;
regs->psw.addr = ip;
@@ -255,18 +285,10 @@ static void pop_kprobe(struct kprobe_ctlblk *kcb)
{
__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
kcb->kprobe_status = kcb->prev_kprobe.status;
+ kcb->prev_kprobe.kp = NULL;
}
NOKPROBE_SYMBOL(pop_kprobe);
-void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
-{
- ri->ret_addr = (kprobe_opcode_t *) regs->gprs[14];
-
- /* Replace the return addr with trampoline addr */
- regs->gprs[14] = (unsigned long) &kretprobe_trampoline;
-}
-NOKPROBE_SYMBOL(arch_prepare_kretprobe);
-
static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p)
{
switch (kcb->kprobe_status) {
@@ -282,7 +304,7 @@ static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p)
* is a BUG. The code path resides in the .kprobes.text
* section and is executed with interrupts disabled.
*/
- pr_err("Invalid kprobe detected.\n");
+ pr_err("Failed to recover from reentered kprobes.\n");
dump_kprobe(p);
BUG();
}
@@ -348,109 +370,6 @@ static int kprobe_handler(struct pt_regs *regs)
NOKPROBE_SYMBOL(kprobe_handler);
/*
- * Function return probe trampoline:
- * - init_kprobes() establishes a probepoint here
- * - When the probed function returns, this probe
- * causes the handlers to fire
- */
-static void __used kretprobe_trampoline_holder(void)
-{
- asm volatile(".global kretprobe_trampoline\n"
- "kretprobe_trampoline: bcr 0,0\n");
-}
-
-/*
- * Called when the probe at kretprobe trampoline is hit
- */
-static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
-{
- struct kretprobe_instance *ri;
- struct hlist_head *head, empty_rp;
- struct hlist_node *tmp;
- unsigned long flags, orig_ret_address;
- unsigned long trampoline_address;
- kprobe_opcode_t *correct_ret_addr;
-
- INIT_HLIST_HEAD(&empty_rp);
- kretprobe_hash_lock(current, &head, &flags);
-
- /*
- * It is possible to have multiple instances associated with a given
- * task either because an multiple functions in the call path
- * have a return probe installed on them, and/or more than one return
- * return probe was registered for a target function.
- *
- * We can handle this because:
- * - instances are always inserted at the head of the list
- * - when multiple return probes are registered for the same
- * function, the first instance's ret_addr will point to the
- * real return address, and all the rest will point to
- * kretprobe_trampoline
- */
- ri = NULL;
- orig_ret_address = 0;
- correct_ret_addr = NULL;
- trampoline_address = (unsigned long) &kretprobe_trampoline;
- hlist_for_each_entry_safe(ri, tmp, head, hlist) {
- if (ri->task != current)
- /* another task is sharing our hash bucket */
- continue;
-
- orig_ret_address = (unsigned long) ri->ret_addr;
-
- if (orig_ret_address != trampoline_address)
- /*
- * This is the real return address. Any other
- * instances associated with this task are for
- * other calls deeper on the call stack
- */
- break;
- }
-
- kretprobe_assert(ri, orig_ret_address, trampoline_address);
-
- correct_ret_addr = ri->ret_addr;
- hlist_for_each_entry_safe(ri, tmp, head, hlist) {
- if (ri->task != current)
- /* another task is sharing our hash bucket */
- continue;
-
- orig_ret_address = (unsigned long) ri->ret_addr;
-
- if (ri->rp && ri->rp->handler) {
- ri->ret_addr = correct_ret_addr;
- ri->rp->handler(ri, regs);
- }
-
- recycle_rp_inst(ri, &empty_rp);
-
- if (orig_ret_address != trampoline_address)
- /*
- * This is the real return address. Any other
- * instances associated with this task are for
- * other calls deeper on the call stack
- */
- break;
- }
-
- regs->psw.addr = orig_ret_address;
-
- kretprobe_hash_unlock(current, &flags);
-
- hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
- hlist_del(&ri->hlist);
- kfree(ri);
- }
- /*
- * By returning a non-zero value, we are telling
- * kprobe_handler() that we don't want the post_handler
- * to run (and have re-enabled preemption)
- */
- return 1;
-}
-NOKPROBE_SYMBOL(trampoline_probe_handler);
-
-/*
* Called after single-stepping. p->addr is the address of the
* instruction whose first byte has been replaced by the "breakpoint"
* instruction. To avoid the SMP problems that can occur when we
@@ -464,24 +383,6 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs)
unsigned long ip = regs->psw.addr;
int fixup = probe_get_fixup_type(p->ainsn.insn);
- /* Check if the kprobes location is an enabled ftrace caller */
- if (p->ainsn.is_ftrace_insn) {
- struct ftrace_insn *insn = (struct ftrace_insn *) p->addr;
- struct ftrace_insn call_insn;
-
- ftrace_generate_call_insn(&call_insn, (unsigned long) p->addr);
- /*
- * A kprobe on an enabled ftrace call site actually single
- * stepped an unconditional branch (ftrace nop equivalent).
- * Now we need to fixup things and pretend that a brasl r0,...
- * was executed instead.
- */
- if (insn->disp == KPROBE_ON_FTRACE_CALL) {
- ip += call_insn.disp * 2 - MCOUNT_INSN_SIZE;
- regs->gprs[0] = (unsigned long)p->addr + sizeof(*insn);
- }
- }
-
if (fixup & FIXUP_PSW_NORMAL)
ip += (unsigned long) p->addr - (unsigned long) p->ainsn.insn;
@@ -509,12 +410,11 @@ static int post_kprobe_handler(struct pt_regs *regs)
if (!p)
return 0;
+ resume_execution(p, regs);
if (kcb->kprobe_status != KPROBE_REENTER && p->post_handler) {
kcb->kprobe_status = KPROBE_HIT_SSDONE;
p->post_handler(p, regs, 0);
}
-
- resume_execution(p, regs);
pop_kprobe(kcb);
preempt_enable_no_resched();
@@ -534,7 +434,6 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
struct kprobe *p = kprobe_running();
- const struct exception_table_entry *entry;
switch(kcb->kprobe_status) {
case KPROBE_HIT_SS:
@@ -553,32 +452,11 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr)
case KPROBE_HIT_ACTIVE:
case KPROBE_HIT_SSDONE:
/*
- * We increment the nmissed count for accounting,
- * we can also use npre/npostfault count for accounting
- * these specific fault cases.
- */
- kprobes_inc_nmissed_count(p);
-
- /*
- * We come here because instructions in the pre/post
- * handler caused the page_fault, this could happen
- * if handler tries to access user space by
- * copy_from_user(), get_user() etc. Let the
- * user-specified handler try to fix it first.
- */
- if (p->fault_handler && p->fault_handler(p, regs, trapnr))
- return 1;
-
- /*
* In case the user-specified fault handler returned
* zero, try to fix up.
*/
- entry = s390_search_extables(regs->psw.addr);
- if (entry) {
- regs->psw.addr = extable_fixup(entry);
+ if (fixup_exception(regs))
return 1;
- }
-
/*
* fixup_exception() could not handle it,
* Let do_page_fault() fix it.
@@ -642,18 +520,13 @@ int kprobe_exceptions_notify(struct notifier_block *self,
}
NOKPROBE_SYMBOL(kprobe_exceptions_notify);
-static struct kprobe trampoline = {
- .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
- .pre_handler = trampoline_probe_handler
-};
-
int __init arch_init_kprobes(void)
{
- return register_kprobe(&trampoline);
+ return 0;
}
int arch_trampoline_kprobe(struct kprobe *p)
{
- return p->addr == (kprobe_opcode_t *) &kretprobe_trampoline;
+ return 0;
}
NOKPROBE_SYMBOL(arch_trampoline_kprobe);
diff --git a/arch/s390/kernel/kprobes.h b/arch/s390/kernel/kprobes.h
new file mode 100644
index 000000000000..dc3ed5098ee7
--- /dev/null
+++ b/arch/s390/kernel/kprobes.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef _ARCH_S390_KPROBES_H
+#define _ARCH_S390_KPROBES_H
+
+#include <linux/kprobes.h>
+
+DEFINE_INSN_CACHE_OPS(s390_insn);
+
+#endif
diff --git a/arch/s390/kernel/kprobes_insn_page.S b/arch/s390/kernel/kprobes_insn_page.S
new file mode 100644
index 000000000000..0fe4d725e98b
--- /dev/null
+++ b/arch/s390/kernel/kprobes_insn_page.S
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+
+/*
+ * insn_page is a special 4k aligned dummy function for kprobes.
+ * It will contain all kprobed instructions that are out-of-line executed.
+ * The page must be within the kernel image to guarantee that the
+ * out-of-line instructions are within 2GB distance of their original
+ * location. Using a dummy function ensures that the insn_page is within
+ * the text section of the kernel and mapped read-only/executable from
+ * the beginning on, thus avoiding to split large mappings if the page
+ * would be in the data section instead.
+ */
+ .section .kprobes.text, "ax"
+ .balign 4096
+SYM_CODE_START(kprobes_insn_page)
+ .rept 2048
+ .word 0x07fe
+ .endr
+SYM_CODE_END(kprobes_insn_page)
+ .previous
diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c
index 452502f9a0d9..6652e54cf3db 100644
--- a/arch/s390/kernel/lgr.c
+++ b/arch/s390/kernel/lgr.c
@@ -88,8 +88,7 @@ static void lgr_stsi_2_2_2(struct lgr_info *lgr_info)
if (stsi(si, 2, 2, 2))
return;
cpascii(lgr_info->name, si->name, sizeof(si->name));
- memcpy(&lgr_info->lpar_number, &si->lpar_number,
- sizeof(lgr_info->lpar_number));
+ lgr_info->lpar_number = si->lpar_number;
}
/*
@@ -167,7 +166,7 @@ static struct timer_list lgr_timer;
*/
static void lgr_timer_set(void)
{
- mod_timer(&lgr_timer, jiffies + LGR_TIMER_INTERVAL_SECS * HZ);
+ mod_timer(&lgr_timer, jiffies + msecs_to_jiffies(LGR_TIMER_INTERVAL_SECS * MSEC_PER_SEC));
}
/*
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index cb8b1cc285c9..aa22ffc16bcd 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -3,7 +3,6 @@
* Copyright IBM Corp. 2005, 2011
*
* Author(s): Rolf Adelsberger,
- * Heiko Carstens <heiko.carstens@de.ibm.com>
* Michael Holzheu <holzheu@linux.vnet.ibm.com>
*/
@@ -14,24 +13,25 @@
#include <linux/reboot.h>
#include <linux/ftrace.h>
#include <linux/debug_locks.h>
-#include <linux/suspend.h>
+#include <asm/pfault.h>
#include <asm/cio.h>
#include <asm/setup.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/smp.h>
#include <asm/ipl.h>
#include <asm/diag.h>
#include <asm/elf.h>
#include <asm/asm-offsets.h>
#include <asm/cacheflush.h>
+#include <asm/abs_lowcore.h>
#include <asm/os_info.h>
#include <asm/set_memory.h>
#include <asm/stacktrace.h>
#include <asm/switch_to.h>
#include <asm/nmi.h>
+#include <asm/sclp.h>
-typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long);
+typedef void (*relocate_kernel_t)(unsigned long, unsigned long, unsigned long);
+typedef int (*purgatory_t)(int);
extern const unsigned char relocate_kernel[];
extern const unsigned long long relocate_kernel_len;
@@ -39,44 +39,17 @@ extern const unsigned long long relocate_kernel_len;
#ifdef CONFIG_CRASH_DUMP
/*
- * PM notifier callback for kdump
- */
-static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action,
- void *ptr)
-{
- switch (action) {
- case PM_SUSPEND_PREPARE:
- case PM_HIBERNATION_PREPARE:
- if (kexec_crash_image)
- arch_kexec_unprotect_crashkres();
- break;
- case PM_POST_SUSPEND:
- case PM_POST_HIBERNATION:
- if (kexec_crash_image)
- arch_kexec_protect_crashkres();
- break;
- default:
- return NOTIFY_DONE;
- }
- return NOTIFY_OK;
-}
-
-static int __init machine_kdump_pm_init(void)
-{
- pm_notifier(machine_kdump_pm_cb, 0);
- return 0;
-}
-arch_initcall(machine_kdump_pm_init);
-
-/*
* Reset the system, copy boot CPU registers to absolute zero,
* and jump to the kdump image
*/
-static void __do_machine_kdump(void *image)
+static void __do_machine_kdump(void *data)
{
- int (*start_kdump)(int);
+ struct kimage *image = data;
+ purgatory_t purgatory;
unsigned long prefix;
+ purgatory = (purgatory_t)image->start;
+
/* store_status() saved the prefix register to lowcore */
prefix = (unsigned long) S390_lowcore.prefixreg_save_area;
@@ -88,14 +61,12 @@ static void __do_machine_kdump(void *image)
* This need to be done *after* s390_reset_system set the
* prefix register of this CPU to zero
*/
- memcpy((void *) __LC_FPREGS_SAVE_AREA,
- (void *)(prefix + __LC_FPREGS_SAVE_AREA), 512);
+ memcpy(absolute_pointer(__LC_FPREGS_SAVE_AREA),
+ phys_to_virt(prefix + __LC_FPREGS_SAVE_AREA), 512);
- __load_psw_mask(PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA);
- start_kdump = (void *)((struct kimage *) image)->start;
- start_kdump(1);
+ call_nodat(1, int, purgatory, int, 1);
- /* Die if start_kdump returns */
+ /* Die if kdump returns */
disabled_wait();
}
@@ -119,16 +90,16 @@ static noinline void __machine_kdump(void *image)
continue;
}
/* Store status of the boot CPU */
- mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
- if (MACHINE_HAS_VX)
+ mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
+ if (cpu_has_vx())
save_vx_regs((__vector128 *) mcesa->vector_save_area);
if (MACHINE_HAS_GS) {
- __ctl_store(cr2_old.val, 2, 2);
+ local_ctl_store(2, &cr2_old.reg);
cr2_new = cr2_old;
cr2_new.gse = 1;
- __ctl_load(cr2_new.val, 2, 2);
+ local_ctl_load(2, &cr2_new.reg);
save_gs_cb((struct gs_cb *) mcesa->guarded_storage_save_area);
- __ctl_load(cr2_old.val, 2, 2);
+ local_ctl_load(2, &cr2_old.reg);
}
/*
* To create a good backchain for this CPU in the dump store_status
@@ -142,18 +113,6 @@ static noinline void __machine_kdump(void *image)
store_status(__do_machine_kdump, image);
}
-static unsigned long do_start_kdump(unsigned long addr)
-{
- struct kimage *image = (struct kimage *) addr;
- int (*start_kdump)(int) = (void *)image->start;
- int rc;
-
- __arch_local_irq_stnsm(0xfb); /* disable DAT */
- rc = start_kdump(0);
- __arch_local_irq_stosm(0x04); /* enable DAT */
- return rc;
-}
-
#endif /* CONFIG_CRASH_DUMP */
/*
@@ -162,11 +121,10 @@ static unsigned long do_start_kdump(unsigned long addr)
static bool kdump_csum_valid(struct kimage *image)
{
#ifdef CONFIG_CRASH_DUMP
+ purgatory_t purgatory = (purgatory_t)image->start;
int rc;
- preempt_disable();
- rc = CALL_ON_STACK(do_start_kdump, S390_lowcore.nodat_stack, 1, image);
- preempt_enable();
+ rc = call_nodat(1, int, purgatory, int, 0);
return rc == 0;
#else
return false;
@@ -240,7 +198,7 @@ int machine_kexec_prepare(struct kimage *image)
return -EINVAL;
/* Get the destination where the assembler code should be copied to.*/
- reboot_code_buffer = (void *) page_to_phys(image->control_code_page);
+ reboot_code_buffer = page_to_virt(image->control_code_page);
/* Then copy it */
memcpy(reboot_code_buffer, relocate_kernel, relocate_kernel_len);
@@ -253,13 +211,17 @@ void machine_kexec_cleanup(struct kimage *image)
void arch_crash_save_vmcoreinfo(void)
{
+ struct lowcore *abs_lc;
+
VMCOREINFO_SYMBOL(lowcore_ptr);
VMCOREINFO_SYMBOL(high_memory);
VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
- vmcoreinfo_append_str("SDMA=%lx\n", __sdma);
- vmcoreinfo_append_str("EDMA=%lx\n", __edma);
+ vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31);
+ vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31);
vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
- mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
+ abs_lc = get_abs_lowcore();
+ abs_lc->vmcore_info = paddr_vmcoreinfo_note();
+ put_abs_lowcore(abs_lc);
}
void machine_shutdown(void)
@@ -276,15 +238,20 @@ void machine_crash_shutdown(struct pt_regs *regs)
*/
static void __do_machine_kexec(void *data)
{
- relocate_kernel_t data_mover;
+ unsigned long data_mover, entry, diag308_subcode;
struct kimage *image = data;
+ data_mover = page_to_phys(image->control_code_page);
+ entry = virt_to_phys(&image->head);
+ diag308_subcode = DIAG308_CLEAR_RESET;
+ if (sclp.has_iplcc)
+ diag308_subcode |= DIAG308_FLAG_EI;
s390_reset_system();
- data_mover = (relocate_kernel_t) page_to_phys(image->control_code_page);
- __arch_local_irq_stnsm(0xfb); /* disable DAT - avoid no-execute */
- /* Call the moving routine */
- (*data_mover)(&image->head, image->start);
+ call_nodat(3, void, (relocate_kernel_t)data_mover,
+ unsigned long, entry,
+ unsigned long, image->start,
+ unsigned long, diag308_subcode);
/* Die if kexec returns */
disabled_wait();
@@ -295,7 +262,6 @@ static void __do_machine_kexec(void *data)
*/
static void __machine_kexec(void *data)
{
- __arch_local_irq_stosm(0x04); /* enable DAT */
pfault_fini();
tracing_off();
debug_locks_off();
diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index 8415ae7d2a23..8d207b82d9fe 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -7,11 +7,14 @@
* Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com>
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/elf.h>
#include <linux/errno.h>
#include <linux/kexec.h>
#include <linux/module_signature.h>
#include <linux/verification.h>
+#include <linux/vmalloc.h>
#include <asm/boot_data.h>
#include <asm/ipl.h>
#include <asm/setup.h>
@@ -28,6 +31,7 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len)
const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1;
struct module_signature *ms;
unsigned long sig_len;
+ int ret;
/* Skip signature verification when not secure IPLed. */
if (!ipl_secure_flag)
@@ -62,11 +66,18 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len)
return -EBADMSG;
}
- return verify_pkcs7_signature(kernel, kernel_len,
- kernel + kernel_len, sig_len,
- VERIFY_USE_PLATFORM_KEYRING,
- VERIFYING_MODULE_SIGNATURE,
- NULL, NULL);
+ ret = verify_pkcs7_signature(kernel, kernel_len,
+ kernel + kernel_len, sig_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_MODULE_SIGNATURE,
+ NULL, NULL);
+ if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING))
+ ret = verify_pkcs7_signature(kernel, kernel_len,
+ kernel + kernel_len, sig_len,
+ VERIFY_USE_PLATFORM_KEYRING,
+ VERIFYING_MODULE_SIGNATURE,
+ NULL, NULL);
+ return ret;
}
#endif /* CONFIG_KEXEC_SIG */
@@ -151,7 +162,7 @@ static int kexec_file_add_initrd(struct kimage *image,
buf.mem += crashk_res.start;
buf.memsz = buf.bufsz;
- data->parm->initrd_start = buf.mem;
+ data->parm->initrd_start = data->memsz;
data->parm->initrd_size = buf.memsz;
data->memsz += buf.memsz;
@@ -170,15 +181,14 @@ static int kexec_file_add_ipl_report(struct kimage *image,
struct kexec_buf buf;
unsigned long addr;
void *ptr, *end;
+ int ret;
buf.image = image;
data->memsz = ALIGN(data->memsz, PAGE_SIZE);
buf.mem = data->memsz;
- if (image->type == KEXEC_TYPE_CRASH)
- buf.mem += crashk_res.start;
- ptr = (void *)ipl_cert_list_addr;
+ ptr = __va(ipl_cert_list_addr);
end = ptr + ipl_cert_list_size;
ncerts = 0;
while (ptr < end) {
@@ -190,7 +200,7 @@ static int kexec_file_add_ipl_report(struct kimage *image,
addr = data->memsz + data->report->size;
addr += ncerts * sizeof(struct ipl_rb_certificate_entry);
- ptr = (void *)ipl_cert_list_addr;
+ ptr = __va(ipl_cert_list_addr);
while (ptr < end) {
len = *(unsigned int *)ptr;
ptr += sizeof(len);
@@ -199,9 +209,13 @@ static int kexec_file_add_ipl_report(struct kimage *image,
ptr += len;
}
+ ret = -ENOMEM;
buf.buffer = ipl_report_finish(data->report);
+ if (!buf.buffer)
+ goto out;
buf.bufsz = data->report->size;
buf.memsz = buf.bufsz;
+ image->arch.ipl_buf = buf.buffer;
data->memsz += buf.memsz;
@@ -209,14 +223,21 @@ static int kexec_file_add_ipl_report(struct kimage *image,
data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr);
*lc_ipl_parmblock_ptr = (__u32)buf.mem;
- return kexec_add_buffer(&buf);
+ if (image->type == KEXEC_TYPE_CRASH)
+ buf.mem += crashk_res.start;
+
+ ret = kexec_add_buffer(&buf);
+out:
+ return ret;
}
void *kexec_file_add_components(struct kimage *image,
int (*add_kernel)(struct kimage *image,
struct s390_load_data *data))
{
+ unsigned long max_command_line_size = LEGACY_COMMAND_LINE_SIZE;
struct s390_load_data data = {0};
+ unsigned long minsize;
int ret;
data.report = ipl_report_init(&ipl_block);
@@ -227,10 +248,23 @@ void *kexec_file_add_components(struct kimage *image,
if (ret)
goto out;
- if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) {
- ret = -EINVAL;
+ ret = -EINVAL;
+ minsize = PARMAREA + offsetof(struct parmarea, command_line);
+ if (image->kernel_buf_len < minsize)
goto out;
- }
+
+ if (data.parm->max_command_line_size)
+ max_command_line_size = data.parm->max_command_line_size;
+
+ if (minsize + max_command_line_size < minsize)
+ goto out;
+
+ if (image->kernel_buf_len < minsize + max_command_line_size)
+ goto out;
+
+ if (image->cmdline_buf_len >= max_command_line_size)
+ goto out;
+
memcpy(data.parm->command_line, image->cmdline_buf,
image->cmdline_buf_len);
@@ -267,8 +301,16 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
const Elf_Shdr *relsec,
const Elf_Shdr *symtab)
{
+ const char *strtab, *name, *shstrtab;
+ const Elf_Shdr *sechdrs;
Elf_Rela *relas;
int i, r_type;
+ int ret;
+
+ /* String & section header string table */
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset;
+ shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
relas = (void *)pi->ehdr + relsec->sh_offset;
@@ -281,15 +323,27 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
sym = (void *)pi->ehdr + symtab->sh_offset;
sym += ELF64_R_SYM(relas[i].r_info);
- if (sym->st_shndx == SHN_UNDEF)
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ if (sym->st_shndx == SHN_UNDEF) {
+ pr_err("Undefined symbol: %s\n", name);
return -ENOEXEC;
+ }
- if (sym->st_shndx == SHN_COMMON)
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
return -ENOEXEC;
+ }
if (sym->st_shndx >= pi->ehdr->e_shnum &&
- sym->st_shndx != SHN_ABS)
+ sym->st_shndx != SHN_ABS) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
return -ENOEXEC;
+ }
loc = pi->purgatory_buf;
loc += section->sh_offset;
@@ -303,21 +357,23 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
addr = section->sh_addr + relas[i].r_offset;
r_type = ELF64_R_TYPE(relas[i].r_info);
- arch_kexec_do_relocs(r_type, loc, val, addr);
+
+ if (r_type == R_390_PLT32DBL)
+ r_type = R_390_PC32DBL;
+
+ ret = arch_kexec_do_relocs(r_type, loc, val, addr);
+ if (ret) {
+ pr_err("Unknown rela relocation: %d\n", r_type);
+ return -ENOEXEC;
+ }
}
return 0;
}
-int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
{
- /* A kernel must be at least large enough to contain head.S. During
- * load memory in head.S will be accessed, e.g. to register the next
- * command line. If the next kernel were smaller the current kernel
- * will panic at load.
- */
- if (buf_len < HEAD_END)
- return -ENOEXEC;
-
- return kexec_image_probe_default(image, buf, buf_len);
+ vfree(image->arch.ipl_buf);
+ image->arch.ipl_buf = NULL;
+
+ return kexec_image_post_load_cleanup_default(image);
}
diff --git a/arch/s390/kernel/machine_kexec_reloc.c b/arch/s390/kernel/machine_kexec_reloc.c
index d5035de9020e..b7182cec48dc 100644
--- a/arch/s390/kernel/machine_kexec_reloc.c
+++ b/arch/s390/kernel/machine_kexec_reloc.c
@@ -28,6 +28,7 @@ int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val,
break;
case R_390_64: /* Direct 64 bit. */
case R_390_GLOB_DAT:
+ case R_390_JMP_SLOT:
*(u64 *)loc = val;
break;
case R_390_PC16: /* PC relative 16 bit. */
diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S
index 9e1660a6b9db..ae4d4fd9afcd 100644
--- a/arch/s390/kernel/mcount.S
+++ b/arch/s390/kernel/mcount.S
@@ -2,8 +2,6 @@
/*
* Copyright IBM Corp. 2008, 2009
*
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
- *
*/
#include <linux/linkage.h>
@@ -11,84 +9,187 @@
#include <asm/ftrace.h>
#include <asm/nospec-insn.h>
#include <asm/ptrace.h>
-#include <asm/export.h>
+
+#define STACK_FRAME_SIZE_PTREGS (STACK_FRAME_OVERHEAD + __PT_SIZE)
+#define STACK_PTREGS (STACK_FRAME_OVERHEAD)
+#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS)
+#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW)
+
+#define STACK_FRAME_SIZE_FREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_SIZE)
+#define STACK_FREGS (STACK_FRAME_OVERHEAD)
+#define STACK_FREGS_PTREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_PT_REGS)
+#define STACK_FREGS_PTREGS_GPRS (STACK_FREGS_PTREGS + __PT_GPRS)
+#define STACK_FREGS_PTREGS_PSW (STACK_FREGS_PTREGS + __PT_PSW)
+#define STACK_FREGS_PTREGS_ORIG_GPR2 (STACK_FREGS_PTREGS + __PT_ORIG_GPR2)
+#define STACK_FREGS_PTREGS_FLAGS (STACK_FREGS_PTREGS + __PT_FLAGS)
+
+/* packed stack: allocate just enough for r14, r15 and backchain */
+#define TRACED_FUNC_FRAME_SIZE 24
+
+#ifdef CONFIG_FUNCTION_TRACER
GEN_BR_THUNK %r1
GEN_BR_THUNK %r14
.section .kprobes.text, "ax"
-ENTRY(ftrace_stub)
+SYM_FUNC_START(ftrace_stub)
BR_EX %r14
-ENDPROC(ftrace_stub)
+SYM_FUNC_END(ftrace_stub)
-#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE)
-#define STACK_PTREGS (STACK_FRAME_OVERHEAD)
-#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS)
-#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW)
+SYM_CODE_START(ftrace_stub_direct_tramp)
+ lgr %r1, %r0
+ BR_EX %r1
+SYM_CODE_END(ftrace_stub_direct_tramp)
-ENTRY(_mcount)
- BR_EX %r14
-ENDPROC(_mcount)
-EXPORT_SYMBOL(_mcount)
+ .macro ftrace_regs_entry, allregs=0
+ stg %r14,(__SF_GPRS+8*8)(%r15) # save traced function caller
+
+ .if \allregs == 1
+ # save psw mask
+ # don't put any instructions clobbering CC before this point
+ epsw %r1,%r14
+ risbg %r14,%r1,0,31,32
+ .endif
-ENTRY(ftrace_caller)
- .globl ftrace_regs_caller
- .set ftrace_regs_caller,ftrace_caller
lgr %r1,%r15
-#if !(defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT))
- aghi %r0,MCOUNT_RETURN_FIXUP
-#endif
- aghi %r15,-STACK_FRAME_SIZE
+ # allocate stack frame for ftrace_caller to contain traced function
+ aghi %r15,-TRACED_FUNC_FRAME_SIZE
stg %r1,__SF_BACKCHAIN(%r15)
- stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15)
- stg %r0,(STACK_PTREGS_PSW+8)(%r15)
- stmg %r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15)
+ stg %r0,(__SF_GPRS+8*8)(%r15)
+ stg %r15,(__SF_GPRS+9*8)(%r15)
+ # allocate ftrace_regs and stack frame for ftrace_trace_function
+ aghi %r15,-STACK_FRAME_SIZE_FREGS
+ stg %r1,(STACK_FREGS_PTREGS_GPRS+15*8)(%r15)
+ xc STACK_FREGS_PTREGS_ORIG_GPR2(8,%r15),STACK_FREGS_PTREGS_ORIG_GPR2(%r15)
+
+ .if \allregs == 1
+ stg %r14,(STACK_FREGS_PTREGS_PSW)(%r15)
+ mvghi STACK_FREGS_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS
+ .else
+ xc STACK_FREGS_PTREGS_FLAGS(8,%r15),STACK_FREGS_PTREGS_FLAGS(%r15)
+ .endif
+
+ lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address
+ aghi %r1,-TRACED_FUNC_FRAME_SIZE
+ stg %r1,__SF_BACKCHAIN(%r15)
+ stg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15)
+ stmg %r2,%r14,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15)
+ .endm
+
+SYM_CODE_START(ftrace_regs_caller)
+ ftrace_regs_entry 1
+ j ftrace_common
+SYM_CODE_END(ftrace_regs_caller)
+
+SYM_CODE_START(ftrace_caller)
+ ftrace_regs_entry 0
+ j ftrace_common
+SYM_CODE_END(ftrace_caller)
+
+SYM_CODE_START(ftrace_common)
#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
aghik %r2,%r0,-MCOUNT_INSN_SIZE
lgrl %r4,function_trace_op
- lgrl %r1,ftrace_trace_function
+ lgrl %r1,ftrace_func
#else
lgr %r2,%r0
aghi %r2,-MCOUNT_INSN_SIZE
larl %r4,function_trace_op
lg %r4,0(%r4)
- larl %r1,ftrace_trace_function
+ larl %r1,ftrace_func
lg %r1,0(%r1)
#endif
lgr %r3,%r14
- la %r5,STACK_PTREGS(%r15)
+ la %r5,STACK_FREGS(%r15)
BASR_EX %r14,%r1
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
# The j instruction gets runtime patched to a nop instruction.
# See ftrace_enable_ftrace_graph_caller.
- .globl ftrace_graph_caller
-ftrace_graph_caller:
- j ftrace_graph_caller_end
- lmg %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15)
- lg %r4,(STACK_PTREGS_PSW+8)(%r15)
+SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL)
+ j .Lftrace_graph_caller_end
+ lmg %r2,%r3,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15)
+ lg %r4,(STACK_FREGS_PTREGS_PSW+8)(%r15)
brasl %r14,prepare_ftrace_return
- stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15)
-ftrace_graph_caller_end:
- .globl ftrace_graph_caller_end
+ stg %r2,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15)
+.Lftrace_graph_caller_end:
+#endif
+ lg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15)
+#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+ ltg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15)
+ locgrz %r1,%r0
+#else
+ lg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15)
+ ltgr %r1,%r1
+ jnz 0f
+ lgr %r1,%r0
#endif
- lg %r1,(STACK_PTREGS_PSW+8)(%r15)
- lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15)
+0: lmg %r2,%r15,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15)
BR_EX %r1
-ENDPROC(ftrace_caller)
+SYM_CODE_END(ftrace_common)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(return_to_handler)
+SYM_FUNC_START(return_to_handler)
stmg %r2,%r5,32(%r15)
lgr %r1,%r15
- aghi %r15,-STACK_FRAME_OVERHEAD
+ aghi %r15,-(STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE)
stg %r1,__SF_BACKCHAIN(%r15)
+ la %r3,STACK_FRAME_OVERHEAD(%r15)
+ stg %r1,__FGRAPH_RET_FP(%r3)
+ stg %r2,__FGRAPH_RET_GPR2(%r3)
+ lgr %r2,%r3
brasl %r14,ftrace_return_to_handler
- aghi %r15,STACK_FRAME_OVERHEAD
+ aghi %r15,STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE
lgr %r14,%r2
lmg %r2,%r5,32(%r15)
BR_EX %r14
-ENDPROC(return_to_handler)
+SYM_FUNC_END(return_to_handler)
#endif
+#endif /* CONFIG_FUNCTION_TRACER */
+
+SYM_CODE_START(ftrace_shared_hotpatch_trampoline_br)
+ lmg %r0,%r1,2(%r1)
+ br %r1
+SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_br_end, SYM_L_GLOBAL)
+SYM_CODE_END(ftrace_shared_hotpatch_trampoline_br)
+
+#ifdef CONFIG_EXPOLINE
+SYM_CODE_START(ftrace_shared_hotpatch_trampoline_exrl)
+ lmg %r0,%r1,2(%r1)
+ exrl %r0,0f
+ j .
+0: br %r1
+SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_exrl_end, SYM_L_GLOBAL)
+SYM_CODE_END(ftrace_shared_hotpatch_trampoline_exrl)
+#endif /* CONFIG_EXPOLINE */
+
+#ifdef CONFIG_RETHOOK
+
+SYM_CODE_START(arch_rethook_trampoline)
+ stg %r14,(__SF_GPRS+8*8)(%r15)
+ lay %r15,-STACK_FRAME_SIZE_PTREGS(%r15)
+ stmg %r0,%r14,STACK_PTREGS_GPRS(%r15)
+
+ # store original stack pointer in backchain and pt_regs
+ lay %r7,STACK_FRAME_SIZE_PTREGS(%r15)
+ stg %r7,__SF_BACKCHAIN(%r15)
+ stg %r7,STACK_PTREGS_GPRS+(15*8)(%r15)
+
+ # store full psw
+ epsw %r2,%r3
+ risbg %r3,%r2,0,31,32
+ stg %r3,STACK_PTREGS_PSW(%r15)
+ larl %r1,arch_rethook_trampoline
+ stg %r1,STACK_PTREGS_PSW+8(%r15)
+
+ lay %r2,STACK_PTREGS(%r15)
+ brasl %r14,arch_rethook_trampoline_callback
+
+ mvc __SF_EMPTY(16,%r7),STACK_PTREGS_PSW(%r15)
+ lmg %r0,%r15,STACK_PTREGS_GPRS(%r15)
+ lpswe __SF_EMPTY(%r15)
+SYM_CODE_END(arch_rethook_trampoline)
+
+#endif /* CONFIG_RETHOOK */
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index ba8f19bb438b..42215f9404af 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -14,14 +14,19 @@
#include <linux/elf.h>
#include <linux/vmalloc.h>
#include <linux/fs.h>
+#include <linux/ftrace.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
#include <linux/moduleloader.h>
#include <linux/bug.h>
+#include <linux/memory.h>
#include <asm/alternative.h>
#include <asm/nospec-branch.h>
#include <asm/facility.h>
+#include <asm/ftrace.lds.h>
+#include <asm/set_memory.h>
+#include <asm/setup.h>
#if 0
#define DEBUGP printk
@@ -29,24 +34,52 @@
#define DEBUGP(fmt , ...)
#endif
-#define PLT_ENTRY_SIZE 20
+#define PLT_ENTRY_SIZE 22
+
+static unsigned long get_module_load_offset(void)
+{
+ static DEFINE_MUTEX(module_kaslr_mutex);
+ static unsigned long module_load_offset;
+
+ if (!kaslr_enabled())
+ return 0;
+ /*
+ * Calculate the module_load_offset the first time this code
+ * is called. Once calculated it stays the same until reboot.
+ */
+ mutex_lock(&module_kaslr_mutex);
+ if (!module_load_offset)
+ module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
+ mutex_unlock(&module_kaslr_mutex);
+ return module_load_offset;
+}
void *module_alloc(unsigned long size)
{
+ gfp_t gfp_mask = GFP_KERNEL;
void *p;
if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL;
- p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
- GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
- if (p && (kasan_module_alloc(p, size) < 0)) {
+ p = __vmalloc_node_range(size, MODULE_ALIGN,
+ MODULES_VADDR + get_module_load_offset(),
+ MODULES_END, gfp_mask, PAGE_KERNEL,
+ VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK,
+ NUMA_NO_NODE, __builtin_return_address(0));
+ if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
vfree(p);
return NULL;
}
return p;
}
+#ifdef CONFIG_FUNCTION_TRACER
+void module_arch_cleanup(struct module *mod)
+{
+ module_memfree(mod->arch.trampolines_start);
+}
+#endif
+
void module_arch_freeing_init(struct module *mod)
{
if (is_livepatch_module(mod) &&
@@ -114,6 +147,7 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
Elf_Rela *rela;
char *strings;
int nrela, i, j;
+ struct module_memory *mod_mem;
/* Find symbol table and string table. */
symtab = NULL;
@@ -161,23 +195,26 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
/* Increase core size by size of got & plt and set start
offsets for got and plt. */
- me->core_layout.size = ALIGN(me->core_layout.size, 4);
- me->arch.got_offset = me->core_layout.size;
- me->core_layout.size += me->arch.got_size;
- me->arch.plt_offset = me->core_layout.size;
+ mod_mem = &me->mem[MOD_TEXT];
+ mod_mem->size = ALIGN(mod_mem->size, 4);
+ me->arch.got_offset = mod_mem->size;
+ mod_mem->size += me->arch.got_size;
+ me->arch.plt_offset = mod_mem->size;
if (me->arch.plt_size) {
if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable)
me->arch.plt_size += PLT_ENTRY_SIZE;
- me->core_layout.size += me->arch.plt_size;
+ mod_mem->size += me->arch.plt_size;
}
return 0;
}
static int apply_rela_bits(Elf_Addr loc, Elf_Addr val,
- int sign, int bits, int shift)
+ int sign, int bits, int shift,
+ void *(*write)(void *dest, const void *src, size_t len))
{
unsigned long umax;
long min, max;
+ void *dest = (void *)loc;
if (val & ((1UL << shift) - 1))
return -ENOEXEC;
@@ -194,26 +231,33 @@ static int apply_rela_bits(Elf_Addr loc, Elf_Addr val,
return -ENOEXEC;
}
- if (bits == 8)
- *(unsigned char *) loc = val;
- else if (bits == 12)
- *(unsigned short *) loc = (val & 0xfff) |
+ if (bits == 8) {
+ unsigned char tmp = val;
+ write(dest, &tmp, 1);
+ } else if (bits == 12) {
+ unsigned short tmp = (val & 0xfff) |
(*(unsigned short *) loc & 0xf000);
- else if (bits == 16)
- *(unsigned short *) loc = val;
- else if (bits == 20)
- *(unsigned int *) loc = (val & 0xfff) << 16 |
- (val & 0xff000) >> 4 |
- (*(unsigned int *) loc & 0xf00000ff);
- else if (bits == 32)
- *(unsigned int *) loc = val;
- else if (bits == 64)
- *(unsigned long *) loc = val;
+ write(dest, &tmp, 2);
+ } else if (bits == 16) {
+ unsigned short tmp = val;
+ write(dest, &tmp, 2);
+ } else if (bits == 20) {
+ unsigned int tmp = (val & 0xfff) << 16 |
+ (val & 0xff000) >> 4 | (*(unsigned int *) loc & 0xf00000ff);
+ write(dest, &tmp, 4);
+ } else if (bits == 32) {
+ unsigned int tmp = val;
+ write(dest, &tmp, 4);
+ } else if (bits == 64) {
+ unsigned long tmp = val;
+ write(dest, &tmp, 8);
+ }
return 0;
}
static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
- const char *strtab, struct module *me)
+ const char *strtab, struct module *me,
+ void *(*write)(void *dest, const void *src, size_t len))
{
struct mod_arch_syminfo *info;
Elf_Addr loc, val;
@@ -241,17 +285,17 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
case R_390_64: /* Direct 64 bit. */
val += rela->r_addend;
if (r_type == R_390_8)
- rc = apply_rela_bits(loc, val, 0, 8, 0);
+ rc = apply_rela_bits(loc, val, 0, 8, 0, write);
else if (r_type == R_390_12)
- rc = apply_rela_bits(loc, val, 0, 12, 0);
+ rc = apply_rela_bits(loc, val, 0, 12, 0, write);
else if (r_type == R_390_16)
- rc = apply_rela_bits(loc, val, 0, 16, 0);
+ rc = apply_rela_bits(loc, val, 0, 16, 0, write);
else if (r_type == R_390_20)
- rc = apply_rela_bits(loc, val, 1, 20, 0);
+ rc = apply_rela_bits(loc, val, 1, 20, 0, write);
else if (r_type == R_390_32)
- rc = apply_rela_bits(loc, val, 0, 32, 0);
+ rc = apply_rela_bits(loc, val, 0, 32, 0, write);
else if (r_type == R_390_64)
- rc = apply_rela_bits(loc, val, 0, 64, 0);
+ rc = apply_rela_bits(loc, val, 0, 64, 0, write);
break;
case R_390_PC16: /* PC relative 16 bit. */
case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */
@@ -260,15 +304,15 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
case R_390_PC64: /* PC relative 64 bit. */
val += rela->r_addend - loc;
if (r_type == R_390_PC16)
- rc = apply_rela_bits(loc, val, 1, 16, 0);
+ rc = apply_rela_bits(loc, val, 1, 16, 0, write);
else if (r_type == R_390_PC16DBL)
- rc = apply_rela_bits(loc, val, 1, 16, 1);
+ rc = apply_rela_bits(loc, val, 1, 16, 1, write);
else if (r_type == R_390_PC32DBL)
- rc = apply_rela_bits(loc, val, 1, 32, 1);
+ rc = apply_rela_bits(loc, val, 1, 32, 1, write);
else if (r_type == R_390_PC32)
- rc = apply_rela_bits(loc, val, 1, 32, 0);
+ rc = apply_rela_bits(loc, val, 1, 32, 0, write);
else if (r_type == R_390_PC64)
- rc = apply_rela_bits(loc, val, 1, 64, 0);
+ rc = apply_rela_bits(loc, val, 1, 64, 0, write);
break;
case R_390_GOT12: /* 12 bit GOT offset. */
case R_390_GOT16: /* 16 bit GOT offset. */
@@ -283,33 +327,34 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
case R_390_GOTPLT64: /* 64 bit offset to jump slot. */
case R_390_GOTPLTENT: /* 32 bit rel. offset to jump slot >> 1. */
if (info->got_initialized == 0) {
- Elf_Addr *gotent;
+ Elf_Addr *gotent = me->mem[MOD_TEXT].base +
+ me->arch.got_offset +
+ info->got_offset;
- gotent = me->core_layout.base + me->arch.got_offset +
- info->got_offset;
- *gotent = val;
+ write(gotent, &val, sizeof(*gotent));
info->got_initialized = 1;
}
val = info->got_offset + rela->r_addend;
if (r_type == R_390_GOT12 ||
r_type == R_390_GOTPLT12)
- rc = apply_rela_bits(loc, val, 0, 12, 0);
+ rc = apply_rela_bits(loc, val, 0, 12, 0, write);
else if (r_type == R_390_GOT16 ||
r_type == R_390_GOTPLT16)
- rc = apply_rela_bits(loc, val, 0, 16, 0);
+ rc = apply_rela_bits(loc, val, 0, 16, 0, write);
else if (r_type == R_390_GOT20 ||
r_type == R_390_GOTPLT20)
- rc = apply_rela_bits(loc, val, 1, 20, 0);
+ rc = apply_rela_bits(loc, val, 1, 20, 0, write);
else if (r_type == R_390_GOT32 ||
r_type == R_390_GOTPLT32)
- rc = apply_rela_bits(loc, val, 0, 32, 0);
+ rc = apply_rela_bits(loc, val, 0, 32, 0, write);
else if (r_type == R_390_GOT64 ||
r_type == R_390_GOTPLT64)
- rc = apply_rela_bits(loc, val, 0, 64, 0);
+ rc = apply_rela_bits(loc, val, 0, 64, 0, write);
else if (r_type == R_390_GOTENT ||
r_type == R_390_GOTPLTENT) {
- val += (Elf_Addr) me->core_layout.base - loc;
- rc = apply_rela_bits(loc, val, 1, 32, 1);
+ val += (Elf_Addr)me->mem[MOD_TEXT].base +
+ me->arch.got_offset - loc;
+ rc = apply_rela_bits(loc, val, 1, 32, 1, write);
}
break;
case R_390_PLT16DBL: /* 16 bit PC rel. PLT shifted by 1. */
@@ -320,25 +365,28 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
case R_390_PLTOFF32: /* 32 bit offset from GOT to PLT. */
case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */
if (info->plt_initialized == 0) {
- unsigned int *ip;
- ip = me->core_layout.base + me->arch.plt_offset +
- info->plt_offset;
- ip[0] = 0x0d10e310; /* basr 1,0 */
- ip[1] = 0x100a0004; /* lg 1,10(1) */
+ unsigned char insn[PLT_ENTRY_SIZE];
+ char *plt_base;
+ char *ip;
+
+ plt_base = me->mem[MOD_TEXT].base + me->arch.plt_offset;
+ ip = plt_base + info->plt_offset;
+ *(int *)insn = 0x0d10e310; /* basr 1,0 */
+ *(int *)&insn[4] = 0x100c0004; /* lg 1,12(1) */
if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) {
- unsigned int *ij;
- ij = me->core_layout.base +
- me->arch.plt_offset +
- me->arch.plt_size - PLT_ENTRY_SIZE;
- ip[2] = 0xa7f40000 + /* j __jump_r1 */
- (unsigned int)(u16)
- (((unsigned long) ij - 8 -
- (unsigned long) ip) / 2);
+ char *jump_r1;
+
+ jump_r1 = plt_base + me->arch.plt_size -
+ PLT_ENTRY_SIZE;
+ /* brcl 0xf,__jump_r1 */
+ *(short *)&insn[8] = 0xc0f4;
+ *(int *)&insn[10] = (jump_r1 - (ip + 8)) / 2;
} else {
- ip[2] = 0x07f10000; /* br %r1 */
+ *(int *)&insn[8] = 0x07f10000; /* br %r1 */
}
- ip[3] = (unsigned int) (val >> 32);
- ip[4] = (unsigned int) val;
+ *(long *)&insn[14] = val;
+
+ write(ip, insn, sizeof(insn));
info->plt_initialized = 1;
}
if (r_type == R_390_PLTOFF16 ||
@@ -351,44 +399,44 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
val - loc + 0xffffUL < 0x1ffffeUL) ||
(r_type == R_390_PLT32DBL &&
val - loc + 0xffffffffULL < 0x1fffffffeULL)))
- val = (Elf_Addr) me->core_layout.base +
+ val = (Elf_Addr) me->mem[MOD_TEXT].base +
me->arch.plt_offset +
info->plt_offset;
val += rela->r_addend - loc;
}
if (r_type == R_390_PLT16DBL)
- rc = apply_rela_bits(loc, val, 1, 16, 1);
+ rc = apply_rela_bits(loc, val, 1, 16, 1, write);
else if (r_type == R_390_PLTOFF16)
- rc = apply_rela_bits(loc, val, 0, 16, 0);
+ rc = apply_rela_bits(loc, val, 0, 16, 0, write);
else if (r_type == R_390_PLT32DBL)
- rc = apply_rela_bits(loc, val, 1, 32, 1);
+ rc = apply_rela_bits(loc, val, 1, 32, 1, write);
else if (r_type == R_390_PLT32 ||
r_type == R_390_PLTOFF32)
- rc = apply_rela_bits(loc, val, 0, 32, 0);
+ rc = apply_rela_bits(loc, val, 0, 32, 0, write);
else if (r_type == R_390_PLT64 ||
r_type == R_390_PLTOFF64)
- rc = apply_rela_bits(loc, val, 0, 64, 0);
+ rc = apply_rela_bits(loc, val, 0, 64, 0, write);
break;
case R_390_GOTOFF16: /* 16 bit offset to GOT. */
case R_390_GOTOFF32: /* 32 bit offset to GOT. */
case R_390_GOTOFF64: /* 64 bit offset to GOT. */
val = val + rela->r_addend -
- ((Elf_Addr) me->core_layout.base + me->arch.got_offset);
+ ((Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset);
if (r_type == R_390_GOTOFF16)
- rc = apply_rela_bits(loc, val, 0, 16, 0);
+ rc = apply_rela_bits(loc, val, 0, 16, 0, write);
else if (r_type == R_390_GOTOFF32)
- rc = apply_rela_bits(loc, val, 0, 32, 0);
+ rc = apply_rela_bits(loc, val, 0, 32, 0, write);
else if (r_type == R_390_GOTOFF64)
- rc = apply_rela_bits(loc, val, 0, 64, 0);
+ rc = apply_rela_bits(loc, val, 0, 64, 0, write);
break;
case R_390_GOTPC: /* 32 bit PC relative offset to GOT. */
case R_390_GOTPCDBL: /* 32 bit PC rel. off. to GOT shifted by 1. */
- val = (Elf_Addr) me->core_layout.base + me->arch.got_offset +
+ val = (Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset +
rela->r_addend - loc;
if (r_type == R_390_GOTPC)
- rc = apply_rela_bits(loc, val, 1, 32, 0);
+ rc = apply_rela_bits(loc, val, 1, 32, 0, write);
else if (r_type == R_390_GOTPCDBL)
- rc = apply_rela_bits(loc, val, 1, 32, 1);
+ rc = apply_rela_bits(loc, val, 1, 32, 1, write);
break;
case R_390_COPY:
case R_390_GLOB_DAT: /* Create GOT entry. */
@@ -412,9 +460,10 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
return 0;
}
-int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
+static int __apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
unsigned int symindex, unsigned int relsec,
- struct module *me)
+ struct module *me,
+ void *(*write)(void *dest, const void *src, size_t len))
{
Elf_Addr base;
Elf_Sym *symtab;
@@ -430,13 +479,51 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
n = sechdrs[relsec].sh_size / sizeof(Elf_Rela);
for (i = 0; i < n; i++, rela++) {
- rc = apply_rela(rela, base, symtab, strtab, me);
+ rc = apply_rela(rela, base, symtab, strtab, me, write);
if (rc)
return rc;
}
return 0;
}
+int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
+ unsigned int symindex, unsigned int relsec,
+ struct module *me)
+{
+ bool early = me->state == MODULE_STATE_UNFORMED;
+ void *(*write)(void *, const void *, size_t) = memcpy;
+
+ if (!early)
+ write = s390_kernel_write;
+
+ return __apply_relocate_add(sechdrs, strtab, symindex, relsec, me,
+ write);
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+static int module_alloc_ftrace_hotpatch_trampolines(struct module *me,
+ const Elf_Shdr *s)
+{
+ char *start, *end;
+ int numpages;
+ size_t size;
+
+ size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size);
+ numpages = DIV_ROUND_UP(size, PAGE_SIZE);
+ start = module_alloc(numpages * PAGE_SIZE);
+ if (!start)
+ return -ENOMEM;
+ set_memory_rox((unsigned long)start, numpages);
+ end = start + size;
+
+ me->arch.trampolines_start = (struct ftrace_hotpatch_trampoline *)start;
+ me->arch.trampolines_end = (struct ftrace_hotpatch_trampoline *)end;
+ me->arch.next_trampoline = me->arch.trampolines_start;
+
+ return 0;
+}
+#endif /* CONFIG_FUNCTION_TRACER */
+
int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
@@ -444,22 +531,19 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *s;
char *secstrings, *secname;
void *aseg;
+#ifdef CONFIG_FUNCTION_TRACER
+ int ret;
+#endif
if (IS_ENABLED(CONFIG_EXPOLINE) &&
!nospec_disable && me->arch.plt_size) {
unsigned int *ij;
- ij = me->core_layout.base + me->arch.plt_offset +
+ ij = me->mem[MOD_TEXT].base + me->arch.plt_offset +
me->arch.plt_size - PLT_ENTRY_SIZE;
- if (test_facility(35)) {
- ij[0] = 0xc6000000; /* exrl %r0,.+10 */
- ij[1] = 0x0005a7f4; /* j . */
- ij[2] = 0x000007f1; /* br %r1 */
- } else {
- ij[0] = 0x44000000 | (unsigned int)
- offsetof(struct lowcore, br_r1_trampoline);
- ij[1] = 0xa7f40000; /* j . */
- }
+ ij[0] = 0xc6000000; /* exrl %r0,.+10 */
+ ij[1] = 0x0005a7f4; /* j . */
+ ij[2] = 0x000007f1; /* br %r1 */
}
secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
@@ -478,8 +562,15 @@ int module_finalize(const Elf_Ehdr *hdr,
if (IS_ENABLED(CONFIG_EXPOLINE) &&
(str_has_prefix(secname, ".s390_return")))
nospec_revert(aseg, aseg + s->sh_size);
+
+#ifdef CONFIG_FUNCTION_TRACER
+ if (!strcmp(FTRACE_CALLSITE_SECTION, secname)) {
+ ret = module_alloc_ftrace_hotpatch_trampolines(me, s);
+ if (ret < 0)
+ return ret;
+ }
+#endif /* CONFIG_FUNCTION_TRACER */
}
- jump_label_apply_nops(me);
return 0;
}
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 0a487fae763e..9ad44c26d1a2 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -6,12 +6,12 @@
* Author(s): Ingo Adlung <adlung@de.ibm.com>,
* Martin Schwidefsky <schwidefsky@de.ibm.com>,
* Cornelia Huck <cornelia.huck@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*/
#include <linux/kernel_stat.h>
#include <linux/init.h>
#include <linux/errno.h>
+#include <linux/entry-common.h>
#include <linux/hardirq.h>
#include <linux/log2.h>
#include <linux/kprobes.h>
@@ -19,18 +19,20 @@
#include <linux/time.h>
#include <linux/module.h>
#include <linux/sched/signal.h>
-
+#include <linux/kvm_host.h>
#include <linux/export.h>
#include <asm/lowcore.h>
+#include <asm/ctlreg.h>
#include <asm/smp.h>
#include <asm/stp.h>
#include <asm/cputime.h>
#include <asm/nmi.h>
#include <asm/crw.h>
#include <asm/switch_to.h>
-#include <asm/ctl_reg.h>
#include <asm/asm-offsets.h>
-#include <linux/kvm_host.h>
+#include <asm/pai.h>
+#include <asm/vx-insn.h>
+#include <asm/fpu/api.h>
struct mcck_struct {
unsigned int kill_task : 1;
@@ -41,116 +43,134 @@ struct mcck_struct {
};
static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck);
-static struct kmem_cache *mcesa_cache;
-static unsigned long mcesa_origin_lc;
static inline int nmi_needs_mcesa(void)
{
- return MACHINE_HAS_VX || MACHINE_HAS_GS;
-}
-
-static inline unsigned long nmi_get_mcesa_size(void)
-{
- if (MACHINE_HAS_GS)
- return MCESA_MAX_SIZE;
- return MCESA_MIN_SIZE;
+ return cpu_has_vx() || MACHINE_HAS_GS;
}
/*
* The initial machine check extended save area for the boot CPU.
- * It will be replaced by nmi_init() with an allocated structure.
- * The structure is required for machine check happening early in
- * the boot process.
+ * It will be replaced on the boot CPU reinit with an allocated
+ * structure. The structure is required for machine check happening
+ * early in the boot process.
*/
-static struct mcesa boot_mcesa __initdata __aligned(MCESA_MAX_SIZE);
+static struct mcesa boot_mcesa __aligned(MCESA_MAX_SIZE);
-void __init nmi_alloc_boot_cpu(struct lowcore *lc)
+void __init nmi_alloc_mcesa_early(u64 *mcesad)
{
if (!nmi_needs_mcesa())
return;
- lc->mcesad = (unsigned long) &boot_mcesa;
+ *mcesad = __pa(&boot_mcesa);
if (MACHINE_HAS_GS)
- lc->mcesad |= ilog2(MCESA_MAX_SIZE);
+ *mcesad |= ilog2(MCESA_MAX_SIZE);
}
-static int __init nmi_init(void)
+int nmi_alloc_mcesa(u64 *mcesad)
{
- unsigned long origin, cr0, size;
+ unsigned long size;
+ void *origin;
+ *mcesad = 0;
if (!nmi_needs_mcesa())
return 0;
- size = nmi_get_mcesa_size();
- if (size > MCESA_MIN_SIZE)
- mcesa_origin_lc = ilog2(size);
- /* create slab cache for the machine-check-extended-save-areas */
- mcesa_cache = kmem_cache_create("nmi_save_areas", size, size, 0, NULL);
- if (!mcesa_cache)
- panic("Couldn't create nmi save area cache");
- origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL);
+ size = MACHINE_HAS_GS ? MCESA_MAX_SIZE : MCESA_MIN_SIZE;
+ origin = kmalloc(size, GFP_KERNEL);
if (!origin)
- panic("Couldn't allocate nmi save area");
+ return -ENOMEM;
/* The pointer is stored with mcesa_bits ORed in */
- kmemleak_not_leak((void *) origin);
- __ctl_store(cr0, 0, 0);
- __ctl_clear_bit(0, 28); /* disable lowcore protection */
- /* Replace boot_mcesa on the boot CPU */
- S390_lowcore.mcesad = origin | mcesa_origin_lc;
- __ctl_load(cr0, 0, 0);
+ kmemleak_not_leak(origin);
+ *mcesad = __pa(origin);
+ if (MACHINE_HAS_GS)
+ *mcesad |= ilog2(MCESA_MAX_SIZE);
return 0;
}
-early_initcall(nmi_init);
-int nmi_alloc_per_cpu(struct lowcore *lc)
+void nmi_free_mcesa(u64 *mcesad)
{
- unsigned long origin;
-
if (!nmi_needs_mcesa())
- return 0;
- origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL);
- if (!origin)
- return -ENOMEM;
- /* The pointer is stored with mcesa_bits ORed in */
- kmemleak_not_leak((void *) origin);
- lc->mcesad = origin | mcesa_origin_lc;
- return 0;
+ return;
+ kfree(__va(*mcesad & MCESA_ORIGIN_MASK));
}
-void nmi_free_per_cpu(struct lowcore *lc)
+static __always_inline char *nmi_puts(char *dest, const char *src)
{
- if (!nmi_needs_mcesa())
- return;
- kmem_cache_free(mcesa_cache, (void *)(lc->mcesad & MCESA_ORIGIN_MASK));
+ while (*src)
+ *dest++ = *src++;
+ *dest = 0;
+ return dest;
+}
+
+static __always_inline char *u64_to_hex(char *dest, u64 val)
+{
+ int i, num;
+
+ for (i = 1; i <= 16; i++) {
+ num = (val >> (64 - 4 * i)) & 0xf;
+ if (num >= 10)
+ *dest++ = 'A' + num - 10;
+ else
+ *dest++ = '0' + num;
+ }
+ *dest = 0;
+ return dest;
}
static notrace void s390_handle_damage(void)
{
+ union ctlreg0 cr0, cr0_new;
+ char message[100];
+ psw_t psw_save;
+ char *ptr;
+
smp_emergency_stop();
+ diag_amode31_ops.diag308_reset();
+ ptr = nmi_puts(message, "System stopped due to unrecoverable machine check, code: 0x");
+ u64_to_hex(ptr, S390_lowcore.mcck_interruption_code);
+
+ /*
+ * Disable low address protection and make machine check new PSW a
+ * disabled wait PSW. Any additional machine check cannot be handled.
+ */
+ local_ctl_store(0, &cr0.reg);
+ cr0_new = cr0;
+ cr0_new.lap = 0;
+ local_ctl_load(0, &cr0_new.reg);
+ psw_save = S390_lowcore.mcck_new_psw;
+ psw_bits(S390_lowcore.mcck_new_psw).io = 0;
+ psw_bits(S390_lowcore.mcck_new_psw).ext = 0;
+ psw_bits(S390_lowcore.mcck_new_psw).wait = 1;
+ sclp_emergency_printk(message);
+
+ /*
+ * Restore machine check new PSW and control register 0 to original
+ * values. This makes possible system dump analysis easier.
+ */
+ S390_lowcore.mcck_new_psw = psw_save;
+ local_ctl_load(0, &cr0.reg);
disabled_wait();
while (1);
}
NOKPROBE_SYMBOL(s390_handle_damage);
/*
- * Main machine check handler function. Will be called with interrupts enabled
- * or disabled and machine checks enabled or disabled.
+ * Main machine check handler function. Will be called with interrupts disabled
+ * and machine checks enabled.
*/
void s390_handle_mcck(void)
{
- unsigned long flags;
struct mcck_struct mcck;
+ unsigned long mflags;
/*
* Disable machine checks and get the current state of accumulated
* machine checks. Afterwards delete the old state and enable machine
* checks again.
*/
- local_irq_save(flags);
- local_mcck_disable();
+ local_mcck_save(mflags);
mcck = *this_cpu_ptr(&cpu_mcck);
memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck));
- clear_cpu_flag(CIF_MCCK_PENDING);
- local_mcck_enable();
- local_irq_restore(flags);
+ local_mcck_restore(mflags);
if (mcck.channel_report)
crw_handle_channel_report();
@@ -167,134 +187,150 @@ void s390_handle_mcck(void)
static int mchchk_wng_posted = 0;
/* Use single cpu clear, as we cannot handle smp here. */
- __ctl_clear_bit(14, 24); /* Disable WARNING MCH */
+ local_ctl_clear_bit(14, CR14_WARNING_SUBMASK_BIT);
if (xchg(&mchchk_wng_posted, 1) == 0)
kill_cad_pid(SIGPWR, 1);
}
if (mcck.stp_queue)
stp_queue_work();
if (mcck.kill_task) {
- local_irq_enable();
printk(KERN_EMERG "mcck: Terminating task because of machine "
"malfunction (code 0x%016lx).\n", mcck.mcck_code);
printk(KERN_EMERG "mcck: task: %s, pid: %d.\n",
current->comm, current->pid);
- do_exit(SIGSEGV);
+ if (is_global_init(current))
+ panic("mcck: Attempting to kill init!\n");
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, current, PIDTYPE_PID);
}
}
-EXPORT_SYMBOL_GPL(s390_handle_mcck);
/*
- * returns 0 if all required registers are available
+ * returns 0 if register contents could be validated
* returns 1 otherwise
*/
-static int notrace s390_check_registers(union mci mci, int umode)
+static int notrace s390_validate_registers(union mci mci)
{
+ struct mcesa *mcesa;
+ void *fpt_save_area;
union ctlreg2 cr2;
int kill_task;
+ u64 zero;
kill_task = 0;
+ zero = 0;
- if (!mci.gr) {
- /*
- * General purpose registers couldn't be restored and have
- * unknown contents. Stop system or terminate process.
- */
- if (!umode)
- s390_handle_damage();
+ if (!mci.gr || !mci.fp)
kill_task = 1;
- }
- /* Check control registers */
- if (!mci.cr) {
- /*
- * Control registers have unknown contents.
- * Can't recover and therefore stopping machine.
- */
- s390_handle_damage();
- }
- if (!mci.fp) {
- /*
- * Floating point registers can't be restored. If the
- * kernel currently uses floating point registers the
- * system is stopped. If the process has its floating
- * pointer registers loaded it is terminated.
- */
- if (S390_lowcore.fpu_flags & KERNEL_VXR_V0V7)
- s390_handle_damage();
- if (!test_cpu_flag(CIF_FPU))
- kill_task = 1;
- }
+ fpt_save_area = &S390_lowcore.floating_pt_save_area;
if (!mci.fc) {
- /*
- * Floating point control register can't be restored.
- * If the kernel currently uses the floating pointer
- * registers and needs the FPC register the system is
- * stopped. If the process has its floating pointer
- * registers loaded it is terminated.
- */
- if (S390_lowcore.fpu_flags & KERNEL_FPC)
- s390_handle_damage();
- if (!test_cpu_flag(CIF_FPU))
- kill_task = 1;
+ kill_task = 1;
+ asm volatile(
+ " lfpc %0\n"
+ :
+ : "Q" (zero));
+ } else {
+ asm volatile(
+ " lfpc %0\n"
+ :
+ : "Q" (S390_lowcore.fpt_creg_save_area));
}
- if (MACHINE_HAS_VX) {
- if (!mci.vr) {
- /*
- * Vector registers can't be restored. If the kernel
- * currently uses vector registers the system is
- * stopped. If the process has its vector registers
- * loaded it is terminated.
- */
- if (S390_lowcore.fpu_flags & KERNEL_VXR)
- s390_handle_damage();
- if (!test_cpu_flag(CIF_FPU))
- kill_task = 1;
- }
- }
- /* Check if access registers are valid */
- if (!mci.ar) {
+ mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
+ if (!cpu_has_vx()) {
+ /* Validate floating point registers */
+ asm volatile(
+ " ld 0,0(%0)\n"
+ " ld 1,8(%0)\n"
+ " ld 2,16(%0)\n"
+ " ld 3,24(%0)\n"
+ " ld 4,32(%0)\n"
+ " ld 5,40(%0)\n"
+ " ld 6,48(%0)\n"
+ " ld 7,56(%0)\n"
+ " ld 8,64(%0)\n"
+ " ld 9,72(%0)\n"
+ " ld 10,80(%0)\n"
+ " ld 11,88(%0)\n"
+ " ld 12,96(%0)\n"
+ " ld 13,104(%0)\n"
+ " ld 14,112(%0)\n"
+ " ld 15,120(%0)\n"
+ :
+ : "a" (fpt_save_area)
+ : "memory");
+ } else {
+ /* Validate vector registers */
+ union ctlreg0 cr0;
+
/*
- * Access registers have unknown contents.
- * Terminating task.
+ * The vector validity must only be checked if not running a
+ * KVM guest. For KVM guests the machine check is forwarded by
+ * KVM and it is the responsibility of the guest to take
+ * appropriate actions. The host vector or FPU values have been
+ * saved by KVM and will be restored by KVM.
*/
- kill_task = 1;
+ if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST))
+ kill_task = 1;
+ cr0.reg = S390_lowcore.cregs_save_area[0];
+ cr0.afp = cr0.vx = 1;
+ local_ctl_load(0, &cr0.reg);
+ asm volatile(
+ " la 1,%0\n"
+ " VLM 0,15,0,1\n"
+ " VLM 16,31,256,1\n"
+ :
+ : "Q" (*(struct vx_array *)mcesa->vector_save_area)
+ : "1");
+ local_ctl_load(0, &S390_lowcore.cregs_save_area[0]);
}
- /* Check guarded storage registers */
- cr2.val = S390_lowcore.cregs_save_area[2];
+ /* Validate access registers */
+ asm volatile(
+ " lam 0,15,0(%0)\n"
+ :
+ : "a" (&S390_lowcore.access_regs_save_area)
+ : "memory");
+ if (!mci.ar)
+ kill_task = 1;
+ /* Validate guarded storage registers */
+ cr2.reg = S390_lowcore.cregs_save_area[2];
if (cr2.gse) {
if (!mci.gs) {
/*
- * Guarded storage register can't be restored and
- * the current processes uses guarded storage.
- * It has to be terminated.
+ * 2 cases:
+ * - machine check in kernel or userspace
+ * - machine check while running SIE (KVM guest)
+ * For kernel or userspace the userspace values of
+ * guarded storage control can not be recreated, the
+ * process must be terminated.
+ * For SIE the guest values of guarded storage can not
+ * be recreated. This is either due to a bug or due to
+ * GS being disabled in the guest. The guest will be
+ * notified by KVM code and the guests machine check
+ * handling must take care of this. The host values
+ * are saved by KVM and are not affected.
*/
- kill_task = 1;
+ if (!test_cpu_flag(CIF_MCCK_GUEST))
+ kill_task = 1;
+ } else {
+ load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area);
}
}
- /* Check if old PSW is valid */
- if (!mci.wp) {
- /*
- * Can't tell if we come from user or kernel mode
- * -> stopping machine.
- */
- s390_handle_damage();
- }
- /* Check for invalid kernel instruction address */
- if (!mci.ia && !umode) {
- /*
- * The instruction address got lost while running
- * in the kernel -> stopping machine.
- */
- s390_handle_damage();
- }
+ /*
+ * The getcpu vdso syscall reads CPU number from the programmable
+ * field of the TOD clock. Disregard the TOD programmable register
+ * validity bit and load the CPU number into the TOD programmable
+ * field unconditionally.
+ */
+ set_tod_programmable_field(raw_smp_processor_id());
+ /* Validate clock comparator register */
+ set_clock_comparator(S390_lowcore.clock_comparator);
if (!mci.ms || !mci.pm || !mci.ia)
kill_task = 1;
return kill_task;
}
-NOKPROBE_SYMBOL(s390_check_registers);
+NOKPROBE_SYMBOL(s390_validate_registers);
/*
* Backup the guest's machine check info to its description block
@@ -305,8 +341,7 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs)
struct sie_page *sie_page;
/* r14 contains the sie block, which was set in sie64a */
- struct kvm_s390_sie_block *sie_block =
- (struct kvm_s390_sie_block *) regs->gprs[14];
+ struct kvm_s390_sie_block *sie_block = phys_to_virt(regs->gprs[14]);
if (sie_block == NULL)
/* Something's seriously wrong, stop system. */
@@ -340,19 +375,19 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
static unsigned long long last_ipd;
struct mcck_struct *mcck;
unsigned long long tmp;
+ irqentry_state_t irq_state;
union mci mci;
unsigned long mcck_dam_code;
+ int mcck_pending = 0;
+
+ irq_state = irqentry_nmi_enter(regs);
- nmi_enter();
+ if (user_mode(regs))
+ update_timer_mcck();
inc_irq_stat(NMI_NMI);
mci.val = S390_lowcore.mcck_interruption_code;
mcck = this_cpu_ptr(&cpu_mcck);
- if (mci.sd) {
- /* System damage -> stopping machine */
- s390_handle_damage();
- }
-
/*
* Reinject the instruction processing damages' machine checks
* including Delayed Access Exception into the guest
@@ -393,14 +428,16 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
s390_handle_damage();
}
}
- if (s390_check_registers(mci, user_mode(regs))) {
+ if (s390_validate_registers(mci)) {
+ if (!user_mode(regs))
+ s390_handle_damage();
/*
* Couldn't restore all register contents for the
* user space process -> mark task for termination.
*/
mcck->kill_task = 1;
mcck->mcck_code = mci.val;
- set_cpu_flag(CIF_MCCK_PENDING);
+ mcck_pending = 1;
}
/*
@@ -420,34 +457,32 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
mcck->stp_queue |= stp_sync_check();
if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND))
mcck->stp_queue |= stp_island_check();
- if (mcck->stp_queue)
- set_cpu_flag(CIF_MCCK_PENDING);
+ mcck_pending = 1;
}
-
/*
* Reinject storage related machine checks into the guest if they
* happen when the guest is running.
*/
if (!test_cpu_flag(CIF_MCCK_GUEST)) {
+ /* Storage error uncorrected */
if (mci.se)
- /* Storage error uncorrected */
s390_handle_damage();
+ /* Storage key-error uncorrected */
if (mci.ke)
- /* Storage key-error uncorrected */
s390_handle_damage();
+ /* Storage degradation */
if (mci.ds && mci.fa)
- /* Storage degradation */
s390_handle_damage();
}
if (mci.cp) {
/* Channel report word pending */
mcck->channel_report = 1;
- set_cpu_flag(CIF_MCCK_PENDING);
+ mcck_pending = 1;
}
if (mci.w) {
/* Warning pending */
mcck->warning = 1;
- set_cpu_flag(CIF_MCCK_PENDING);
+ mcck_pending = 1;
}
/*
@@ -462,15 +497,19 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
*((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR;
}
clear_cpu_flag(CIF_MCCK_GUEST);
- nmi_exit();
+
+ if (mcck_pending)
+ schedule_mcck_handler();
+
+ irqentry_nmi_exit(regs, irq_state);
}
NOKPROBE_SYMBOL(s390_do_machine_check);
static int __init machine_check_init(void)
{
- ctl_set_bit(14, 25); /* enable external damage MCH */
- ctl_set_bit(14, 27); /* enable system recovery MCH */
- ctl_set_bit(14, 24); /* enable warning MCH */
+ system_ctl_set_bit(14, CR14_EXTERNAL_DAMAGE_SUBMASK_BIT);
+ system_ctl_set_bit(14, CR14_RECOVERY_SUBMASK_BIT);
+ system_ctl_set_bit(14, CR14_WARNING_SUBMASK_BIT);
return 0;
}
early_initcall(machine_check_init);
diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c
index 29e511f5bf06..d1b16d83e49a 100644
--- a/arch/s390/kernel/nospec-branch.c
+++ b/arch/s390/kernel/nospec-branch.c
@@ -14,14 +14,14 @@ static int __init nobp_setup_early(char *str)
return rc;
if (enabled && test_facility(82)) {
/*
- * The user explicitely requested nobp=1, enable it and
+ * The user explicitly requested nobp=1, enable it and
* disable the expoline support.
*/
- __set_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __set_facility(82, alt_stfle_fac_list);
if (IS_ENABLED(CONFIG_EXPOLINE))
nospec_disable = 1;
} else {
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
}
return 0;
}
@@ -29,7 +29,7 @@ early_param("nobp", nobp_setup_early);
static int __init nospec_setup_early(char *str)
{
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
return 0;
}
early_param("nospec", nospec_setup_early);
@@ -38,9 +38,9 @@ static int __init nospec_report(void)
{
if (test_facility(156))
pr_info("Spectre V2 mitigation: etokens\n");
- if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable)
+ if (nospec_uses_trampoline())
pr_info("Spectre V2 mitigation: execute trampolines\n");
- if (__test_facility(82, S390_lowcore.alt_stfle_fac_list))
+ if (__test_facility(82, alt_stfle_fac_list))
pr_info("Spectre V2 mitigation: limited branch prediction\n");
return 0;
}
@@ -66,14 +66,14 @@ void __init nospec_auto_detect(void)
*/
if (__is_defined(CC_USING_EXPOLINE))
nospec_disable = 1;
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
} else if (__is_defined(CC_USING_EXPOLINE)) {
/*
* The kernel has been compiled with expolines.
* Keep expolines enabled and disable nobp.
*/
nospec_disable = 0;
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
}
/*
* If the kernel has not been compiled with expolines the
@@ -86,7 +86,7 @@ static int __init spectre_v2_setup_early(char *str)
{
if (str && !strncmp(str, "on", 2)) {
nospec_disable = 0;
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
}
if (str && !strncmp(str, "off", 3))
nospec_disable = 1;
@@ -99,11 +99,13 @@ early_param("spectre_v2", spectre_v2_setup_early);
static void __init_or_module __nospec_revert(s32 *start, s32 *end)
{
enum { BRCL_EXPOLINE, BRASL_EXPOLINE } type;
+ static const u8 branch[] = { 0x47, 0x00, 0x07, 0x00 };
u8 *instr, *thunk, *br;
u8 insnbuf[6];
s32 *epo;
/* Second part of the instruction replace is always a nop */
+ memcpy(insnbuf + 2, branch, sizeof(branch));
for (epo = start; epo < end; epo++) {
instr = (u8 *) epo + *epo;
if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x04)
@@ -116,42 +118,20 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end)
if (thunk[0] == 0xc6 && thunk[1] == 0x00)
/* exrl %r0,<target-br> */
br = thunk + (*(int *)(thunk + 2)) * 2;
- else if (thunk[0] == 0xc0 && (thunk[1] & 0x0f) == 0x00 &&
- thunk[6] == 0x44 && thunk[7] == 0x00 &&
- (thunk[8] & 0x0f) == 0x00 && thunk[9] == 0x00 &&
- (thunk[1] & 0xf0) == (thunk[8] & 0xf0))
- /* larl %rx,<target br> + ex %r0,0(%rx) */
- br = thunk + (*(int *)(thunk + 2)) * 2;
else
continue;
- /* Check for unconditional branch 0x07f? or 0x47f???? */
- if ((br[0] & 0xbf) != 0x07 || (br[1] & 0xf0) != 0xf0)
+ if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0)
continue;
-
- memcpy(insnbuf + 2, (char[]) { 0x47, 0x00, 0x07, 0x00 }, 4);
switch (type) {
case BRCL_EXPOLINE:
+ /* brcl to thunk, replace with br + nop */
insnbuf[0] = br[0];
insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f);
- if (br[0] == 0x47) {
- /* brcl to b, replace with bc + nopr */
- insnbuf[2] = br[2];
- insnbuf[3] = br[3];
- } else {
- /* brcl to br, replace with bcr + nop */
- }
break;
case BRASL_EXPOLINE:
+ /* brasl to thunk, replace with basr + nop */
+ insnbuf[0] = 0x0d;
insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f);
- if (br[0] == 0x47) {
- /* brasl to b, replace with bas + nopr */
- insnbuf[0] = 0x4d;
- insnbuf[2] = br[2];
- insnbuf[3] = br[3];
- } else {
- /* brasl to br, replace with basr + nop */
- insnbuf[0] = 0x0d;
- }
break;
}
diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c
index 48f472bf9290..52d4353188ad 100644
--- a/arch/s390/kernel/nospec-sysfs.c
+++ b/arch/s390/kernel/nospec-sysfs.c
@@ -15,9 +15,9 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
{
if (test_facility(156))
return sprintf(buf, "Mitigation: etokens\n");
- if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable)
+ if (nospec_uses_trampoline())
return sprintf(buf, "Mitigation: execute trampolines\n");
- if (__test_facility(82, S390_lowcore.alt_stfle_fac_list))
+ if (__test_facility(82, alt_stfle_fac_list))
return sprintf(buf, "Mitigation: limited branch prediction\n");
return sprintf(buf, "Vulnerable\n");
}
diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c
new file mode 100644
index 000000000000..23ab9f02f278
--- /dev/null
+++ b/arch/s390/kernel/numa.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NUMA support for s390
+ *
+ * Implement NUMA core code.
+ *
+ * Copyright IBM Corp. 2015
+ */
+
+#include <linux/kernel.h>
+#include <linux/mmzone.h>
+#include <linux/cpumask.h>
+#include <linux/memblock.h>
+#include <linux/node.h>
+#include <asm/numa.h>
+
+struct pglist_data *node_data[MAX_NUMNODES];
+EXPORT_SYMBOL(node_data);
+
+void __init numa_setup(void)
+{
+ int nid;
+
+ nodes_clear(node_possible_map);
+ node_set(0, node_possible_map);
+ node_set_online(0);
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8);
+ if (!NODE_DATA(nid))
+ panic("%s: Failed to allocate %zu bytes align=0x%x\n",
+ __func__, sizeof(pg_data_t), 8);
+ }
+ NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT;
+ NODE_DATA(0)->node_id = 0;
+}
diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c
index 0a5e4bafb6ad..6e1824141b29 100644
--- a/arch/s390/kernel/os_info.c
+++ b/arch/s390/kernel/os_info.c
@@ -13,8 +13,10 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <asm/checksum.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
#include <asm/os_info.h>
+#include <asm/maccess.h>
+#include <asm/asm-offsets.h>
/*
* OS info structure has to be page aligned
@@ -45,24 +47,26 @@ void os_info_crashkernel_add(unsigned long base, unsigned long size)
*/
void os_info_entry_add(int nr, void *ptr, u64 size)
{
- os_info.entry[nr].addr = (u64)(unsigned long)ptr;
+ os_info.entry[nr].addr = __pa(ptr);
os_info.entry[nr].size = size;
os_info.entry[nr].csum = (__force u32)csum_partial(ptr, size, 0);
os_info.csum = os_info_csum(&os_info);
}
/*
- * Initialize OS info struture and set lowcore pointer
+ * Initialize OS info structure and set lowcore pointer
*/
void __init os_info_init(void)
{
- void *ptr = &os_info;
+ struct lowcore *abs_lc;
os_info.version_major = OS_INFO_VERSION_MAJOR;
os_info.version_minor = OS_INFO_VERSION_MINOR;
os_info.magic = OS_INFO_MAGIC;
os_info.csum = os_info_csum(&os_info);
- mem_assign_absolute(S390_lowcore.os_info, (unsigned long) ptr);
+ abs_lc = get_abs_lowcore();
+ abs_lc->os_info = __pa(&os_info);
+ put_abs_lowcore(abs_lc);
}
#ifdef CONFIG_CRASH_DUMP
@@ -90,7 +94,7 @@ static void os_info_old_alloc(int nr, int align)
goto fail;
}
buf_align = PTR_ALIGN(buf, align);
- if (copy_oldmem_kernel(buf_align, (void *) addr, size)) {
+ if (copy_oldmem_kernel(buf_align, addr, size)) {
msg = "copy failed";
goto fail_free;
}
@@ -121,17 +125,16 @@ static void os_info_old_init(void)
if (os_info_init)
return;
- if (!OLDMEM_BASE)
+ if (!oldmem_data.start)
goto fail;
- if (copy_oldmem_kernel(&addr, &S390_lowcore.os_info, sizeof(addr)))
+ if (copy_oldmem_kernel(&addr, __LC_OS_INFO, sizeof(addr)))
goto fail;
if (addr == 0 || addr % PAGE_SIZE)
goto fail;
os_info_old = kzalloc(sizeof(*os_info_old), GFP_KERNEL);
if (!os_info_old)
goto fail;
- if (copy_oldmem_kernel(os_info_old, (void *) addr,
- sizeof(*os_info_old)))
+ if (copy_oldmem_kernel(os_info_old, addr, sizeof(*os_info_old)))
goto fail_free;
if (os_info_old->magic != OS_INFO_MAGIC)
goto fail_free;
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 0eb1d1cc53a8..41ed6e0f0a2a 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -2,8 +2,9 @@
/*
* Performance event support for s390x - CPU-measurement Counter Facility
*
- * Copyright IBM Corp. 2012, 2019
+ * Copyright IBM Corp. 2012, 2023
* Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
+ * Thomas Richter <tmricht@linux.ibm.com>
*/
#define KMSG_COMPONENT "cpum_cf"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
@@ -14,7 +15,574 @@
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/export.h>
-#include <asm/cpu_mcf.h>
+#include <linux/miscdevice.h>
+#include <linux/perf_event.h>
+
+#include <asm/cpu_mf.h>
+#include <asm/hwctrset.h>
+#include <asm/debug.h>
+
+enum cpumf_ctr_set {
+ CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */
+ CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */
+ CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */
+ CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */
+ CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */
+
+ /* Maximum number of counter sets */
+ CPUMF_CTR_SET_MAX,
+};
+
+#define CPUMF_LCCTL_ENABLE_SHIFT 16
+#define CPUMF_LCCTL_ACTCTL_SHIFT 0
+
+static inline void ctr_set_enable(u64 *state, u64 ctrsets)
+{
+ *state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT;
+}
+
+static inline void ctr_set_disable(u64 *state, u64 ctrsets)
+{
+ *state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT);
+}
+
+static inline void ctr_set_start(u64 *state, u64 ctrsets)
+{
+ *state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT;
+}
+
+static inline void ctr_set_stop(u64 *state, u64 ctrsets)
+{
+ *state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT);
+}
+
+static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest)
+{
+ switch (set) {
+ case CPUMF_CTR_SET_BASIC:
+ return stcctm(BASIC, range, dest);
+ case CPUMF_CTR_SET_USER:
+ return stcctm(PROBLEM_STATE, range, dest);
+ case CPUMF_CTR_SET_CRYPTO:
+ return stcctm(CRYPTO_ACTIVITY, range, dest);
+ case CPUMF_CTR_SET_EXT:
+ return stcctm(EXTENDED, range, dest);
+ case CPUMF_CTR_SET_MT_DIAG:
+ return stcctm(MT_DIAG_CLEARING, range, dest);
+ case CPUMF_CTR_SET_MAX:
+ return 3;
+ }
+ return 3;
+}
+
+struct cpu_cf_events {
+ refcount_t refcnt; /* Reference count */
+ atomic_t ctr_set[CPUMF_CTR_SET_MAX];
+ u64 state; /* For perf_event_open SVC */
+ u64 dev_state; /* For /dev/hwctr */
+ unsigned int flags;
+ size_t used; /* Bytes used in data */
+ size_t usedss; /* Bytes used in start/stop */
+ unsigned char start[PAGE_SIZE]; /* Counter set at event add */
+ unsigned char stop[PAGE_SIZE]; /* Counter set at event delete */
+ unsigned char data[PAGE_SIZE]; /* Counter set at /dev/hwctr */
+ unsigned int sets; /* # Counter set saved in memory */
+};
+
+static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */
+static debug_info_t *cf_dbg;
+
+/*
+ * The CPU Measurement query counter information instruction contains
+ * information which varies per machine generation, but is constant and
+ * does not change when running on a particular machine, such as counter
+ * first and second version number. This is needed to determine the size
+ * of counter sets. Extract this information at device driver initialization.
+ */
+static struct cpumf_ctr_info cpumf_ctr_info;
+
+struct cpu_cf_ptr {
+ struct cpu_cf_events *cpucf;
+};
+
+static struct cpu_cf_root { /* Anchor to per CPU data */
+ refcount_t refcnt; /* Overall active events */
+ struct cpu_cf_ptr __percpu *cfptr;
+} cpu_cf_root;
+
+/*
+ * Serialize event initialization and event removal. Both are called from
+ * user space in task context with perf_event_open() and close()
+ * system calls.
+ *
+ * This mutex serializes functions cpum_cf_alloc_cpu() called at event
+ * initialization via cpumf_pmu_event_init() and function cpum_cf_free_cpu()
+ * called at event removal via call back function hw_perf_event_destroy()
+ * when the event is deleted. They are serialized to enforce correct
+ * bookkeeping of pointer and reference counts anchored by
+ * struct cpu_cf_root and the access to cpu_cf_root::refcnt and the
+ * per CPU pointers stored in cpu_cf_root::cfptr.
+ */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Get pointer to per-cpu structure.
+ *
+ * Function get_cpu_cfhw() is called from
+ * - cfset_copy_all(): This function is protected by cpus_read_lock(), so
+ * CPU hot plug remove can not happen. Event removal requires a close()
+ * first.
+ *
+ * Function this_cpu_cfhw() is called from perf common code functions:
+ * - pmu_{en|dis}able(), pmu_{add|del}()and pmu_{start|stop}():
+ * All functions execute with interrupts disabled on that particular CPU.
+ * - cfset_ioctl_{on|off}, cfset_cpu_read(): see comment cfset_copy_all().
+ *
+ * Therefore it is safe to access the CPU specific pointer to the event.
+ */
+static struct cpu_cf_events *get_cpu_cfhw(int cpu)
+{
+ struct cpu_cf_ptr __percpu *p = cpu_cf_root.cfptr;
+
+ if (p) {
+ struct cpu_cf_ptr *q = per_cpu_ptr(p, cpu);
+
+ return q->cpucf;
+ }
+ return NULL;
+}
+
+static struct cpu_cf_events *this_cpu_cfhw(void)
+{
+ return get_cpu_cfhw(smp_processor_id());
+}
+
+/* Disable counter sets on dedicated CPU */
+static void cpum_cf_reset_cpu(void *flags)
+{
+ lcctl(0);
+}
+
+/* Free per CPU data when the last event is removed. */
+static void cpum_cf_free_root(void)
+{
+ if (!refcount_dec_and_test(&cpu_cf_root.refcnt))
+ return;
+ free_percpu(cpu_cf_root.cfptr);
+ cpu_cf_root.cfptr = NULL;
+ irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
+ on_each_cpu(cpum_cf_reset_cpu, NULL, 1);
+ debug_sprintf_event(cf_dbg, 4, "%s root.refcnt %u cfptr %d\n",
+ __func__, refcount_read(&cpu_cf_root.refcnt),
+ !cpu_cf_root.cfptr);
+}
+
+/*
+ * On initialization of first event also allocate per CPU data dynamically.
+ * Start with an array of pointers, the array size is the maximum number of
+ * CPUs possible, which might be larger than the number of CPUs currently
+ * online.
+ */
+static int cpum_cf_alloc_root(void)
+{
+ int rc = 0;
+
+ if (refcount_inc_not_zero(&cpu_cf_root.refcnt))
+ return rc;
+
+ /* The memory is already zeroed. */
+ cpu_cf_root.cfptr = alloc_percpu(struct cpu_cf_ptr);
+ if (cpu_cf_root.cfptr) {
+ refcount_set(&cpu_cf_root.refcnt, 1);
+ on_each_cpu(cpum_cf_reset_cpu, NULL, 1);
+ irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
+ } else {
+ rc = -ENOMEM;
+ }
+
+ return rc;
+}
+
+/* Free CPU counter data structure for a PMU */
+static void cpum_cf_free_cpu(int cpu)
+{
+ struct cpu_cf_events *cpuhw;
+ struct cpu_cf_ptr *p;
+
+ mutex_lock(&pmc_reserve_mutex);
+ /*
+ * When invoked via CPU hotplug handler, there might be no events
+ * installed or that particular CPU might not have an
+ * event installed. This anchor pointer can be NULL!
+ */
+ if (!cpu_cf_root.cfptr)
+ goto out;
+ p = per_cpu_ptr(cpu_cf_root.cfptr, cpu);
+ cpuhw = p->cpucf;
+ /*
+ * Might be zero when called from CPU hotplug handler and no event
+ * installed on that CPU, but on different CPUs.
+ */
+ if (!cpuhw)
+ goto out;
+
+ if (refcount_dec_and_test(&cpuhw->refcnt)) {
+ kfree(cpuhw);
+ p->cpucf = NULL;
+ }
+ cpum_cf_free_root();
+out:
+ mutex_unlock(&pmc_reserve_mutex);
+}
+
+/* Allocate CPU counter data structure for a PMU. Called under mutex lock. */
+static int cpum_cf_alloc_cpu(int cpu)
+{
+ struct cpu_cf_events *cpuhw;
+ struct cpu_cf_ptr *p;
+ int rc;
+
+ mutex_lock(&pmc_reserve_mutex);
+ rc = cpum_cf_alloc_root();
+ if (rc)
+ goto unlock;
+ p = per_cpu_ptr(cpu_cf_root.cfptr, cpu);
+ cpuhw = p->cpucf;
+
+ if (!cpuhw) {
+ cpuhw = kzalloc(sizeof(*cpuhw), GFP_KERNEL);
+ if (cpuhw) {
+ p->cpucf = cpuhw;
+ refcount_set(&cpuhw->refcnt, 1);
+ } else {
+ rc = -ENOMEM;
+ }
+ } else {
+ refcount_inc(&cpuhw->refcnt);
+ }
+ if (rc) {
+ /*
+ * Error in allocation of event, decrement anchor. Since
+ * cpu_cf_event in not created, its destroy() function is not
+ * invoked. Adjust the reference counter for the anchor.
+ */
+ cpum_cf_free_root();
+ }
+unlock:
+ mutex_unlock(&pmc_reserve_mutex);
+ return rc;
+}
+
+/*
+ * Create/delete per CPU data structures for /dev/hwctr interface and events
+ * created by perf_event_open().
+ * If cpu is -1, track task on all available CPUs. This requires
+ * allocation of hardware data structures for all CPUs. This setup handles
+ * perf_event_open() with task context and /dev/hwctr interface.
+ * If cpu is non-zero install event on this CPU only. This setup handles
+ * perf_event_open() with CPU context.
+ */
+static int cpum_cf_alloc(int cpu)
+{
+ cpumask_var_t mask;
+ int rc;
+
+ if (cpu == -1) {
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+ for_each_online_cpu(cpu) {
+ rc = cpum_cf_alloc_cpu(cpu);
+ if (rc) {
+ for_each_cpu(cpu, mask)
+ cpum_cf_free_cpu(cpu);
+ break;
+ }
+ cpumask_set_cpu(cpu, mask);
+ }
+ free_cpumask_var(mask);
+ } else {
+ rc = cpum_cf_alloc_cpu(cpu);
+ }
+ return rc;
+}
+
+static void cpum_cf_free(int cpu)
+{
+ if (cpu == -1) {
+ for_each_online_cpu(cpu)
+ cpum_cf_free_cpu(cpu);
+ } else {
+ cpum_cf_free_cpu(cpu);
+ }
+}
+
+#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */
+ /* interval in seconds */
+
+/* Counter sets are stored as data stream in a page sized memory buffer and
+ * exported to user space via raw data attached to the event sample data.
+ * Each counter set starts with an eight byte header consisting of:
+ * - a two byte eye catcher (0xfeef)
+ * - a one byte counter set number
+ * - a two byte counter set size (indicates the number of counters in this set)
+ * - a three byte reserved value (must be zero) to make the header the same
+ * size as a counter value.
+ * All counter values are eight byte in size.
+ *
+ * All counter sets are followed by a 64 byte trailer.
+ * The trailer consists of a:
+ * - flag field indicating valid fields when corresponding bit set
+ * - the counter facility first and second version number
+ * - the CPU speed if nonzero
+ * - the time stamp the counter sets have been collected
+ * - the time of day (TOD) base value
+ * - the machine type.
+ *
+ * The counter sets are saved when the process is prepared to be executed on a
+ * CPU and saved again when the process is going to be removed from a CPU.
+ * The difference of both counter sets are calculated and stored in the event
+ * sample data area.
+ */
+struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */
+ unsigned int def:16; /* 0-15 Data Entry Format */
+ unsigned int set:16; /* 16-31 Counter set identifier */
+ unsigned int ctr:16; /* 32-47 Number of stored counters */
+ unsigned int res1:16; /* 48-63 Reserved */
+};
+
+struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */
+ /* 0 - 7 */
+ union {
+ struct {
+ unsigned int clock_base:1; /* TOD clock base set */
+ unsigned int speed:1; /* CPU speed set */
+ /* Measurement alerts */
+ unsigned int mtda:1; /* Loss of MT ctr. data alert */
+ unsigned int caca:1; /* Counter auth. change alert */
+ unsigned int lcda:1; /* Loss of counter data alert */
+ };
+ unsigned long flags; /* 0-63 All indicators */
+ };
+ /* 8 - 15 */
+ unsigned int cfvn:16; /* 64-79 Ctr First Version */
+ unsigned int csvn:16; /* 80-95 Ctr Second Version */
+ unsigned int cpu_speed:32; /* 96-127 CPU speed */
+ /* 16 - 23 */
+ unsigned long timestamp; /* 128-191 Timestamp (TOD) */
+ /* 24 - 55 */
+ union {
+ struct {
+ unsigned long progusage1;
+ unsigned long progusage2;
+ unsigned long progusage3;
+ unsigned long tod_base;
+ };
+ unsigned long progusage[4];
+ };
+ /* 56 - 63 */
+ unsigned int mach_type:16; /* Machine type */
+ unsigned int res1:16; /* Reserved */
+ unsigned int res2:32; /* Reserved */
+};
+
+/* Create the trailer data at the end of a page. */
+static void cfdiag_trailer(struct cf_trailer_entry *te)
+{
+ struct cpuid cpuid;
+
+ te->cfvn = cpumf_ctr_info.cfvn; /* Counter version numbers */
+ te->csvn = cpumf_ctr_info.csvn;
+
+ get_cpu_id(&cpuid); /* Machine type */
+ te->mach_type = cpuid.machine;
+ te->cpu_speed = cfdiag_cpu_speed;
+ if (te->cpu_speed)
+ te->speed = 1;
+ te->clock_base = 1; /* Save clock base */
+ te->tod_base = tod_clock_base.tod;
+ te->timestamp = get_tod_clock_fast();
+}
+
+/*
+ * The number of counters per counter set varies between machine generations,
+ * but is constant when running on a particular machine generation.
+ * Determine each counter set size at device driver initialization and
+ * retrieve it later.
+ */
+static size_t cpumf_ctr_setsizes[CPUMF_CTR_SET_MAX];
+static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset)
+{
+ size_t ctrset_size = 0;
+
+ switch (ctrset) {
+ case CPUMF_CTR_SET_BASIC:
+ if (cpumf_ctr_info.cfvn >= 1)
+ ctrset_size = 6;
+ break;
+ case CPUMF_CTR_SET_USER:
+ if (cpumf_ctr_info.cfvn == 1)
+ ctrset_size = 6;
+ else if (cpumf_ctr_info.cfvn >= 3)
+ ctrset_size = 2;
+ break;
+ case CPUMF_CTR_SET_CRYPTO:
+ if (cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5)
+ ctrset_size = 16;
+ else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7)
+ ctrset_size = 20;
+ break;
+ case CPUMF_CTR_SET_EXT:
+ if (cpumf_ctr_info.csvn == 1)
+ ctrset_size = 32;
+ else if (cpumf_ctr_info.csvn == 2)
+ ctrset_size = 48;
+ else if (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5)
+ ctrset_size = 128;
+ else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7)
+ ctrset_size = 160;
+ break;
+ case CPUMF_CTR_SET_MT_DIAG:
+ if (cpumf_ctr_info.csvn > 3)
+ ctrset_size = 48;
+ break;
+ case CPUMF_CTR_SET_MAX:
+ break;
+ }
+ cpumf_ctr_setsizes[ctrset] = ctrset_size;
+}
+
+/*
+ * Return the maximum possible counter set size (in number of 8 byte counters)
+ * depending on type and model number.
+ */
+static size_t cpum_cf_read_setsize(enum cpumf_ctr_set ctrset)
+{
+ return cpumf_ctr_setsizes[ctrset];
+}
+
+/* Read a counter set. The counter set number determines the counter set and
+ * the CPUM-CF first and second version number determine the number of
+ * available counters in each counter set.
+ * Each counter set starts with header containing the counter set number and
+ * the number of eight byte counters.
+ *
+ * The functions returns the number of bytes occupied by this counter set
+ * including the header.
+ * If there is no counter in the counter set, this counter set is useless and
+ * zero is returned on this case.
+ *
+ * Note that the counter sets may not be enabled or active and the stcctm
+ * instruction might return error 3. Depending on error_ok value this is ok,
+ * for example when called from cpumf_pmu_start() call back function.
+ */
+static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
+ size_t room, bool error_ok)
+{
+ size_t ctrset_size, need = 0;
+ int rc = 3; /* Assume write failure */
+
+ ctrdata->def = CF_DIAG_CTRSET_DEF;
+ ctrdata->set = ctrset;
+ ctrdata->res1 = 0;
+ ctrset_size = cpum_cf_read_setsize(ctrset);
+
+ if (ctrset_size) { /* Save data */
+ need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
+ if (need <= room) {
+ rc = ctr_stcctm(ctrset, ctrset_size,
+ (u64 *)(ctrdata + 1));
+ }
+ if (rc != 3 || error_ok)
+ ctrdata->ctr = ctrset_size;
+ else
+ need = 0;
+ }
+
+ return need;
+}
+
+static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = {
+ [CPUMF_CTR_SET_BASIC] = 0x02,
+ [CPUMF_CTR_SET_USER] = 0x04,
+ [CPUMF_CTR_SET_CRYPTO] = 0x08,
+ [CPUMF_CTR_SET_EXT] = 0x01,
+ [CPUMF_CTR_SET_MT_DIAG] = 0x20,
+};
+
+/* Read out all counter sets and save them in the provided data buffer.
+ * The last 64 byte host an artificial trailer entry.
+ */
+static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth,
+ bool error_ok)
+{
+ struct cf_trailer_entry *trailer;
+ size_t offset = 0, done;
+ int i;
+
+ memset(data, 0, sz);
+ sz -= sizeof(*trailer); /* Always room for trailer */
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ struct cf_ctrset_entry *ctrdata = data + offset;
+
+ if (!(auth & cpumf_ctr_ctl[i]))
+ continue; /* Counter set not authorized */
+
+ done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok);
+ offset += done;
+ }
+ trailer = data + offset;
+ cfdiag_trailer(trailer);
+ return offset + sizeof(*trailer);
+}
+
+/* Calculate the difference for each counter in a counter set. */
+static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters)
+{
+ for (; --counters >= 0; ++pstart, ++pstop)
+ if (*pstop >= *pstart)
+ *pstop -= *pstart;
+ else
+ *pstop = *pstart - *pstop + 1;
+}
+
+/* Scan the counter sets and calculate the difference of each counter
+ * in each set. The result is the increment of each counter during the
+ * period the counter set has been activated.
+ *
+ * Return true on success.
+ */
+static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth)
+{
+ struct cf_trailer_entry *trailer_start, *trailer_stop;
+ struct cf_ctrset_entry *ctrstart, *ctrstop;
+ size_t offset = 0;
+
+ auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
+ do {
+ ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset);
+ ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset);
+
+ if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
+ pr_err_once("cpum_cf_diag counter set compare error "
+ "in set %i\n", ctrstart->set);
+ return 0;
+ }
+ auth &= ~cpumf_ctr_ctl[ctrstart->set];
+ if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
+ cfdiag_diffctrset((u64 *)(ctrstart + 1),
+ (u64 *)(ctrstop + 1), ctrstart->ctr);
+ offset += ctrstart->ctr * sizeof(u64) +
+ sizeof(*ctrstart);
+ }
+ } while (ctrstart->def && auth);
+
+ /* Save time_stamp from start of event in stop's trailer */
+ trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset);
+ trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset);
+ trailer_stop->progusage[0] = trailer_start->timestamp;
+
+ return 1;
+}
static enum cpumf_ctr_set get_counter_set(u64 event)
{
@@ -34,39 +602,35 @@ static enum cpumf_ctr_set get_counter_set(u64 event)
return set;
}
-static int validate_ctr_version(const struct hw_perf_event *hwc)
+static int validate_ctr_version(const u64 config, enum cpumf_ctr_set set)
{
- struct cpu_cf_events *cpuhw;
- int err = 0;
u16 mtdiag_ctl;
-
- cpuhw = &get_cpu_var(cpu_cf_events);
+ int err = 0;
/* check required version for counter sets */
- switch (hwc->config_base) {
+ switch (set) {
case CPUMF_CTR_SET_BASIC:
case CPUMF_CTR_SET_USER:
- if (cpuhw->info.cfvn < 1)
+ if (cpumf_ctr_info.cfvn < 1)
err = -EOPNOTSUPP;
break;
case CPUMF_CTR_SET_CRYPTO:
- if ((cpuhw->info.csvn >= 1 && cpuhw->info.csvn <= 5 &&
- hwc->config > 79) ||
- (cpuhw->info.csvn >= 6 && hwc->config > 83))
+ if ((cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5 &&
+ config > 79) || (cpumf_ctr_info.csvn >= 6 && config > 83))
err = -EOPNOTSUPP;
break;
case CPUMF_CTR_SET_EXT:
- if (cpuhw->info.csvn < 1)
+ if (cpumf_ctr_info.csvn < 1)
err = -EOPNOTSUPP;
- if ((cpuhw->info.csvn == 1 && hwc->config > 159) ||
- (cpuhw->info.csvn == 2 && hwc->config > 175) ||
- (cpuhw->info.csvn >= 3 && cpuhw->info.csvn <= 5
- && hwc->config > 255) ||
- (cpuhw->info.csvn >= 6 && hwc->config > 287))
+ if ((cpumf_ctr_info.csvn == 1 && config > 159) ||
+ (cpumf_ctr_info.csvn == 2 && config > 175) ||
+ (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5 &&
+ config > 255) ||
+ (cpumf_ctr_info.csvn >= 6 && config > 287))
err = -EOPNOTSUPP;
break;
case CPUMF_CTR_SET_MT_DIAG:
- if (cpuhw->info.csvn <= 3)
+ if (cpumf_ctr_info.csvn <= 3)
err = -EOPNOTSUPP;
/*
* MT-diagnostic counters are read-only. The counter set
@@ -81,35 +645,15 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
* counter set is enabled and active.
*/
mtdiag_ctl = cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG];
- if (!((cpuhw->info.auth_ctl & mtdiag_ctl) &&
- (cpuhw->info.enable_ctl & mtdiag_ctl) &&
- (cpuhw->info.act_ctl & mtdiag_ctl)))
+ if (!((cpumf_ctr_info.auth_ctl & mtdiag_ctl) &&
+ (cpumf_ctr_info.enable_ctl & mtdiag_ctl) &&
+ (cpumf_ctr_info.act_ctl & mtdiag_ctl)))
err = -EOPNOTSUPP;
break;
+ case CPUMF_CTR_SET_MAX:
+ err = -EOPNOTSUPP;
}
- put_cpu_var(cpu_cf_events);
- return err;
-}
-
-static int validate_ctr_auth(const struct hw_perf_event *hwc)
-{
- struct cpu_cf_events *cpuhw;
- u64 ctrs_state;
- int err = 0;
-
- cpuhw = &get_cpu_var(cpu_cf_events);
-
- /* Check authorization for cpu counter sets.
- * If the particular CPU counter set is not authorized,
- * return with -ENOENT in order to fall back to other
- * PMUs that might suffice the event request.
- */
- ctrs_state = cpumf_ctr_ctl[hwc->config_base];
- if (!(ctrs_state & cpuhw->info.auth_ctl))
- err = -ENOENT;
-
- put_cpu_var(cpu_cf_events);
return err;
}
@@ -120,20 +664,17 @@ static int validate_ctr_auth(const struct hw_perf_event *hwc)
*/
static void cpumf_pmu_enable(struct pmu *pmu)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
int err;
- if (cpuhw->flags & PMU_F_ENABLED)
- return;
-
- err = lcctl(cpuhw->state);
- if (err) {
- pr_err("Enabling the performance measuring unit "
- "failed with rc=%x\n", err);
+ if (!cpuhw || (cpuhw->flags & PMU_F_ENABLED))
return;
- }
- cpuhw->flags |= PMU_F_ENABLED;
+ err = lcctl(cpuhw->state | cpuhw->dev_state);
+ if (err)
+ pr_err("Enabling the performance measuring unit failed with rc=%x\n", err);
+ else
+ cpuhw->flags |= PMU_F_ENABLED;
}
/*
@@ -143,39 +684,26 @@ static void cpumf_pmu_enable(struct pmu *pmu)
*/
static void cpumf_pmu_disable(struct pmu *pmu)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- int err;
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
u64 inactive;
+ int err;
- if (!(cpuhw->flags & PMU_F_ENABLED))
+ if (!cpuhw || !(cpuhw->flags & PMU_F_ENABLED))
return;
inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
+ inactive |= cpuhw->dev_state;
err = lcctl(inactive);
- if (err) {
- pr_err("Disabling the performance measuring unit "
- "failed with rc=%x\n", err);
- return;
- }
-
- cpuhw->flags &= ~PMU_F_ENABLED;
+ if (err)
+ pr_err("Disabling the performance measuring unit failed with rc=%x\n", err);
+ else
+ cpuhw->flags &= ~PMU_F_ENABLED;
}
-
-/* Number of perf events counting hardware events */
-static atomic_t num_events = ATOMIC_INIT(0);
-/* Used to avoid races in calling reserve/release_cpumf_hardware */
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
/* Release the PMU if event is the last perf event */
static void hw_perf_event_destroy(struct perf_event *event)
{
- if (!atomic_add_unless(&num_events, -1, 1)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_dec_return(&num_events) == 0)
- __kernel_cpumcf_end();
- mutex_unlock(&pmc_reserve_mutex);
- }
+ cpum_cf_free(event->cpu);
}
/* CPUMF <-> perf event mappings for kernel+userspace (basic set) */
@@ -199,12 +727,17 @@ static const int cpumf_generic_events_user[] = {
[PERF_COUNT_HW_BUS_CYCLES] = -1,
};
+static int is_userspace_event(u64 ev)
+{
+ return cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev ||
+ cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev;
+}
+
static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
{
struct perf_event_attr *attr = &event->attr;
struct hw_perf_event *hwc = &event->hw;
enum cpumf_ctr_set set;
- int err = 0;
u64 ev;
switch (type) {
@@ -221,21 +754,26 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
if (is_sampling_event(event)) /* No sampling support */
return -ENOENT;
ev = attr->config;
- /* Count user space (problem-state) only */
if (!attr->exclude_user && attr->exclude_kernel) {
- if (ev >= ARRAY_SIZE(cpumf_generic_events_user))
- return -EOPNOTSUPP;
- ev = cpumf_generic_events_user[ev];
-
- /* No support for kernel space counters only */
+ /*
+ * Count user space (problem-state) only
+ * Handle events 32 and 33 as 0:u and 1:u
+ */
+ if (!is_userspace_event(ev)) {
+ if (ev >= ARRAY_SIZE(cpumf_generic_events_user))
+ return -EOPNOTSUPP;
+ ev = cpumf_generic_events_user[ev];
+ }
} else if (!attr->exclude_kernel && attr->exclude_user) {
+ /* No support for kernel space counters only */
return -EOPNOTSUPP;
-
- /* Count user and kernel space */
} else {
- if (ev >= ARRAY_SIZE(cpumf_generic_events_basic))
- return -EOPNOTSUPP;
- ev = cpumf_generic_events_basic[ev];
+ /* Count user and kernel space, incl. events 32 + 33 */
+ if (!is_userspace_event(ev)) {
+ if (ev >= ARRAY_SIZE(cpumf_generic_events_basic))
+ return -EOPNOTSUPP;
+ ev = cpumf_generic_events_basic[ev];
+ }
}
break;
@@ -260,36 +798,51 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
/*
* Use the hardware perf event structure to store the
* counter number in the 'config' member and the counter
- * set number in the 'config_base'. The counter set number
- * is then later used to enable/disable the counter(s).
+ * set number in the 'config_base' as bit mask.
+ * It is later used to enable/disable the counter(s).
*/
hwc->config = ev;
- hwc->config_base = set;
+ hwc->config_base = cpumf_ctr_ctl[set];
break;
case CPUMF_CTR_SET_MAX:
/* The counter could not be associated to a counter set */
return -EINVAL;
- };
+ }
/* Initialize for using the CPU-measurement counter facility */
- if (!atomic_inc_not_zero(&num_events)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&num_events) == 0 && __kernel_cpumcf_begin())
- err = -EBUSY;
- else
- atomic_inc(&num_events);
- mutex_unlock(&pmc_reserve_mutex);
- }
- if (err)
- return err;
+ if (cpum_cf_alloc(event->cpu))
+ return -ENOMEM;
event->destroy = hw_perf_event_destroy;
- /* Finally, validate version and authorization of the counter set */
- err = validate_ctr_auth(hwc);
- if (!err)
- err = validate_ctr_version(hwc);
+ /*
+ * Finally, validate version and authorization of the counter set.
+ * If the particular CPU counter set is not authorized,
+ * return with -ENOENT in order to fall back to other
+ * PMUs that might suffice the event request.
+ */
+ if (!(hwc->config_base & cpumf_ctr_info.auth_ctl))
+ return -ENOENT;
+ return validate_ctr_version(hwc->config, set);
+}
- return err;
+/* Events CPU_CYLCES and INSTRUCTIONS can be submitted with two different
+ * attribute::type values:
+ * - PERF_TYPE_HARDWARE:
+ * - pmu->type:
+ * Handle both type of invocations identical. They address the same hardware.
+ * The result is different when event modifiers exclude_kernel and/or
+ * exclude_user are also set.
+ */
+static int cpumf_pmu_event_type(struct perf_event *event)
+{
+ u64 ev = event->attr.config;
+
+ if (cpumf_generic_events_basic[PERF_COUNT_HW_CPU_CYCLES] == ev ||
+ cpumf_generic_events_basic[PERF_COUNT_HW_INSTRUCTIONS] == ev ||
+ cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev ||
+ cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev)
+ return PERF_TYPE_HARDWARE;
+ return PERF_TYPE_RAW;
}
static int cpumf_pmu_event_init(struct perf_event *event)
@@ -301,7 +854,7 @@ static int cpumf_pmu_event_init(struct perf_event *event)
err = __hw_perf_event_init(event, type);
else if (event->pmu->type == type)
/* Registered as unknown PMU */
- err = __hw_perf_event_init(event, PERF_TYPE_RAW);
+ err = __hw_perf_event_init(event, cpumf_pmu_event_type(event));
else
return -ENOENT;
@@ -361,18 +914,13 @@ static void cpumf_pmu_read(struct perf_event *event)
static void cpumf_pmu_start(struct perf_event *event, int flags)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
struct hw_perf_event *hwc = &event->hw;
+ int i;
- if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
- return;
-
- if (WARN_ON_ONCE(hwc->config == -1))
+ if (!(hwc->state & PERF_HES_STOPPED))
return;
- if (flags & PERF_EF_RELOAD)
- WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-
hwc->state = 0;
/* (Re-)enable and activate the counter set */
@@ -384,45 +932,95 @@ static void cpumf_pmu_start(struct perf_event *event, int flags)
* needs to be synchronized. At this point, the counter set can be in
* the inactive or disabled state.
*/
- hw_perf_event_reset(event);
+ if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+ cpuhw->usedss = cfdiag_getctr(cpuhw->start,
+ sizeof(cpuhw->start),
+ hwc->config_base, true);
+ } else {
+ hw_perf_event_reset(event);
+ }
- /* increment refcount for this counter set */
- atomic_inc(&cpuhw->ctr_set[hwc->config_base]);
+ /* Increment refcount for counter sets */
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+ if ((hwc->config_base & cpumf_ctr_ctl[i]))
+ atomic_inc(&cpuhw->ctr_set[i]);
+}
+
+/* Create perf event sample with the counter sets as raw data. The sample
+ * is then pushed to the event subsystem and the function checks for
+ * possible event overflows. If an event overflow occurs, the PMU is
+ * stopped.
+ *
+ * Return non-zero if an event overflow occurred.
+ */
+static int cfdiag_push_sample(struct perf_event *event,
+ struct cpu_cf_events *cpuhw)
+{
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ int overflow;
+
+ /* Setup perf sample */
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+ memset(&regs, 0, sizeof(regs));
+ memset(&raw, 0, sizeof(raw));
+
+ if (event->attr.sample_type & PERF_SAMPLE_CPU)
+ data.cpu_entry.cpu = event->cpu;
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw.frag.size = cpuhw->usedss;
+ raw.frag.data = cpuhw->stop;
+ perf_sample_save_raw_data(&data, &raw);
+ }
+
+ overflow = perf_event_overflow(event, &data, &regs);
+ if (overflow)
+ event->pmu->stop(event, 0);
+
+ perf_event_update_userpage(event);
+ return overflow;
}
static void cpumf_pmu_stop(struct perf_event *event, int flags)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
struct hw_perf_event *hwc = &event->hw;
+ int i;
if (!(hwc->state & PERF_HES_STOPPED)) {
/* Decrement reference count for this counter set and if this
* is the last used counter in the set, clear activation
* control and set the counter set state to inactive.
*/
- if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base]))
- ctr_set_stop(&cpuhw->state, hwc->config_base);
- event->hw.state |= PERF_HES_STOPPED;
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ if (!(hwc->config_base & cpumf_ctr_ctl[i]))
+ continue;
+ if (!atomic_dec_return(&cpuhw->ctr_set[i]))
+ ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]);
+ }
+ hwc->state |= PERF_HES_STOPPED;
}
if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
- hw_perf_event_update(event);
- event->hw.state |= PERF_HES_UPTODATE;
+ if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+ local64_inc(&event->count);
+ cpuhw->usedss = cfdiag_getctr(cpuhw->stop,
+ sizeof(cpuhw->stop),
+ event->hw.config_base,
+ false);
+ if (cfdiag_diffctr(cpuhw, event->hw.config_base))
+ cfdiag_push_sample(event, cpuhw);
+ } else {
+ hw_perf_event_update(event);
+ }
+ hwc->state |= PERF_HES_UPTODATE;
}
}
static int cpumf_pmu_add(struct perf_event *event, int flags)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
- /* Check authorization for the counter set to which this
- * counter belongs.
- * For group events transaction, the authorization check is
- * done in cpumf_pmu_commit_txn().
- */
- if (!(cpuhw->txn_flags & PERF_PMU_TXN_ADD))
- if (validate_ctr_auth(&event->hw))
- return -ENOENT;
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
ctr_set_enable(&cpuhw->state, event->hw.config_base);
event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
@@ -430,14 +1028,13 @@ static int cpumf_pmu_add(struct perf_event *event, int flags)
if (flags & PERF_EF_START)
cpumf_pmu_start(event, PERF_EF_RELOAD);
- perf_event_update_userpage(event);
-
return 0;
}
static void cpumf_pmu_del(struct perf_event *event, int flags)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+ int i;
cpumf_pmu_stop(event, PERF_EF_UPDATE);
@@ -449,112 +1046,905 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
* clear enable control and resets all counters in a set. Therefore,
* cpumf_pmu_start() always has to reenable a counter set.
*/
- if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base]))
- ctr_set_disable(&cpuhw->state, event->hw.config_base);
-
- perf_event_update_userpage(event);
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+ if (!atomic_read(&cpuhw->ctr_set[i]))
+ ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]);
}
+/* Performance monitoring unit for s390x */
+static struct pmu cpumf_pmu = {
+ .task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+ .pmu_enable = cpumf_pmu_enable,
+ .pmu_disable = cpumf_pmu_disable,
+ .event_init = cpumf_pmu_event_init,
+ .add = cpumf_pmu_add,
+ .del = cpumf_pmu_del,
+ .start = cpumf_pmu_start,
+ .stop = cpumf_pmu_stop,
+ .read = cpumf_pmu_read,
+};
+
+static struct cfset_session { /* CPUs and counter set bit mask */
+ struct list_head head; /* Head of list of active processes */
+} cfset_session = {
+ .head = LIST_HEAD_INIT(cfset_session.head)
+};
+
+static refcount_t cfset_opencnt = REFCOUNT_INIT(0); /* Access count */
/*
- * Start group events scheduling transaction.
- * Set flags to perform a single test at commit time.
+ * Synchronize access to device /dev/hwc. This mutex protects against
+ * concurrent access to functions cfset_open() and cfset_release().
+ * Same for CPU hotplug add and remove events triggering
+ * cpum_cf_online_cpu() and cpum_cf_offline_cpu().
+ * It also serializes concurrent device ioctl access from multiple
+ * processes accessing /dev/hwc.
*
- * We only support PERF_PMU_TXN_ADD transactions. Save the
- * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
- * transactions.
+ * The mutex protects concurrent access to the /dev/hwctr session management
+ * struct cfset_session and reference counting variable cfset_opencnt.
*/
-static void cpumf_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
+static DEFINE_MUTEX(cfset_ctrset_mutex);
+
+/*
+ * CPU hotplug handles only /dev/hwctr device.
+ * For perf_event_open() the CPU hotplug handling is done on kernel common
+ * code:
+ * - CPU add: Nothing is done since a file descriptor can not be created
+ * and returned to the user.
+ * - CPU delete: Handled by common code via pmu_disable(), pmu_stop() and
+ * pmu_delete(). The event itself is removed when the file descriptor is
+ * closed.
+ */
+static int cfset_online_cpu(unsigned int cpu);
+
+static int cpum_cf_online_cpu(unsigned int cpu)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ int rc = 0;
+
+ /*
+ * Ignore notification for perf_event_open().
+ * Handle only /dev/hwctr device sessions.
+ */
+ mutex_lock(&cfset_ctrset_mutex);
+ if (refcount_read(&cfset_opencnt)) {
+ rc = cpum_cf_alloc_cpu(cpu);
+ if (!rc)
+ cfset_online_cpu(cpu);
+ }
+ mutex_unlock(&cfset_ctrset_mutex);
+ return rc;
+}
+
+static int cfset_offline_cpu(unsigned int cpu);
+
+static int cpum_cf_offline_cpu(unsigned int cpu)
+{
+ /*
+ * During task exit processing of grouped perf events triggered by CPU
+ * hotplug processing, pmu_disable() is called as part of perf context
+ * removal process. Therefore do not trigger event removal now for
+ * perf_event_open() created events. Perf common code triggers event
+ * destruction when the event file descriptor is closed.
+ *
+ * Handle only /dev/hwctr device sessions.
+ */
+ mutex_lock(&cfset_ctrset_mutex);
+ if (refcount_read(&cfset_opencnt)) {
+ cfset_offline_cpu(cpu);
+ cpum_cf_free_cpu(cpu);
+ }
+ mutex_unlock(&cfset_ctrset_mutex);
+ return 0;
+}
+
+/* Return true if store counter set multiple instruction is available */
+static inline int stccm_avail(void)
+{
+ return test_facility(142);
+}
+
+/* CPU-measurement alerts for the counter facility */
+static void cpumf_measurement_alert(struct ext_code ext_code,
+ unsigned int alert, unsigned long unused)
+{
+ struct cpu_cf_events *cpuhw;
+
+ if (!(alert & CPU_MF_INT_CF_MASK))
+ return;
- WARN_ON_ONCE(cpuhw->txn_flags); /* txn already in flight */
+ inc_irq_stat(IRQEXT_CMC);
- cpuhw->txn_flags = txn_flags;
- if (txn_flags & ~PERF_PMU_TXN_ADD)
+ /*
+ * Measurement alerts are shared and might happen when the PMU
+ * is not reserved. Ignore these alerts in this case.
+ */
+ cpuhw = this_cpu_cfhw();
+ if (!cpuhw)
return;
- perf_pmu_disable(pmu);
- cpuhw->tx_state = cpuhw->state;
+ /* counter authorization change alert */
+ if (alert & CPU_MF_INT_CF_CACA)
+ qctri(&cpumf_ctr_info);
+
+ /* loss of counter data alert */
+ if (alert & CPU_MF_INT_CF_LCDA)
+ pr_err("CPU[%i] Counter data was lost\n", smp_processor_id());
+
+ /* loss of MT counter data alert */
+ if (alert & CPU_MF_INT_CF_MTDA)
+ pr_warn("CPU[%i] MT counter data was lost\n",
+ smp_processor_id());
}
-/*
- * Stop and cancel a group events scheduling tranctions.
- * Assumes cpumf_pmu_del() is called for each successful added
- * cpumf_pmu_add() during the transaction.
+static int cfset_init(void);
+static int __init cpumf_pmu_init(void)
+{
+ int rc;
+
+ /* Extract counter measurement facility information */
+ if (!cpum_cf_avail() || qctri(&cpumf_ctr_info))
+ return -ENODEV;
+
+ /* Determine and store counter set sizes for later reference */
+ for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+ cpum_cf_make_setsize(rc);
+
+ /*
+ * Clear bit 15 of cr0 to unauthorize problem-state to
+ * extract measurement counters
+ */
+ system_ctl_clear_bit(0, CR0_CPUMF_EXTRACTION_AUTH_BIT);
+
+ /* register handler for measurement-alert interruptions */
+ rc = register_external_irq(EXT_IRQ_MEASURE_ALERT,
+ cpumf_measurement_alert);
+ if (rc) {
+ pr_err("Registering for CPU-measurement alerts failed with rc=%i\n", rc);
+ return rc;
+ }
+
+ /* Setup s390dbf facility */
+ cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
+ if (!cf_dbg) {
+ pr_err("Registration of s390dbf(cpum_cf) failed\n");
+ rc = -ENOMEM;
+ goto out1;
+ }
+ debug_register_view(cf_dbg, &debug_sprintf_view);
+
+ cpumf_pmu.attr_groups = cpumf_cf_event_group();
+ rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1);
+ if (rc) {
+ pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
+ goto out2;
+ } else if (stccm_avail()) { /* Setup counter set device */
+ cfset_init();
+ }
+
+ rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE,
+ "perf/s390/cf:online",
+ cpum_cf_online_cpu, cpum_cf_offline_cpu);
+ return rc;
+
+out2:
+ debug_unregister_view(cf_dbg, &debug_sprintf_view);
+ debug_unregister(cf_dbg);
+out1:
+ unregister_external_irq(EXT_IRQ_MEASURE_ALERT, cpumf_measurement_alert);
+ return rc;
+}
+
+/* Support for the CPU Measurement Facility counter set extraction using
+ * device /dev/hwctr. This allows user space programs to extract complete
+ * counter set via normal file operations.
+ */
+
+struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */
+ unsigned int sets; /* Counter set bit mask */
+ atomic_t cpus_ack; /* # CPUs successfully executed func */
+};
+
+struct cfset_request { /* CPUs and counter set bit mask */
+ unsigned long ctrset; /* Bit mask of counter set to read */
+ cpumask_t mask; /* CPU mask to read from */
+ struct list_head node; /* Chain to cfset_session.head */
+};
+
+static void cfset_session_init(void)
+{
+ INIT_LIST_HEAD(&cfset_session.head);
+}
+
+/* Remove current request from global bookkeeping. Maintain a counter set bit
+ * mask on a per CPU basis.
+ * Done in process context under mutex protection.
*/
-static void cpumf_pmu_cancel_txn(struct pmu *pmu)
+static void cfset_session_del(struct cfset_request *p)
{
- unsigned int txn_flags;
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ list_del(&p->node);
+}
- WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */
+/* Add current request to global bookkeeping. Maintain a counter set bit mask
+ * on a per CPU basis.
+ * Done in process context under mutex protection.
+ */
+static void cfset_session_add(struct cfset_request *p)
+{
+ list_add(&p->node, &cfset_session.head);
+}
- txn_flags = cpuhw->txn_flags;
- cpuhw->txn_flags = 0;
- if (txn_flags & ~PERF_PMU_TXN_ADD)
- return;
+/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access
+ * path is currently used.
+ * The cpu_cf_events::dev_state is used to denote counter sets in use by this
+ * interface. It is always or'ed in. If this interface is not active, its
+ * value is zero and no additional counter sets will be included.
+ *
+ * The cpu_cf_events::state is used by the perf_event_open SVC and remains
+ * unchanged.
+ *
+ * perf_pmu_enable() and perf_pmu_enable() and its call backs
+ * cpumf_pmu_enable() and cpumf_pmu_disable() are called by the
+ * performance measurement subsystem to enable per process
+ * CPU Measurement counter facility.
+ * The XXX_enable() and XXX_disable functions are used to turn off
+ * x86 performance monitoring interrupt (PMI) during scheduling.
+ * s390 uses these calls to temporarily stop and resume the active CPU
+ * counters sets during scheduling.
+ *
+ * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr
+ * device access. The perf_event_open() SVC interface makes a lot of effort
+ * to only run the counters while the calling process is actively scheduled
+ * to run.
+ * When /dev/hwctr interface is also used at the same time, the counter sets
+ * will keep running, even when the process is scheduled off a CPU.
+ * However this is not a problem and does not lead to wrong counter values
+ * for the perf_event_open() SVC. The current counter value will be recorded
+ * during schedule-in. At schedule-out time the current counter value is
+ * extracted again and the delta is calculated and added to the event.
+ */
+/* Stop all counter sets via ioctl interface */
+static void cfset_ioctl_off(void *parm)
+{
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+ struct cfset_call_on_cpu_parm *p = parm;
+ int rc;
+
+ /* Check if any counter set used by /dev/hwctr */
+ for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+ if ((p->sets & cpumf_ctr_ctl[rc])) {
+ if (!atomic_dec_return(&cpuhw->ctr_set[rc])) {
+ ctr_set_disable(&cpuhw->dev_state,
+ cpumf_ctr_ctl[rc]);
+ ctr_set_stop(&cpuhw->dev_state,
+ cpumf_ctr_ctl[rc]);
+ }
+ }
+ /* Keep perf_event_open counter sets */
+ rc = lcctl(cpuhw->dev_state | cpuhw->state);
+ if (rc)
+ pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n",
+ cpuhw->state, S390_HWCTR_DEVICE, rc);
+ if (!cpuhw->dev_state)
+ cpuhw->flags &= ~PMU_F_IN_USE;
+}
+
+/* Start counter sets on particular CPU */
+static void cfset_ioctl_on(void *parm)
+{
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+ struct cfset_call_on_cpu_parm *p = parm;
+ int rc;
+
+ cpuhw->flags |= PMU_F_IN_USE;
+ ctr_set_enable(&cpuhw->dev_state, p->sets);
+ ctr_set_start(&cpuhw->dev_state, p->sets);
+ for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+ if ((p->sets & cpumf_ctr_ctl[rc]))
+ atomic_inc(&cpuhw->ctr_set[rc]);
+ rc = lcctl(cpuhw->dev_state | cpuhw->state); /* Start counter sets */
+ if (!rc)
+ atomic_inc(&p->cpus_ack);
+ else
+ pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n",
+ cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc);
+}
+
+static void cfset_release_cpu(void *p)
+{
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+ int rc;
+
+ cpuhw->dev_state = 0;
+ rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */
+ if (rc)
+ pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n",
+ cpuhw->state, S390_HWCTR_DEVICE, rc);
+}
+
+/* This modifies the process CPU mask to adopt it to the currently online
+ * CPUs. Offline CPUs can not be addresses. This call terminates the access
+ * and is usually followed by close() or a new iotcl(..., START, ...) which
+ * creates a new request structure.
+ */
+static void cfset_all_stop(struct cfset_request *req)
+{
+ struct cfset_call_on_cpu_parm p = {
+ .sets = req->ctrset,
+ };
- WARN_ON(cpuhw->tx_state != cpuhw->state);
+ cpumask_and(&req->mask, &req->mask, cpu_online_mask);
+ on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1);
+}
- perf_pmu_enable(pmu);
+/* Release function is also called when application gets terminated without
+ * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
+ */
+static int cfset_release(struct inode *inode, struct file *file)
+{
+ mutex_lock(&cfset_ctrset_mutex);
+ /* Open followed by close/exit has no private_data */
+ if (file->private_data) {
+ cfset_all_stop(file->private_data);
+ cfset_session_del(file->private_data);
+ kfree(file->private_data);
+ file->private_data = NULL;
+ }
+ if (refcount_dec_and_test(&cfset_opencnt)) { /* Last close */
+ on_each_cpu(cfset_release_cpu, NULL, 1);
+ cpum_cf_free(-1);
+ }
+ mutex_unlock(&cfset_ctrset_mutex);
+ return 0;
}
/*
- * Commit the group events scheduling transaction. On success, the
- * transaction is closed. On error, the transaction is kept open
- * until cpumf_pmu_cancel_txn() is called.
+ * Open via /dev/hwctr device. Allocate all per CPU resources on the first
+ * open of the device. The last close releases all per CPU resources.
+ * Parallel perf_event_open system calls also use per CPU resources.
+ * These invocations are handled via reference counting on the per CPU data
+ * structures.
*/
-static int cpumf_pmu_commit_txn(struct pmu *pmu)
+static int cfset_open(struct inode *inode, struct file *file)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- u64 state;
+ int rc = 0;
+
+ if (!perfmon_capable())
+ return -EPERM;
+ file->private_data = NULL;
+
+ mutex_lock(&cfset_ctrset_mutex);
+ if (!refcount_inc_not_zero(&cfset_opencnt)) { /* First open */
+ rc = cpum_cf_alloc(-1);
+ if (!rc) {
+ cfset_session_init();
+ refcount_set(&cfset_opencnt, 1);
+ }
+ }
+ mutex_unlock(&cfset_ctrset_mutex);
- WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */
+ /* nonseekable_open() never fails */
+ return rc ?: nonseekable_open(inode, file);
+}
- if (cpuhw->txn_flags & ~PERF_PMU_TXN_ADD) {
- cpuhw->txn_flags = 0;
- return 0;
+static int cfset_all_start(struct cfset_request *req)
+{
+ struct cfset_call_on_cpu_parm p = {
+ .sets = req->ctrset,
+ .cpus_ack = ATOMIC_INIT(0),
+ };
+ cpumask_var_t mask;
+ int rc = 0;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+ cpumask_and(mask, &req->mask, cpu_online_mask);
+ on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1);
+ if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
+ on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
+ rc = -EIO;
}
+ free_cpumask_var(mask);
+ return rc;
+}
- /* check if the updated state can be scheduled */
- state = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
- state >>= CPUMF_LCCTL_ENABLE_SHIFT;
- if ((state & cpuhw->info.auth_ctl) != state)
- return -ENOENT;
+/* Return the maximum required space for all possible CPUs in case one
+ * CPU will be onlined during the START, READ, STOP cycles.
+ * To find out the size of the counter sets, any one CPU will do. They
+ * all have the same counter sets.
+ */
+static size_t cfset_needspace(unsigned int sets)
+{
+ size_t bytes = 0;
+ int i;
+
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ if (!(sets & cpumf_ctr_ctl[i]))
+ continue;
+ bytes += cpum_cf_read_setsize(i) * sizeof(u64) +
+ sizeof(((struct s390_ctrset_setdata *)0)->set) +
+ sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
+ }
+ bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
+ (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
+ sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
+ return bytes;
+}
+
+static int cfset_all_copy(unsigned long arg, cpumask_t *mask)
+{
+ struct s390_ctrset_read __user *ctrset_read;
+ unsigned int cpu, cpus, rc = 0;
+ void __user *uptr;
+
+ ctrset_read = (struct s390_ctrset_read __user *)arg;
+ uptr = ctrset_read->data;
+ for_each_cpu(cpu, mask) {
+ struct cpu_cf_events *cpuhw = get_cpu_cfhw(cpu);
+ struct s390_ctrset_cpudata __user *ctrset_cpudata;
+
+ ctrset_cpudata = uptr;
+ rc = put_user(cpu, &ctrset_cpudata->cpu_nr);
+ rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets);
+ rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data,
+ cpuhw->used);
+ if (rc) {
+ rc = -EFAULT;
+ goto out;
+ }
+ uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used;
+ cond_resched();
+ }
+ cpus = cpumask_weight(mask);
+ if (put_user(cpus, &ctrset_read->no_cpus))
+ rc = -EFAULT;
+out:
+ return rc;
+}
- cpuhw->txn_flags = 0;
- perf_pmu_enable(pmu);
+static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
+ int ctrset_size, size_t room)
+{
+ size_t need = 0;
+ int rc = -1;
+
+ need = sizeof(*p) + sizeof(u64) * ctrset_size;
+ if (need <= room) {
+ p->set = cpumf_ctr_ctl[ctrset];
+ p->no_cnts = ctrset_size;
+ rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
+ if (rc == 3) /* Nothing stored */
+ need = 0;
+ }
+ return need;
+}
+
+/* Read all counter sets. */
+static void cfset_cpu_read(void *parm)
+{
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+ struct cfset_call_on_cpu_parm *p = parm;
+ int set, set_size;
+ size_t space;
+
+ /* No data saved yet */
+ cpuhw->used = 0;
+ cpuhw->sets = 0;
+ memset(cpuhw->data, 0, sizeof(cpuhw->data));
+
+ /* Scan the counter sets */
+ for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
+ struct s390_ctrset_setdata *sp = (void *)cpuhw->data +
+ cpuhw->used;
+
+ if (!(p->sets & cpumf_ctr_ctl[set]))
+ continue; /* Counter set not in list */
+ set_size = cpum_cf_read_setsize(set);
+ space = sizeof(cpuhw->data) - cpuhw->used;
+ space = cfset_cpuset_read(sp, set, set_size, space);
+ if (space) {
+ cpuhw->used += space;
+ cpuhw->sets += 1;
+ }
+ }
+}
+
+static int cfset_all_read(unsigned long arg, struct cfset_request *req)
+{
+ struct cfset_call_on_cpu_parm p;
+ cpumask_var_t mask;
+ int rc;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ p.sets = req->ctrset;
+ cpumask_and(mask, &req->mask, cpu_online_mask);
+ on_each_cpu_mask(mask, cfset_cpu_read, &p, 1);
+ rc = cfset_all_copy(arg, mask);
+ free_cpumask_var(mask);
+ return rc;
+}
+
+static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req)
+{
+ int ret = -ENODATA;
+
+ if (req && req->ctrset)
+ ret = cfset_all_read(arg, req);
+ return ret;
+}
+
+static long cfset_ioctl_stop(struct file *file)
+{
+ struct cfset_request *req = file->private_data;
+ int ret = -ENXIO;
+
+ if (req) {
+ cfset_all_stop(req);
+ cfset_session_del(req);
+ kfree(req);
+ file->private_data = NULL;
+ ret = 0;
+ }
+ return ret;
+}
+
+static long cfset_ioctl_start(unsigned long arg, struct file *file)
+{
+ struct s390_ctrset_start __user *ustart;
+ struct s390_ctrset_start start;
+ struct cfset_request *preq;
+ void __user *umask;
+ unsigned int len;
+ int ret = 0;
+ size_t need;
+
+ if (file->private_data)
+ return -EBUSY;
+ ustart = (struct s390_ctrset_start __user *)arg;
+ if (copy_from_user(&start, ustart, sizeof(start)))
+ return -EFAULT;
+ if (start.version != S390_HWCTR_START_VERSION)
+ return -EINVAL;
+ if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
+ cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
+ cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
+ cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
+ cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
+ return -EINVAL; /* Invalid counter set */
+ if (!start.counter_sets)
+ return -EINVAL; /* No counter set at all? */
+
+ preq = kzalloc(sizeof(*preq), GFP_KERNEL);
+ if (!preq)
+ return -ENOMEM;
+ cpumask_clear(&preq->mask);
+ len = min_t(u64, start.cpumask_len, cpumask_size());
+ umask = (void __user *)start.cpumask;
+ if (copy_from_user(&preq->mask, umask, len)) {
+ kfree(preq);
+ return -EFAULT;
+ }
+ if (cpumask_empty(&preq->mask)) {
+ kfree(preq);
+ return -EINVAL;
+ }
+ need = cfset_needspace(start.counter_sets);
+ if (put_user(need, &ustart->data_bytes)) {
+ kfree(preq);
+ return -EFAULT;
+ }
+ preq->ctrset = start.counter_sets;
+ ret = cfset_all_start(preq);
+ if (!ret) {
+ cfset_session_add(preq);
+ file->private_data = preq;
+ } else {
+ kfree(preq);
+ }
+ return ret;
+}
+
+/* Entry point to the /dev/hwctr device interface.
+ * The ioctl system call supports three subcommands:
+ * S390_HWCTR_START: Start the specified counter sets on a CPU list. The
+ * counter set keeps running until explicitly stopped. Returns the number
+ * of bytes needed to store the counter values. If another S390_HWCTR_START
+ * ioctl subcommand is called without a previous S390_HWCTR_STOP stop
+ * command on the same file descriptor, -EBUSY is returned.
+ * S390_HWCTR_READ: Read the counter set values from specified CPU list given
+ * with the S390_HWCTR_START command.
+ * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the
+ * previous S390_HWCTR_START subcommand.
+ */
+static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int ret;
+
+ cpus_read_lock();
+ mutex_lock(&cfset_ctrset_mutex);
+ switch (cmd) {
+ case S390_HWCTR_START:
+ ret = cfset_ioctl_start(arg, file);
+ break;
+ case S390_HWCTR_STOP:
+ ret = cfset_ioctl_stop(file);
+ break;
+ case S390_HWCTR_READ:
+ ret = cfset_ioctl_read(arg, file->private_data);
+ break;
+ default:
+ ret = -ENOTTY;
+ break;
+ }
+ mutex_unlock(&cfset_ctrset_mutex);
+ cpus_read_unlock();
+ return ret;
+}
+
+static const struct file_operations cfset_fops = {
+ .owner = THIS_MODULE,
+ .open = cfset_open,
+ .release = cfset_release,
+ .unlocked_ioctl = cfset_ioctl,
+ .compat_ioctl = cfset_ioctl,
+ .llseek = no_llseek
+};
+
+static struct miscdevice cfset_dev = {
+ .name = S390_HWCTR_DEVICE,
+ .minor = MISC_DYNAMIC_MINOR,
+ .fops = &cfset_fops,
+ .mode = 0666,
+};
+
+/* Hotplug add of a CPU. Scan through all active processes and add
+ * that CPU to the list of CPUs supplied with ioctl(..., START, ...).
+ */
+static int cfset_online_cpu(unsigned int cpu)
+{
+ struct cfset_call_on_cpu_parm p;
+ struct cfset_request *rp;
+
+ if (!list_empty(&cfset_session.head)) {
+ list_for_each_entry(rp, &cfset_session.head, node) {
+ p.sets = rp->ctrset;
+ cfset_ioctl_on(&p);
+ cpumask_set_cpu(cpu, &rp->mask);
+ }
+ }
return 0;
}
-/* Performance monitoring unit for s390x */
-static struct pmu cpumf_pmu = {
+/* Hotplug remove of a CPU. Scan through all active processes and clear
+ * that CPU from the list of CPUs supplied with ioctl(..., START, ...).
+ * Adjust reference counts.
+ */
+static int cfset_offline_cpu(unsigned int cpu)
+{
+ struct cfset_call_on_cpu_parm p;
+ struct cfset_request *rp;
+
+ if (!list_empty(&cfset_session.head)) {
+ list_for_each_entry(rp, &cfset_session.head, node) {
+ p.sets = rp->ctrset;
+ cfset_ioctl_off(&p);
+ cpumask_clear_cpu(cpu, &rp->mask);
+ }
+ }
+ return 0;
+}
+
+static void cfdiag_read(struct perf_event *event)
+{
+}
+
+static int get_authctrsets(void)
+{
+ unsigned long auth = 0;
+ enum cpumf_ctr_set i;
+
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ if (cpumf_ctr_info.auth_ctl & cpumf_ctr_ctl[i])
+ auth |= cpumf_ctr_ctl[i];
+ }
+ return auth;
+}
+
+/* Setup the event. Test for authorized counter sets and only include counter
+ * sets which are authorized at the time of the setup. Including unauthorized
+ * counter sets result in specification exception (and panic).
+ */
+static int cfdiag_event_init2(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ int err = 0;
+
+ /* Set sample_period to indicate sampling */
+ event->hw.config = attr->config;
+ event->hw.sample_period = attr->sample_period;
+ local64_set(&event->hw.period_left, event->hw.sample_period);
+ local64_set(&event->count, 0);
+ event->hw.last_period = event->hw.sample_period;
+
+ /* Add all authorized counter sets to config_base. The
+ * the hardware init function is either called per-cpu or just once
+ * for all CPUS (event->cpu == -1). This depends on the whether
+ * counting is started for all CPUs or on a per workload base where
+ * the perf event moves from one CPU to another CPU.
+ * Checking the authorization on any CPU is fine as the hardware
+ * applies the same authorization settings to all CPUs.
+ */
+ event->hw.config_base = get_authctrsets();
+
+ /* No authorized counter sets, nothing to count/sample */
+ if (!event->hw.config_base)
+ err = -EINVAL;
+
+ return err;
+}
+
+static int cfdiag_event_init(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ int err = -ENOENT;
+
+ if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
+ event->attr.type != event->pmu->type)
+ goto out;
+
+ /* Raw events are used to access counters directly,
+ * hence do not permit excludes.
+ * This event is useless without PERF_SAMPLE_RAW to return counter set
+ * values as raw data.
+ */
+ if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
+ !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* Initialize for using the CPU-measurement counter facility */
+ if (cpum_cf_alloc(event->cpu))
+ return -ENOMEM;
+ event->destroy = hw_perf_event_destroy;
+
+ err = cfdiag_event_init2(event);
+ if (unlikely(err))
+ event->destroy(event);
+out:
+ return err;
+}
+
+/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used
+ * to collect the complete counter sets for a scheduled process. Target
+ * are complete counter sets attached as raw data to the artificial event.
+ * This results in complete counter sets available when a process is
+ * scheduled. Contains the delta of every counter while the process was
+ * running.
+ */
+CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
+
+static struct attribute *cfdiag_events_attr[] = {
+ CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
+ NULL,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *cfdiag_format_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group cfdiag_events_group = {
+ .name = "events",
+ .attrs = cfdiag_events_attr,
+};
+static struct attribute_group cfdiag_format_group = {
+ .name = "format",
+ .attrs = cfdiag_format_attr,
+};
+static const struct attribute_group *cfdiag_attr_groups[] = {
+ &cfdiag_events_group,
+ &cfdiag_format_group,
+ NULL,
+};
+
+/* Performance monitoring unit for event CF_DIAG. Since this event
+ * is also started and stopped via the perf_event_open() system call, use
+ * the same event enable/disable call back functions. They do not
+ * have a pointer to the perf_event strcture as first parameter.
+ *
+ * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common.
+ * Reuse them and distinguish the event (always first parameter) via
+ * 'config' member.
+ */
+static struct pmu cf_diag = {
.task_ctx_nr = perf_sw_context,
- .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+ .event_init = cfdiag_event_init,
.pmu_enable = cpumf_pmu_enable,
.pmu_disable = cpumf_pmu_disable,
- .event_init = cpumf_pmu_event_init,
.add = cpumf_pmu_add,
.del = cpumf_pmu_del,
.start = cpumf_pmu_start,
.stop = cpumf_pmu_stop,
- .read = cpumf_pmu_read,
- .start_txn = cpumf_pmu_start_txn,
- .commit_txn = cpumf_pmu_commit_txn,
- .cancel_txn = cpumf_pmu_cancel_txn,
+ .read = cfdiag_read,
+
+ .attr_groups = cfdiag_attr_groups
};
-static int __init cpumf_pmu_init(void)
+/* Calculate memory needed to store all counter sets together with header and
+ * trailer data. This is independent of the counter set authorization which
+ * can vary depending on the configuration.
+ */
+static size_t cfdiag_maxsize(struct cpumf_ctr_info *info)
+{
+ size_t max_size = sizeof(struct cf_trailer_entry);
+ enum cpumf_ctr_set i;
+
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ size_t size = cpum_cf_read_setsize(i);
+
+ if (size)
+ max_size += size * sizeof(u64) +
+ sizeof(struct cf_ctrset_entry);
+ }
+ return max_size;
+}
+
+/* Get the CPU speed, try sampling facility first and CPU attributes second. */
+static void cfdiag_get_cpu_speed(void)
+{
+ unsigned long mhz;
+
+ if (cpum_sf_avail()) { /* Sampling facility first */
+ struct hws_qsi_info_block si;
+
+ memset(&si, 0, sizeof(si));
+ if (!qsi(&si)) {
+ cfdiag_cpu_speed = si.cpu_speed;
+ return;
+ }
+ }
+
+ /* Fallback: CPU speed extract static part. Used in case
+ * CPU Measurement Sampling Facility is turned off.
+ */
+ mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
+ if (mhz != -1UL)
+ cfdiag_cpu_speed = mhz & 0xffffffff;
+}
+
+static int cfset_init(void)
{
+ size_t need;
int rc;
- if (!kernel_cpumcf_avail())
- return -ENODEV;
+ cfdiag_get_cpu_speed();
+ /* Make sure the counter set data fits into predefined buffer. */
+ need = cfdiag_maxsize(&cpumf_ctr_info);
+ if (need > sizeof(((struct cpu_cf_events *)0)->start)) {
+ pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
+ need);
+ return -ENOMEM;
+ }
- cpumf_pmu.attr_groups = cpumf_cf_event_group();
- rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1);
- if (rc)
- pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
+ rc = misc_register(&cfset_dev);
+ if (rc) {
+ pr_err("Registration of /dev/%s failed rc=%i\n",
+ cfset_dev.name, rc);
+ goto out;
+ }
+
+ rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
+ if (rc) {
+ misc_deregister(&cfset_dev);
+ pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
+ rc);
+ }
+out:
return rc;
}
-subsys_initcall(cpumf_pmu_init);
+
+device_initcall(cpumf_pmu_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c
deleted file mode 100644
index 3bced89caffb..000000000000
--- a/arch/s390/kernel/perf_cpum_cf_common.c
+++ /dev/null
@@ -1,201 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * CPU-Measurement Counter Facility Support - Common Layer
- *
- * Copyright IBM Corp. 2019
- * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
- */
-#define KMSG_COMPONENT "cpum_cf_common"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <asm/ctl_reg.h>
-#include <asm/irq.h>
-#include <asm/cpu_mcf.h>
-
-/* Per-CPU event structure for the counter facility */
-DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = {
- .ctr_set = {
- [CPUMF_CTR_SET_BASIC] = ATOMIC_INIT(0),
- [CPUMF_CTR_SET_USER] = ATOMIC_INIT(0),
- [CPUMF_CTR_SET_CRYPTO] = ATOMIC_INIT(0),
- [CPUMF_CTR_SET_EXT] = ATOMIC_INIT(0),
- [CPUMF_CTR_SET_MT_DIAG] = ATOMIC_INIT(0),
- },
- .alert = ATOMIC64_INIT(0),
- .state = 0,
- .flags = 0,
- .txn_flags = 0,
-};
-/* Indicator whether the CPU-Measurement Counter Facility Support is ready */
-static bool cpum_cf_initalized;
-
-/* CPU-measurement alerts for the counter facility */
-static void cpumf_measurement_alert(struct ext_code ext_code,
- unsigned int alert, unsigned long unused)
-{
- struct cpu_cf_events *cpuhw;
-
- if (!(alert & CPU_MF_INT_CF_MASK))
- return;
-
- inc_irq_stat(IRQEXT_CMC);
- cpuhw = this_cpu_ptr(&cpu_cf_events);
-
- /* Measurement alerts are shared and might happen when the PMU
- * is not reserved. Ignore these alerts in this case. */
- if (!(cpuhw->flags & PMU_F_RESERVED))
- return;
-
- /* counter authorization change alert */
- if (alert & CPU_MF_INT_CF_CACA)
- qctri(&cpuhw->info);
-
- /* loss of counter data alert */
- if (alert & CPU_MF_INT_CF_LCDA)
- pr_err("CPU[%i] Counter data was lost\n", smp_processor_id());
-
- /* loss of MT counter data alert */
- if (alert & CPU_MF_INT_CF_MTDA)
- pr_warn("CPU[%i] MT counter data was lost\n",
- smp_processor_id());
-
- /* store alert for special handling by in-kernel users */
- atomic64_or(alert, &cpuhw->alert);
-}
-
-#define PMC_INIT 0
-#define PMC_RELEASE 1
-static void cpum_cf_setup_cpu(void *flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
- switch (*((int *) flags)) {
- case PMC_INIT:
- memset(&cpuhw->info, 0, sizeof(cpuhw->info));
- qctri(&cpuhw->info);
- cpuhw->flags |= PMU_F_RESERVED;
- break;
-
- case PMC_RELEASE:
- cpuhw->flags &= ~PMU_F_RESERVED;
- break;
- }
-
- /* Disable CPU counter sets */
- lcctl(0);
-}
-
-bool kernel_cpumcf_avail(void)
-{
- return cpum_cf_initalized;
-}
-EXPORT_SYMBOL(kernel_cpumcf_avail);
-
-
-/* Reserve/release functions for sharing perf hardware */
-static DEFINE_SPINLOCK(cpumcf_owner_lock);
-static void *cpumcf_owner;
-
-/* Initialize the CPU-measurement counter facility */
-int __kernel_cpumcf_begin(void)
-{
- int flags = PMC_INIT;
- int err = 0;
-
- spin_lock(&cpumcf_owner_lock);
- if (cpumcf_owner)
- err = -EBUSY;
- else
- cpumcf_owner = __builtin_return_address(0);
- spin_unlock(&cpumcf_owner_lock);
- if (err)
- return err;
-
- on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
- irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-
- return 0;
-}
-EXPORT_SYMBOL(__kernel_cpumcf_begin);
-
-/* Obtain the CPU-measurement alerts for the counter facility */
-unsigned long kernel_cpumcf_alert(int clear)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- unsigned long alert;
-
- alert = atomic64_read(&cpuhw->alert);
- if (clear)
- atomic64_set(&cpuhw->alert, 0);
-
- return alert;
-}
-EXPORT_SYMBOL(kernel_cpumcf_alert);
-
-/* Release the CPU-measurement counter facility */
-void __kernel_cpumcf_end(void)
-{
- int flags = PMC_RELEASE;
-
- on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
- irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-
- spin_lock(&cpumcf_owner_lock);
- cpumcf_owner = NULL;
- spin_unlock(&cpumcf_owner_lock);
-}
-EXPORT_SYMBOL(__kernel_cpumcf_end);
-
-static int cpum_cf_setup(unsigned int cpu, int flags)
-{
- local_irq_disable();
- cpum_cf_setup_cpu(&flags);
- local_irq_enable();
- return 0;
-}
-
-static int cpum_cf_online_cpu(unsigned int cpu)
-{
- return cpum_cf_setup(cpu, PMC_INIT);
-}
-
-static int cpum_cf_offline_cpu(unsigned int cpu)
-{
- return cpum_cf_setup(cpu, PMC_RELEASE);
-}
-
-static int __init cpum_cf_init(void)
-{
- int rc;
-
- if (!cpum_cf_avail())
- return -ENODEV;
-
- /* clear bit 15 of cr0 to unauthorize problem-state to
- * extract measurement counters */
- ctl_clear_bit(0, 48);
-
- /* register handler for measurement-alert interruptions */
- rc = register_external_irq(EXT_IRQ_MEASURE_ALERT,
- cpumf_measurement_alert);
- if (rc) {
- pr_err("Registering for CPU-measurement alerts "
- "failed with rc=%i\n", rc);
- return rc;
- }
-
- rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE,
- "perf/s390/cf:online",
- cpum_cf_online_cpu, cpum_cf_offline_cpu);
- if (!rc)
- cpum_cf_initalized = true;
-
- return rc;
-}
-early_initcall(cpum_cf_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c
deleted file mode 100644
index e949ab832ed7..000000000000
--- a/arch/s390/kernel/perf_cpum_cf_diag.c
+++ /dev/null
@@ -1,705 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Performance event support for s390x - CPU-measurement Counter Sets
- *
- * Copyright IBM Corp. 2019
- * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
- * Thomas Richer <tmricht@linux.ibm.com>
- */
-#define KMSG_COMPONENT "cpum_cf_diag"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/processor.h>
-
-#include <asm/ctl_reg.h>
-#include <asm/irq.h>
-#include <asm/cpu_mcf.h>
-#include <asm/timex.h>
-#include <asm/debug.h>
-
-#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */
-
-static unsigned int cf_diag_cpu_speed;
-static debug_info_t *cf_diag_dbg;
-
-struct cf_diag_csd { /* Counter set data per CPU */
- size_t used; /* Bytes used in data/start */
- unsigned char start[PAGE_SIZE]; /* Counter set at event start */
- unsigned char data[PAGE_SIZE]; /* Counter set at event delete */
-};
-static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd);
-
-/* Counter sets are stored as data stream in a page sized memory buffer and
- * exported to user space via raw data attached to the event sample data.
- * Each counter set starts with an eight byte header consisting of:
- * - a two byte eye catcher (0xfeef)
- * - a one byte counter set number
- * - a two byte counter set size (indicates the number of counters in this set)
- * - a three byte reserved value (must be zero) to make the header the same
- * size as a counter value.
- * All counter values are eight byte in size.
- *
- * All counter sets are followed by a 64 byte trailer.
- * The trailer consists of a:
- * - flag field indicating valid fields when corresponding bit set
- * - the counter facility first and second version number
- * - the CPU speed if nonzero
- * - the time stamp the counter sets have been collected
- * - the time of day (TOD) base value
- * - the machine type.
- *
- * The counter sets are saved when the process is prepared to be executed on a
- * CPU and saved again when the process is going to be removed from a CPU.
- * The difference of both counter sets are calculated and stored in the event
- * sample data area.
- */
-
-struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */
- unsigned int def:16; /* 0-15 Data Entry Format */
- unsigned int set:16; /* 16-31 Counter set identifier */
- unsigned int ctr:16; /* 32-47 Number of stored counters */
- unsigned int res1:16; /* 48-63 Reserved */
-};
-
-struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */
- /* 0 - 7 */
- union {
- struct {
- unsigned int clock_base:1; /* TOD clock base set */
- unsigned int speed:1; /* CPU speed set */
- /* Measurement alerts */
- unsigned int mtda:1; /* Loss of MT ctr. data alert */
- unsigned int caca:1; /* Counter auth. change alert */
- unsigned int lcda:1; /* Loss of counter data alert */
- };
- unsigned long flags; /* 0-63 All indicators */
- };
- /* 8 - 15 */
- unsigned int cfvn:16; /* 64-79 Ctr First Version */
- unsigned int csvn:16; /* 80-95 Ctr Second Version */
- unsigned int cpu_speed:32; /* 96-127 CPU speed */
- /* 16 - 23 */
- unsigned long timestamp; /* 128-191 Timestamp (TOD) */
- /* 24 - 55 */
- union {
- struct {
- unsigned long progusage1;
- unsigned long progusage2;
- unsigned long progusage3;
- unsigned long tod_base;
- };
- unsigned long progusage[4];
- };
- /* 56 - 63 */
- unsigned int mach_type:16; /* Machine type */
- unsigned int res1:16; /* Reserved */
- unsigned int res2:32; /* Reserved */
-};
-
-/* Create the trailer data at the end of a page. */
-static void cf_diag_trailer(struct cf_trailer_entry *te)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- struct cpuid cpuid;
-
- te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */
- te->csvn = cpuhw->info.csvn;
-
- get_cpu_id(&cpuid); /* Machine type */
- te->mach_type = cpuid.machine;
- te->cpu_speed = cf_diag_cpu_speed;
- if (te->cpu_speed)
- te->speed = 1;
- te->clock_base = 1; /* Save clock base */
- memcpy(&te->tod_base, &tod_clock_base[1], 8);
- store_tod_clock((__u64 *)&te->timestamp);
-}
-
-/*
- * Change the CPUMF state to active.
- * Enable and activate the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_enable(struct pmu *pmu)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- int err;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s pmu %p cpu %d flags %#x state %#llx\n",
- __func__, pmu, smp_processor_id(), cpuhw->flags,
- cpuhw->state);
- if (cpuhw->flags & PMU_F_ENABLED)
- return;
-
- err = lcctl(cpuhw->state);
- if (err) {
- pr_err("Enabling the performance measuring unit "
- "failed with rc=%x\n", err);
- return;
- }
- cpuhw->flags |= PMU_F_ENABLED;
-}
-
-/*
- * Change the CPUMF state to inactive.
- * Disable and enable (inactive) the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_disable(struct pmu *pmu)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- u64 inactive;
- int err;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s pmu %p cpu %d flags %#x state %#llx\n",
- __func__, pmu, smp_processor_id(), cpuhw->flags,
- cpuhw->state);
- if (!(cpuhw->flags & PMU_F_ENABLED))
- return;
-
- inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
- err = lcctl(inactive);
- if (err) {
- pr_err("Disabling the performance measuring unit "
- "failed with rc=%x\n", err);
- return;
- }
- cpuhw->flags &= ~PMU_F_ENABLED;
-}
-
-/* Number of perf events counting hardware events */
-static atomic_t cf_diag_events = ATOMIC_INIT(0);
-
-/* Release the PMU if event is the last perf event */
-static void cf_diag_perf_event_destroy(struct perf_event *event)
-{
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d cf_diag_events %d\n",
- __func__, event, event->cpu,
- atomic_read(&cf_diag_events));
- if (atomic_dec_return(&cf_diag_events) == 0)
- __kernel_cpumcf_end();
-}
-
-/* Setup the event. Test for authorized counter sets and only include counter
- * sets which are authorized at the time of the setup. Including unauthorized
- * counter sets result in specification exception (and panic).
- */
-static int __hw_perf_event_init(struct perf_event *event)
-{
- struct perf_event_attr *attr = &event->attr;
- struct cpu_cf_events *cpuhw;
- enum cpumf_ctr_set i;
- int err = 0;
-
- debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__,
- event, event->cpu);
-
- event->hw.config = attr->config;
- event->hw.config_base = 0;
-
- /* Add all authorized counter sets to config_base. The
- * the hardware init function is either called per-cpu or just once
- * for all CPUS (event->cpu == -1). This depends on the whether
- * counting is started for all CPUs or on a per workload base where
- * the perf event moves from one CPU to another CPU.
- * Checking the authorization on any CPU is fine as the hardware
- * applies the same authorization settings to all CPUs.
- */
- cpuhw = &get_cpu_var(cpu_cf_events);
- for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
- if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
- event->hw.config_base |= cpumf_ctr_ctl[i];
- put_cpu_var(cpu_cf_events);
-
- /* No authorized counter sets, nothing to count/sample */
- if (!event->hw.config_base) {
- err = -EINVAL;
- goto out;
- }
-
- /* Set sample_period to indicate sampling */
- event->hw.sample_period = attr->sample_period;
- local64_set(&event->hw.period_left, event->hw.sample_period);
- event->hw.last_period = event->hw.sample_period;
-out:
- debug_sprintf_event(cf_diag_dbg, 5, "%s err %d config_base %#lx\n",
- __func__, err, event->hw.config_base);
- return err;
-}
-
-static int cf_diag_event_init(struct perf_event *event)
-{
- struct perf_event_attr *attr = &event->attr;
- int err = -ENOENT;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d config %#llx type:%u "
- "sample_type %#llx cf_diag_events %d\n", __func__,
- event, event->cpu, attr->config, event->pmu->type,
- attr->sample_type, atomic_read(&cf_diag_events));
-
- if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
- event->attr.type != event->pmu->type)
- goto out;
-
- /* Raw events are used to access counters directly,
- * hence do not permit excludes.
- * This event is usesless without PERF_SAMPLE_RAW to return counter set
- * values as raw data.
- */
- if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
- !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- /* Initialize for using the CPU-measurement counter facility */
- if (atomic_inc_return(&cf_diag_events) == 1) {
- if (__kernel_cpumcf_begin()) {
- atomic_dec(&cf_diag_events);
- err = -EBUSY;
- goto out;
- }
- }
- event->destroy = cf_diag_perf_event_destroy;
-
- err = __hw_perf_event_init(event);
- if (unlikely(err))
- event->destroy(event);
-out:
- debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
- return err;
-}
-
-static void cf_diag_read(struct perf_event *event)
-{
- debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event);
-}
-
-/* Return the maximum possible counter set size (in number of 8 byte counters)
- * depending on type and model number.
- */
-static size_t cf_diag_ctrset_size(enum cpumf_ctr_set ctrset,
- struct cpumf_ctr_info *info)
-{
- size_t ctrset_size = 0;
-
- switch (ctrset) {
- case CPUMF_CTR_SET_BASIC:
- if (info->cfvn >= 1)
- ctrset_size = 6;
- break;
- case CPUMF_CTR_SET_USER:
- if (info->cfvn == 1)
- ctrset_size = 6;
- else if (info->cfvn >= 3)
- ctrset_size = 2;
- break;
- case CPUMF_CTR_SET_CRYPTO:
- if (info->csvn >= 1 && info->csvn <= 5)
- ctrset_size = 16;
- else if (info->csvn == 6)
- ctrset_size = 20;
- break;
- case CPUMF_CTR_SET_EXT:
- if (info->csvn == 1)
- ctrset_size = 32;
- else if (info->csvn == 2)
- ctrset_size = 48;
- else if (info->csvn >= 3 && info->csvn <= 5)
- ctrset_size = 128;
- else if (info->csvn == 6)
- ctrset_size = 160;
- break;
- case CPUMF_CTR_SET_MT_DIAG:
- if (info->csvn > 3)
- ctrset_size = 48;
- break;
- case CPUMF_CTR_SET_MAX:
- break;
- }
-
- return ctrset_size;
-}
-
-/* Calculate memory needed to store all counter sets together with header and
- * trailer data. This is independend of the counter set authorization which
- * can vary depending on the configuration.
- */
-static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info)
-{
- size_t max_size = sizeof(struct cf_trailer_entry);
- enum cpumf_ctr_set i;
-
- for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
- size_t size = cf_diag_ctrset_size(i, info);
-
- if (size)
- max_size += size * sizeof(u64) +
- sizeof(struct cf_ctrset_entry);
- }
- debug_sprintf_event(cf_diag_dbg, 5, "%s max_size %zu\n", __func__,
- max_size);
-
- return max_size;
-}
-
-/* Read a counter set. The counter set number determines which counter set and
- * the CPUM-CF first and second version number determine the number of
- * available counters in this counter set.
- * Each counter set starts with header containing the counter set number and
- * the number of 8 byte counters.
- *
- * The functions returns the number of bytes occupied by this counter set
- * including the header.
- * If there is no counter in the counter set, this counter set is useless and
- * zero is returned on this case.
- */
-static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
- size_t room)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- size_t ctrset_size, need = 0;
- int rc = 3; /* Assume write failure */
-
- ctrdata->def = CF_DIAG_CTRSET_DEF;
- ctrdata->set = ctrset;
- ctrdata->res1 = 0;
- ctrset_size = cf_diag_ctrset_size(ctrset, &cpuhw->info);
-
- if (ctrset_size) { /* Save data */
- need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
- if (need <= room)
- rc = ctr_stcctm(ctrset, ctrset_size,
- (u64 *)(ctrdata + 1));
- if (rc != 3)
- ctrdata->ctr = ctrset_size;
- else
- need = 0;
- }
-
- debug_sprintf_event(cf_diag_dbg, 6,
- "%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
- " need %zd rc %d\n",
- __func__, ctrset, ctrset_size, cpuhw->info.cfvn,
- cpuhw->info.csvn, need, rc);
- return need;
-}
-
-/* Read out all counter sets and save them in the provided data buffer.
- * The last 64 byte host an artificial trailer entry.
- */
-static size_t cf_diag_getctr(void *data, size_t sz, unsigned long auth)
-{
- struct cf_trailer_entry *trailer;
- size_t offset = 0, done;
- int i;
-
- memset(data, 0, sz);
- sz -= sizeof(*trailer); /* Always room for trailer */
- for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
- struct cf_ctrset_entry *ctrdata = data + offset;
-
- if (!(auth & cpumf_ctr_ctl[i]))
- continue; /* Counter set not authorized */
-
- done = cf_diag_getctrset(ctrdata, i, sz - offset);
- offset += done;
- debug_sprintf_event(cf_diag_dbg, 6,
- "%s ctrset %d offset %zu done %zu\n",
- __func__, i, offset, done);
- }
- trailer = data + offset;
- cf_diag_trailer(trailer);
- return offset + sizeof(*trailer);
-}
-
-/* Calculate the difference for each counter in a counter set. */
-static void cf_diag_diffctrset(u64 *pstart, u64 *pstop, int counters)
-{
- for (; --counters >= 0; ++pstart, ++pstop)
- if (*pstop >= *pstart)
- *pstop -= *pstart;
- else
- *pstop = *pstart - *pstop;
-}
-
-/* Scan the counter sets and calculate the difference of each counter
- * in each set. The result is the increment of each counter during the
- * period the counter set has been activated.
- *
- * Return true on success.
- */
-static int cf_diag_diffctr(struct cf_diag_csd *csd, unsigned long auth)
-{
- struct cf_trailer_entry *trailer_start, *trailer_stop;
- struct cf_ctrset_entry *ctrstart, *ctrstop;
- size_t offset = 0;
-
- auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
- do {
- ctrstart = (struct cf_ctrset_entry *)(csd->start + offset);
- ctrstop = (struct cf_ctrset_entry *)(csd->data + offset);
-
- if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
- pr_err("cpum_cf_diag counter set compare error "
- "in set %i\n", ctrstart->set);
- return 0;
- }
- auth &= ~cpumf_ctr_ctl[ctrstart->set];
- if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
- cf_diag_diffctrset((u64 *)(ctrstart + 1),
- (u64 *)(ctrstop + 1), ctrstart->ctr);
- offset += ctrstart->ctr * sizeof(u64) +
- sizeof(*ctrstart);
- }
- debug_sprintf_event(cf_diag_dbg, 6,
- "%s set %d ctr %d offset %zu auth %lx\n",
- __func__, ctrstart->set, ctrstart->ctr,
- offset, auth);
- } while (ctrstart->def && auth);
-
- /* Save time_stamp from start of event in stop's trailer */
- trailer_start = (struct cf_trailer_entry *)(csd->start + offset);
- trailer_stop = (struct cf_trailer_entry *)(csd->data + offset);
- trailer_stop->progusage[0] = trailer_start->timestamp;
-
- return 1;
-}
-
-/* Create perf event sample with the counter sets as raw data. The sample
- * is then pushed to the event subsystem and the function checks for
- * possible event overflows. If an event overflow occurs, the PMU is
- * stopped.
- *
- * Return non-zero if an event overflow occurred.
- */
-static int cf_diag_push_sample(struct perf_event *event,
- struct cf_diag_csd *csd)
-{
- struct perf_sample_data data;
- struct perf_raw_record raw;
- struct pt_regs regs;
- int overflow;
-
- /* Setup perf sample */
- perf_sample_data_init(&data, 0, event->hw.last_period);
- memset(&regs, 0, sizeof(regs));
- memset(&raw, 0, sizeof(raw));
-
- if (event->attr.sample_type & PERF_SAMPLE_CPU)
- data.cpu_entry.cpu = event->cpu;
- if (event->attr.sample_type & PERF_SAMPLE_RAW) {
- raw.frag.size = csd->used;
- raw.frag.data = csd->data;
- raw.size = csd->used;
- data.raw = &raw;
- }
-
- overflow = perf_event_overflow(event, &data, &regs);
- debug_sprintf_event(cf_diag_dbg, 6,
- "%s event %p cpu %d sample_type %#llx raw %d "
- "ov %d\n", __func__, event, event->cpu,
- event->attr.sample_type, raw.size, overflow);
- if (overflow)
- event->pmu->stop(event, 0);
-
- perf_event_update_userpage(event);
- return overflow;
-}
-
-static void cf_diag_start(struct perf_event *event, int flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
- struct hw_perf_event *hwc = &event->hw;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d flags %#x hwc-state %#x\n",
- __func__, event, event->cpu, flags, hwc->state);
- if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
- return;
-
- /* (Re-)enable and activate all counter sets */
- lcctl(0); /* Reset counter sets */
- hwc->state = 0;
- ctr_set_multiple_enable(&cpuhw->state, hwc->config_base);
- lcctl(cpuhw->state); /* Enable counter sets */
- csd->used = cf_diag_getctr(csd->start, sizeof(csd->start),
- event->hw.config_base);
- ctr_set_multiple_start(&cpuhw->state, hwc->config_base);
- /* Function cf_diag_enable() starts the counter sets. */
-}
-
-static void cf_diag_stop(struct perf_event *event, int flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
- struct hw_perf_event *hwc = &event->hw;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d flags %#x hwc-state %#x\n",
- __func__, event, event->cpu, flags, hwc->state);
-
- /* Deactivate all counter sets */
- ctr_set_multiple_stop(&cpuhw->state, hwc->config_base);
- local64_inc(&event->count);
- csd->used = cf_diag_getctr(csd->data, sizeof(csd->data),
- event->hw.config_base);
- if (cf_diag_diffctr(csd, event->hw.config_base))
- cf_diag_push_sample(event, csd);
- hwc->state |= PERF_HES_STOPPED;
-}
-
-static int cf_diag_add(struct perf_event *event, int flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- int err = 0;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d flags %#x cpuhw %p\n",
- __func__, event, event->cpu, flags, cpuhw);
-
- if (cpuhw->flags & PMU_F_IN_USE) {
- err = -EAGAIN;
- goto out;
- }
-
- event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
- cpuhw->flags |= PMU_F_IN_USE;
- if (flags & PERF_EF_START)
- cf_diag_start(event, PERF_EF_RELOAD);
-out:
- debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
- return err;
-}
-
-static void cf_diag_del(struct perf_event *event, int flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d flags %#x\n",
- __func__, event, event->cpu, flags);
-
- cf_diag_stop(event, PERF_EF_UPDATE);
- ctr_set_multiple_stop(&cpuhw->state, event->hw.config_base);
- ctr_set_multiple_disable(&cpuhw->state, event->hw.config_base);
- cpuhw->flags &= ~PMU_F_IN_USE;
-}
-
-CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
-
-static struct attribute *cf_diag_events_attr[] = {
- CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
- NULL,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-63");
-
-static struct attribute *cf_diag_format_attr[] = {
- &format_attr_event.attr,
- NULL,
-};
-
-static struct attribute_group cf_diag_events_group = {
- .name = "events",
- .attrs = cf_diag_events_attr,
-};
-static struct attribute_group cf_diag_format_group = {
- .name = "format",
- .attrs = cf_diag_format_attr,
-};
-static const struct attribute_group *cf_diag_attr_groups[] = {
- &cf_diag_events_group,
- &cf_diag_format_group,
- NULL,
-};
-
-/* Performance monitoring unit for s390x */
-static struct pmu cf_diag = {
- .task_ctx_nr = perf_sw_context,
- .pmu_enable = cf_diag_enable,
- .pmu_disable = cf_diag_disable,
- .event_init = cf_diag_event_init,
- .add = cf_diag_add,
- .del = cf_diag_del,
- .start = cf_diag_start,
- .stop = cf_diag_stop,
- .read = cf_diag_read,
-
- .attr_groups = cf_diag_attr_groups
-};
-
-/* Get the CPU speed, try sampling facility first and CPU attributes second. */
-static void cf_diag_get_cpu_speed(void)
-{
- if (cpum_sf_avail()) { /* Sampling facility first */
- struct hws_qsi_info_block si;
-
- memset(&si, 0, sizeof(si));
- if (!qsi(&si)) {
- cf_diag_cpu_speed = si.cpu_speed;
- return;
- }
- }
-
- if (test_facility(34)) { /* CPU speed extract static part */
- unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
-
- if (mhz != -1UL)
- cf_diag_cpu_speed = mhz & 0xffffffff;
- }
-}
-
-/* Initialize the counter set PMU to generate complete counter set data as
- * event raw data. This relies on the CPU Measurement Counter Facility device
- * already being loaded and initialized.
- */
-static int __init cf_diag_init(void)
-{
- struct cpumf_ctr_info info;
- size_t need;
- int rc;
-
- if (!kernel_cpumcf_avail() || !stccm_avail() || qctri(&info))
- return -ENODEV;
- cf_diag_get_cpu_speed();
-
- /* Make sure the counter set data fits into predefined buffer. */
- need = cf_diag_ctrset_maxsize(&info);
- if (need > sizeof(((struct cf_diag_csd *)0)->start)) {
- pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
- need);
- return -ENOMEM;
- }
-
- /* Setup s390dbf facility */
- cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
- if (!cf_diag_dbg) {
- pr_err("Registration of s390dbf(cpum_cf_diag) failed\n");
- return -ENOMEM;
- }
- debug_register_view(cf_diag_dbg, &debug_sprintf_view);
-
- rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
- if (rc) {
- debug_unregister_view(cf_diag_dbg, &debug_sprintf_view);
- debug_unregister(cf_diag_dbg);
- pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
- rc);
- }
- return rc;
-}
-arch_initcall(cf_diag_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c
index 8b33e03e47b8..0d64aafd158f 100644
--- a/arch/s390/kernel/perf_cpum_cf_events.c
+++ b/arch/s390/kernel/perf_cpum_cf_events.c
@@ -238,6 +238,134 @@ CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5);
CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
+CPUMF_EVENT_ATTR(cf_z15, L1D_RO_EXCL_WRITES, 0x0080);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_WRITES, 0x0081);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_MISSES, 0x0082);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_HPAGE_WRITES, 0x0083);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_GPAGE_WRITES, 0x0084);
+CPUMF_EVENT_ATTR(cf_z15, L1D_L2D_SOURCED_WRITES, 0x0085);
+CPUMF_EVENT_ATTR(cf_z15, ITLB2_WRITES, 0x0086);
+CPUMF_EVENT_ATTR(cf_z15, ITLB2_MISSES, 0x0087);
+CPUMF_EVENT_ATTR(cf_z15, L1I_L2I_SOURCED_WRITES, 0x0088);
+CPUMF_EVENT_ATTR(cf_z15, TLB2_PTE_WRITES, 0x0089);
+CPUMF_EVENT_ATTR(cf_z15, TLB2_CRSTE_WRITES, 0x008a);
+CPUMF_EVENT_ATTR(cf_z15, TLB2_ENGINES_BUSY, 0x008b);
+CPUMF_EVENT_ATTR(cf_z15, TX_C_TEND, 0x008c);
+CPUMF_EVENT_ATTR(cf_z15, TX_NC_TEND, 0x008d);
+CPUMF_EVENT_ATTR(cf_z15, L1C_TLB2_MISSES, 0x008f);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES, 0x0091);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0092);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES, 0x0093);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x0094);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x0095);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES, 0x0096);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x0097);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x0098);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES, 0x0099);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x009a);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x009b);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x009c);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES, 0x009d);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO, 0x009e);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES, 0x00a3);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a4);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES, 0x00a5);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x00a6);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x00a7);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES, 0x00a8);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x00a9);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x00aa);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES, 0x00ab);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x00ac);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x00ad);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00ae);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES, 0x00af);
+CPUMF_EVENT_ATTR(cf_z15, BCD_DFP_EXECUTION_SLOTS, 0x00e0);
+CPUMF_EVENT_ATTR(cf_z15, VX_BCD_EXECUTION_SLOTS, 0x00e1);
+CPUMF_EVENT_ATTR(cf_z15, DECIMAL_INSTRUCTIONS, 0x00e2);
+CPUMF_EVENT_ATTR(cf_z15, LAST_HOST_TRANSLATIONS, 0x00e8);
+CPUMF_EVENT_ATTR(cf_z15, TX_NC_TABORT, 0x00f3);
+CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_NO_SPECIAL, 0x00f4);
+CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_SPECIAL, 0x00f5);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_ACCESS, 0x00f7);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CYCLES, 0x00fc);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x00109);
+CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
+CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
+CPUMF_EVENT_ATTR(cf_z16, L1D_RO_EXCL_WRITES, 0x0080);
+CPUMF_EVENT_ATTR(cf_z16, DTLB2_WRITES, 0x0081);
+CPUMF_EVENT_ATTR(cf_z16, DTLB2_MISSES, 0x0082);
+CPUMF_EVENT_ATTR(cf_z16, CRSTE_1MB_WRITES, 0x0083);
+CPUMF_EVENT_ATTR(cf_z16, DTLB2_GPAGE_WRITES, 0x0084);
+CPUMF_EVENT_ATTR(cf_z16, ITLB2_WRITES, 0x0086);
+CPUMF_EVENT_ATTR(cf_z16, ITLB2_MISSES, 0x0087);
+CPUMF_EVENT_ATTR(cf_z16, TLB2_PTE_WRITES, 0x0089);
+CPUMF_EVENT_ATTR(cf_z16, TLB2_CRSTE_WRITES, 0x008a);
+CPUMF_EVENT_ATTR(cf_z16, TLB2_ENGINES_BUSY, 0x008b);
+CPUMF_EVENT_ATTR(cf_z16, TX_C_TEND, 0x008c);
+CPUMF_EVENT_ATTR(cf_z16, TX_NC_TEND, 0x008d);
+CPUMF_EVENT_ATTR(cf_z16, L1C_TLB2_MISSES, 0x008f);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ, 0x0091);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_IV, 0x0092);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_CHIP_HIT, 0x0093);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_DRAWER_HIT, 0x0094);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP, 0x0095);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_IV, 0x0096);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_CHIP_HIT, 0x0097);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT, 0x0098);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE, 0x0099);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER, 0x009a);
+CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER, 0x009b);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_MEMORY, 0x009c);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE_MEMORY, 0x009d);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER_MEMORY, 0x009e);
+CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER_MEMORY, 0x009f);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_IV, 0x00a0);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT, 0x00a1);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT, 0x00a2);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_IV, 0x00a3);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT, 0x00a4);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT, 0x00a5);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_IV, 0x00a6);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT, 0x00a7);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT, 0x00a8);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ, 0x00a9);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_IV, 0x00aa);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_CHIP_HIT, 0x00ab);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_DRAWER_HIT, 0x00ac);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP, 0x00ad);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_IV, 0x00ae);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_CHIP_HIT, 0x00af);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT, 0x00b0);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE, 0x00b1);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER, 0x00b2);
+CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER, 0x00b3);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_MEMORY, 0x00b4);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE_MEMORY, 0x00b5);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER_MEMORY, 0x00b6);
+CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER_MEMORY, 0x00b7);
+CPUMF_EVENT_ATTR(cf_z16, BCD_DFP_EXECUTION_SLOTS, 0x00e0);
+CPUMF_EVENT_ATTR(cf_z16, VX_BCD_EXECUTION_SLOTS, 0x00e1);
+CPUMF_EVENT_ATTR(cf_z16, DECIMAL_INSTRUCTIONS, 0x00e2);
+CPUMF_EVENT_ATTR(cf_z16, LAST_HOST_TRANSLATIONS, 0x00e8);
+CPUMF_EVENT_ATTR(cf_z16, TX_NC_TABORT, 0x00f4);
+CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_NO_SPECIAL, 0x00f5);
+CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_SPECIAL, 0x00f6);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_ACCESS, 0x00f8);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_CYCLES, 0x00fd);
+CPUMF_EVENT_ATTR(cf_z16, SORTL, 0x0100);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_CC, 0x0109);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_CCFINISH, 0x010a);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_INVOCATIONS, 0x010b);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_COMPLETIONS, 0x010c);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_WAIT_LOCK, 0x010d);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_HOLD_LOCK, 0x010e);
+CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
+CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
+
static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = {
CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES),
CPUMF_EVENT_PTR(cf_fvn1, INSTRUCTIONS),
@@ -286,7 +414,7 @@ static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = {
NULL,
};
-static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = {
+static struct attribute *cpumcf_svn_67_pmu_event_attr[] __initdata = {
CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS),
CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES),
CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS),
@@ -516,6 +644,141 @@ static struct attribute *cpumcf_z14_pmu_event_attr[] __initdata = {
NULL,
};
+static struct attribute *cpumcf_z15_pmu_event_attr[] __initdata = {
+ CPUMF_EVENT_PTR(cf_z15, L1D_RO_EXCL_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, DTLB2_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, DTLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z15, DTLB2_HPAGE_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, DTLB2_GPAGE_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_L2D_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, ITLB2_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, ITLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_L2I_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, TLB2_PTE_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, TLB2_CRSTE_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, TLB2_ENGINES_BUSY),
+ CPUMF_EVENT_PTR(cf_z15, TX_C_TEND),
+ CPUMF_EVENT_PTR(cf_z15, TX_NC_TEND),
+ CPUMF_EVENT_PTR(cf_z15, L1C_TLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, BCD_DFP_EXECUTION_SLOTS),
+ CPUMF_EVENT_PTR(cf_z15, VX_BCD_EXECUTION_SLOTS),
+ CPUMF_EVENT_PTR(cf_z15, DECIMAL_INSTRUCTIONS),
+ CPUMF_EVENT_PTR(cf_z15, LAST_HOST_TRANSLATIONS),
+ CPUMF_EVENT_PTR(cf_z15, TX_NC_TABORT),
+ CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_NO_SPECIAL),
+ CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_SPECIAL),
+ CPUMF_EVENT_PTR(cf_z15, DFLT_ACCESS),
+ CPUMF_EVENT_PTR(cf_z15, DFLT_CYCLES),
+ CPUMF_EVENT_PTR(cf_z15, DFLT_CC),
+ CPUMF_EVENT_PTR(cf_z15, DFLT_CCFINISH),
+ CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE),
+ CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE),
+ NULL,
+};
+
+static struct attribute *cpumcf_z16_pmu_event_attr[] __initdata = {
+ CPUMF_EVENT_PTR(cf_z16, L1D_RO_EXCL_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, DTLB2_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, DTLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z16, CRSTE_1MB_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, DTLB2_GPAGE_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, ITLB2_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, ITLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z16, TLB2_PTE_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, TLB2_CRSTE_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, TLB2_ENGINES_BUSY),
+ CPUMF_EVENT_PTR(cf_z16, TX_C_TEND),
+ CPUMF_EVENT_PTR(cf_z16, TX_NC_TEND),
+ CPUMF_EVENT_PTR(cf_z16, L1C_TLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z16, DCW_REQ),
+ CPUMF_EVENT_PTR(cf_z16, DCW_REQ_IV),
+ CPUMF_EVENT_PTR(cf_z16, DCW_REQ_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, DCW_REQ_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_IV),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER),
+ CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_IV),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_IV),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_IV),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_REQ),
+ CPUMF_EVENT_PTR(cf_z16, ICW_REQ_IV),
+ CPUMF_EVENT_PTR(cf_z16, ICW_REQ_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_REQ_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_IV),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER),
+ CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, BCD_DFP_EXECUTION_SLOTS),
+ CPUMF_EVENT_PTR(cf_z16, VX_BCD_EXECUTION_SLOTS),
+ CPUMF_EVENT_PTR(cf_z16, DECIMAL_INSTRUCTIONS),
+ CPUMF_EVENT_PTR(cf_z16, LAST_HOST_TRANSLATIONS),
+ CPUMF_EVENT_PTR(cf_z16, TX_NC_TABORT),
+ CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_NO_SPECIAL),
+ CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_SPECIAL),
+ CPUMF_EVENT_PTR(cf_z16, DFLT_ACCESS),
+ CPUMF_EVENT_PTR(cf_z16, DFLT_CYCLES),
+ CPUMF_EVENT_PTR(cf_z16, SORTL),
+ CPUMF_EVENT_PTR(cf_z16, DFLT_CC),
+ CPUMF_EVENT_PTR(cf_z16, DFLT_CCFINISH),
+ CPUMF_EVENT_PTR(cf_z16, NNPA_INVOCATIONS),
+ CPUMF_EVENT_PTR(cf_z16, NNPA_COMPLETIONS),
+ CPUMF_EVENT_PTR(cf_z16, NNPA_WAIT_LOCK),
+ CPUMF_EVENT_PTR(cf_z16, NNPA_HOLD_LOCK),
+ CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE),
+ CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE),
+ NULL,
+};
+
/* END: CPUM_CF COUNTER DEFINITIONS ===================================== */
static struct attribute_group cpumcf_pmu_events_group = {
@@ -596,8 +859,8 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
case 1 ... 5:
csvn = cpumcf_svn_12345_pmu_event_attr;
break;
- case 6:
- csvn = cpumcf_svn_6_pmu_event_attr;
+ case 6 ... 7:
+ csvn = cpumcf_svn_67_pmu_event_attr;
break;
default:
csvn = none;
@@ -624,9 +887,15 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
break;
case 0x3906:
case 0x3907:
+ model = cpumcf_z14_pmu_event_attr;
+ break;
case 0x8561:
case 0x8562:
- model = cpumcf_z14_pmu_event_attr;
+ model = cpumcf_z15_pmu_event_attr;
+ break;
+ case 0x3931:
+ case 0x3932:
+ model = cpumcf_z16_pmu_event_attr;
break;
default:
model = none;
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index c07fdcd73726..06efad5b4f93 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -22,6 +22,7 @@
#include <asm/irq.h>
#include <asm/debug.h>
#include <asm/timex.h>
+#include <linux/io.h>
/* Minimum number of sample-data-block-tables:
* At least one table is required for the sampling buffer structure.
@@ -42,7 +43,7 @@
#define CPUM_SF_SDBT_TL_OFFSET (CPUM_SF_SDB_PER_TABLE * 8)
static inline int require_table_link(const void *sdbt)
{
- return ((unsigned long) sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET;
+ return ((unsigned long)sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET;
}
/* Minimum and maximum sampling buffer sizes:
@@ -99,6 +100,57 @@ static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf);
/* Debug feature */
static debug_info_t *sfdbg;
+/* Sampling control helper functions */
+static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi,
+ unsigned long freq)
+{
+ return (USEC_PER_SEC / freq) * qsi->cpu_speed;
+}
+
+static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi,
+ unsigned long rate)
+{
+ return USEC_PER_SEC * qsi->cpu_speed / rate;
+}
+
+/* Return TOD timestamp contained in an trailer entry */
+static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te)
+{
+ /* TOD in STCKE format */
+ if (te->header.t)
+ return *((unsigned long long *)&te->timestamp[1]);
+
+ /* TOD in STCK format */
+ return *((unsigned long long *)&te->timestamp[0]);
+}
+
+/* Return pointer to trailer entry of an sample data block */
+static inline struct hws_trailer_entry *trailer_entry_ptr(unsigned long v)
+{
+ void *ret;
+
+ ret = (void *)v;
+ ret += PAGE_SIZE;
+ ret -= sizeof(struct hws_trailer_entry);
+
+ return ret;
+}
+
+/*
+ * Return true if the entry in the sample data block table (sdbt)
+ * is a link to the next sdbt
+ */
+static inline int is_link_entry(unsigned long *s)
+{
+ return *s & 0x1UL ? 1 : 0;
+}
+
+/* Return pointer to the linked sdbt */
+static inline unsigned long *get_next_sdbt(unsigned long *s)
+{
+ return phys_to_virt(*s & ~0x1UL);
+}
+
/*
* sf_disable() - Switch off sampling facility
*/
@@ -140,7 +192,7 @@ static void free_sampling_buffer(struct sf_buffer *sfb)
if (is_link_entry(curr)) {
curr = get_next_sdbt(curr);
if (sdbt)
- free_page((unsigned long) sdbt);
+ free_page((unsigned long)sdbt);
/* If the origin is reached, sampling buffer is freed */
if (curr == sfb->sdbt)
@@ -150,7 +202,7 @@ static void free_sampling_buffer(struct sf_buffer *sfb)
} else {
/* Process SDB pointer */
if (*curr) {
- free_page(*curr);
+ free_page((unsigned long)phys_to_virt(*curr));
curr++;
}
}
@@ -163,17 +215,18 @@ static void free_sampling_buffer(struct sf_buffer *sfb)
static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags)
{
- unsigned long sdb, *trailer;
+ struct hws_trailer_entry *te;
+ unsigned long sdb;
/* Allocate and initialize sample-data-block */
sdb = get_zeroed_page(gfp_flags);
if (!sdb)
return -ENOMEM;
- trailer = trailer_entry_ptr(sdb);
- *trailer = SDB_TE_ALERT_REQ_MASK;
+ te = trailer_entry_ptr(sdb);
+ te->header.a = 1;
/* Link SDB into the sample-data-block-table */
- *sdbt = sdb;
+ *sdbt = virt_to_phys((void *)sdb);
return 0;
}
@@ -225,14 +278,14 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
for (i = 0; i < num_sdb; i++) {
/* Allocate a new SDB-table if it is full. */
if (require_table_link(tail)) {
- new = (unsigned long *) get_zeroed_page(gfp_flags);
+ new = (unsigned long *)get_zeroed_page(gfp_flags);
if (!new) {
rc = -ENOMEM;
break;
}
sfb->num_sdbt++;
/* Link current page to tail of chain */
- *tail = (unsigned long)(void *) new + 1;
+ *tail = virt_to_phys((void *)new) + 1;
tail_prev = tail;
tail = new;
}
@@ -251,7 +304,7 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
*/
if (tail_prev) {
sfb->num_sdbt--;
- free_page((unsigned long) new);
+ free_page((unsigned long)new);
tail = tail_prev;
}
break;
@@ -262,7 +315,7 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
}
/* Link sampling buffer to its origin */
- *tail = (unsigned long) sfb->sdbt + 1;
+ *tail = virt_to_phys(sfb->sdbt) + 1;
sfb->tail = tail;
debug_sprintf_event(sfdbg, 4, "%s: new buffer"
@@ -290,7 +343,7 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb)
return -EINVAL;
/* Allocate the sample-data-block-table origin */
- sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+ sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL);
if (!sfb->sdbt)
return -ENOMEM;
sfb->num_sdb = 0;
@@ -300,7 +353,7 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb)
* realloc_sampling_buffer() invocation.
*/
sfb->tail = sfb->sdbt;
- *sfb->tail = (unsigned long)(void *) sfb->sdbt + 1;
+ *sfb->tail = virt_to_phys((void *)sfb->sdbt) + 1;
/* Allocate requested number of sample-data-blocks */
rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL);
@@ -372,28 +425,33 @@ static void deallocate_buffers(struct cpu_hw_sf *cpuhw)
static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
{
- unsigned long n_sdb, freq, factor;
+ unsigned long n_sdb, freq;
size_t sample_size;
/* Calculate sampling buffers using 4K pages
*
- * 1. Determine the sample data size which depends on the used
- * sampling functions, for example, basic-sampling or
- * basic-sampling with diagnostic-sampling.
+ * 1. The sampling size is 32 bytes for basic sampling. This size
+ * is the same for all machine types. Diagnostic
+ * sampling uses auxlilary data buffer setup which provides the
+ * memory for SDBs using linux common code auxiliary trace
+ * setup.
*
- * 2. Use the sampling frequency as input. The sampling buffer is
- * designed for almost one second. This can be adjusted through
- * the "factor" variable.
- * In any case, alloc_sampling_buffer() sets the Alert Request
+ * 2. Function alloc_sampling_buffer() sets the Alert Request
* Control indicator to trigger a measurement-alert to harvest
- * sample-data-blocks (sdb).
+ * sample-data-blocks (SDB). This is done per SDB. This
+ * measurement alert interrupt fires quick enough to handle
+ * one SDB, on very high frequency and work loads there might
+ * be 2 to 3 SBDs available for sample processing.
+ * Currently there is no need for setup alert request on every
+ * n-th page. This is counterproductive as one IRQ triggers
+ * a very high number of samples to be processed at one IRQ.
*
- * 3. Compute the number of sample-data-blocks and ensure a minimum
- * of CPUM_SF_MIN_SDB. Also ensure the upper limit does not
- * exceed a "calculated" maximum. The symbolic maximum is
- * designed for basic-sampling only and needs to be increased if
- * diagnostic-sampling is active.
- * See also the remarks for these symbolic constants.
+ * 3. Use the sampling frequency as input.
+ * Compute the number of SDBs and ensure a minimum
+ * of CPUM_SF_MIN_SDB. Depending on frequency add some more
+ * SDBs to handle a higher sampling rate.
+ * Use a minimum of CPUM_SF_MIN_SDB and allow for 100 samples
+ * (one SDB) for every 10000 HZ frequency increment.
*
* 4. Compute the number of sample-data-block-tables (SDBT) and
* ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up
@@ -401,10 +459,7 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
*/
sample_size = sizeof(struct hws_basic_entry);
freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc));
- factor = 1;
- n_sdb = DIV_ROUND_UP(freq, factor * ((PAGE_SIZE-64) / sample_size));
- if (n_sdb < CPUM_SF_MIN_SDB)
- n_sdb = CPUM_SF_MIN_SDB;
+ n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000);
/* If there is already a sampling buffer allocated, it is very likely
* that the sampling facility is enabled too. If the event to be
@@ -539,11 +594,10 @@ static DEFINE_MUTEX(pmc_reserve_mutex);
#define PMC_FAILURE 2
static void setup_pmc_cpu(void *flags)
{
- int err;
struct cpu_hw_sf *cpusf = this_cpu_ptr(&cpu_hw_sf);
+ int err = 0;
- err = 0;
- switch (*((int *) flags)) {
+ switch (*((int *)flags)) {
case PMC_INIT:
memset(cpusf, 0, sizeof(*cpusf));
err = qsi(&cpusf->qsi);
@@ -551,28 +605,18 @@ static void setup_pmc_cpu(void *flags)
break;
cpusf->flags |= PMU_F_RESERVED;
err = sf_disable();
- if (err)
- pr_err("Switching off the sampling facility failed "
- "with rc %i\n", err);
- debug_sprintf_event(sfdbg, 5,
- "%s: initialized: cpuhw %p\n", __func__,
- cpusf);
break;
case PMC_RELEASE:
cpusf->flags &= ~PMU_F_RESERVED;
err = sf_disable();
- if (err) {
- pr_err("Switching off the sampling facility failed "
- "with rc %i\n", err);
- } else
+ if (!err)
deallocate_buffers(cpusf);
- debug_sprintf_event(sfdbg, 5,
- "%s: released: cpuhw %p\n", __func__,
- cpusf);
break;
}
- if (err)
- *((int *) flags) |= PMC_FAILURE;
+ if (err) {
+ *((int *)flags) |= PMC_FAILURE;
+ pr_err("Switching off the sampling facility failed with rc %i\n", err);
+ }
}
static void release_pmc_hardware(void)
@@ -669,8 +713,9 @@ static void cpumsf_output_event_pid(struct perf_event *event,
/* Protect callchain buffers, tasks */
rcu_read_lock();
- perf_prepare_sample(&header, data, event, regs);
- if (perf_output_begin(&handle, event, header.size))
+ perf_prepare_sample(data, event, regs);
+ perf_prepare_header(&header, data, event, regs);
+ if (perf_output_begin(&handle, data, event, header.size))
goto out;
/* Update the process ID (see also kernel/events/core.c) */
@@ -832,10 +877,6 @@ static int __hw_perf_event_init(struct perf_event *event)
SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_DIAG_MODE;
}
- /* Check and set other sampling flags */
- if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS)
- SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS;
-
err = __hw_perf_event_init_rate(event, &si);
if (err)
goto out;
@@ -879,12 +920,21 @@ out:
return err;
}
+static bool is_callchain_event(struct perf_event *event)
+{
+ u64 sample_type = event->attr.sample_type;
+
+ return sample_type & (PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_REGS_USER |
+ PERF_SAMPLE_STACK_USER);
+}
+
static int cpumsf_pmu_event_init(struct perf_event *event)
{
int err;
/* No support for taken branch sampling */
- if (has_branch_stack(event))
+ /* No support for callchain, stacks and registers */
+ if (has_branch_stack(event) || is_callchain_event(event))
return -EOPNOTSUPP;
switch (event->attr.type) {
@@ -908,10 +958,6 @@ static int cpumsf_pmu_event_init(struct perf_event *event)
return -ENOENT;
}
- /* Check online status of the CPU to which the event is pinned */
- if (event->cpu >= 0 && !cpu_online(event->cpu))
- return -ENODEV;
-
/* Force reset of idle/hv excludes regardless of what the
* user requested.
*/
@@ -971,8 +1017,7 @@ static void cpumsf_pmu_enable(struct pmu *pmu)
err = lsctl(&cpuhw->lsctl);
if (err) {
cpuhw->flags &= ~PMU_F_ENABLED;
- pr_err("Loading sampling controls failed: op %i err %i\n",
- 1, err);
+ pr_err("Loading sampling controls failed: op 1 err %i\n", err);
return;
}
@@ -1006,8 +1051,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
err = lsctl(&inactive);
if (err) {
- pr_err("Loading sampling controls failed: op %i err %i\n",
- 2, err);
+ pr_err("Loading sampling controls failed: op 2 err %i\n", err);
return;
}
@@ -1164,11 +1208,11 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
struct hws_trailer_entry *te;
struct hws_basic_entry *sample;
- te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);
- sample = (struct hws_basic_entry *) *sdbt;
- while ((unsigned long *) sample < (unsigned long *) te) {
+ te = trailer_entry_ptr((unsigned long)sdbt);
+ sample = (struct hws_basic_entry *)sdbt;
+ while ((unsigned long *)sample < (unsigned long *)te) {
/* Check for an empty sample */
- if (!sample->def)
+ if (!sample->def || sample->LS)
break;
/* Update perf event period */
@@ -1195,7 +1239,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
"%s: Found unknown"
" sampling data entry: te->f %i"
" basic.def %#4x (%p)\n", __func__,
- te->f, sample->def, sample);
+ te->header.f, sample->def, sample);
/* Sample slot is not yet written or other record.
*
* This condition can occur if the buffer was reused
@@ -1206,7 +1250,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
* that are not full. Stop processing if the first
* invalid format was detected.
*/
- if (!te->f)
+ if (!te->header.f)
break;
}
@@ -1224,18 +1268,16 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
* The sampling buffer position are retrieved and saved in the TEAR_REG
* register of the specified perf event.
*
- * Only full sample-data-blocks are processed. Specify the flash_all flag
- * to also walk through partially filled sample-data-blocks. It is ignored
- * if PERF_CPUM_SF_FULL_BLOCKS is set. The PERF_CPUM_SF_FULL_BLOCKS flag
- * enforces the processing of full sample-data-blocks only (trailer entries
- * with the block-full-indicator bit set).
+ * Only full sample-data-blocks are processed. Specify the flush_all flag
+ * to also walk through partially filled sample-data-blocks.
*/
static void hw_perf_event_update(struct perf_event *event, int flush_all)
{
+ unsigned long long event_overflow, sampl_overflow, num_sdb;
+ union hws_trailer_header old, prev, new;
struct hw_perf_event *hwc = &event->hw;
struct hws_trailer_entry *te;
- unsigned long *sdbt;
- unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags;
+ unsigned long *sdbt, sdb;
int done;
/*
@@ -1245,50 +1287,52 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
if (SAMPL_DIAG_MODE(&event->hw))
return;
- if (flush_all && SDB_FULL_BLOCKS(hwc))
- flush_all = 0;
-
- sdbt = (unsigned long *) TEAR_REG(hwc);
+ sdbt = (unsigned long *)TEAR_REG(hwc);
done = event_overflow = sampl_overflow = num_sdb = 0;
while (!done) {
/* Get the trailer entry of the sample-data-block */
- te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);
+ sdb = (unsigned long)phys_to_virt(*sdbt);
+ te = trailer_entry_ptr(sdb);
/* Leave loop if no more work to do (block full indicator) */
- if (!te->f) {
+ if (!te->header.f) {
done = 1;
if (!flush_all)
break;
}
/* Check the sample overflow count */
- if (te->overflow)
+ if (te->header.overflow)
/* Account sample overflows and, if a particular limit
* is reached, extend the sampling buffer.
* For details, see sfb_account_overflows().
*/
- sampl_overflow += te->overflow;
+ sampl_overflow += te->header.overflow;
/* Timestamps are valid for full sample-data-blocks only */
- debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx "
+ debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx/%#lx "
"overflow %llu timestamp %#llx\n",
- __func__, (unsigned long)sdbt, te->overflow,
- (te->f) ? trailer_timestamp(te) : 0ULL);
+ __func__, sdb, (unsigned long)sdbt,
+ te->header.overflow,
+ (te->header.f) ? trailer_timestamp(te) : 0ULL);
/* Collect all samples from a single sample-data-block and
* flag if an (perf) event overflow happened. If so, the PMU
* is stopped and remaining samples will be discarded.
*/
- hw_collect_samples(event, sdbt, &event_overflow);
+ hw_collect_samples(event, (unsigned long *)sdb, &event_overflow);
num_sdb++;
/* Reset trailer (using compare-double-and-swap) */
+ prev.val = READ_ONCE_ALIGNED_128(te->header.val);
do {
- te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
- te_flags |= SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- te->flags, te->overflow,
- te_flags, 0ULL));
+ old.val = prev.val;
+ new.val = prev.val;
+ new.f = 0;
+ new.a = 1;
+ new.overflow = 0;
+ prev.val = cmpxchg128(&te->header.val, old.val, new.val);
+ } while (prev.val != old.val);
/* Advance to next sample-data-block */
sdbt++;
@@ -1303,18 +1347,28 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
*/
if (flush_all && done)
break;
-
- /* If an event overflow happened, discard samples by
- * processing any remaining sample-data-blocks.
- */
- if (event_overflow)
- flush_all = 1;
}
/* Account sample overflows in the event hardware structure */
if (sampl_overflow)
OVERFLOW_REG(hwc) = DIV_ROUND_UP(OVERFLOW_REG(hwc) +
sampl_overflow, 1 + num_sdb);
+
+ /* Perf_event_overflow() and perf_event_account_interrupt() limit
+ * the interrupt rate to an upper limit. Roughly 1000 samples per
+ * task tick.
+ * Hitting this limit results in a large number
+ * of throttled REF_REPORT_THROTTLE entries and the samples
+ * are dropped.
+ * Slightly increase the interval to avoid hitting this limit.
+ */
+ if (event_overflow) {
+ SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10);
+ debug_sprintf_event(sfdbg, 1, "%s: rate adjustment %ld\n",
+ __func__,
+ DIV_ROUND_UP(SAMPL_RATE(hwc), 10));
+ }
+
if (sampl_overflow || event_overflow)
debug_sprintf_event(sfdbg, 4, "%s: "
"overflows: sample %llu event %llu"
@@ -1323,10 +1377,26 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
OVERFLOW_REG(hwc), num_sdb);
}
-#define AUX_SDB_INDEX(aux, i) ((i) % aux->sfb.num_sdb)
-#define AUX_SDB_NUM(aux, start, end) (end >= start ? end - start + 1 : 0)
-#define AUX_SDB_NUM_ALERT(aux) AUX_SDB_NUM(aux, aux->head, aux->alert_mark)
-#define AUX_SDB_NUM_EMPTY(aux) AUX_SDB_NUM(aux, aux->head, aux->empty_mark)
+static inline unsigned long aux_sdb_index(struct aux_buffer *aux,
+ unsigned long i)
+{
+ return i % aux->sfb.num_sdb;
+}
+
+static inline unsigned long aux_sdb_num(unsigned long start, unsigned long end)
+{
+ return end >= start ? end - start + 1 : 0;
+}
+
+static inline unsigned long aux_sdb_num_alert(struct aux_buffer *aux)
+{
+ return aux_sdb_num(aux->head, aux->alert_mark);
+}
+
+static inline unsigned long aux_sdb_num_empty(struct aux_buffer *aux)
+{
+ return aux_sdb_num(aux->head, aux->empty_mark);
+}
/*
* Get trailer entry by index of SDB.
@@ -1336,9 +1406,9 @@ static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux,
{
unsigned long sdb;
- index = AUX_SDB_INDEX(aux, index);
+ index = aux_sdb_index(aux, index);
sdb = aux->sdb_index[index];
- return (struct hws_trailer_entry *)trailer_entry_ptr(sdb);
+ return trailer_entry_ptr(sdb);
}
/*
@@ -1360,10 +1430,10 @@ static void aux_output_end(struct perf_output_handle *handle)
if (!aux)
return;
- range_scan = AUX_SDB_NUM_ALERT(aux);
+ range_scan = aux_sdb_num_alert(aux);
for (i = 0, idx = aux->head; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
- if (!(te->flags & SDB_TE_BUFFER_FULL_MASK))
+ if (!te->header.f)
break;
}
/* i is num of SDBs which are full */
@@ -1371,9 +1441,10 @@ static void aux_output_end(struct perf_output_handle *handle)
/* Remove alert indicators in the buffer */
te = aux_sdb_trailer(aux, aux->alert_mark);
- te->flags &= ~SDB_TE_ALERT_REQ_MASK;
+ te->header.a = 0;
- debug_sprintf_event(sfdbg, 6, "%s: collect %#lx SDBs\n", __func__, i);
+ debug_sprintf_event(sfdbg, 6, "%s: SDBs %ld range %ld head %ld\n",
+ __func__, i, range_scan, aux->head);
}
/*
@@ -1389,9 +1460,7 @@ static int aux_output_begin(struct perf_output_handle *handle,
struct aux_buffer *aux,
struct cpu_hw_sf *cpuhw)
{
- unsigned long range;
- unsigned long i, range_scan, idx;
- unsigned long head, base, offset;
+ unsigned long range, i, range_scan, idx, head, base, offset;
struct hws_trailer_entry *te;
if (WARN_ON_ONCE(handle->head & ~PAGE_MASK))
@@ -1406,14 +1475,18 @@ static int aux_output_begin(struct perf_output_handle *handle,
* SDBs between aux->head and aux->empty_mark are already ready
* for new data. range_scan is num of SDBs not within them.
*/
- if (range > AUX_SDB_NUM_EMPTY(aux)) {
- range_scan = range - AUX_SDB_NUM_EMPTY(aux);
+ debug_sprintf_event(sfdbg, 6,
+ "%s: range %ld head %ld alert %ld empty %ld\n",
+ __func__, range, aux->head, aux->alert_mark,
+ aux->empty_mark);
+ if (range > aux_sdb_num_empty(aux)) {
+ range_scan = range - aux_sdb_num_empty(aux);
idx = aux->empty_mark + 1;
for (i = 0; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
- te->flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
- te->flags = te->flags & ~SDB_TE_ALERT_REQ_MASK;
- te->overflow = 0;
+ te->header.f = 0;
+ te->header.a = 0;
+ te->header.overflow = 0;
}
/* Save the position of empty SDBs */
aux->empty_mark = aux->head + range - 1;
@@ -1422,24 +1495,20 @@ static int aux_output_begin(struct perf_output_handle *handle,
/* Set alert indicator */
aux->alert_mark = aux->head + range/2 - 1;
te = aux_sdb_trailer(aux, aux->alert_mark);
- te->flags = te->flags | SDB_TE_ALERT_REQ_MASK;
+ te->header.a = 1;
/* Reset hardware buffer head */
- head = AUX_SDB_INDEX(aux, aux->head);
+ head = aux_sdb_index(aux, aux->head);
base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE];
offset = head % CPUM_SF_SDB_PER_TABLE;
- cpuhw->lsctl.tear = base + offset * sizeof(unsigned long);
- cpuhw->lsctl.dear = aux->sdb_index[head];
+ cpuhw->lsctl.tear = virt_to_phys((void *)base) + offset * sizeof(unsigned long);
+ cpuhw->lsctl.dear = virt_to_phys((void *)aux->sdb_index[head]);
- debug_sprintf_event(sfdbg, 6, "%s: "
- "head->alert_mark->empty_mark (num_alert, range)"
- "[%#lx -> %#lx -> %#lx] (%#lx, %#lx) "
- "tear index %#lx, tear %#lx dear %#lx\n", __func__,
+ debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld empty %ld "
+ "index %ld tear %#lx dear %#lx\n", __func__,
aux->head, aux->alert_mark, aux->empty_mark,
- AUX_SDB_NUM_ALERT(aux), range,
head / CPUM_SF_SDB_PER_TABLE,
- cpuhw->lsctl.tear,
- cpuhw->lsctl.dear);
+ cpuhw->lsctl.tear, cpuhw->lsctl.dear);
return 0;
}
@@ -1453,15 +1522,16 @@ static int aux_output_begin(struct perf_output_handle *handle,
static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
unsigned long long *overflow)
{
- unsigned long long orig_overflow, orig_flags, new_flags;
+ union hws_trailer_header old, prev, new;
struct hws_trailer_entry *te;
te = aux_sdb_trailer(aux, alert_index);
+ prev.val = READ_ONCE_ALIGNED_128(te->header.val);
do {
- orig_flags = te->flags;
- orig_overflow = te->overflow;
- *overflow = orig_overflow;
- if (orig_flags & SDB_TE_BUFFER_FULL_MASK) {
+ old.val = prev.val;
+ new.val = prev.val;
+ *overflow = old.overflow;
+ if (old.f) {
/*
* SDB is already set by hardware.
* Abort and try to set somewhere
@@ -1469,10 +1539,10 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
*/
return false;
}
- new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- orig_flags, orig_overflow,
- new_flags, 0ULL));
+ new.a = 1;
+ new.overflow = 0;
+ prev.val = cmpxchg128(&te->header.val, old.val, new.val);
+ } while (prev.val != old.val);
return true;
}
@@ -1501,11 +1571,15 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
unsigned long long *overflow)
{
- unsigned long long orig_overflow, orig_flags, new_flags;
- unsigned long i, range_scan, idx;
+ unsigned long i, range_scan, idx, idx_old;
+ union hws_trailer_header old, prev, new;
+ unsigned long long orig_overflow;
struct hws_trailer_entry *te;
- if (range <= AUX_SDB_NUM_EMPTY(aux))
+ debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld "
+ "empty %ld\n", __func__, range, aux->head,
+ aux->alert_mark, aux->empty_mark);
+ if (range <= aux_sdb_num_empty(aux))
/*
* No need to scan. All SDBs in range are marked as empty.
* Just set alert indicator. Should check race with hardware
@@ -1526,27 +1600,32 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
* Start scanning from one SDB behind empty_mark. If the new alert
* indicator fall into this range, set it.
*/
- range_scan = range - AUX_SDB_NUM_EMPTY(aux);
- idx = aux->empty_mark + 1;
+ range_scan = range - aux_sdb_num_empty(aux);
+ idx_old = idx = aux->empty_mark + 1;
for (i = 0; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
+ prev.val = READ_ONCE_ALIGNED_128(te->header.val);
do {
- orig_flags = te->flags;
- orig_overflow = te->overflow;
- new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK;
+ old.val = prev.val;
+ new.val = prev.val;
+ orig_overflow = old.overflow;
+ new.f = 0;
+ new.overflow = 0;
if (idx == aux->alert_mark)
- new_flags |= SDB_TE_ALERT_REQ_MASK;
+ new.a = 1;
else
- new_flags &= ~SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- orig_flags, orig_overflow,
- new_flags, 0ULL));
+ new.a = 0;
+ prev.val = cmpxchg128(&te->header.val, old.val, new.val);
+ } while (prev.val != old.val);
*overflow += orig_overflow;
}
/* Update empty_mark to new position */
aux->empty_mark = aux->head + range - 1;
+ debug_sprintf_event(sfdbg, 6, "%s: range_scan %ld idx %ld..%ld "
+ "empty %ld\n", __func__, range_scan, idx_old,
+ idx - 1, aux->empty_mark);
return true;
}
@@ -1567,10 +1646,12 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
return;
/* Inform user space new data arrived */
- size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT;
+ size = aux_sdb_num_alert(aux) << PAGE_SHIFT;
+ debug_sprintf_event(sfdbg, 6, "%s: #alert %ld\n", __func__,
+ size >> PAGE_SHIFT);
perf_aux_output_end(handle, size);
- num_sdb = aux->sfb.num_sdb;
+ num_sdb = aux->sfb.num_sdb;
while (!done) {
/* Get an output handle */
aux = perf_aux_output_begin(handle, cpuhw->event);
@@ -1578,9 +1659,6 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
pr_err("The AUX buffer with %lu pages for the "
"diagnostic-sampling mode is full\n",
num_sdb);
- debug_sprintf_event(sfdbg, 1,
- "%s: AUX buffer used up\n",
- __func__);
break;
}
if (WARN_ON_ONCE(!aux))
@@ -1602,14 +1680,14 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
size = range << PAGE_SHIFT;
perf_aux_output_end(&cpuhw->handle, size);
pr_err("Sample data caused the AUX buffer with %lu "
- "pages to overflow\n", num_sdb);
- debug_sprintf_event(sfdbg, 1, "%s: head %#lx range %#lx "
- "overflow %#llx\n", __func__,
+ "pages to overflow\n", aux->sfb.num_sdb);
+ debug_sprintf_event(sfdbg, 1, "%s: head %ld range %ld "
+ "overflow %lld\n", __func__,
aux->head, range, overflow);
} else {
- size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT;
+ size = aux_sdb_num_alert(aux) << PAGE_SHIFT;
perf_aux_output_end(&cpuhw->handle, size);
- debug_sprintf_event(sfdbg, 6, "%s: head %#lx alert %#lx "
+ debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld "
"already full, try another\n",
__func__,
aux->head, aux->alert_mark);
@@ -1617,11 +1695,9 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
}
if (done)
- debug_sprintf_event(sfdbg, 6, "%s: aux_reset_buffer "
- "[%#lx -> %#lx -> %#lx] (%#lx, %#lx)\n",
- __func__, aux->head, aux->alert_mark,
- aux->empty_mark, AUX_SDB_NUM_ALERT(aux),
- range);
+ debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld "
+ "empty %ld\n", __func__, aux->head,
+ aux->alert_mark, aux->empty_mark);
}
/*
@@ -1644,19 +1720,18 @@ static void aux_buffer_free(void *data)
kfree(aux->sdb_index);
kfree(aux);
- debug_sprintf_event(sfdbg, 4, "%s: free "
- "%lu SDBTs\n", __func__, num_sdbt);
+ debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu\n", __func__, num_sdbt);
}
static void aux_sdb_init(unsigned long sdb)
{
struct hws_trailer_entry *te;
- te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb);
+ te = trailer_entry_ptr(sdb);
/* Save clock base */
te->clock_base = 1;
- memcpy(&te->progusage2, &tod_clock_base[1], 8);
+ te->progusage2 = tod_clock_base.tod;
}
/*
@@ -1697,13 +1772,13 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages,
}
/* Allocate aux_buffer struct for the event */
- aux = kmalloc(sizeof(struct aux_buffer), GFP_KERNEL);
+ aux = kzalloc(sizeof(struct aux_buffer), GFP_KERNEL);
if (!aux)
goto no_aux;
sfb = &aux->sfb;
/* Allocate sdbt_index for fast reference */
- n_sdbt = (nr_pages + CPUM_SF_SDB_PER_TABLE - 1) / CPUM_SF_SDB_PER_TABLE;
+ n_sdbt = DIV_ROUND_UP(nr_pages, CPUM_SF_SDB_PER_TABLE);
aux->sdbt_index = kmalloc_array(n_sdbt, sizeof(void *), GFP_KERNEL);
if (!aux->sdbt_index)
goto no_sdbt_index;
@@ -1715,7 +1790,7 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages,
/* Allocate the first SDBT */
sfb->num_sdbt = 0;
- sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+ sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL);
if (!sfb->sdbt)
goto no_sdbt;
aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt;
@@ -1727,23 +1802,23 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages,
*/
for (i = 0; i < nr_pages; i++, tail++) {
if (require_table_link(tail)) {
- new = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+ new = (unsigned long *)get_zeroed_page(GFP_KERNEL);
if (!new)
goto no_sdbt;
aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new;
/* Link current page to tail of chain */
- *tail = (unsigned long)(void *) new + 1;
+ *tail = virt_to_phys(new) + 1;
tail = new;
}
/* Tail is the entry in a SDBT */
- *tail = (unsigned long)pages[i];
+ *tail = virt_to_phys(pages[i]);
aux->sdb_index[i] = (unsigned long)pages[i];
aux_sdb_init((unsigned long)pages[i]);
}
sfb->num_sdb = nr_pages;
/* Link the last entry in the SDBT to the first SDBT */
- *tail = (unsigned long) sfb->sdbt + 1;
+ *tail = virt_to_phys(sfb->sdbt) + 1;
sfb->tail = tail;
/*
@@ -1753,8 +1828,8 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages,
*/
aux->empty_mark = sfb->num_sdb - 1;
- debug_sprintf_event(sfdbg, 4, "%s: setup %lu SDBTs and %lu SDBs\n",
- __func__, sfb->num_sdbt, sfb->num_sdb);
+ debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu SDBs %lu\n", __func__,
+ sfb->num_sdbt, sfb->num_sdb);
return aux;
@@ -1776,7 +1851,7 @@ static void cpumsf_pmu_read(struct perf_event *event)
/* Nothing to do ... updates are interrupt-driven */
}
-/* Check if the new sampling period/freqeuncy is appropriate.
+/* Check if the new sampling period/frequency is appropriate.
*
* Return non-zero on error and zero on passed checks.
*/
@@ -1883,9 +1958,9 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags)
cpuhw->lsctl.h = 1;
cpuhw->lsctl.interval = SAMPL_RATE(&event->hw);
if (!SAMPL_DIAG_MODE(&event->hw)) {
- cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt;
- cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt;
- TEAR_REG(&event->hw) = (unsigned long) cpuhw->sfb.sdbt;
+ cpuhw->lsctl.tear = virt_to_phys(cpuhw->sfb.sdbt);
+ cpuhw->lsctl.dear = *(unsigned long *)cpuhw->sfb.sdbt;
+ TEAR_REG(&event->hw) = (unsigned long)cpuhw->sfb.sdbt;
}
/* Ensure sampling functions are in the disabled state. If disabled,
@@ -2202,4 +2277,4 @@ out:
}
arch_initcall(init_cpum_sampling_pmu);
-core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0640);
+core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0644);
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index 1e75cc983546..dfa77da2fd2e 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -15,7 +15,10 @@
#include <linux/export.h>
#include <linux/seq_file.h>
#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/compat.h>
#include <linux/sysfs.h>
+#include <asm/stacktrace.h>
#include <asm/irq.h>
#include <asm/cpu_mf.h>
#include <asm/lowcore.h>
@@ -23,27 +26,6 @@
#include <asm/sysinfo.h>
#include <asm/unwind.h>
-const char *perf_pmu_name(void)
-{
- if (cpum_cf_avail() || cpum_sf_avail())
- return "CPU-Measurement Facilities (CPU-MF)";
- return "pmu";
-}
-EXPORT_SYMBOL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
- int num = 0;
-
- if (cpum_cf_avail())
- num += PERF_CPUM_CF_MAX_CTR;
- if (cpum_sf_avail())
- num += PERF_CPUM_SF_MAX_CTR;
-
- return num;
-}
-EXPORT_SYMBOL(perf_num_counters);
-
static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs)
{
struct stack_frame *stack = (struct stack_frame *) regs->gprs[15];
@@ -51,7 +33,7 @@ static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs)
if (!stack)
return NULL;
- return (struct kvm_s390_sie_block *) stack->empty1[0];
+ return (struct kvm_s390_sie_block *)stack->sie_control_block;
}
static bool is_in_guest(struct pt_regs *regs)
@@ -233,6 +215,44 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
}
}
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
+ struct pt_regs *regs)
+{
+ struct stack_frame_user __user *sf;
+ unsigned long ip, sp;
+ bool first = true;
+
+ if (is_compat_task())
+ return;
+ perf_callchain_store(entry, instruction_pointer(regs));
+ sf = (void __user *)user_stack_pointer(regs);
+ pagefault_disable();
+ while (entry->nr < entry->max_stack) {
+ if (__get_user(sp, &sf->back_chain))
+ break;
+ if (__get_user(ip, &sf->gprs[8]))
+ break;
+ if (ip & 0x1) {
+ /*
+ * If the instruction address is invalid, and this
+ * is the first stack frame, assume r14 has not
+ * been written to the stack yet. Otherwise exit.
+ */
+ if (first && !(regs->gprs[14] & 0x1))
+ ip = regs->gprs[14];
+ else
+ break;
+ }
+ perf_callchain_store(entry, ip);
+ /* Sanity check: ABI requires SP to be aligned 8 bytes. */
+ if (!sp || sp & 0x7)
+ break;
+ sf = (void __user *)sp;
+ first = false;
+ }
+ pagefault_enable();
+}
+
/* Perf definitions for PMU event attributes in sysfs */
ssize_t cpumf_events_sysfs_show(struct device *dev,
struct device_attribute *attr, char *page)
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
new file mode 100644
index 000000000000..bf8a672b15a4
--- /dev/null
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@ -0,0 +1,773 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Performance event support - Processor Activity Instrumentation Facility
+ *
+ * Copyright IBM Corp. 2022
+ * Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ */
+#define KMSG_COMPONENT "pai_crypto"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/perf_event.h>
+#include <asm/ctlreg.h>
+#include <asm/pai.h>
+#include <asm/debug.h>
+
+static debug_info_t *cfm_dbg;
+static unsigned int paicrypt_cnt; /* Size of the mapped counter sets */
+ /* extracted with QPACI instruction */
+
+DEFINE_STATIC_KEY_FALSE(pai_key);
+
+struct pai_userdata {
+ u16 num;
+ u64 value;
+} __packed;
+
+struct paicrypt_map {
+ unsigned long *page; /* Page for CPU to store counters */
+ struct pai_userdata *save; /* Page to store no-zero counters */
+ unsigned int active_events; /* # of PAI crypto users */
+ refcount_t refcnt; /* Reference count mapped buffers */
+ enum paievt_mode mode; /* Type of event */
+ struct perf_event *event; /* Perf event for sampling */
+};
+
+struct paicrypt_mapptr {
+ struct paicrypt_map *mapptr;
+};
+
+static struct paicrypt_root { /* Anchor to per CPU data */
+ refcount_t refcnt; /* Overall active events */
+ struct paicrypt_mapptr __percpu *mapptr;
+} paicrypt_root;
+
+/* Free per CPU data when the last event is removed. */
+static void paicrypt_root_free(void)
+{
+ if (refcount_dec_and_test(&paicrypt_root.refcnt)) {
+ free_percpu(paicrypt_root.mapptr);
+ paicrypt_root.mapptr = NULL;
+ }
+ debug_sprintf_event(cfm_dbg, 5, "%s root.refcount %d\n", __func__,
+ refcount_read(&paicrypt_root.refcnt));
+}
+
+/*
+ * On initialization of first event also allocate per CPU data dynamically.
+ * Start with an array of pointers, the array size is the maximum number of
+ * CPUs possible, which might be larger than the number of CPUs currently
+ * online.
+ */
+static int paicrypt_root_alloc(void)
+{
+ if (!refcount_inc_not_zero(&paicrypt_root.refcnt)) {
+ /* The memory is already zeroed. */
+ paicrypt_root.mapptr = alloc_percpu(struct paicrypt_mapptr);
+ if (!paicrypt_root.mapptr)
+ return -ENOMEM;
+ refcount_set(&paicrypt_root.refcnt, 1);
+ }
+ return 0;
+}
+
+/* Release the PMU if event is the last perf event */
+static DEFINE_MUTEX(pai_reserve_mutex);
+
+/* Adjust usage counters and remove allocated memory when all users are
+ * gone.
+ */
+static void paicrypt_event_destroy(struct perf_event *event)
+{
+ struct paicrypt_mapptr *mp = per_cpu_ptr(paicrypt_root.mapptr,
+ event->cpu);
+ struct paicrypt_map *cpump = mp->mapptr;
+
+ cpump->event = NULL;
+ static_branch_dec(&pai_key);
+ mutex_lock(&pai_reserve_mutex);
+ debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d"
+ " mode %d refcnt %u\n", __func__,
+ event->attr.config, event->cpu,
+ cpump->active_events, cpump->mode,
+ refcount_read(&cpump->refcnt));
+ if (refcount_dec_and_test(&cpump->refcnt)) {
+ debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n",
+ __func__, (unsigned long)cpump->page,
+ cpump->save);
+ free_page((unsigned long)cpump->page);
+ kvfree(cpump->save);
+ kfree(cpump);
+ mp->mapptr = NULL;
+ }
+ paicrypt_root_free();
+ mutex_unlock(&pai_reserve_mutex);
+}
+
+static u64 paicrypt_getctr(unsigned long *page, int nr, bool kernel)
+{
+ if (kernel)
+ nr += PAI_CRYPTO_MAXCTR;
+ return page[nr];
+}
+
+/* Read the counter values. Return value from location in CMP. For event
+ * CRYPTO_ALL sum up all events.
+ */
+static u64 paicrypt_getdata(struct perf_event *event, bool kernel)
+{
+ struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+ struct paicrypt_map *cpump = mp->mapptr;
+ u64 sum = 0;
+ int i;
+
+ if (event->attr.config != PAI_CRYPTO_BASE) {
+ return paicrypt_getctr(cpump->page,
+ event->attr.config - PAI_CRYPTO_BASE,
+ kernel);
+ }
+
+ for (i = 1; i <= paicrypt_cnt; i++) {
+ u64 val = paicrypt_getctr(cpump->page, i, kernel);
+
+ if (!val)
+ continue;
+ sum += val;
+ }
+ return sum;
+}
+
+static u64 paicrypt_getall(struct perf_event *event)
+{
+ u64 sum = 0;
+
+ if (!event->attr.exclude_kernel)
+ sum += paicrypt_getdata(event, true);
+ if (!event->attr.exclude_user)
+ sum += paicrypt_getdata(event, false);
+
+ return sum;
+}
+
+/* Used to avoid races in checking concurrent access of counting and
+ * sampling for crypto events
+ *
+ * Only one instance of event pai_crypto/CRYPTO_ALL/ for sampling is
+ * allowed and when this event is running, no counting event is allowed.
+ * Several counting events are allowed in parallel, but no sampling event
+ * is allowed while one (or more) counting events are running.
+ *
+ * This function is called in process context and it is save to block.
+ * When the event initialization functions fails, no other call back will
+ * be invoked.
+ *
+ * Allocate the memory for the event.
+ */
+static struct paicrypt_map *paicrypt_busy(struct perf_event *event)
+{
+ struct perf_event_attr *a = &event->attr;
+ struct paicrypt_map *cpump = NULL;
+ struct paicrypt_mapptr *mp;
+ int rc;
+
+ mutex_lock(&pai_reserve_mutex);
+
+ /* Allocate root node */
+ rc = paicrypt_root_alloc();
+ if (rc)
+ goto unlock;
+
+ /* Allocate node for this event */
+ mp = per_cpu_ptr(paicrypt_root.mapptr, event->cpu);
+ cpump = mp->mapptr;
+ if (!cpump) { /* Paicrypt_map allocated? */
+ cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
+ if (!cpump) {
+ rc = -ENOMEM;
+ goto free_root;
+ }
+ }
+
+ if (a->sample_period) { /* Sampling requested */
+ if (cpump->mode != PAI_MODE_NONE)
+ rc = -EBUSY; /* ... sampling/counting active */
+ } else { /* Counting requested */
+ if (cpump->mode == PAI_MODE_SAMPLING)
+ rc = -EBUSY; /* ... and sampling active */
+ }
+ /*
+ * This error case triggers when there is a conflict:
+ * Either sampling requested and counting already active, or visa
+ * versa. Therefore the struct paicrypto_map for this CPU is
+ * needed or the error could not have occurred. Only adjust root
+ * node refcount.
+ */
+ if (rc)
+ goto free_root;
+
+ /* Allocate memory for counter page and counter extraction.
+ * Only the first counting event has to allocate a page.
+ */
+ if (cpump->page) {
+ refcount_inc(&cpump->refcnt);
+ goto unlock;
+ }
+
+ rc = -ENOMEM;
+ cpump->page = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+ if (!cpump->page)
+ goto free_paicrypt_map;
+ cpump->save = kvmalloc_array(paicrypt_cnt + 1,
+ sizeof(struct pai_userdata), GFP_KERNEL);
+ if (!cpump->save) {
+ free_page((unsigned long)cpump->page);
+ cpump->page = NULL;
+ goto free_paicrypt_map;
+ }
+
+ /* Set mode and reference count */
+ rc = 0;
+ refcount_set(&cpump->refcnt, 1);
+ cpump->mode = a->sample_period ? PAI_MODE_SAMPLING : PAI_MODE_COUNTING;
+ mp->mapptr = cpump;
+ debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx users %d"
+ " mode %d refcnt %u page %#lx save %p rc %d\n",
+ __func__, a->sample_period, cpump->active_events,
+ cpump->mode, refcount_read(&cpump->refcnt),
+ (unsigned long)cpump->page, cpump->save, rc);
+ goto unlock;
+
+free_paicrypt_map:
+ kfree(cpump);
+ mp->mapptr = NULL;
+free_root:
+ paicrypt_root_free();
+
+unlock:
+ mutex_unlock(&pai_reserve_mutex);
+ return rc ? ERR_PTR(rc) : cpump;
+}
+
+/* Might be called on different CPU than the one the event is intended for. */
+static int paicrypt_event_init(struct perf_event *event)
+{
+ struct perf_event_attr *a = &event->attr;
+ struct paicrypt_map *cpump;
+
+ /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */
+ if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
+ return -ENOENT;
+ /* PAI crypto event must be in valid range */
+ if (a->config < PAI_CRYPTO_BASE ||
+ a->config > PAI_CRYPTO_BASE + paicrypt_cnt)
+ return -EINVAL;
+ /* Allow only CPU wide operation, no process context for now. */
+ if ((event->attach_state & PERF_ATTACH_TASK) || event->cpu == -1)
+ return -ENOENT;
+ /* Allow only CRYPTO_ALL for sampling. */
+ if (a->sample_period && a->config != PAI_CRYPTO_BASE)
+ return -EINVAL;
+
+ cpump = paicrypt_busy(event);
+ if (IS_ERR(cpump))
+ return PTR_ERR(cpump);
+
+ event->destroy = paicrypt_event_destroy;
+
+ if (a->sample_period) {
+ a->sample_period = 1;
+ a->freq = 0;
+ /* Register for paicrypt_sched_task() to be called */
+ event->attach_state |= PERF_ATTACH_SCHED_CB;
+ /* Add raw data which contain the memory mapped counters */
+ a->sample_type |= PERF_SAMPLE_RAW;
+ /* Turn off inheritance */
+ a->inherit = 0;
+ }
+
+ static_branch_inc(&pai_key);
+ return 0;
+}
+
+static void paicrypt_read(struct perf_event *event)
+{
+ u64 prev, new, delta;
+
+ prev = local64_read(&event->hw.prev_count);
+ new = paicrypt_getall(event);
+ local64_set(&event->hw.prev_count, new);
+ delta = (prev <= new) ? new - prev
+ : (-1ULL - prev) + new + 1; /* overflow */
+ local64_add(delta, &event->count);
+}
+
+static void paicrypt_start(struct perf_event *event, int flags)
+{
+ u64 sum;
+
+ /* Event initialization sets last_tag to 0. When later on the events
+ * are deleted and re-added, do not reset the event count value to zero.
+ * Events are added, deleted and re-added when 2 or more events
+ * are active at the same time.
+ */
+ if (!event->attr.sample_period) { /* Counting */
+ if (!event->hw.last_tag) {
+ event->hw.last_tag = 1;
+ sum = paicrypt_getall(event); /* Get current value */
+ local64_set(&event->hw.prev_count, sum);
+ }
+ } else { /* Sampling */
+ perf_sched_cb_inc(event->pmu);
+ }
+}
+
+static int paicrypt_add(struct perf_event *event, int flags)
+{
+ struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+ struct paicrypt_map *cpump = mp->mapptr;
+ unsigned long ccd;
+
+ if (++cpump->active_events == 1) {
+ ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET;
+ WRITE_ONCE(S390_lowcore.ccd, ccd);
+ local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT);
+ }
+ cpump->event = event;
+ if (flags & PERF_EF_START)
+ paicrypt_start(event, PERF_EF_RELOAD);
+ event->hw.state = 0;
+ return 0;
+}
+
+static void paicrypt_stop(struct perf_event *event, int flags)
+{
+ if (!event->attr.sample_period) /* Counting */
+ paicrypt_read(event);
+ else /* Sampling */
+ perf_sched_cb_dec(event->pmu);
+ event->hw.state = PERF_HES_STOPPED;
+}
+
+static void paicrypt_del(struct perf_event *event, int flags)
+{
+ struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+ struct paicrypt_map *cpump = mp->mapptr;
+
+ paicrypt_stop(event, PERF_EF_UPDATE);
+ if (--cpump->active_events == 0) {
+ local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT);
+ WRITE_ONCE(S390_lowcore.ccd, 0);
+ }
+}
+
+/* Create raw data and save it in buffer. Returns number of bytes copied.
+ * Saves only positive counter entries of the form
+ * 2 bytes: Number of counter
+ * 8 bytes: Value of counter
+ */
+static size_t paicrypt_copy(struct pai_userdata *userdata, unsigned long *page,
+ bool exclude_user, bool exclude_kernel)
+{
+ int i, outidx = 0;
+
+ for (i = 1; i <= paicrypt_cnt; i++) {
+ u64 val = 0;
+
+ if (!exclude_kernel)
+ val += paicrypt_getctr(page, i, true);
+ if (!exclude_user)
+ val += paicrypt_getctr(page, i, false);
+ if (val) {
+ userdata[outidx].num = i;
+ userdata[outidx].value = val;
+ outidx++;
+ }
+ }
+ return outidx * sizeof(struct pai_userdata);
+}
+
+static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump,
+ struct perf_event *event)
+{
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ int overflow;
+
+ /* Setup perf sample */
+ memset(&regs, 0, sizeof(regs));
+ memset(&raw, 0, sizeof(raw));
+ memset(&data, 0, sizeof(data));
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+ if (event->attr.sample_type & PERF_SAMPLE_TID) {
+ data.tid_entry.pid = task_tgid_nr(current);
+ data.tid_entry.tid = task_pid_nr(current);
+ }
+ if (event->attr.sample_type & PERF_SAMPLE_TIME)
+ data.time = event->clock();
+ if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
+ data.id = event->id;
+ if (event->attr.sample_type & PERF_SAMPLE_CPU) {
+ data.cpu_entry.cpu = smp_processor_id();
+ data.cpu_entry.reserved = 0;
+ }
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw.frag.size = rawsize;
+ raw.frag.data = cpump->save;
+ perf_sample_save_raw_data(&data, &raw);
+ }
+
+ overflow = perf_event_overflow(event, &data, &regs);
+ perf_event_update_userpage(event);
+ /* Clear lowcore page after read */
+ memset(cpump->page, 0, PAGE_SIZE);
+ return overflow;
+}
+
+/* Check if there is data to be saved on schedule out of a task. */
+static int paicrypt_have_sample(void)
+{
+ struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+ struct paicrypt_map *cpump = mp->mapptr;
+ struct perf_event *event = cpump->event;
+ size_t rawsize;
+ int rc = 0;
+
+ if (!event) /* No event active */
+ return 0;
+ rawsize = paicrypt_copy(cpump->save, cpump->page,
+ cpump->event->attr.exclude_user,
+ cpump->event->attr.exclude_kernel);
+ if (rawsize) /* No incremented counters */
+ rc = paicrypt_push_sample(rawsize, cpump, event);
+ return rc;
+}
+
+/* Called on schedule-in and schedule-out. No access to event structure,
+ * but for sampling only event CRYPTO_ALL is allowed.
+ */
+static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+{
+ /* We started with a clean page on event installation. So read out
+ * results on schedule_out and if page was dirty, clear values.
+ */
+ if (!sched_in)
+ paicrypt_have_sample();
+}
+
+/* Attribute definitions for paicrypt interface. As with other CPU
+ * Measurement Facilities, there is one attribute per mapped counter.
+ * The number of mapped counters may vary per machine generation. Use
+ * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
+ * to determine the number of mapped counters. The instructions returns
+ * a positive number, which is the highest number of supported counters.
+ * All counters less than this number are also supported, there are no
+ * holes. A returned number of zero means no support for mapped counters.
+ *
+ * The identification of the counter is a unique number. The chosen range
+ * is 0x1000 + offset in mapped kernel page.
+ * All CPU Measurement Facility counters identifiers must be unique and
+ * the numbers from 0 to 496 are already used for the CPU Measurement
+ * Counter facility. Numbers 0xb0000, 0xbc000 and 0xbd000 are already
+ * used for the CPU Measurement Sampling facility.
+ */
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *paicrypt_format_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group paicrypt_events_group = {
+ .name = "events",
+ .attrs = NULL /* Filled in attr_event_init() */
+};
+
+static struct attribute_group paicrypt_format_group = {
+ .name = "format",
+ .attrs = paicrypt_format_attr,
+};
+
+static const struct attribute_group *paicrypt_attr_groups[] = {
+ &paicrypt_events_group,
+ &paicrypt_format_group,
+ NULL,
+};
+
+/* Performance monitoring unit for mapped counters */
+static struct pmu paicrypt = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = paicrypt_event_init,
+ .add = paicrypt_add,
+ .del = paicrypt_del,
+ .start = paicrypt_start,
+ .stop = paicrypt_stop,
+ .read = paicrypt_read,
+ .sched_task = paicrypt_sched_task,
+ .attr_groups = paicrypt_attr_groups
+};
+
+/* List of symbolic PAI counter names. */
+static const char * const paicrypt_ctrnames[] = {
+ [0] = "CRYPTO_ALL",
+ [1] = "KM_DEA",
+ [2] = "KM_TDEA_128",
+ [3] = "KM_TDEA_192",
+ [4] = "KM_ENCRYPTED_DEA",
+ [5] = "KM_ENCRYPTED_TDEA_128",
+ [6] = "KM_ENCRYPTED_TDEA_192",
+ [7] = "KM_AES_128",
+ [8] = "KM_AES_192",
+ [9] = "KM_AES_256",
+ [10] = "KM_ENCRYPTED_AES_128",
+ [11] = "KM_ENCRYPTED_AES_192",
+ [12] = "KM_ENCRYPTED_AES_256",
+ [13] = "KM_XTS_AES_128",
+ [14] = "KM_XTS_AES_256",
+ [15] = "KM_XTS_ENCRYPTED_AES_128",
+ [16] = "KM_XTS_ENCRYPTED_AES_256",
+ [17] = "KMC_DEA",
+ [18] = "KMC_TDEA_128",
+ [19] = "KMC_TDEA_192",
+ [20] = "KMC_ENCRYPTED_DEA",
+ [21] = "KMC_ENCRYPTED_TDEA_128",
+ [22] = "KMC_ENCRYPTED_TDEA_192",
+ [23] = "KMC_AES_128",
+ [24] = "KMC_AES_192",
+ [25] = "KMC_AES_256",
+ [26] = "KMC_ENCRYPTED_AES_128",
+ [27] = "KMC_ENCRYPTED_AES_192",
+ [28] = "KMC_ENCRYPTED_AES_256",
+ [29] = "KMC_PRNG",
+ [30] = "KMA_GCM_AES_128",
+ [31] = "KMA_GCM_AES_192",
+ [32] = "KMA_GCM_AES_256",
+ [33] = "KMA_GCM_ENCRYPTED_AES_128",
+ [34] = "KMA_GCM_ENCRYPTED_AES_192",
+ [35] = "KMA_GCM_ENCRYPTED_AES_256",
+ [36] = "KMF_DEA",
+ [37] = "KMF_TDEA_128",
+ [38] = "KMF_TDEA_192",
+ [39] = "KMF_ENCRYPTED_DEA",
+ [40] = "KMF_ENCRYPTED_TDEA_128",
+ [41] = "KMF_ENCRYPTED_TDEA_192",
+ [42] = "KMF_AES_128",
+ [43] = "KMF_AES_192",
+ [44] = "KMF_AES_256",
+ [45] = "KMF_ENCRYPTED_AES_128",
+ [46] = "KMF_ENCRYPTED_AES_192",
+ [47] = "KMF_ENCRYPTED_AES_256",
+ [48] = "KMCTR_DEA",
+ [49] = "KMCTR_TDEA_128",
+ [50] = "KMCTR_TDEA_192",
+ [51] = "KMCTR_ENCRYPTED_DEA",
+ [52] = "KMCTR_ENCRYPTED_TDEA_128",
+ [53] = "KMCTR_ENCRYPTED_TDEA_192",
+ [54] = "KMCTR_AES_128",
+ [55] = "KMCTR_AES_192",
+ [56] = "KMCTR_AES_256",
+ [57] = "KMCTR_ENCRYPTED_AES_128",
+ [58] = "KMCTR_ENCRYPTED_AES_192",
+ [59] = "KMCTR_ENCRYPTED_AES_256",
+ [60] = "KMO_DEA",
+ [61] = "KMO_TDEA_128",
+ [62] = "KMO_TDEA_192",
+ [63] = "KMO_ENCRYPTED_DEA",
+ [64] = "KMO_ENCRYPTED_TDEA_128",
+ [65] = "KMO_ENCRYPTED_TDEA_192",
+ [66] = "KMO_AES_128",
+ [67] = "KMO_AES_192",
+ [68] = "KMO_AES_256",
+ [69] = "KMO_ENCRYPTED_AES_128",
+ [70] = "KMO_ENCRYPTED_AES_192",
+ [71] = "KMO_ENCRYPTED_AES_256",
+ [72] = "KIMD_SHA_1",
+ [73] = "KIMD_SHA_256",
+ [74] = "KIMD_SHA_512",
+ [75] = "KIMD_SHA3_224",
+ [76] = "KIMD_SHA3_256",
+ [77] = "KIMD_SHA3_384",
+ [78] = "KIMD_SHA3_512",
+ [79] = "KIMD_SHAKE_128",
+ [80] = "KIMD_SHAKE_256",
+ [81] = "KIMD_GHASH",
+ [82] = "KLMD_SHA_1",
+ [83] = "KLMD_SHA_256",
+ [84] = "KLMD_SHA_512",
+ [85] = "KLMD_SHA3_224",
+ [86] = "KLMD_SHA3_256",
+ [87] = "KLMD_SHA3_384",
+ [88] = "KLMD_SHA3_512",
+ [89] = "KLMD_SHAKE_128",
+ [90] = "KLMD_SHAKE_256",
+ [91] = "KMAC_DEA",
+ [92] = "KMAC_TDEA_128",
+ [93] = "KMAC_TDEA_192",
+ [94] = "KMAC_ENCRYPTED_DEA",
+ [95] = "KMAC_ENCRYPTED_TDEA_128",
+ [96] = "KMAC_ENCRYPTED_TDEA_192",
+ [97] = "KMAC_AES_128",
+ [98] = "KMAC_AES_192",
+ [99] = "KMAC_AES_256",
+ [100] = "KMAC_ENCRYPTED_AES_128",
+ [101] = "KMAC_ENCRYPTED_AES_192",
+ [102] = "KMAC_ENCRYPTED_AES_256",
+ [103] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_DEA",
+ [104] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_128",
+ [105] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_192",
+ [106] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_DEA",
+ [107] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_128",
+ [108] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_192",
+ [109] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_128",
+ [110] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_192",
+ [111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256",
+ [112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128",
+ [113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192",
+ [114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256A",
+ [115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128",
+ [116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256",
+ [117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128",
+ [118] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_256",
+ [119] = "PCC_SCALAR_MULTIPLY_P256",
+ [120] = "PCC_SCALAR_MULTIPLY_P384",
+ [121] = "PCC_SCALAR_MULTIPLY_P521",
+ [122] = "PCC_SCALAR_MULTIPLY_ED25519",
+ [123] = "PCC_SCALAR_MULTIPLY_ED448",
+ [124] = "PCC_SCALAR_MULTIPLY_X25519",
+ [125] = "PCC_SCALAR_MULTIPLY_X448",
+ [126] = "PRNO_SHA_512_DRNG",
+ [127] = "PRNO_TRNG_QUERY_RAW_TO_CONDITIONED_RATIO",
+ [128] = "PRNO_TRNG",
+ [129] = "KDSA_ECDSA_VERIFY_P256",
+ [130] = "KDSA_ECDSA_VERIFY_P384",
+ [131] = "KDSA_ECDSA_VERIFY_P521",
+ [132] = "KDSA_ECDSA_SIGN_P256",
+ [133] = "KDSA_ECDSA_SIGN_P384",
+ [134] = "KDSA_ECDSA_SIGN_P521",
+ [135] = "KDSA_ENCRYPTED_ECDSA_SIGN_P256",
+ [136] = "KDSA_ENCRYPTED_ECDSA_SIGN_P384",
+ [137] = "KDSA_ENCRYPTED_ECDSA_SIGN_P521",
+ [138] = "KDSA_EDDSA_VERIFY_ED25519",
+ [139] = "KDSA_EDDSA_VERIFY_ED448",
+ [140] = "KDSA_EDDSA_SIGN_ED25519",
+ [141] = "KDSA_EDDSA_SIGN_ED448",
+ [142] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED25519",
+ [143] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED448",
+ [144] = "PCKMO_ENCRYPT_DEA_KEY",
+ [145] = "PCKMO_ENCRYPT_TDEA_128_KEY",
+ [146] = "PCKMO_ENCRYPT_TDEA_192_KEY",
+ [147] = "PCKMO_ENCRYPT_AES_128_KEY",
+ [148] = "PCKMO_ENCRYPT_AES_192_KEY",
+ [149] = "PCKMO_ENCRYPT_AES_256_KEY",
+ [150] = "PCKMO_ENCRYPT_ECC_P256_KEY",
+ [151] = "PCKMO_ENCRYPT_ECC_P384_KEY",
+ [152] = "PCKMO_ENCRYPT_ECC_P521_KEY",
+ [153] = "PCKMO_ENCRYPT_ECC_ED25519_KEY",
+ [154] = "PCKMO_ENCRYPT_ECC_ED448_KEY",
+ [155] = "IBM_RESERVED_155",
+ [156] = "IBM_RESERVED_156",
+};
+
+static void __init attr_event_free(struct attribute **attrs, int num)
+{
+ struct perf_pmu_events_attr *pa;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ struct device_attribute *dap;
+
+ dap = container_of(attrs[i], struct device_attribute, attr);
+ pa = container_of(dap, struct perf_pmu_events_attr, attr);
+ kfree(pa);
+ }
+ kfree(attrs);
+}
+
+static int __init attr_event_init_one(struct attribute **attrs, int num)
+{
+ struct perf_pmu_events_attr *pa;
+
+ pa = kzalloc(sizeof(*pa), GFP_KERNEL);
+ if (!pa)
+ return -ENOMEM;
+
+ sysfs_attr_init(&pa->attr.attr);
+ pa->id = PAI_CRYPTO_BASE + num;
+ pa->attr.attr.name = paicrypt_ctrnames[num];
+ pa->attr.attr.mode = 0444;
+ pa->attr.show = cpumf_events_sysfs_show;
+ pa->attr.store = NULL;
+ attrs[num] = &pa->attr.attr;
+ return 0;
+}
+
+/* Create PMU sysfs event attributes on the fly. */
+static int __init attr_event_init(void)
+{
+ struct attribute **attrs;
+ int ret, i;
+
+ attrs = kmalloc_array(ARRAY_SIZE(paicrypt_ctrnames) + 1, sizeof(*attrs),
+ GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+ for (i = 0; i < ARRAY_SIZE(paicrypt_ctrnames); i++) {
+ ret = attr_event_init_one(attrs, i);
+ if (ret) {
+ attr_event_free(attrs, i - 1);
+ return ret;
+ }
+ }
+ attrs[i] = NULL;
+ paicrypt_events_group.attrs = attrs;
+ return 0;
+}
+
+static int __init paicrypt_init(void)
+{
+ struct qpaci_info_block ib;
+ int rc;
+
+ if (!test_facility(196))
+ return 0;
+
+ qpaci(&ib);
+ paicrypt_cnt = ib.num_cc;
+ if (paicrypt_cnt == 0)
+ return 0;
+ if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR)
+ paicrypt_cnt = PAI_CRYPTO_MAXCTR - 1;
+
+ rc = attr_event_init(); /* Export known PAI crypto events */
+ if (rc) {
+ pr_err("Creation of PMU pai_crypto /sysfs failed\n");
+ return rc;
+ }
+
+ /* Setup s390dbf facility */
+ cfm_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128);
+ if (!cfm_dbg) {
+ pr_err("Registration of s390dbf pai_crypto failed\n");
+ return -ENOMEM;
+ }
+ debug_register_view(cfm_dbg, &debug_sprintf_view);
+
+ rc = perf_pmu_register(&paicrypt, "pai_crypto", -1);
+ if (rc) {
+ pr_err("Registering the pai_crypto PMU failed with rc=%i\n",
+ rc);
+ debug_unregister_view(cfm_dbg, &debug_sprintf_view);
+ debug_unregister(cfm_dbg);
+ return rc;
+ }
+ return 0;
+}
+
+device_initcall(paicrypt_init);
diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
new file mode 100644
index 000000000000..af7f2b538c8f
--- /dev/null
+++ b/arch/s390/kernel/perf_pai_ext.c
@@ -0,0 +1,671 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Performance event support - Processor Activity Instrumentation Extension
+ * Facility
+ *
+ * Copyright IBM Corp. 2022
+ * Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ */
+#define KMSG_COMPONENT "pai_ext"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/perf_event.h>
+#include <asm/ctlreg.h>
+#include <asm/pai.h>
+#include <asm/debug.h>
+
+#define PAIE1_CB_SZ 0x200 /* Size of PAIE1 control block */
+#define PAIE1_CTRBLOCK_SZ 0x400 /* Size of PAIE1 counter blocks */
+
+static debug_info_t *paiext_dbg;
+static unsigned int paiext_cnt; /* Extracted with QPACI instruction */
+
+struct pai_userdata {
+ u16 num;
+ u64 value;
+} __packed;
+
+/* Create the PAI extension 1 control block area.
+ * The PAI extension control block 1 is pointed to by lowcore
+ * address 0x1508 for each CPU. This control block is 512 bytes in size
+ * and requires a 512 byte boundary alignment.
+ */
+struct paiext_cb { /* PAI extension 1 control block */
+ u64 header; /* Not used */
+ u64 reserved1;
+ u64 acc; /* Addr to analytics counter control block */
+ u8 reserved2[488];
+} __packed;
+
+struct paiext_map {
+ unsigned long *area; /* Area for CPU to store counters */
+ struct pai_userdata *save; /* Area to store non-zero counters */
+ enum paievt_mode mode; /* Type of event */
+ unsigned int active_events; /* # of PAI Extension users */
+ refcount_t refcnt;
+ struct perf_event *event; /* Perf event for sampling */
+ struct paiext_cb *paiext_cb; /* PAI extension control block area */
+};
+
+struct paiext_mapptr {
+ struct paiext_map *mapptr;
+};
+
+static struct paiext_root { /* Anchor to per CPU data */
+ refcount_t refcnt; /* Overall active events */
+ struct paiext_mapptr __percpu *mapptr;
+} paiext_root;
+
+/* Free per CPU data when the last event is removed. */
+static void paiext_root_free(void)
+{
+ if (refcount_dec_and_test(&paiext_root.refcnt)) {
+ free_percpu(paiext_root.mapptr);
+ paiext_root.mapptr = NULL;
+ }
+}
+
+/* On initialization of first event also allocate per CPU data dynamically.
+ * Start with an array of pointers, the array size is the maximum number of
+ * CPUs possible, which might be larger than the number of CPUs currently
+ * online.
+ */
+static int paiext_root_alloc(void)
+{
+ if (!refcount_inc_not_zero(&paiext_root.refcnt)) {
+ /* The memory is already zeroed. */
+ paiext_root.mapptr = alloc_percpu(struct paiext_mapptr);
+ if (!paiext_root.mapptr) {
+ /* Returning without refcnt adjustment is ok. The
+ * error code is handled by paiext_alloc() which
+ * decrements refcnt when an event can not be
+ * created.
+ */
+ return -ENOMEM;
+ }
+ refcount_set(&paiext_root.refcnt, 1);
+ }
+ return 0;
+}
+
+/* Protects against concurrent increment of sampler and counter member
+ * increments at the same time and prohibits concurrent execution of
+ * counting and sampling events.
+ * Ensures that analytics counter block is deallocated only when the
+ * sampling and counting on that cpu is zero.
+ * For details see paiext_alloc().
+ */
+static DEFINE_MUTEX(paiext_reserve_mutex);
+
+/* Free all memory allocated for event counting/sampling setup */
+static void paiext_free(struct paiext_mapptr *mp)
+{
+ kfree(mp->mapptr->area);
+ kfree(mp->mapptr->paiext_cb);
+ kvfree(mp->mapptr->save);
+ kfree(mp->mapptr);
+ mp->mapptr = NULL;
+}
+
+/* Release the PMU if event is the last perf event */
+static void paiext_event_destroy(struct perf_event *event)
+{
+ struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, event->cpu);
+ struct paiext_map *cpump = mp->mapptr;
+
+ mutex_lock(&paiext_reserve_mutex);
+ cpump->event = NULL;
+ if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */
+ paiext_free(mp);
+ paiext_root_free();
+ mutex_unlock(&paiext_reserve_mutex);
+ debug_sprintf_event(paiext_dbg, 4, "%s cpu %d mapptr %p\n", __func__,
+ event->cpu, mp->mapptr);
+
+}
+
+/* Used to avoid races in checking concurrent access of counting and
+ * sampling for pai_extension events.
+ *
+ * Only one instance of event pai_ext/NNPA_ALL/ for sampling is
+ * allowed and when this event is running, no counting event is allowed.
+ * Several counting events are allowed in parallel, but no sampling event
+ * is allowed while one (or more) counting events are running.
+ *
+ * This function is called in process context and it is safe to block.
+ * When the event initialization functions fails, no other call back will
+ * be invoked.
+ *
+ * Allocate the memory for the event.
+ */
+static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event)
+{
+ struct paiext_mapptr *mp;
+ struct paiext_map *cpump;
+ int rc;
+
+ mutex_lock(&paiext_reserve_mutex);
+
+ rc = paiext_root_alloc();
+ if (rc)
+ goto unlock;
+
+ mp = per_cpu_ptr(paiext_root.mapptr, event->cpu);
+ cpump = mp->mapptr;
+ if (!cpump) { /* Paiext_map allocated? */
+ rc = -ENOMEM;
+ cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
+ if (!cpump)
+ goto undo;
+
+ /* Allocate memory for counter area and counter extraction.
+ * These are
+ * - a 512 byte block and requires 512 byte boundary alignment.
+ * - a 1KB byte block and requires 1KB boundary alignment.
+ * Only the first counting event has to allocate the area.
+ *
+ * Note: This works with commit 59bb47985c1d by default.
+ * Backporting this to kernels without this commit might
+ * need adjustment.
+ */
+ mp->mapptr = cpump;
+ cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL);
+ cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL);
+ cpump->save = kvmalloc_array(paiext_cnt + 1,
+ sizeof(struct pai_userdata),
+ GFP_KERNEL);
+ if (!cpump->save || !cpump->area || !cpump->paiext_cb) {
+ paiext_free(mp);
+ goto undo;
+ }
+ refcount_set(&cpump->refcnt, 1);
+ cpump->mode = a->sample_period ? PAI_MODE_SAMPLING
+ : PAI_MODE_COUNTING;
+ } else {
+ /* Multiple invocation, check what is active.
+ * Supported are multiple counter events or only one sampling
+ * event concurrently at any one time.
+ */
+ if (cpump->mode == PAI_MODE_SAMPLING ||
+ (cpump->mode == PAI_MODE_COUNTING && a->sample_period)) {
+ rc = -EBUSY;
+ goto undo;
+ }
+ refcount_inc(&cpump->refcnt);
+ }
+
+ rc = 0;
+ cpump->event = event;
+
+undo:
+ if (rc) {
+ /* Error in allocation of event, decrement anchor. Since
+ * the event in not created, its destroy() function is never
+ * invoked. Adjust the reference counter for the anchor.
+ */
+ paiext_root_free();
+ }
+unlock:
+ mutex_unlock(&paiext_reserve_mutex);
+ /* If rc is non-zero, no increment of counter/sampler was done. */
+ return rc;
+}
+
+/* The PAI extension 1 control block supports up to 128 entries. Return
+ * the index within PAIE1_CB given the event number. Also validate event
+ * number.
+ */
+static int paiext_event_valid(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) {
+ /* Offset NNPA in paiext_cb */
+ event->hw.config_base = offsetof(struct paiext_cb, acc);
+ return 0;
+ }
+ return -EINVAL;
+}
+
+/* Might be called on different CPU than the one the event is intended for. */
+static int paiext_event_init(struct perf_event *event)
+{
+ struct perf_event_attr *a = &event->attr;
+ int rc;
+
+ /* PMU pai_ext registered as PERF_TYPE_RAW, check event type */
+ if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
+ return -ENOENT;
+ /* PAI extension event must be valid and in supported range */
+ rc = paiext_event_valid(event);
+ if (rc)
+ return rc;
+ /* Allow only CPU wide operation, no process context for now. */
+ if ((event->attach_state & PERF_ATTACH_TASK) || event->cpu == -1)
+ return -ENOENT;
+ /* Allow only event NNPA_ALL for sampling. */
+ if (a->sample_period && a->config != PAI_NNPA_BASE)
+ return -EINVAL;
+ /* Prohibit exclude_user event selection */
+ if (a->exclude_user)
+ return -EINVAL;
+
+ rc = paiext_alloc(a, event);
+ if (rc)
+ return rc;
+ event->destroy = paiext_event_destroy;
+
+ if (a->sample_period) {
+ a->sample_period = 1;
+ a->freq = 0;
+ /* Register for paicrypt_sched_task() to be called */
+ event->attach_state |= PERF_ATTACH_SCHED_CB;
+ /* Add raw data which are the memory mapped counters */
+ a->sample_type |= PERF_SAMPLE_RAW;
+ /* Turn off inheritance */
+ a->inherit = 0;
+ }
+
+ return 0;
+}
+
+static u64 paiext_getctr(unsigned long *area, int nr)
+{
+ return area[nr];
+}
+
+/* Read the counter values. Return value from location in buffer. For event
+ * NNPA_ALL sum up all events.
+ */
+static u64 paiext_getdata(struct perf_event *event)
+{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
+ u64 sum = 0;
+ int i;
+
+ if (event->attr.config != PAI_NNPA_BASE)
+ return paiext_getctr(cpump->area,
+ event->attr.config - PAI_NNPA_BASE);
+
+ for (i = 1; i <= paiext_cnt; i++)
+ sum += paiext_getctr(cpump->area, i);
+
+ return sum;
+}
+
+static u64 paiext_getall(struct perf_event *event)
+{
+ return paiext_getdata(event);
+}
+
+static void paiext_read(struct perf_event *event)
+{
+ u64 prev, new, delta;
+
+ prev = local64_read(&event->hw.prev_count);
+ new = paiext_getall(event);
+ local64_set(&event->hw.prev_count, new);
+ delta = new - prev;
+ local64_add(delta, &event->count);
+}
+
+static void paiext_start(struct perf_event *event, int flags)
+{
+ u64 sum;
+
+ if (!event->attr.sample_period) { /* Counting */
+ if (!event->hw.last_tag) {
+ event->hw.last_tag = 1;
+ sum = paiext_getall(event); /* Get current value */
+ local64_set(&event->hw.prev_count, sum);
+ }
+ } else { /* Sampling */
+ perf_sched_cb_inc(event->pmu);
+ }
+}
+
+static int paiext_add(struct perf_event *event, int flags)
+{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
+ struct paiext_cb *pcb = cpump->paiext_cb;
+
+ if (++cpump->active_events == 1) {
+ S390_lowcore.aicd = virt_to_phys(cpump->paiext_cb);
+ pcb->acc = virt_to_phys(cpump->area) | 0x1;
+ /* Enable CPU instruction lookup for PAIE1 control block */
+ local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT);
+ debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n",
+ __func__, S390_lowcore.aicd, pcb->acc);
+ }
+ cpump->event = event;
+ if (flags & PERF_EF_START)
+ paiext_start(event, PERF_EF_RELOAD);
+ event->hw.state = 0;
+ return 0;
+}
+
+static void paiext_stop(struct perf_event *event, int flags)
+{
+ if (!event->attr.sample_period) /* Counting */
+ paiext_read(event);
+ else /* Sampling */
+ perf_sched_cb_dec(event->pmu);
+ event->hw.state = PERF_HES_STOPPED;
+}
+
+static void paiext_del(struct perf_event *event, int flags)
+{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
+ struct paiext_cb *pcb = cpump->paiext_cb;
+
+ paiext_stop(event, PERF_EF_UPDATE);
+ if (--cpump->active_events == 0) {
+ /* Disable CPU instruction lookup for PAIE1 control block */
+ local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT);
+ pcb->acc = 0;
+ S390_lowcore.aicd = 0;
+ debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n",
+ __func__, S390_lowcore.aicd, pcb->acc);
+ }
+}
+
+/* Create raw data and save it in buffer. Returns number of bytes copied.
+ * Saves only positive counter entries of the form
+ * 2 bytes: Number of counter
+ * 8 bytes: Value of counter
+ */
+static size_t paiext_copy(struct pai_userdata *userdata, unsigned long *area)
+{
+ int i, outidx = 0;
+
+ for (i = 1; i <= paiext_cnt; i++) {
+ u64 val = paiext_getctr(area, i);
+
+ if (val) {
+ userdata[outidx].num = i;
+ userdata[outidx].value = val;
+ outidx++;
+ }
+ }
+ return outidx * sizeof(*userdata);
+}
+
+/* Write sample when one or more counters values are nonzero.
+ *
+ * Note: The function paiext_sched_task() and paiext_push_sample() are not
+ * invoked after function paiext_del() has been called because of function
+ * perf_sched_cb_dec().
+ * The function paiext_sched_task() and paiext_push_sample() are only
+ * called when sampling is active. Function perf_sched_cb_inc()
+ * has been invoked to install function paiext_sched_task() as call back
+ * to run at context switch time (see paiext_add()).
+ *
+ * This causes function perf_event_context_sched_out() and
+ * perf_event_context_sched_in() to check whether the PMU has installed an
+ * sched_task() callback. That callback is not active after paiext_del()
+ * returns and has deleted the event on that CPU.
+ */
+static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
+ struct perf_event *event)
+{
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ int overflow;
+
+ /* Setup perf sample */
+ memset(&regs, 0, sizeof(regs));
+ memset(&raw, 0, sizeof(raw));
+ memset(&data, 0, sizeof(data));
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+ if (event->attr.sample_type & PERF_SAMPLE_TID) {
+ data.tid_entry.pid = task_tgid_nr(current);
+ data.tid_entry.tid = task_pid_nr(current);
+ }
+ if (event->attr.sample_type & PERF_SAMPLE_TIME)
+ data.time = event->clock();
+ if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
+ data.id = event->id;
+ if (event->attr.sample_type & PERF_SAMPLE_CPU)
+ data.cpu_entry.cpu = smp_processor_id();
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw.frag.size = rawsize;
+ raw.frag.data = cpump->save;
+ perf_sample_save_raw_data(&data, &raw);
+ }
+
+ overflow = perf_event_overflow(event, &data, &regs);
+ perf_event_update_userpage(event);
+ /* Clear lowcore area after read */
+ memset(cpump->area, 0, PAIE1_CTRBLOCK_SZ);
+ return overflow;
+}
+
+/* Check if there is data to be saved on schedule out of a task. */
+static int paiext_have_sample(void)
+{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
+ struct perf_event *event = cpump->event;
+ size_t rawsize;
+ int rc = 0;
+
+ if (!event)
+ return 0;
+ rawsize = paiext_copy(cpump->save, cpump->area);
+ if (rawsize) /* Incremented counters */
+ rc = paiext_push_sample(rawsize, cpump, event);
+ return rc;
+}
+
+/* Called on schedule-in and schedule-out. No access to event structure,
+ * but for sampling only event NNPA_ALL is allowed.
+ */
+static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+{
+ /* We started with a clean page on event installation. So read out
+ * results on schedule_out and if page was dirty, clear values.
+ */
+ if (!sched_in)
+ paiext_have_sample();
+}
+
+/* Attribute definitions for pai extension1 interface. As with other CPU
+ * Measurement Facilities, there is one attribute per mapped counter.
+ * The number of mapped counters may vary per machine generation. Use
+ * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
+ * to determine the number of mapped counters. The instructions returns
+ * a positive number, which is the highest number of supported counters.
+ * All counters less than this number are also supported, there are no
+ * holes. A returned number of zero means no support for mapped counters.
+ *
+ * The identification of the counter is a unique number. The chosen range
+ * is 0x1800 + offset in mapped kernel page.
+ * All CPU Measurement Facility counters identifiers must be unique and
+ * the numbers from 0 to 496 are already used for the CPU Measurement
+ * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography
+ * counters.
+ * Numbers 0xb0000, 0xbc000 and 0xbd000 are already
+ * used for the CPU Measurement Sampling facility.
+ */
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *paiext_format_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group paiext_events_group = {
+ .name = "events",
+ .attrs = NULL, /* Filled in attr_event_init() */
+};
+
+static struct attribute_group paiext_format_group = {
+ .name = "format",
+ .attrs = paiext_format_attr,
+};
+
+static const struct attribute_group *paiext_attr_groups[] = {
+ &paiext_events_group,
+ &paiext_format_group,
+ NULL,
+};
+
+/* Performance monitoring unit for mapped counters */
+static struct pmu paiext = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = paiext_event_init,
+ .add = paiext_add,
+ .del = paiext_del,
+ .start = paiext_start,
+ .stop = paiext_stop,
+ .read = paiext_read,
+ .sched_task = paiext_sched_task,
+ .attr_groups = paiext_attr_groups,
+};
+
+/* List of symbolic PAI extension 1 NNPA counter names. */
+static const char * const paiext_ctrnames[] = {
+ [0] = "NNPA_ALL",
+ [1] = "NNPA_ADD",
+ [2] = "NNPA_SUB",
+ [3] = "NNPA_MUL",
+ [4] = "NNPA_DIV",
+ [5] = "NNPA_MIN",
+ [6] = "NNPA_MAX",
+ [7] = "NNPA_LOG",
+ [8] = "NNPA_EXP",
+ [9] = "NNPA_IBM_RESERVED_9",
+ [10] = "NNPA_RELU",
+ [11] = "NNPA_TANH",
+ [12] = "NNPA_SIGMOID",
+ [13] = "NNPA_SOFTMAX",
+ [14] = "NNPA_BATCHNORM",
+ [15] = "NNPA_MAXPOOL2D",
+ [16] = "NNPA_AVGPOOL2D",
+ [17] = "NNPA_LSTMACT",
+ [18] = "NNPA_GRUACT",
+ [19] = "NNPA_CONVOLUTION",
+ [20] = "NNPA_MATMUL_OP",
+ [21] = "NNPA_MATMUL_OP_BCAST23",
+ [22] = "NNPA_SMALLBATCH",
+ [23] = "NNPA_LARGEDIM",
+ [24] = "NNPA_SMALLTENSOR",
+ [25] = "NNPA_1MFRAME",
+ [26] = "NNPA_2GFRAME",
+ [27] = "NNPA_ACCESSEXCEPT",
+};
+
+static void __init attr_event_free(struct attribute **attrs, int num)
+{
+ struct perf_pmu_events_attr *pa;
+ struct device_attribute *dap;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ dap = container_of(attrs[i], struct device_attribute, attr);
+ pa = container_of(dap, struct perf_pmu_events_attr, attr);
+ kfree(pa);
+ }
+ kfree(attrs);
+}
+
+static int __init attr_event_init_one(struct attribute **attrs, int num)
+{
+ struct perf_pmu_events_attr *pa;
+
+ pa = kzalloc(sizeof(*pa), GFP_KERNEL);
+ if (!pa)
+ return -ENOMEM;
+
+ sysfs_attr_init(&pa->attr.attr);
+ pa->id = PAI_NNPA_BASE + num;
+ pa->attr.attr.name = paiext_ctrnames[num];
+ pa->attr.attr.mode = 0444;
+ pa->attr.show = cpumf_events_sysfs_show;
+ pa->attr.store = NULL;
+ attrs[num] = &pa->attr.attr;
+ return 0;
+}
+
+/* Create PMU sysfs event attributes on the fly. */
+static int __init attr_event_init(void)
+{
+ struct attribute **attrs;
+ int ret, i;
+
+ attrs = kmalloc_array(ARRAY_SIZE(paiext_ctrnames) + 1, sizeof(*attrs),
+ GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+ for (i = 0; i < ARRAY_SIZE(paiext_ctrnames); i++) {
+ ret = attr_event_init_one(attrs, i);
+ if (ret) {
+ attr_event_free(attrs, i - 1);
+ return ret;
+ }
+ }
+ attrs[i] = NULL;
+ paiext_events_group.attrs = attrs;
+ return 0;
+}
+
+static int __init paiext_init(void)
+{
+ struct qpaci_info_block ib;
+ int rc = -ENOMEM;
+
+ if (!test_facility(197))
+ return 0;
+
+ qpaci(&ib);
+ paiext_cnt = ib.num_nnpa;
+ if (paiext_cnt >= PAI_NNPA_MAXCTR)
+ paiext_cnt = PAI_NNPA_MAXCTR;
+ if (!paiext_cnt)
+ return 0;
+
+ rc = attr_event_init();
+ if (rc) {
+ pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n");
+ return rc;
+ }
+
+ /* Setup s390dbf facility */
+ paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128);
+ if (!paiext_dbg) {
+ pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n");
+ rc = -ENOMEM;
+ goto out_init;
+ }
+ debug_register_view(paiext_dbg, &debug_sprintf_view);
+
+ rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1);
+ if (rc) {
+ pr_err("Registration of " KMSG_COMPONENT " PMU failed with "
+ "rc=%i\n", rc);
+ goto out_pmu;
+ }
+
+ return 0;
+
+out_pmu:
+ debug_unregister_view(paiext_dbg, &debug_sprintf_view);
+ debug_unregister(paiext_dbg);
+out_init:
+ attr_event_free(paiext_events_group.attrs,
+ ARRAY_SIZE(paiext_ctrnames) + 1);
+ return rc;
+}
+
+device_initcall(paiext_init);
diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c
index 4352a504f235..3d93656bd948 100644
--- a/arch/s390/kernel/perf_regs.c
+++ b/arch/s390/kernel/perf_regs.c
@@ -20,8 +20,10 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
return 0;
idx -= PERF_REG_S390_FP0;
- fp = MACHINE_HAS_VX ? *(freg_t *)(current->thread.fpu.vxrs + idx)
- : current->thread.fpu.fprs[idx];
+ if (cpu_has_vx())
+ fp = *(freg_t *)(current->thread.fpu.vxrs + idx);
+ else
+ fp = current->thread.fpu.fprs[idx];
return fp.ui;
}
@@ -53,8 +55,7 @@ u64 perf_reg_abi(struct task_struct *task)
}
void perf_get_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs,
- struct pt_regs *regs_user_copy)
+ struct pt_regs *regs)
{
/*
* Use the regs from the first interruption and let
diff --git a/arch/s390/kernel/pgm_check.S b/arch/s390/kernel/pgm_check.S
deleted file mode 100644
index 59dee9d3bebf..000000000000
--- a/arch/s390/kernel/pgm_check.S
+++ /dev/null
@@ -1,147 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Program check table.
- *
- * Copyright IBM Corp. 2012
- */
-
-#include <linux/linkage.h>
-
-#define PGM_CHECK(handler) .quad handler
-#define PGM_CHECK_DEFAULT PGM_CHECK(default_trap_handler)
-
-/*
- * The program check table contains exactly 128 (0x00-0x7f) entries. Each
- * line defines the function to be called corresponding to the program check
- * interruption code.
- */
-.section .rodata, "a"
-ENTRY(pgm_check_table)
-PGM_CHECK_DEFAULT /* 00 */
-PGM_CHECK(illegal_op) /* 01 */
-PGM_CHECK(privileged_op) /* 02 */
-PGM_CHECK(execute_exception) /* 03 */
-PGM_CHECK(do_protection_exception) /* 04 */
-PGM_CHECK(addressing_exception) /* 05 */
-PGM_CHECK(specification_exception) /* 06 */
-PGM_CHECK(data_exception) /* 07 */
-PGM_CHECK(overflow_exception) /* 08 */
-PGM_CHECK(divide_exception) /* 09 */
-PGM_CHECK(overflow_exception) /* 0a */
-PGM_CHECK(divide_exception) /* 0b */
-PGM_CHECK(hfp_overflow_exception) /* 0c */
-PGM_CHECK(hfp_underflow_exception) /* 0d */
-PGM_CHECK(hfp_significance_exception) /* 0e */
-PGM_CHECK(hfp_divide_exception) /* 0f */
-PGM_CHECK(do_dat_exception) /* 10 */
-PGM_CHECK(do_dat_exception) /* 11 */
-PGM_CHECK(translation_exception) /* 12 */
-PGM_CHECK(special_op_exception) /* 13 */
-PGM_CHECK_DEFAULT /* 14 */
-PGM_CHECK(operand_exception) /* 15 */
-PGM_CHECK_DEFAULT /* 16 */
-PGM_CHECK_DEFAULT /* 17 */
-PGM_CHECK(transaction_exception) /* 18 */
-PGM_CHECK_DEFAULT /* 19 */
-PGM_CHECK_DEFAULT /* 1a */
-PGM_CHECK(vector_exception) /* 1b */
-PGM_CHECK(space_switch_exception) /* 1c */
-PGM_CHECK(hfp_sqrt_exception) /* 1d */
-PGM_CHECK_DEFAULT /* 1e */
-PGM_CHECK_DEFAULT /* 1f */
-PGM_CHECK_DEFAULT /* 20 */
-PGM_CHECK_DEFAULT /* 21 */
-PGM_CHECK_DEFAULT /* 22 */
-PGM_CHECK_DEFAULT /* 23 */
-PGM_CHECK_DEFAULT /* 24 */
-PGM_CHECK_DEFAULT /* 25 */
-PGM_CHECK_DEFAULT /* 26 */
-PGM_CHECK_DEFAULT /* 27 */
-PGM_CHECK_DEFAULT /* 28 */
-PGM_CHECK_DEFAULT /* 29 */
-PGM_CHECK_DEFAULT /* 2a */
-PGM_CHECK_DEFAULT /* 2b */
-PGM_CHECK_DEFAULT /* 2c */
-PGM_CHECK_DEFAULT /* 2d */
-PGM_CHECK_DEFAULT /* 2e */
-PGM_CHECK_DEFAULT /* 2f */
-PGM_CHECK_DEFAULT /* 30 */
-PGM_CHECK_DEFAULT /* 31 */
-PGM_CHECK_DEFAULT /* 32 */
-PGM_CHECK_DEFAULT /* 33 */
-PGM_CHECK_DEFAULT /* 34 */
-PGM_CHECK_DEFAULT /* 35 */
-PGM_CHECK_DEFAULT /* 36 */
-PGM_CHECK_DEFAULT /* 37 */
-PGM_CHECK(do_dat_exception) /* 38 */
-PGM_CHECK(do_dat_exception) /* 39 */
-PGM_CHECK(do_dat_exception) /* 3a */
-PGM_CHECK(do_dat_exception) /* 3b */
-PGM_CHECK_DEFAULT /* 3c */
-PGM_CHECK_DEFAULT /* 3d */
-PGM_CHECK_DEFAULT /* 3e */
-PGM_CHECK_DEFAULT /* 3f */
-PGM_CHECK_DEFAULT /* 40 */
-PGM_CHECK_DEFAULT /* 41 */
-PGM_CHECK_DEFAULT /* 42 */
-PGM_CHECK_DEFAULT /* 43 */
-PGM_CHECK_DEFAULT /* 44 */
-PGM_CHECK_DEFAULT /* 45 */
-PGM_CHECK_DEFAULT /* 46 */
-PGM_CHECK_DEFAULT /* 47 */
-PGM_CHECK_DEFAULT /* 48 */
-PGM_CHECK_DEFAULT /* 49 */
-PGM_CHECK_DEFAULT /* 4a */
-PGM_CHECK_DEFAULT /* 4b */
-PGM_CHECK_DEFAULT /* 4c */
-PGM_CHECK_DEFAULT /* 4d */
-PGM_CHECK_DEFAULT /* 4e */
-PGM_CHECK_DEFAULT /* 4f */
-PGM_CHECK_DEFAULT /* 50 */
-PGM_CHECK_DEFAULT /* 51 */
-PGM_CHECK_DEFAULT /* 52 */
-PGM_CHECK_DEFAULT /* 53 */
-PGM_CHECK_DEFAULT /* 54 */
-PGM_CHECK_DEFAULT /* 55 */
-PGM_CHECK_DEFAULT /* 56 */
-PGM_CHECK_DEFAULT /* 57 */
-PGM_CHECK_DEFAULT /* 58 */
-PGM_CHECK_DEFAULT /* 59 */
-PGM_CHECK_DEFAULT /* 5a */
-PGM_CHECK_DEFAULT /* 5b */
-PGM_CHECK_DEFAULT /* 5c */
-PGM_CHECK_DEFAULT /* 5d */
-PGM_CHECK_DEFAULT /* 5e */
-PGM_CHECK_DEFAULT /* 5f */
-PGM_CHECK_DEFAULT /* 60 */
-PGM_CHECK_DEFAULT /* 61 */
-PGM_CHECK_DEFAULT /* 62 */
-PGM_CHECK_DEFAULT /* 63 */
-PGM_CHECK_DEFAULT /* 64 */
-PGM_CHECK_DEFAULT /* 65 */
-PGM_CHECK_DEFAULT /* 66 */
-PGM_CHECK_DEFAULT /* 67 */
-PGM_CHECK_DEFAULT /* 68 */
-PGM_CHECK_DEFAULT /* 69 */
-PGM_CHECK_DEFAULT /* 6a */
-PGM_CHECK_DEFAULT /* 6b */
-PGM_CHECK_DEFAULT /* 6c */
-PGM_CHECK_DEFAULT /* 6d */
-PGM_CHECK_DEFAULT /* 6e */
-PGM_CHECK_DEFAULT /* 6f */
-PGM_CHECK_DEFAULT /* 70 */
-PGM_CHECK_DEFAULT /* 71 */
-PGM_CHECK_DEFAULT /* 72 */
-PGM_CHECK_DEFAULT /* 73 */
-PGM_CHECK_DEFAULT /* 74 */
-PGM_CHECK_DEFAULT /* 75 */
-PGM_CHECK_DEFAULT /* 76 */
-PGM_CHECK_DEFAULT /* 77 */
-PGM_CHECK_DEFAULT /* 78 */
-PGM_CHECK_DEFAULT /* 79 */
-PGM_CHECK_DEFAULT /* 7a */
-PGM_CHECK_DEFAULT /* 7b */
-PGM_CHECK_DEFAULT /* 7c */
-PGM_CHECK_DEFAULT /* 7d */
-PGM_CHECK_DEFAULT /* 7e */
-PGM_CHECK_DEFAULT /* 7f */
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 6ccef5f29761..4e3b366589fb 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -29,8 +29,9 @@
#include <linux/random.h>
#include <linux/export.h>
#include <linux/init_task.h>
+#include <linux/entry-common.h>
+#include <linux/io.h>
#include <asm/cpu_mf.h>
-#include <asm/io.h>
#include <asm/processor.h>
#include <asm/vtimer.h>
#include <asm/exec.h>
@@ -43,9 +44,22 @@
#include <asm/unwind.h>
#include "entry.h"
-asmlinkage void ret_from_fork(void) asm ("ret_from_fork");
+void ret_from_fork(void) asm("ret_from_fork");
-extern void kernel_thread_starter(void);
+void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs)
+{
+ void (*func)(void *arg);
+
+ schedule_tail(prev);
+
+ if (!user_mode(regs)) {
+ /* Kernel thread */
+ func = (void *)regs->gprs[9];
+ func((void *)regs->gprs[10]);
+ }
+ clear_pt_regs_flag(regs, PIF_SYSCALL);
+ syscall_exit_to_user_mode(regs);
+}
void flush_thread(void)
{
@@ -75,14 +89,28 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
*/
save_fpu_regs();
- memcpy(dst, src, arch_task_struct_size);
+ *dst = *src;
dst->thread.fpu.regs = dst->thread.fpu.fprs;
+
+ /*
+ * Don't transfer over the runtime instrumentation or the guarded
+ * storage control block pointers. These fields are cleared here instead
+ * of in copy_thread() to avoid premature freeing of associated memory
+ * on fork() failure. Wait to clear the RI flag because ->stack still
+ * refers to the source thread.
+ */
+ dst->thread.ri_cb = NULL;
+ dst->thread.gs_cb = NULL;
+ dst->thread.gs_bc_cb = NULL;
+
return 0;
}
-int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
- unsigned long arg, struct task_struct *p, unsigned long tls)
+int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
{
+ unsigned long clone_flags = args->flags;
+ unsigned long new_stackp = args->stack;
+ unsigned long tls = args->tls;
struct fake_frame
{
struct stack_frame sf;
@@ -94,7 +122,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
/* Save access registers to new thread structure. */
save_access_regs(&p->thread.acrs[0]);
/* start new process with ar4 pointing to the correct address space */
- p->thread.mm_segment = get_fs();
/* Don't copy debug registers */
memset(&p->thread.per_user, 0, sizeof(p->thread.per_user));
memset(&p->thread.per_event, 0, sizeof(p->thread.per_event));
@@ -106,26 +133,26 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
p->thread.system_timer = 0;
p->thread.hardirq_timer = 0;
p->thread.softirq_timer = 0;
+ p->thread.last_break = 1;
frame->sf.back_chain = 0;
+ frame->sf.gprs[11 - 6] = (unsigned long)&frame->childregs;
+ frame->sf.gprs[12 - 6] = (unsigned long)p;
/* new return point is ret_from_fork */
- frame->sf.gprs[8] = (unsigned long) ret_from_fork;
+ frame->sf.gprs[14 - 6] = (unsigned long)ret_from_fork;
/* fake return stack for resume(), don't go back to schedule */
- frame->sf.gprs[9] = (unsigned long) frame;
+ frame->sf.gprs[15 - 6] = (unsigned long)frame;
/* Store access registers to kernel stack of new process. */
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(args->fn)) {
/* kernel thread */
memset(&frame->childregs, 0, sizeof(struct pt_regs));
- frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT |
- PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
- frame->childregs.psw.addr =
- (unsigned long) kernel_thread_starter;
- frame->childregs.gprs[9] = new_stackp; /* function */
- frame->childregs.gprs[10] = arg;
- frame->childregs.gprs[11] = (unsigned long) do_exit;
+ frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO |
+ PSW_MASK_EXT | PSW_MASK_MCHECK;
+ frame->childregs.gprs[9] = (unsigned long)args->fn;
+ frame->childregs.gprs[10] = (unsigned long)args->fn_arg;
frame->childregs.orig_gpr2 = -1;
-
+ frame->childregs.last_break = 1;
return 0;
}
frame->childregs = *current_pt_regs();
@@ -133,13 +160,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
frame->childregs.flags = 0;
if (new_stackp)
frame->childregs.gprs[15] = new_stackp;
-
- /* Don't copy runtime instrumentation info */
- p->thread.ri_cb = NULL;
+ /*
+ * Clear the runtime instrumentation flag after the above childregs
+ * copy. The CB pointer was already cleared in arch_dup_task_struct().
+ */
frame->childregs.psw.mask &= ~PSW_MASK_RI;
- /* Don't copy guarded storage control block */
- p->thread.gs_cb = NULL;
- p->thread.gs_bc_cb = NULL;
/* Set a new TLS ? */
if (clone_flags & CLONE_SETTLS) {
@@ -150,39 +175,27 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
p->thread.acrs[1] = (unsigned int)tls;
}
}
+ /*
+ * s390 stores the svc return address in arch_data when calling
+ * sigreturn()/restart_syscall() via vdso. 1 means no valid address
+ * stored.
+ */
+ p->restart_block.arch_data = 1;
return 0;
}
-asmlinkage void execve_tail(void)
+void execve_tail(void)
{
current->thread.fpu.fpc = 0;
asm volatile("sfpc %0" : : "d" (0));
}
-/*
- * fill in the FPU structure for a core dump.
- */
-int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
-{
- save_fpu_regs();
- fpregs->fpc = current->thread.fpu.fpc;
- fpregs->pad = 0;
- if (MACHINE_HAS_VX)
- convert_vx_to_fp((freg_t *)&fpregs->fprs,
- current->thread.fpu.vxrs);
- else
- memcpy(&fpregs->fprs, current->thread.fpu.fprs,
- sizeof(fpregs->fprs));
- return 1;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-unsigned long get_wchan(struct task_struct *p)
+unsigned long __get_wchan(struct task_struct *p)
{
struct unwind_state state;
unsigned long ip = 0;
- if (!p || p == current || p->state == TASK_RUNNING || !task_stack_page(p))
+ if (!task_stack_page(p))
return 0;
if (!try_get_task_stack(p))
@@ -209,13 +222,13 @@ unsigned long get_wchan(struct task_struct *p)
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= get_random_int() & ~PAGE_MASK;
+ sp -= get_random_u32_below(PAGE_SIZE);
return sp & ~0xf;
}
static inline unsigned long brk_rnd(void)
{
- return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT;
+ return (get_random_u16() & BRK_RND_MASK) << PAGE_SHIFT;
}
unsigned long arch_randomize_brk(struct mm_struct *mm)
@@ -225,16 +238,3 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
ret = PAGE_ALIGN(mm->brk + brk_rnd());
return (ret > mm->brk) ? ret : mm->brk;
}
-
-void set_fs_fixup(void)
-{
- struct pt_regs *regs = current_pt_regs();
- static bool warned;
-
- set_fs(USER_DS);
- if (warned)
- return;
- WARN(1, "Unbalanced set_fs - int code: 0x%x\n", regs->int_code);
- show_registers(regs);
- warned = true;
-}
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 6ebc2117c66c..65c1464eea4f 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -8,9 +8,9 @@
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/stop_machine.h>
-#include <linux/cpufeature.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
+#include <linux/random.h>
#include <linux/sched/mm.h>
#include <linux/init.h>
#include <linux/seq_file.h>
@@ -23,8 +23,12 @@
#include <asm/elf.h>
#include <asm/lowcore.h>
#include <asm/param.h>
+#include <asm/sclp.h>
#include <asm/smp.h>
+unsigned long __read_mostly elf_hwcap;
+char elf_platform[ELF_PLATFORM_SIZE];
+
struct cpu_info {
unsigned int cpu_mhz_dynamic;
unsigned int cpu_mhz_static;
@@ -91,23 +95,12 @@ void cpu_init(void)
enter_lazy_tlb(&init_mm, current);
}
-/*
- * cpu_have_feature - Test CPU features on module initialization
- */
-int cpu_have_feature(unsigned int num)
-{
- return elf_hwcap & (1UL << num);
-}
-EXPORT_SYMBOL(cpu_have_feature);
-
static void show_facilities(struct seq_file *m)
{
unsigned int bit;
- long *facilities;
- facilities = (long *)&S390_lowcore.stfle_fac_list;
seq_puts(m, "facilities :");
- for_each_set_bit_inv(bit, facilities, MAX_FACILITY_BIT)
+ for_each_set_bit_inv(bit, (long *)&stfle_fac_list, MAX_FACILITY_BIT)
seq_printf(m, " %d", bit);
seq_putc(m, '\n');
}
@@ -115,15 +108,33 @@ static void show_facilities(struct seq_file *m)
static void show_cpu_summary(struct seq_file *m, void *v)
{
static const char *hwcap_str[] = {
- "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp",
- "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs",
- "vxe2", "vxp", "sort", "dflt"
- };
- static const char * const int_hwcap_str[] = {
- "sie"
+ [HWCAP_NR_ESAN3] = "esan3",
+ [HWCAP_NR_ZARCH] = "zarch",
+ [HWCAP_NR_STFLE] = "stfle",
+ [HWCAP_NR_MSA] = "msa",
+ [HWCAP_NR_LDISP] = "ldisp",
+ [HWCAP_NR_EIMM] = "eimm",
+ [HWCAP_NR_DFP] = "dfp",
+ [HWCAP_NR_HPAGE] = "edat",
+ [HWCAP_NR_ETF3EH] = "etf3eh",
+ [HWCAP_NR_HIGH_GPRS] = "highgprs",
+ [HWCAP_NR_TE] = "te",
+ [HWCAP_NR_VXRS] = "vx",
+ [HWCAP_NR_VXRS_BCD] = "vxd",
+ [HWCAP_NR_VXRS_EXT] = "vxe",
+ [HWCAP_NR_GS] = "gs",
+ [HWCAP_NR_VXRS_EXT2] = "vxe2",
+ [HWCAP_NR_VXRS_PDE] = "vxp",
+ [HWCAP_NR_SORT] = "sort",
+ [HWCAP_NR_DFLT] = "dflt",
+ [HWCAP_NR_VXRS_PDE2] = "vxp2",
+ [HWCAP_NR_NNPA] = "nnpa",
+ [HWCAP_NR_PCI_MIO] = "pcimio",
+ [HWCAP_NR_SIE] = "sie",
};
int i, cpu;
+ BUILD_BUG_ON(ARRAY_SIZE(hwcap_str) != HWCAP_NR_MAX);
seq_printf(m, "vendor_id : IBM/S390\n"
"# processors : %i\n"
"bogomips per cpu: %lu.%02lu\n",
@@ -134,9 +145,6 @@ static void show_cpu_summary(struct seq_file *m, void *v)
for (i = 0; i < ARRAY_SIZE(hwcap_str); i++)
if (hwcap_str[i] && (elf_hwcap & (1UL << i)))
seq_printf(m, "%s ", hwcap_str[i]);
- for (i = 0; i < ARRAY_SIZE(int_hwcap_str); i++)
- if (int_hwcap_str[i] && (int_hwcap & (1UL << i)))
- seq_printf(m, "%s ", int_hwcap_str[i]);
seq_puts(m, "\n");
show_facilities(m);
show_cacheinfo(m);
@@ -151,10 +159,155 @@ static void show_cpu_summary(struct seq_file *m, void *v)
}
}
+static int __init setup_hwcaps(void)
+{
+ /* instructions named N3, "backported" to esa-mode */
+ elf_hwcap |= HWCAP_ESAN3;
+
+ /* z/Architecture mode active */
+ elf_hwcap |= HWCAP_ZARCH;
+
+ /* store-facility-list-extended */
+ if (test_facility(7))
+ elf_hwcap |= HWCAP_STFLE;
+
+ /* message-security assist */
+ if (test_facility(17))
+ elf_hwcap |= HWCAP_MSA;
+
+ /* long-displacement */
+ if (test_facility(19))
+ elf_hwcap |= HWCAP_LDISP;
+
+ /* extended-immediate */
+ elf_hwcap |= HWCAP_EIMM;
+
+ /* extended-translation facility 3 enhancement */
+ if (test_facility(22) && test_facility(30))
+ elf_hwcap |= HWCAP_ETF3EH;
+
+ /* decimal floating point & perform floating point operation */
+ if (test_facility(42) && test_facility(44))
+ elf_hwcap |= HWCAP_DFP;
+
+ /* huge page support */
+ if (MACHINE_HAS_EDAT1)
+ elf_hwcap |= HWCAP_HPAGE;
+
+ /* 64-bit register support for 31-bit processes */
+ elf_hwcap |= HWCAP_HIGH_GPRS;
+
+ /* transactional execution */
+ if (MACHINE_HAS_TE)
+ elf_hwcap |= HWCAP_TE;
+
+ /* vector */
+ if (test_facility(129)) {
+ elf_hwcap |= HWCAP_VXRS;
+ if (test_facility(134))
+ elf_hwcap |= HWCAP_VXRS_BCD;
+ if (test_facility(135))
+ elf_hwcap |= HWCAP_VXRS_EXT;
+ if (test_facility(148))
+ elf_hwcap |= HWCAP_VXRS_EXT2;
+ if (test_facility(152))
+ elf_hwcap |= HWCAP_VXRS_PDE;
+ if (test_facility(192))
+ elf_hwcap |= HWCAP_VXRS_PDE2;
+ }
+
+ if (test_facility(150))
+ elf_hwcap |= HWCAP_SORT;
+
+ if (test_facility(151))
+ elf_hwcap |= HWCAP_DFLT;
+
+ if (test_facility(165))
+ elf_hwcap |= HWCAP_NNPA;
+
+ /* guarded storage */
+ if (MACHINE_HAS_GS)
+ elf_hwcap |= HWCAP_GS;
+
+ if (MACHINE_HAS_PCI_MIO)
+ elf_hwcap |= HWCAP_PCI_MIO;
+
+ /* virtualization support */
+ if (sclp.has_sief2)
+ elf_hwcap |= HWCAP_SIE;
+
+ return 0;
+}
+arch_initcall(setup_hwcaps);
+
+static int __init setup_elf_platform(void)
+{
+ struct cpuid cpu_id;
+
+ get_cpu_id(&cpu_id);
+ add_device_randomness(&cpu_id, sizeof(cpu_id));
+ switch (cpu_id.machine) {
+ default: /* Use "z10" as default. */
+ strcpy(elf_platform, "z10");
+ break;
+ case 0x2817:
+ case 0x2818:
+ strcpy(elf_platform, "z196");
+ break;
+ case 0x2827:
+ case 0x2828:
+ strcpy(elf_platform, "zEC12");
+ break;
+ case 0x2964:
+ case 0x2965:
+ strcpy(elf_platform, "z13");
+ break;
+ case 0x3906:
+ case 0x3907:
+ strcpy(elf_platform, "z14");
+ break;
+ case 0x8561:
+ case 0x8562:
+ strcpy(elf_platform, "z15");
+ break;
+ case 0x3931:
+ case 0x3932:
+ strcpy(elf_platform, "z16");
+ break;
+ }
+ return 0;
+}
+arch_initcall(setup_elf_platform);
+
+static void show_cpu_topology(struct seq_file *m, unsigned long n)
+{
+#ifdef CONFIG_SCHED_TOPOLOGY
+ seq_printf(m, "physical id : %d\n", topology_physical_package_id(n));
+ seq_printf(m, "core id : %d\n", topology_core_id(n));
+ seq_printf(m, "book id : %d\n", topology_book_id(n));
+ seq_printf(m, "drawer id : %d\n", topology_drawer_id(n));
+ seq_printf(m, "dedicated : %d\n", topology_cpu_dedicated(n));
+ seq_printf(m, "address : %d\n", smp_cpu_get_cpu_address(n));
+ seq_printf(m, "siblings : %d\n", cpumask_weight(topology_core_cpumask(n)));
+ seq_printf(m, "cpu cores : %d\n", topology_booted_cores(n));
+#endif /* CONFIG_SCHED_TOPOLOGY */
+}
+
+static void show_cpu_ids(struct seq_file *m, unsigned long n)
+{
+ struct cpuid *id = &per_cpu(cpu_info.cpu_id, n);
+
+ seq_printf(m, "version : %02X\n", id->version);
+ seq_printf(m, "identification : %06X\n", id->ident);
+ seq_printf(m, "machine : %04X\n", id->machine);
+}
+
static void show_cpu_mhz(struct seq_file *m, unsigned long n)
{
struct cpu_info *c = per_cpu_ptr(&cpu_info, n);
+ if (!machine_has_cpu_mhz)
+ return;
seq_printf(m, "cpu MHz dynamic : %d\n", c->cpu_mhz_dynamic);
seq_printf(m, "cpu MHz static : %d\n", c->cpu_mhz_static);
}
@@ -165,12 +318,13 @@ static void show_cpu_mhz(struct seq_file *m, unsigned long n)
static int show_cpuinfo(struct seq_file *m, void *v)
{
unsigned long n = (unsigned long) v - 1;
+ unsigned long first = cpumask_first(cpu_online_mask);
- if (!n)
+ if (n == first)
show_cpu_summary(m, v);
- if (!machine_has_cpu_mhz)
- return 0;
seq_printf(m, "\ncpu number : %ld\n", n);
+ show_cpu_topology(m, n);
+ show_cpu_ids(m, n);
show_cpu_mhz(m, n);
return 0;
}
@@ -179,12 +333,14 @@ static inline void *c_update(loff_t *pos)
{
if (*pos)
*pos = cpumask_next(*pos - 1, cpu_online_mask);
+ else
+ *pos = cpumask_first(cpu_online_mask);
return *pos < nr_cpu_ids ? (void *)*pos + 1 : NULL;
}
static void *c_start(struct seq_file *m, loff_t *pos)
{
- get_online_cpus();
+ cpus_read_lock();
return c_update(pos);
}
@@ -196,7 +352,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
static void c_stop(struct seq_file *m, void *v)
{
- put_online_cpus();
+ cpus_read_unlock();
}
const struct seq_operations cpuinfo_op = {
@@ -205,21 +361,3 @@ const struct seq_operations cpuinfo_op = {
.stop = c_stop,
.show = show_cpuinfo,
};
-
-int s390_isolate_bp(void)
-{
- if (!test_facility(82))
- return -EOPNOTSUPP;
- set_thread_flag(TIF_ISOLATE_BP);
- return 0;
-}
-EXPORT_SYMBOL(s390_isolate_bp);
-
-int s390_isolate_bp_guest(void)
-{
- if (!test_facility(82))
- return -EOPNOTSUPP;
- set_thread_flag(TIF_ISOLATE_BP_GUEST);
- return 0;
-}
-EXPORT_SYMBOL(s390_isolate_bp_guest);
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 58faa12542a1..f1897a8bb221 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -7,6 +7,7 @@
* Martin Schwidefsky (schwidefsky@de.ibm.com)
*/
+#include "asm/ptrace.h"
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
@@ -20,18 +21,16 @@
#include <linux/signal.h>
#include <linux/elf.h>
#include <linux/regset.h>
-#include <linux/tracehook.h>
#include <linux/seccomp.h>
#include <linux/compat.h>
#include <trace/syscall.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/switch_to.h>
#include <asm/runtime_instr.h>
#include <asm/facility.h>
+#include <asm/fpu/api.h>
#include "entry.h"
@@ -39,20 +38,24 @@
#include "compat_ptrace.h"
#endif
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
void update_cr_regs(struct task_struct *task)
{
struct pt_regs *regs = task_pt_regs(task);
struct thread_struct *thread = &task->thread;
- struct per_regs old, new;
union ctlreg0 cr0_old, cr0_new;
union ctlreg2 cr2_old, cr2_new;
int cr0_changed, cr2_changed;
-
- __ctl_store(cr0_old.val, 0, 0);
- __ctl_store(cr2_old.val, 2, 2);
+ union {
+ struct ctlreg regs[3];
+ struct {
+ struct ctlreg control;
+ struct ctlreg start;
+ struct ctlreg end;
+ };
+ } old, new;
+
+ local_ctl_store(0, &cr0_old.reg);
+ local_ctl_store(2, &cr2_old.reg);
cr0_new = cr0_old;
cr2_new = cr2_old;
/* Take care of the enable/disable of transactional execution. */
@@ -80,38 +83,38 @@ void update_cr_regs(struct task_struct *task)
cr0_changed = cr0_new.val != cr0_old.val;
cr2_changed = cr2_new.val != cr2_old.val;
if (cr0_changed)
- __ctl_load(cr0_new.val, 0, 0);
+ local_ctl_load(0, &cr0_new.reg);
if (cr2_changed)
- __ctl_load(cr2_new.val, 2, 2);
+ local_ctl_load(2, &cr2_new.reg);
/* Copy user specified PER registers */
- new.control = thread->per_user.control;
- new.start = thread->per_user.start;
- new.end = thread->per_user.end;
+ new.control.val = thread->per_user.control;
+ new.start.val = thread->per_user.start;
+ new.end.val = thread->per_user.end;
/* merge TIF_SINGLE_STEP into user specified PER registers. */
if (test_tsk_thread_flag(task, TIF_SINGLE_STEP) ||
test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP)) {
if (test_tsk_thread_flag(task, TIF_BLOCK_STEP))
- new.control |= PER_EVENT_BRANCH;
+ new.control.val |= PER_EVENT_BRANCH;
else
- new.control |= PER_EVENT_IFETCH;
- new.control |= PER_CONTROL_SUSPENSION;
- new.control |= PER_EVENT_TRANSACTION_END;
+ new.control.val |= PER_EVENT_IFETCH;
+ new.control.val |= PER_CONTROL_SUSPENSION;
+ new.control.val |= PER_EVENT_TRANSACTION_END;
if (test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP))
- new.control |= PER_EVENT_IFETCH;
- new.start = 0;
- new.end = -1UL;
+ new.control.val |= PER_EVENT_IFETCH;
+ new.start.val = 0;
+ new.end.val = -1UL;
}
/* Take care of the PER enablement bit in the PSW. */
- if (!(new.control & PER_EVENT_MASK)) {
+ if (!(new.control.val & PER_EVENT_MASK)) {
regs->psw.mask &= ~PSW_MASK_PER;
return;
}
regs->psw.mask |= PSW_MASK_PER;
- __ctl_store(old, 9, 11);
+ __local_ctl_store(9, 11, old.regs);
if (memcmp(&new, &old, sizeof(struct per_regs)) != 0)
- __ctl_load(new, 9, 11);
+ __local_ctl_load(9, 11, new.regs);
}
void user_enable_single_step(struct task_struct *task)
@@ -142,7 +145,7 @@ void ptrace_disable(struct task_struct *task)
memset(&task->thread.per_user, 0, sizeof(task->thread.per_user));
memset(&task->thread.per_event, 0, sizeof(task->thread.per_event));
clear_tsk_thread_flag(task, TIF_SINGLE_STEP);
- clear_pt_regs_flag(task_pt_regs(task), PIF_PER_TRAP);
+ clear_tsk_thread_flag(task, TIF_PER_TRAP);
task->thread.per_flags = 0;
}
@@ -151,38 +154,36 @@ void ptrace_disable(struct task_struct *task)
static inline unsigned long __peek_user_per(struct task_struct *child,
addr_t addr)
{
- struct per_struct_kernel *dummy = NULL;
-
- if (addr == (addr_t) &dummy->cr9)
+ if (addr == offsetof(struct per_struct_kernel, cr9))
/* Control bits of the active per set. */
return test_thread_flag(TIF_SINGLE_STEP) ?
PER_EVENT_IFETCH : child->thread.per_user.control;
- else if (addr == (addr_t) &dummy->cr10)
+ else if (addr == offsetof(struct per_struct_kernel, cr10))
/* Start address of the active per set. */
return test_thread_flag(TIF_SINGLE_STEP) ?
0 : child->thread.per_user.start;
- else if (addr == (addr_t) &dummy->cr11)
+ else if (addr == offsetof(struct per_struct_kernel, cr11))
/* End address of the active per set. */
return test_thread_flag(TIF_SINGLE_STEP) ?
-1UL : child->thread.per_user.end;
- else if (addr == (addr_t) &dummy->bits)
+ else if (addr == offsetof(struct per_struct_kernel, bits))
/* Single-step bit. */
return test_thread_flag(TIF_SINGLE_STEP) ?
(1UL << (BITS_PER_LONG - 1)) : 0;
- else if (addr == (addr_t) &dummy->starting_addr)
+ else if (addr == offsetof(struct per_struct_kernel, starting_addr))
/* Start address of the user specified per set. */
return child->thread.per_user.start;
- else if (addr == (addr_t) &dummy->ending_addr)
+ else if (addr == offsetof(struct per_struct_kernel, ending_addr))
/* End address of the user specified per set. */
return child->thread.per_user.end;
- else if (addr == (addr_t) &dummy->perc_atmid)
+ else if (addr == offsetof(struct per_struct_kernel, perc_atmid))
/* PER code, ATMID and AI of the last PER trap */
return (unsigned long)
child->thread.per_event.cause << (BITS_PER_LONG - 16);
- else if (addr == (addr_t) &dummy->address)
+ else if (addr == offsetof(struct per_struct_kernel, address))
/* Address of the last PER trap */
return child->thread.per_event.address;
- else if (addr == (addr_t) &dummy->access_id)
+ else if (addr == offsetof(struct per_struct_kernel, access_id))
/* Access id of the last PER trap */
return (unsigned long)
child->thread.per_event.paid << (BITS_PER_LONG - 8);
@@ -200,73 +201,72 @@ static inline unsigned long __peek_user_per(struct task_struct *child,
*/
static unsigned long __peek_user(struct task_struct *child, addr_t addr)
{
- struct user *dummy = NULL;
addr_t offset, tmp;
- if (addr < (addr_t) &dummy->regs.acrs) {
+ if (addr < offsetof(struct user, regs.acrs)) {
/*
* psw and gprs are stored on the stack
*/
tmp = *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr);
- if (addr == (addr_t) &dummy->regs.psw.mask) {
+ if (addr == offsetof(struct user, regs.psw.mask)) {
/* Return a clean psw mask. */
tmp &= PSW_MASK_USER | PSW_MASK_RI;
tmp |= PSW_USER_BITS;
}
- } else if (addr < (addr_t) &dummy->regs.orig_gpr2) {
+ } else if (addr < offsetof(struct user, regs.orig_gpr2)) {
/*
* access registers are stored in the thread structure
*/
- offset = addr - (addr_t) &dummy->regs.acrs;
+ offset = addr - offsetof(struct user, regs.acrs);
/*
* Very special case: old & broken 64 bit gdb reading
* from acrs[15]. Result is a 64 bit value. Read the
* 32 bit acrs[15] value and shift it by 32. Sick...
*/
- if (addr == (addr_t) &dummy->regs.acrs[15])
+ if (addr == offsetof(struct user, regs.acrs[15]))
tmp = ((unsigned long) child->thread.acrs[15]) << 32;
else
tmp = *(addr_t *)((addr_t) &child->thread.acrs + offset);
- } else if (addr == (addr_t) &dummy->regs.orig_gpr2) {
+ } else if (addr == offsetof(struct user, regs.orig_gpr2)) {
/*
* orig_gpr2 is stored on the kernel stack
*/
tmp = (addr_t) task_pt_regs(child)->orig_gpr2;
- } else if (addr < (addr_t) &dummy->regs.fp_regs) {
+ } else if (addr < offsetof(struct user, regs.fp_regs)) {
/*
* prevent reads of padding hole between
* orig_gpr2 and fp_regs on s390.
*/
tmp = 0;
- } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) {
+ } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) {
/*
* floating point control reg. is in the thread structure
*/
tmp = child->thread.fpu.fpc;
tmp <<= BITS_PER_LONG - 32;
- } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) {
+ } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
* floating point regs. are either in child->thread.fpu
* or the child->thread.fpu.vxrs array
*/
- offset = addr - (addr_t) &dummy->regs.fp_regs.fprs;
- if (MACHINE_HAS_VX)
+ offset = addr - offsetof(struct user, regs.fp_regs.fprs);
+ if (cpu_has_vx())
tmp = *(addr_t *)
((addr_t) child->thread.fpu.vxrs + 2*offset);
else
tmp = *(addr_t *)
((addr_t) child->thread.fpu.fprs + offset);
- } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) {
+ } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) {
/*
* Handle access to the per_info structure.
*/
- addr -= (addr_t) &dummy->regs.per_info;
+ addr -= offsetof(struct user, regs.per_info);
tmp = __peek_user_per(child, addr);
} else
@@ -285,8 +285,8 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
* an alignment of 4. Programmers from hell...
*/
mask = __ADDR_MASK;
- if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs &&
- addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2)
+ if (addr >= offsetof(struct user, regs.acrs) &&
+ addr < offsetof(struct user, regs.orig_gpr2))
mask = 3;
if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK)
return -EIO;
@@ -298,8 +298,6 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
static inline void __poke_user_per(struct task_struct *child,
addr_t addr, addr_t data)
{
- struct per_struct_kernel *dummy = NULL;
-
/*
* There are only three fields in the per_info struct that the
* debugger user can write to.
@@ -312,14 +310,14 @@ static inline void __poke_user_per(struct task_struct *child,
* addresses are used only if single stepping is not in effect.
* Writes to any other field in per_info are ignored.
*/
- if (addr == (addr_t) &dummy->cr9)
+ if (addr == offsetof(struct per_struct_kernel, cr9))
/* PER event mask of the user specified per set. */
child->thread.per_user.control =
data & (PER_EVENT_MASK | PER_CONTROL_MASK);
- else if (addr == (addr_t) &dummy->starting_addr)
+ else if (addr == offsetof(struct per_struct_kernel, starting_addr))
/* Starting address of the user specified per set. */
child->thread.per_user.start = data;
- else if (addr == (addr_t) &dummy->ending_addr)
+ else if (addr == offsetof(struct per_struct_kernel, ending_addr))
/* Ending address of the user specified per set. */
child->thread.per_user.end = data;
}
@@ -332,14 +330,15 @@ static inline void __poke_user_per(struct task_struct *child,
*/
static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
{
- struct user *dummy = NULL;
addr_t offset;
- if (addr < (addr_t) &dummy->regs.acrs) {
+
+ if (addr < offsetof(struct user, regs.acrs)) {
+ struct pt_regs *regs = task_pt_regs(child);
/*
* psw and gprs are stored on the stack
*/
- if (addr == (addr_t) &dummy->regs.psw.mask) {
+ if (addr == offsetof(struct user, regs.psw.mask)) {
unsigned long mask = PSW_MASK_USER;
mask |= is_ri_task(child) ? PSW_MASK_RI : 0;
@@ -353,64 +352,69 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
/* Invalid addressing mode bits */
return -EINVAL;
}
- *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr) = data;
- } else if (addr < (addr_t) (&dummy->regs.orig_gpr2)) {
+ if (test_pt_regs_flag(regs, PIF_SYSCALL) &&
+ addr == offsetof(struct user, regs.gprs[2])) {
+ struct pt_regs *regs = task_pt_regs(child);
+
+ regs->int_code = 0x20000 | (data & 0xffff);
+ }
+ *(addr_t *)((addr_t) &regs->psw + addr) = data;
+ } else if (addr < offsetof(struct user, regs.orig_gpr2)) {
/*
* access registers are stored in the thread structure
*/
- offset = addr - (addr_t) &dummy->regs.acrs;
+ offset = addr - offsetof(struct user, regs.acrs);
/*
* Very special case: old & broken 64 bit gdb writing
* to acrs[15] with a 64 bit value. Ignore the lower
* half of the value and write the upper 32 bit to
* acrs[15]. Sick...
*/
- if (addr == (addr_t) &dummy->regs.acrs[15])
+ if (addr == offsetof(struct user, regs.acrs[15]))
child->thread.acrs[15] = (unsigned int) (data >> 32);
else
*(addr_t *)((addr_t) &child->thread.acrs + offset) = data;
- } else if (addr == (addr_t) &dummy->regs.orig_gpr2) {
+ } else if (addr == offsetof(struct user, regs.orig_gpr2)) {
/*
* orig_gpr2 is stored on the kernel stack
*/
task_pt_regs(child)->orig_gpr2 = data;
- } else if (addr < (addr_t) &dummy->regs.fp_regs) {
+ } else if (addr < offsetof(struct user, regs.fp_regs)) {
/*
* prevent writes of padding hole between
* orig_gpr2 and fp_regs on s390.
*/
return 0;
- } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) {
+ } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) {
/*
* floating point control reg. is in the thread structure
*/
- if ((unsigned int) data != 0 ||
- test_fp_ctl(data >> (BITS_PER_LONG - 32)))
+ if ((unsigned int)data != 0)
return -EINVAL;
child->thread.fpu.fpc = data >> (BITS_PER_LONG - 32);
- } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) {
+ } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
* floating point regs. are either in child->thread.fpu
* or the child->thread.fpu.vxrs array
*/
- offset = addr - (addr_t) &dummy->regs.fp_regs.fprs;
- if (MACHINE_HAS_VX)
+ offset = addr - offsetof(struct user, regs.fp_regs.fprs);
+ if (cpu_has_vx())
*(addr_t *)((addr_t)
child->thread.fpu.vxrs + 2*offset) = data;
else
*(addr_t *)((addr_t)
child->thread.fpu.fprs + offset) = data;
- } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) {
+ } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) {
/*
* Handle access to the per_info structure.
*/
- addr -= (addr_t) &dummy->regs.per_info;
+ addr -= offsetof(struct user, regs.per_info);
__poke_user_per(child, addr, data);
}
@@ -427,8 +431,8 @@ static int poke_user(struct task_struct *child, addr_t addr, addr_t data)
* an alignment of 4. Programmers from hell indeed...
*/
mask = __ADDR_MASK;
- if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs &&
- addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2)
+ if (addr >= offsetof(struct user, regs.acrs) &&
+ addr < offsetof(struct user, regs.orig_gpr2))
mask = 3;
if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK)
return -EIO;
@@ -477,9 +481,7 @@ long arch_ptrace(struct task_struct *child, long request,
}
return 0;
case PTRACE_GET_LAST_BREAK:
- put_user(child->thread.last_break,
- (unsigned long __user *) data);
- return 0;
+ return put_user(child->thread.last_break, (unsigned long __user *)data);
case PTRACE_ENABLE_TE:
if (!MACHINE_HAS_TE)
return -EIO;
@@ -536,37 +538,35 @@ long arch_ptrace(struct task_struct *child, long request,
static inline __u32 __peek_user_per_compat(struct task_struct *child,
addr_t addr)
{
- struct compat_per_struct_kernel *dummy32 = NULL;
-
- if (addr == (addr_t) &dummy32->cr9)
+ if (addr == offsetof(struct compat_per_struct_kernel, cr9))
/* Control bits of the active per set. */
return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
PER_EVENT_IFETCH : child->thread.per_user.control;
- else if (addr == (addr_t) &dummy32->cr10)
+ else if (addr == offsetof(struct compat_per_struct_kernel, cr10))
/* Start address of the active per set. */
return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
0 : child->thread.per_user.start;
- else if (addr == (addr_t) &dummy32->cr11)
+ else if (addr == offsetof(struct compat_per_struct_kernel, cr11))
/* End address of the active per set. */
return test_thread_flag(TIF_SINGLE_STEP) ?
PSW32_ADDR_INSN : child->thread.per_user.end;
- else if (addr == (addr_t) &dummy32->bits)
+ else if (addr == offsetof(struct compat_per_struct_kernel, bits))
/* Single-step bit. */
return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
0x80000000 : 0;
- else if (addr == (addr_t) &dummy32->starting_addr)
+ else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr))
/* Start address of the user specified per set. */
return (__u32) child->thread.per_user.start;
- else if (addr == (addr_t) &dummy32->ending_addr)
+ else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr))
/* End address of the user specified per set. */
return (__u32) child->thread.per_user.end;
- else if (addr == (addr_t) &dummy32->perc_atmid)
+ else if (addr == offsetof(struct compat_per_struct_kernel, perc_atmid))
/* PER code, ATMID and AI of the last PER trap */
return (__u32) child->thread.per_event.cause << 16;
- else if (addr == (addr_t) &dummy32->address)
+ else if (addr == offsetof(struct compat_per_struct_kernel, address))
/* Address of the last PER trap */
return (__u32) child->thread.per_event.address;
- else if (addr == (addr_t) &dummy32->access_id)
+ else if (addr == offsetof(struct compat_per_struct_kernel, access_id))
/* Access id of the last PER trap */
return (__u32) child->thread.per_event.paid << 24;
return 0;
@@ -577,21 +577,20 @@ static inline __u32 __peek_user_per_compat(struct task_struct *child,
*/
static u32 __peek_user_compat(struct task_struct *child, addr_t addr)
{
- struct compat_user *dummy32 = NULL;
addr_t offset;
__u32 tmp;
- if (addr < (addr_t) &dummy32->regs.acrs) {
+ if (addr < offsetof(struct compat_user, regs.acrs)) {
struct pt_regs *regs = task_pt_regs(child);
/*
* psw and gprs are stored on the stack
*/
- if (addr == (addr_t) &dummy32->regs.psw.mask) {
+ if (addr == offsetof(struct compat_user, regs.psw.mask)) {
/* Fake a 31 bit psw mask. */
tmp = (__u32)(regs->psw.mask >> 32);
tmp &= PSW32_MASK_USER | PSW32_MASK_RI;
tmp |= PSW32_USER_BITS;
- } else if (addr == (addr_t) &dummy32->regs.psw.addr) {
+ } else if (addr == offsetof(struct compat_user, regs.psw.addr)) {
/* Fake a 31 bit psw address. */
tmp = (__u32) regs->psw.addr |
(__u32)(regs->psw.mask & PSW_MASK_BA);
@@ -599,50 +598,50 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr)
/* gpr 0-15 */
tmp = *(__u32 *)((addr_t) &regs->psw + addr*2 + 4);
}
- } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) {
+ } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) {
/*
* access registers are stored in the thread structure
*/
- offset = addr - (addr_t) &dummy32->regs.acrs;
+ offset = addr - offsetof(struct compat_user, regs.acrs);
tmp = *(__u32*)((addr_t) &child->thread.acrs + offset);
- } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) {
+ } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) {
/*
* orig_gpr2 is stored on the kernel stack
*/
tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4);
- } else if (addr < (addr_t) &dummy32->regs.fp_regs) {
+ } else if (addr < offsetof(struct compat_user, regs.fp_regs)) {
/*
* prevent reads of padding hole between
* orig_gpr2 and fp_regs on s390.
*/
tmp = 0;
- } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) {
+ } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) {
/*
* floating point control reg. is in the thread structure
*/
tmp = child->thread.fpu.fpc;
- } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) {
+ } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
* floating point regs. are either in child->thread.fpu
* or the child->thread.fpu.vxrs array
*/
- offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs;
- if (MACHINE_HAS_VX)
+ offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs);
+ if (cpu_has_vx())
tmp = *(__u32 *)
((addr_t) child->thread.fpu.vxrs + 2*offset);
else
tmp = *(__u32 *)
((addr_t) child->thread.fpu.fprs + offset);
- } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) {
+ } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) {
/*
* Handle access to the per_info structure.
*/
- addr -= (addr_t) &dummy32->regs.per_info;
+ addr -= offsetof(struct compat_user, regs.per_info);
tmp = __peek_user_per_compat(child, addr);
} else
@@ -669,16 +668,14 @@ static int peek_user_compat(struct task_struct *child,
static inline void __poke_user_per_compat(struct task_struct *child,
addr_t addr, __u32 data)
{
- struct compat_per_struct_kernel *dummy32 = NULL;
-
- if (addr == (addr_t) &dummy32->cr9)
+ if (addr == offsetof(struct compat_per_struct_kernel, cr9))
/* PER event mask of the user specified per set. */
child->thread.per_user.control =
data & (PER_EVENT_MASK | PER_CONTROL_MASK);
- else if (addr == (addr_t) &dummy32->starting_addr)
+ else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr))
/* Starting address of the user specified per set. */
child->thread.per_user.start = data;
- else if (addr == (addr_t) &dummy32->ending_addr)
+ else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr))
/* Ending address of the user specified per set. */
child->thread.per_user.end = data;
}
@@ -689,16 +686,15 @@ static inline void __poke_user_per_compat(struct task_struct *child,
static int __poke_user_compat(struct task_struct *child,
addr_t addr, addr_t data)
{
- struct compat_user *dummy32 = NULL;
__u32 tmp = (__u32) data;
addr_t offset;
- if (addr < (addr_t) &dummy32->regs.acrs) {
+ if (addr < offsetof(struct compat_user, regs.acrs)) {
struct pt_regs *regs = task_pt_regs(child);
/*
* psw, gprs, acrs and orig_gpr2 are stored on the stack
*/
- if (addr == (addr_t) &dummy32->regs.psw.mask) {
+ if (addr == offsetof(struct compat_user, regs.psw.mask)) {
__u32 mask = PSW32_MASK_USER;
mask |= is_ri_task(child) ? PSW32_MASK_RI : 0;
@@ -712,62 +708,66 @@ static int __poke_user_compat(struct task_struct *child,
regs->psw.mask = (regs->psw.mask & ~PSW_MASK_USER) |
(regs->psw.mask & PSW_MASK_BA) |
(__u64)(tmp & mask) << 32;
- } else if (addr == (addr_t) &dummy32->regs.psw.addr) {
+ } else if (addr == offsetof(struct compat_user, regs.psw.addr)) {
/* Build a 64 bit psw address from 31 bit address. */
regs->psw.addr = (__u64) tmp & PSW32_ADDR_INSN;
/* Transfer 31 bit amode bit to psw mask. */
regs->psw.mask = (regs->psw.mask & ~PSW_MASK_BA) |
(__u64)(tmp & PSW32_ADDR_AMODE);
} else {
+ if (test_pt_regs_flag(regs, PIF_SYSCALL) &&
+ addr == offsetof(struct compat_user, regs.gprs[2])) {
+ struct pt_regs *regs = task_pt_regs(child);
+
+ regs->int_code = 0x20000 | (data & 0xffff);
+ }
/* gpr 0-15 */
*(__u32*)((addr_t) &regs->psw + addr*2 + 4) = tmp;
}
- } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) {
+ } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) {
/*
* access registers are stored in the thread structure
*/
- offset = addr - (addr_t) &dummy32->regs.acrs;
+ offset = addr - offsetof(struct compat_user, regs.acrs);
*(__u32*)((addr_t) &child->thread.acrs + offset) = tmp;
- } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) {
+ } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) {
/*
* orig_gpr2 is stored on the kernel stack
*/
*(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp;
- } else if (addr < (addr_t) &dummy32->regs.fp_regs) {
+ } else if (addr < offsetof(struct compat_user, regs.fp_regs)) {
/*
* prevent writess of padding hole between
* orig_gpr2 and fp_regs on s390.
*/
return 0;
- } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) {
+ } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) {
/*
* floating point control reg. is in the thread structure
*/
- if (test_fp_ctl(tmp))
- return -EINVAL;
child->thread.fpu.fpc = data;
- } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) {
+ } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
* floating point regs. are either in child->thread.fpu
* or the child->thread.fpu.vxrs array
*/
- offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs;
- if (MACHINE_HAS_VX)
+ offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs);
+ if (cpu_has_vx())
*(__u32 *)((addr_t)
child->thread.fpu.vxrs + 2*offset) = tmp;
else
*(__u32 *)((addr_t)
child->thread.fpu.fprs + offset) = tmp;
- } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) {
+ } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) {
/*
* Handle access to the per_info structure.
*/
- addr -= (addr_t) &dummy32->regs.per_info;
+ addr -= offsetof(struct compat_user, regs.per_info);
__poke_user_per_compat(child, addr, data);
}
@@ -827,92 +827,26 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
}
return 0;
case PTRACE_GET_LAST_BREAK:
- put_user(child->thread.last_break,
- (unsigned int __user *) data);
- return 0;
+ return put_user(child->thread.last_break, (unsigned int __user *)data);
}
return compat_ptrace_request(child, request, addr, data);
}
#endif
-asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
-{
- unsigned long mask = -1UL;
-
- /*
- * The sysc_tracesys code in entry.S stored the system
- * call number to gprs[2].
- */
- if (test_thread_flag(TIF_SYSCALL_TRACE) &&
- (tracehook_report_syscall_entry(regs) ||
- regs->gprs[2] >= NR_syscalls)) {
- /*
- * Tracing decided this syscall should not happen or the
- * debugger stored an invalid system call number. Skip
- * the system call and the system call restart handling.
- */
- clear_pt_regs_flag(regs, PIF_SYSCALL);
- return -1;
- }
-
- /* Do the secure computing check after ptrace. */
- if (secure_computing()) {
- /* seccomp failures shouldn't expose any additional code. */
- return -1;
- }
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
- trace_sys_enter(regs, regs->gprs[2]);
-
- if (is_compat_task())
- mask = 0xffffffff;
-
- audit_syscall_entry(regs->gprs[2], regs->orig_gpr2 & mask,
- regs->gprs[3] &mask, regs->gprs[4] &mask,
- regs->gprs[5] &mask);
-
- return regs->gprs[2];
-}
-
-asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
-{
- audit_syscall_exit(regs);
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
- trace_sys_exit(regs, regs->gprs[2]);
-
- if (test_thread_flag(TIF_SYSCALL_TRACE))
- tracehook_report_syscall_exit(regs, 0);
-}
-
/*
* user_regset definitions.
*/
static int s390_regs_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
+ unsigned pos;
if (target == current)
save_access_regs(target->thread.acrs);
- if (kbuf) {
- unsigned long *k = kbuf;
- while (count > 0) {
- *k++ = __peek_user(target, pos);
- count -= sizeof(*k);
- pos += sizeof(*k);
- }
- } else {
- unsigned long __user *u = ubuf;
- while (count > 0) {
- if (__put_user(__peek_user(target, pos), u++))
- return -EFAULT;
- count -= sizeof(*u);
- pos += sizeof(*u);
- }
- }
+ for (pos = 0; pos < sizeof(s390_regs); pos += sizeof(long))
+ membuf_store(&to, __peek_user(target, pos));
return 0;
}
@@ -953,8 +887,8 @@ static int s390_regs_set(struct task_struct *target,
}
static int s390_fpregs_get(struct task_struct *target,
- const struct user_regset *regset, unsigned int pos,
- unsigned int count, void *kbuf, void __user *ubuf)
+ const struct user_regset *regset,
+ struct membuf to)
{
_s390_fp_regs fp_regs;
@@ -964,8 +898,7 @@ static int s390_fpregs_get(struct task_struct *target,
fp_regs.fpc = target->thread.fpu.fpc;
fpregs_store(&fp_regs, &target->thread.fpu);
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &fp_regs, 0, -1);
+ return membuf_write(&to, &fp_regs, sizeof(fp_regs));
}
static int s390_fpregs_set(struct task_struct *target,
@@ -979,19 +912,18 @@ static int s390_fpregs_set(struct task_struct *target,
if (target == current)
save_fpu_regs();
- if (MACHINE_HAS_VX)
+ if (cpu_has_vx())
convert_vx_to_fp(fprs, target->thread.fpu.vxrs);
else
memcpy(&fprs, target->thread.fpu.fprs, sizeof(fprs));
- /* If setting FPC, must validate it first. */
if (count > 0 && pos < offsetof(s390_fp_regs, fprs)) {
u32 ufpc[2] = { target->thread.fpu.fpc, 0 };
rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ufpc,
0, offsetof(s390_fp_regs, fprs));
if (rc)
return rc;
- if (ufpc[1] != 0 || test_fp_ctl(ufpc[0]))
+ if (ufpc[1] != 0)
return -EINVAL;
target->thread.fpu.fpc = ufpc[0];
}
@@ -1002,7 +934,7 @@ static int s390_fpregs_set(struct task_struct *target,
if (rc)
return rc;
- if (MACHINE_HAS_VX)
+ if (cpu_has_vx())
convert_fp_to_vx(target->thread.fpu.vxrs, fprs);
else
memcpy(target->thread.fpu.fprs, &fprs, sizeof(fprs));
@@ -1012,20 +944,9 @@ static int s390_fpregs_set(struct task_struct *target,
static int s390_last_break_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- if (count > 0) {
- if (kbuf) {
- unsigned long *k = kbuf;
- *k = target->thread.last_break;
- } else {
- unsigned long __user *u = ubuf;
- if (__put_user(target->thread.last_break, u))
- return -EFAULT;
- }
- }
- return 0;
+ return membuf_store(&to, target->thread.last_break);
}
static int s390_last_break_set(struct task_struct *target,
@@ -1038,16 +959,15 @@ static int s390_last_break_set(struct task_struct *target,
static int s390_tdb_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct pt_regs *regs = task_pt_regs(target);
- unsigned char *data;
+ size_t size;
if (!(regs->int_code & 0x200))
return -ENODATA;
- data = target->thread.trap_tdb;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf, data, 0, 256);
+ size = sizeof(target->thread.trap_tdb.data);
+ return membuf_write(&to, target->thread.trap_tdb.data, size);
}
static int s390_tdb_set(struct task_struct *target,
@@ -1060,19 +980,18 @@ static int s390_tdb_set(struct task_struct *target,
static int s390_vxrs_low_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
__u64 vxrs[__NUM_VXRS_LOW];
int i;
- if (!MACHINE_HAS_VX)
+ if (!cpu_has_vx())
return -ENODEV;
if (target == current)
save_fpu_regs();
for (i = 0; i < __NUM_VXRS_LOW; i++)
- vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1);
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
+ vxrs[i] = target->thread.fpu.vxrs[i].low;
+ return membuf_write(&to, vxrs, sizeof(vxrs));
}
static int s390_vxrs_low_set(struct task_struct *target,
@@ -1083,36 +1002,32 @@ static int s390_vxrs_low_set(struct task_struct *target,
__u64 vxrs[__NUM_VXRS_LOW];
int i, rc;
- if (!MACHINE_HAS_VX)
+ if (!cpu_has_vx())
return -ENODEV;
if (target == current)
save_fpu_regs();
for (i = 0; i < __NUM_VXRS_LOW; i++)
- vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1);
+ vxrs[i] = target->thread.fpu.vxrs[i].low;
rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
if (rc == 0)
for (i = 0; i < __NUM_VXRS_LOW; i++)
- *((__u64 *)(target->thread.fpu.vxrs + i) + 1) = vxrs[i];
+ target->thread.fpu.vxrs[i].low = vxrs[i];
return rc;
}
static int s390_vxrs_high_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- __vector128 vxrs[__NUM_VXRS_HIGH];
-
- if (!MACHINE_HAS_VX)
+ if (!cpu_has_vx())
return -ENODEV;
if (target == current)
save_fpu_regs();
- memcpy(vxrs, target->thread.fpu.vxrs + __NUM_VXRS_LOW, sizeof(vxrs));
-
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
+ return membuf_write(&to, target->thread.fpu.vxrs + __NUM_VXRS_LOW,
+ __NUM_VXRS_HIGH * sizeof(__vector128));
}
static int s390_vxrs_high_set(struct task_struct *target,
@@ -1122,7 +1037,7 @@ static int s390_vxrs_high_set(struct task_struct *target,
{
int rc;
- if (!MACHINE_HAS_VX)
+ if (!cpu_has_vx())
return -ENODEV;
if (target == current)
save_fpu_regs();
@@ -1134,12 +1049,9 @@ static int s390_vxrs_high_set(struct task_struct *target,
static int s390_system_call_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- unsigned int *data = &target->thread.system_call;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- data, 0, sizeof(unsigned int));
+ return membuf_store(&to, target->thread.system_call);
}
static int s390_system_call_set(struct task_struct *target,
@@ -1154,8 +1066,7 @@ static int s390_system_call_set(struct task_struct *target,
static int s390_gs_cb_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct gs_cb *data = target->thread.gs_cb;
@@ -1165,8 +1076,7 @@ static int s390_gs_cb_get(struct task_struct *target,
return -ENODATA;
if (target == current)
save_gs_cb(data);
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- data, 0, sizeof(struct gs_cb));
+ return membuf_write(&to, data, sizeof(struct gs_cb));
}
static int s390_gs_cb_set(struct task_struct *target,
@@ -1201,7 +1111,7 @@ static int s390_gs_cb_set(struct task_struct *target,
target->thread.gs_cb = data;
*target->thread.gs_cb = gs_cb;
if (target == current) {
- __ctl_set_bit(2, 4);
+ local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
restore_gs_cb(target->thread.gs_cb);
}
preempt_enable();
@@ -1210,8 +1120,7 @@ static int s390_gs_cb_set(struct task_struct *target,
static int s390_gs_bc_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct gs_cb *data = target->thread.gs_bc_cb;
@@ -1219,8 +1128,7 @@ static int s390_gs_bc_get(struct task_struct *target,
return -ENODEV;
if (!data)
return -ENODATA;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- data, 0, sizeof(struct gs_cb));
+ return membuf_write(&to, data, sizeof(struct gs_cb));
}
static int s390_gs_bc_set(struct task_struct *target,
@@ -1256,7 +1164,6 @@ static bool is_ri_cb_valid(struct runtime_instr_cb *cb)
cb->pc == 1 &&
cb->qc == 0 &&
cb->reserved2 == 0 &&
- cb->key == PAGE_DEFAULT_KEY &&
cb->reserved3 == 0 &&
cb->reserved4 == 0 &&
cb->reserved5 == 0 &&
@@ -1271,8 +1178,7 @@ static bool is_ri_cb_valid(struct runtime_instr_cb *cb)
static int s390_runtime_instr_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct runtime_instr_cb *data = target->thread.ri_cb;
@@ -1281,8 +1187,7 @@ static int s390_runtime_instr_get(struct task_struct *target,
if (!data)
return -ENODATA;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- data, 0, sizeof(struct runtime_instr_cb));
+ return membuf_write(&to, data, sizeof(struct runtime_instr_cb));
}
static int s390_runtime_instr_set(struct task_struct *target,
@@ -1320,7 +1225,11 @@ static int s390_runtime_instr_set(struct task_struct *target,
kfree(data);
return -EINVAL;
}
-
+ /*
+ * Override access key in any case, since user space should
+ * not be able to set it, nor should it care about it.
+ */
+ ri_cb.key = PAGE_DEFAULT_KEY >> 4;
preempt_disable();
if (!target->thread.ri_cb)
target->thread.ri_cb = data;
@@ -1338,7 +1247,7 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(s390_regs) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
- .get = s390_regs_get,
+ .regset_get = s390_regs_get,
.set = s390_regs_set,
},
{
@@ -1346,7 +1255,7 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(s390_fp_regs) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
- .get = s390_fpregs_get,
+ .regset_get = s390_fpregs_get,
.set = s390_fpregs_set,
},
{
@@ -1354,7 +1263,7 @@ static const struct user_regset s390_regsets[] = {
.n = 1,
.size = sizeof(unsigned int),
.align = sizeof(unsigned int),
- .get = s390_system_call_get,
+ .regset_get = s390_system_call_get,
.set = s390_system_call_set,
},
{
@@ -1362,7 +1271,7 @@ static const struct user_regset s390_regsets[] = {
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
- .get = s390_last_break_get,
+ .regset_get = s390_last_break_get,
.set = s390_last_break_set,
},
{
@@ -1370,7 +1279,7 @@ static const struct user_regset s390_regsets[] = {
.n = 1,
.size = 256,
.align = 1,
- .get = s390_tdb_get,
+ .regset_get = s390_tdb_get,
.set = s390_tdb_set,
},
{
@@ -1378,7 +1287,7 @@ static const struct user_regset s390_regsets[] = {
.n = __NUM_VXRS_LOW,
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_vxrs_low_get,
+ .regset_get = s390_vxrs_low_get,
.set = s390_vxrs_low_set,
},
{
@@ -1386,7 +1295,7 @@ static const struct user_regset s390_regsets[] = {
.n = __NUM_VXRS_HIGH,
.size = sizeof(__vector128),
.align = sizeof(__vector128),
- .get = s390_vxrs_high_get,
+ .regset_get = s390_vxrs_high_get,
.set = s390_vxrs_high_set,
},
{
@@ -1394,7 +1303,7 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_gs_cb_get,
+ .regset_get = s390_gs_cb_get,
.set = s390_gs_cb_set,
},
{
@@ -1402,7 +1311,7 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_gs_bc_get,
+ .regset_get = s390_gs_bc_get,
.set = s390_gs_bc_set,
},
{
@@ -1410,13 +1319,13 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_runtime_instr_get,
+ .regset_get = s390_runtime_instr_get,
.set = s390_runtime_instr_set,
},
};
static const struct user_regset_view user_s390_view = {
- .name = UTS_MACHINE,
+ .name = "s390x",
.e_machine = EM_S390,
.regsets = s390_regsets,
.n = ARRAY_SIZE(s390_regsets)
@@ -1425,28 +1334,15 @@ static const struct user_regset_view user_s390_view = {
#ifdef CONFIG_COMPAT
static int s390_compat_regs_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
+ unsigned n;
+
if (target == current)
save_access_regs(target->thread.acrs);
- if (kbuf) {
- compat_ulong_t *k = kbuf;
- while (count > 0) {
- *k++ = __peek_user_compat(target, pos);
- count -= sizeof(*k);
- pos += sizeof(*k);
- }
- } else {
- compat_ulong_t __user *u = ubuf;
- while (count > 0) {
- if (__put_user(__peek_user_compat(target, pos), u++))
- return -EFAULT;
- count -= sizeof(*u);
- pos += sizeof(*u);
- }
- }
+ for (n = 0; n < sizeof(s390_compat_regs); n += sizeof(compat_ulong_t))
+ membuf_store(&to, __peek_user_compat(target, n));
return 0;
}
@@ -1488,29 +1384,14 @@ static int s390_compat_regs_set(struct task_struct *target,
static int s390_compat_regs_high_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
compat_ulong_t *gprs_high;
+ int i;
- gprs_high = (compat_ulong_t *)
- &task_pt_regs(target)->gprs[pos / sizeof(compat_ulong_t)];
- if (kbuf) {
- compat_ulong_t *k = kbuf;
- while (count > 0) {
- *k++ = *gprs_high;
- gprs_high += 2;
- count -= sizeof(*k);
- }
- } else {
- compat_ulong_t __user *u = ubuf;
- while (count > 0) {
- if (__put_user(*gprs_high, u++))
- return -EFAULT;
- gprs_high += 2;
- count -= sizeof(*u);
- }
- }
+ gprs_high = (compat_ulong_t *)task_pt_regs(target)->gprs;
+ for (i = 0; i < NUM_GPRS; i++, gprs_high += 2)
+ membuf_store(&to, *gprs_high);
return 0;
}
@@ -1549,23 +1430,11 @@ static int s390_compat_regs_high_set(struct task_struct *target,
static int s390_compat_last_break_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- compat_ulong_t last_break;
+ compat_ulong_t last_break = target->thread.last_break;
- if (count > 0) {
- last_break = target->thread.last_break;
- if (kbuf) {
- unsigned long *k = kbuf;
- *k = last_break;
- } else {
- unsigned long __user *u = ubuf;
- if (__put_user(last_break, u))
- return -EFAULT;
- }
- }
- return 0;
+ return membuf_store(&to, (unsigned long)last_break);
}
static int s390_compat_last_break_set(struct task_struct *target,
@@ -1582,7 +1451,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(s390_compat_regs) / sizeof(compat_long_t),
.size = sizeof(compat_long_t),
.align = sizeof(compat_long_t),
- .get = s390_compat_regs_get,
+ .regset_get = s390_compat_regs_get,
.set = s390_compat_regs_set,
},
{
@@ -1590,7 +1459,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(s390_fp_regs) / sizeof(compat_long_t),
.size = sizeof(compat_long_t),
.align = sizeof(compat_long_t),
- .get = s390_fpregs_get,
+ .regset_get = s390_fpregs_get,
.set = s390_fpregs_set,
},
{
@@ -1598,7 +1467,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = 1,
.size = sizeof(compat_uint_t),
.align = sizeof(compat_uint_t),
- .get = s390_system_call_get,
+ .regset_get = s390_system_call_get,
.set = s390_system_call_set,
},
{
@@ -1606,7 +1475,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
- .get = s390_compat_last_break_get,
+ .regset_get = s390_compat_last_break_get,
.set = s390_compat_last_break_set,
},
{
@@ -1614,7 +1483,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = 1,
.size = 256,
.align = 1,
- .get = s390_tdb_get,
+ .regset_get = s390_tdb_get,
.set = s390_tdb_set,
},
{
@@ -1622,7 +1491,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = __NUM_VXRS_LOW,
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_vxrs_low_get,
+ .regset_get = s390_vxrs_low_get,
.set = s390_vxrs_low_set,
},
{
@@ -1630,7 +1499,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = __NUM_VXRS_HIGH,
.size = sizeof(__vector128),
.align = sizeof(__vector128),
- .get = s390_vxrs_high_get,
+ .regset_get = s390_vxrs_high_get,
.set = s390_vxrs_high_set,
},
{
@@ -1638,7 +1507,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t),
.size = sizeof(compat_long_t),
.align = sizeof(compat_long_t),
- .get = s390_compat_regs_high_get,
+ .regset_get = s390_compat_regs_high_get,
.set = s390_compat_regs_high_set,
},
{
@@ -1646,7 +1515,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_gs_cb_get,
+ .regset_get = s390_gs_cb_get,
.set = s390_gs_cb_set,
},
{
@@ -1654,7 +1523,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_gs_bc_get,
+ .regset_get = s390_gs_bc_get,
.set = s390_gs_bc_set,
},
{
@@ -1662,7 +1531,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_runtime_instr_get,
+ .regset_get = s390_runtime_instr_get,
.set = s390_runtime_instr_set,
},
};
diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S
index 4a22163962eb..88087a32ebc6 100644
--- a/arch/s390/kernel/reipl.S
+++ b/arch/s390/kernel/reipl.S
@@ -19,7 +19,7 @@
# r2 = Function to be called after store status
# r3 = Parameter for function
#
-ENTRY(store_status)
+SYM_CODE_START(store_status)
/* Save register one and load save area base */
stg %r1,__LC_SAVE_AREA_RESTART
/* General purpose registers */
@@ -61,7 +61,7 @@ ENTRY(store_status)
stpx 0(%r1)
/* Clock comparator - seven bytes */
lghi %r1,__LC_CLOCK_COMP_SAVE_AREA
- larl %r4,.Lclkcmp
+ larl %r4,clkcmp
stckc 0(%r4)
mvc 1(7,%r1),1(%r4)
/* Program status word */
@@ -73,9 +73,9 @@ ENTRY(store_status)
lgr %r9,%r2
lgr %r2,%r3
BR_EX %r9
-ENDPROC(store_status)
+SYM_CODE_END(store_status)
.section .bss
- .align 8
-.Lclkcmp: .quad 0x0000000000000000
+ .balign 8
+SYM_DATA_LOCAL(clkcmp, .quad 0x0000000000000000)
.previous
diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S
index fe396673e8a6..0ae297c82afd 100644
--- a/arch/s390/kernel/relocate_kernel.S
+++ b/arch/s390/kernel/relocate_kernel.S
@@ -2,8 +2,7 @@
/*
* Copyright IBM Corp. 2005
*
- * Author(s): Rolf Adelsberger,
- * Heiko Carstens <heiko.carstens@de.ibm.com>
+ * Author(s): Rolf Adelsberger
*
*/
@@ -15,6 +14,7 @@
* moves the new kernel to its destination...
* %r2 = pointer to first kimage_entry_t
* %r3 = start address - where to jump to after the job is done...
+ * %r4 = subcode
*
* %r5 will be used as temp. storage
* %r6 holds the destination address
@@ -26,53 +26,51 @@
*/
.text
-ENTRY(relocate_kernel)
- basr %r13,0 # base address
- .base:
- lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7
- lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9
- lg %r5,0(%r2) # read another word for indirection page
- aghi %r2,8 # increment pointer
- tml %r5,0x1 # is it a destination page?
- je .indir_check # NO, goto "indir_check"
- lgr %r6,%r5 # r6 = r5
- nill %r6,0xf000 # mask it out and...
- j .base # ...next iteration
- .indir_check:
- tml %r5,0x2 # is it a indirection page?
- je .done_test # NO, goto "done_test"
- nill %r5,0xf000 # YES, mask out,
- lgr %r2,%r5 # move it into the right register,
- j .base # and read next...
- .done_test:
- tml %r5,0x4 # is it the done indicator?
- je .source_test # NO! Well, then it should be the source indicator...
- j .done # ok, lets finish it here...
- .source_test:
- tml %r5,0x8 # it should be a source indicator...
- je .base # NO, ignore it...
- lgr %r8,%r5 # r8 = r5
- nill %r8,0xf000 # masking
- 0: mvcle %r6,%r8,0x0 # copy PAGE_SIZE bytes from r8 to r6 - pad with 0
- jo 0b
- j .base
- .done:
- sgr %r0,%r0 # clear register r0
- cghi %r3,0
- je .diag
- la %r4,load_psw-.base(%r13) # load psw-address into the register
- o %r3,4(%r4) # or load address into psw
- st %r3,4(%r4)
- mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0
- .diag:
- diag %r0,%r0,0x308
-ENDPROC(relocate_kernel)
+SYM_CODE_START(relocate_kernel)
+ basr %r13,0 # base address
+.base:
+ lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7
+ lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9
+ lg %r5,0(%r2) # read another word for indirection page
+ aghi %r2,8 # increment pointer
+ tml %r5,0x1 # is it a destination page?
+ je .indir_check # NO, goto "indir_check"
+ lgr %r6,%r5 # r6 = r5
+ nill %r6,0xf000 # mask it out and...
+ j .base # ...next iteration
+.indir_check:
+ tml %r5,0x2 # is it a indirection page?
+ je .done_test # NO, goto "done_test"
+ nill %r5,0xf000 # YES, mask out,
+ lgr %r2,%r5 # move it into the right register,
+ j .base # and read next...
+.done_test:
+ tml %r5,0x4 # is it the done indicator?
+ je .source_test # NO! Well, then it should be the source indicator...
+ j .done # ok, lets finish it here...
+.source_test:
+ tml %r5,0x8 # it should be a source indicator...
+ je .base # NO, ignore it...
+ lgr %r8,%r5 # r8 = r5
+ nill %r8,0xf000 # masking
+0: mvcle %r6,%r8,0x0 # copy PAGE_SIZE bytes from r8 to r6 - pad with 0
+ jo 0b
+ j .base
+.done:
+ lgr %r0,%r4 # subcode
+ cghi %r3,0
+ je .diag
+ la %r4,load_psw-.base(%r13) # load psw-address into the register
+ o %r3,4(%r4) # or load address into psw
+ st %r3,4(%r4)
+ mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0
+.diag:
+ diag %r0,%r0,0x308
+SYM_CODE_END(relocate_kernel)
- .align 8
- load_psw:
- .long 0x00080000,0x80000000
- relocate_kernel_end:
- .align 8
- .globl relocate_kernel_len
- relocate_kernel_len:
- .quad relocate_kernel_end - relocate_kernel
+ .balign 8
+SYM_DATA_START_LOCAL(load_psw)
+ .long 0x00080000,0x80000000
+SYM_DATA_END_LABEL(load_psw, SYM_L_LOCAL, relocate_kernel_end)
+ .balign 8
+SYM_DATA(relocate_kernel_len, .quad relocate_kernel_end - relocate_kernel)
diff --git a/arch/s390/kernel/rethook.c b/arch/s390/kernel/rethook.c
new file mode 100644
index 000000000000..af10e6bdd34e
--- /dev/null
+++ b/arch/s390/kernel/rethook.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/rethook.h>
+#include <linux/kprobes.h>
+#include "rethook.h"
+
+void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount)
+{
+ rh->ret_addr = regs->gprs[14];
+ rh->frame = regs->gprs[15];
+
+ /* Replace the return addr with trampoline addr */
+ regs->gprs[14] = (unsigned long)&arch_rethook_trampoline;
+}
+NOKPROBE_SYMBOL(arch_rethook_prepare);
+
+void arch_rethook_fixup_return(struct pt_regs *regs,
+ unsigned long correct_ret_addr)
+{
+ /* Replace fake return address with real one. */
+ regs->gprs[14] = correct_ret_addr;
+}
+NOKPROBE_SYMBOL(arch_rethook_fixup_return);
+
+/*
+ * Called from arch_rethook_trampoline
+ */
+unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs)
+{
+ return rethook_trampoline_handler(regs, regs->gprs[15]);
+}
+NOKPROBE_SYMBOL(arch_rethook_trampoline_callback);
+
+/* assembler function that handles the rethook must not be probed itself */
+NOKPROBE_SYMBOL(arch_rethook_trampoline);
diff --git a/arch/s390/kernel/rethook.h b/arch/s390/kernel/rethook.h
new file mode 100644
index 000000000000..32f069eed3f3
--- /dev/null
+++ b/arch/s390/kernel/rethook.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __S390_RETHOOK_H
+#define __S390_RETHOOK_H
+
+unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs);
+
+#endif
diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c
index 125c7f6e8715..1788a5454b6f 100644
--- a/arch/s390/kernel/runtime_instr.c
+++ b/arch/s390/kernel/runtime_instr.c
@@ -57,7 +57,7 @@ static void init_runtime_instr_cb(struct runtime_instr_cb *cb)
cb->k = 1;
cb->ps = 1;
cb->pc = 1;
- cb->key = PAGE_DEFAULT_KEY;
+ cb->key = PAGE_DEFAULT_KEY >> 4;
cb->v = 1;
}
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 9cbf490fd162..d1f3b56e7afc 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -37,7 +37,7 @@
#include <linux/root_dev.h>
#include <linux/console.h>
#include <linux/kernel_stat.h>
-#include <linux/dma-contiguous.h>
+#include <linux/dma-map-ops.h>
#include <linux/device.h>
#include <linux/notifier.h>
#include <linux/pfn.h>
@@ -49,14 +49,17 @@
#include <linux/memory.h>
#include <linux/compat.h>
#include <linux/start_kernel.h>
+#include <linux/hugetlb.h>
+#include <linux/kmemleak.h>
+#include <asm/archrandom.h>
#include <asm/boot_data.h>
#include <asm/ipl.h>
#include <asm/facility.h>
#include <asm/smp.h>
#include <asm/mmu_context.h>
#include <asm/cpcmd.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
#include <asm/nmi.h>
#include <asm/irq.h>
#include <asm/page.h>
@@ -71,8 +74,10 @@
#include <asm/numa.h>
#include <asm/alternative.h>
#include <asm/nospec-branch.h>
-#include <asm/mem_detect.h>
+#include <asm/physmem_info.h>
+#include <asm/maccess.h>
#include <asm/uv.h>
+#include <asm/asm-offsets.h>
#include "entry.h"
/*
@@ -87,30 +92,71 @@ EXPORT_SYMBOL(console_devno);
unsigned int console_irq = -1;
EXPORT_SYMBOL(console_irq);
-unsigned long elf_hwcap __read_mostly = 0;
-char elf_platform[ELF_PLATFORM_SIZE];
+/*
+ * Some code and data needs to stay below 2 GB, even when the kernel would be
+ * relocated above 2 GB, because it has to use 31 bit addresses.
+ * Such code and data is part of the .amode31 section.
+ */
+char __amode31_ref *__samode31 = _samode31;
+char __amode31_ref *__eamode31 = _eamode31;
+char __amode31_ref *__stext_amode31 = _stext_amode31;
+char __amode31_ref *__etext_amode31 = _etext_amode31;
+struct exception_table_entry __amode31_ref *__start_amode31_ex_table = _start_amode31_ex_table;
+struct exception_table_entry __amode31_ref *__stop_amode31_ex_table = _stop_amode31_ex_table;
-unsigned long int_hwcap = 0;
+/*
+ * Control registers CR2, CR5 and CR15 are initialized with addresses
+ * of tables that must be placed below 2G which is handled by the AMODE31
+ * sections.
+ * Because the AMODE31 sections are relocated below 2G at startup,
+ * the content of control registers CR2, CR5 and CR15 must be updated
+ * with new addresses after the relocation. The initial initialization of
+ * control registers occurs in head64.S and then gets updated again after AMODE31
+ * relocation. We must access the relevant AMODE31 tables indirectly via
+ * pointers placed in the .amode31.refs linker section. Those pointers get
+ * updated automatically during AMODE31 relocation and always contain a valid
+ * address within AMODE31 sections.
+ */
-#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
-int __bootdata_preserved(prot_virt_guest);
-#endif
+static __amode31_data u32 __ctl_duct_amode31[16] __aligned(64);
+
+static __amode31_data u64 __ctl_aste_amode31[8] __aligned(64) = {
+ [1] = 0xffffffffffffffff
+};
+
+static __amode31_data u32 __ctl_duald_amode31[32] __aligned(128) = {
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0
+};
+
+static __amode31_data u32 __ctl_linkage_stack_amode31[8] __aligned(64) = {
+ 0, 0, 0x89000000, 0,
+ 0, 0, 0x8a000000, 0
+};
+
+static u64 __amode31_ref *__ctl_aste = __ctl_aste_amode31;
+static u32 __amode31_ref *__ctl_duald = __ctl_duald_amode31;
+static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31;
+static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31;
+
+unsigned long __bootdata_preserved(max_mappable);
+unsigned long __bootdata(ident_map_size);
+struct physmem_info __bootdata(physmem_info);
-int __bootdata(noexec_disabled);
-int __bootdata(memory_end_set);
-unsigned long __bootdata(memory_end);
-unsigned long __bootdata(vmalloc_size);
-unsigned long __bootdata(max_physmem_end);
-struct mem_detect_info __bootdata(mem_detect);
-
-struct exception_table_entry *__bootdata_preserved(__start_dma_ex_table);
-struct exception_table_entry *__bootdata_preserved(__stop_dma_ex_table);
-unsigned long __bootdata_preserved(__swsusp_reset_dma);
-unsigned long __bootdata_preserved(__stext_dma);
-unsigned long __bootdata_preserved(__etext_dma);
-unsigned long __bootdata_preserved(__sdma);
-unsigned long __bootdata_preserved(__edma);
unsigned long __bootdata_preserved(__kaslr_offset);
+int __bootdata_preserved(__kaslr_enabled);
+unsigned int __bootdata_preserved(zlib_dfltcc_support);
+EXPORT_SYMBOL(zlib_dfltcc_support);
+u64 __bootdata_preserved(stfle_fac_list[16]);
+EXPORT_SYMBOL(stfle_fac_list);
+u64 __bootdata_preserved(alt_stfle_fac_list[16]);
+struct oldmem_data __bootdata_preserved(oldmem_data);
unsigned long VMALLOC_START;
EXPORT_SYMBOL(VMALLOC_START);
@@ -120,6 +166,7 @@ EXPORT_SYMBOL(VMALLOC_END);
struct page *vmemmap;
EXPORT_SYMBOL(vmemmap);
+unsigned long vmemmap_size;
unsigned long MODULES_VADDR;
unsigned long MODULES_END;
@@ -128,6 +175,14 @@ unsigned long MODULES_END;
struct lowcore *lowcore_ptr[NR_CPUS];
EXPORT_SYMBOL(lowcore_ptr);
+DEFINE_STATIC_KEY_FALSE(cpu_has_bear);
+
+/*
+ * The Write Back bit position in the physaddr is given by the SLPC PCI.
+ * Leaving the mask zero always uses write through which is safe
+ */
+unsigned long mio_wb_bit_mask __ro_after_init;
+
/*
* This is set up by the setup-routine at boot-time
* for S390 need to find out, what we have to setup
@@ -161,7 +216,7 @@ static void __init set_preferred_console(void)
else if (CONSOLE_IS_3270)
add_preferred_console("tty3270", 0, NULL);
else if (CONSOLE_IS_VT220)
- add_preferred_console("ttyS", 1, NULL);
+ add_preferred_console("ttysclp", 0, NULL);
else if (CONSOLE_IS_HVC)
add_preferred_console("hvc", 0, NULL);
}
@@ -241,18 +296,16 @@ static void __init conmode_default(void)
SET_CONSOLE_SCLP;
#endif
}
- if (IS_ENABLED(CONFIG_VT) && IS_ENABLED(CONFIG_DUMMY_CONSOLE))
- conswitchp = &dummy_con;
}
#ifdef CONFIG_CRASH_DUMP
static void __init setup_zfcpdump(void)
{
- if (ipl_info.type != IPL_TYPE_FCP_DUMP)
+ if (!is_ipl_type_dump())
return;
- if (OLDMEM_BASE)
+ if (oldmem_data.start)
return;
- strcat(boot_command_line, " cio_ignore=all,!ipldev,!condev");
+ strlcat(boot_command_line, " cio_ignore=all,!ipldev,!condev", COMMAND_LINE_SIZE);
console_loglevel = 2;
}
#else
@@ -303,17 +356,17 @@ void machine_power_off(void)
void (*pm_power_off)(void) = machine_power_off;
EXPORT_SYMBOL_GPL(pm_power_off);
-void *restart_stack __section(.data);
+void *restart_stack;
unsigned long stack_alloc(void)
{
#ifdef CONFIG_VMAP_STACK
- return (unsigned long)
- __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
- VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP,
- PAGE_KERNEL, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
+ void *ret;
+
+ ret = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP,
+ NUMA_NO_NODE, __builtin_return_address(0));
+ kmemleak_not_leak(ret);
+ return (unsigned long)ret;
#else
return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
#endif
@@ -328,51 +381,21 @@ void stack_free(unsigned long stack)
#endif
}
-int __init arch_early_irq_init(void)
-{
- unsigned long stack;
-
- stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
- if (!stack)
- panic("Couldn't allocate async stack");
- S390_lowcore.async_stack = stack + STACK_INIT_OFFSET;
- return 0;
-}
-
-static int __init async_stack_realloc(void)
-{
- unsigned long old, new;
-
- old = S390_lowcore.async_stack - STACK_INIT_OFFSET;
- new = stack_alloc();
- if (!new)
- panic("Couldn't allocate async stack");
- S390_lowcore.async_stack = new + STACK_INIT_OFFSET;
- free_pages(old, THREAD_SIZE_ORDER);
- return 0;
-}
-early_initcall(async_stack_realloc);
-
-void __init arch_call_rest_init(void)
+static unsigned long __init stack_alloc_early(void)
{
unsigned long stack;
- stack = stack_alloc();
- if (!stack)
- panic("Couldn't allocate kernel stack");
- current->stack = (void *) stack;
-#ifdef CONFIG_VMAP_STACK
- current->stack_vm_area = (void *) stack;
-#endif
- set_task_stack_end_magic(current);
- stack += STACK_INIT_OFFSET;
- S390_lowcore.kernel_stack = stack;
- CALL_ON_STACK_NORETURN(rest_init, stack);
+ stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+ if (!stack) {
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, THREAD_SIZE, THREAD_SIZE);
+ }
+ return stack;
}
-static void __init setup_lowcore_dat_off(void)
+static void __init setup_lowcore(void)
{
- struct lowcore *lc;
+ struct lowcore *lc, *abs_lc;
/*
* Setup lowcore for boot cpu
@@ -383,52 +406,40 @@ static void __init setup_lowcore_dat_off(void)
panic("%s: Failed to allocate %zu bytes align=%zx\n",
__func__, sizeof(*lc), sizeof(*lc));
- lc->restart_psw.mask = PSW_KERNEL_BITS;
- lc->restart_psw.addr = (unsigned long) restart_int_handler;
- lc->external_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK;
+ lc->restart_psw.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT;
+ lc->restart_psw.addr = __pa(restart_int_handler);
+ lc->external_new_psw.mask = PSW_KERNEL_BITS;
lc->external_new_psw.addr = (unsigned long) ext_int_handler;
- lc->svc_new_psw.mask = PSW_KERNEL_BITS |
- PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
+ lc->svc_new_psw.mask = PSW_KERNEL_BITS;
lc->svc_new_psw.addr = (unsigned long) system_call;
- lc->program_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK;
+ lc->program_new_psw.mask = PSW_KERNEL_BITS;
lc->program_new_psw.addr = (unsigned long) pgm_check_handler;
lc->mcck_new_psw.mask = PSW_KERNEL_BITS;
lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler;
- lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK;
+ lc->io_new_psw.mask = PSW_KERNEL_BITS;
lc->io_new_psw.addr = (unsigned long) io_int_handler;
lc->clock_comparator = clock_comparator_max;
- lc->nodat_stack = ((unsigned long) &init_thread_union)
- + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
lc->current_task = (unsigned long)&init_task;
lc->lpp = LPP_MAGIC;
lc->machine_flags = S390_lowcore.machine_flags;
lc->preempt_count = S390_lowcore.preempt_count;
- lc->stfl_fac_list = S390_lowcore.stfl_fac_list;
- memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
- sizeof(lc->stfle_fac_list));
- memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list,
- sizeof(lc->alt_stfle_fac_list));
- nmi_alloc_boot_cpu(lc);
- vdso_alloc_boot_cpu(lc);
- lc->sync_enter_timer = S390_lowcore.sync_enter_timer;
- lc->async_enter_timer = S390_lowcore.async_enter_timer;
+ nmi_alloc_mcesa_early(&lc->mcesad);
+ lc->sys_enter_timer = S390_lowcore.sys_enter_timer;
lc->exit_timer = S390_lowcore.exit_timer;
lc->user_timer = S390_lowcore.user_timer;
lc->system_timer = S390_lowcore.system_timer;
lc->steal_timer = S390_lowcore.steal_timer;
lc->last_update_timer = S390_lowcore.last_update_timer;
lc->last_update_clock = S390_lowcore.last_update_clock;
-
/*
* Allocate the global restart stack which is the same for
- * all CPUs in cast *one* of them does a PSW restart.
+ * all CPUs in case *one* of them does a PSW restart.
*/
- restart_stack = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
- if (!restart_stack)
- panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
- __func__, THREAD_SIZE, THREAD_SIZE);
- restart_stack += STACK_INIT_OFFSET;
-
+ restart_stack = (void *)(stack_alloc_early() + STACK_INIT_OFFSET);
+ lc->mcck_stack = stack_alloc_early() + STACK_INIT_OFFSET;
+ lc->async_stack = stack_alloc_early() + STACK_INIT_OFFSET;
+ lc->nodat_stack = stack_alloc_early() + STACK_INIT_OFFSET;
+ lc->kernel_stack = S390_lowcore.kernel_stack;
/*
* Set up PSW restart to call ipl.c:do_restart(). Copy the relevant
* restart data to the absolute zero lowcore. This is necessary if
@@ -437,32 +448,32 @@ static void __init setup_lowcore_dat_off(void)
lc->restart_stack = (unsigned long) restart_stack;
lc->restart_fn = (unsigned long) do_restart;
lc->restart_data = 0;
- lc->restart_source = -1UL;
-
- /* Setup absolute zero lowcore */
- mem_assign_absolute(S390_lowcore.restart_stack, lc->restart_stack);
- mem_assign_absolute(S390_lowcore.restart_fn, lc->restart_fn);
- mem_assign_absolute(S390_lowcore.restart_data, lc->restart_data);
- mem_assign_absolute(S390_lowcore.restart_source, lc->restart_source);
- mem_assign_absolute(S390_lowcore.restart_psw, lc->restart_psw);
-
+ lc->restart_source = -1U;
lc->spinlock_lockval = arch_spin_lockval(0);
lc->spinlock_index = 0;
arch_spin_lock_setup(0);
- lc->br_r1_trampoline = 0x07f1; /* br %r1 */
-
- set_prefix((u32)(unsigned long) lc);
+ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
+ lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
+ lc->preempt_count = PREEMPT_DISABLED;
+ lc->kernel_asce = S390_lowcore.kernel_asce;
+ lc->user_asce = S390_lowcore.user_asce;
+
+ system_ctlreg_init_save_area(lc);
+ abs_lc = get_abs_lowcore();
+ abs_lc->restart_stack = lc->restart_stack;
+ abs_lc->restart_fn = lc->restart_fn;
+ abs_lc->restart_data = lc->restart_data;
+ abs_lc->restart_source = lc->restart_source;
+ abs_lc->restart_psw = lc->restart_psw;
+ abs_lc->restart_flags = RESTART_FLAG_CTLREGS;
+ abs_lc->program_new_psw = lc->program_new_psw;
+ abs_lc->mcesad = lc->mcesad;
+ put_abs_lowcore(abs_lc);
+
+ set_prefix(__pa(lc));
lowcore_ptr[0] = lc;
-}
-
-static void __init setup_lowcore_dat_on(void)
-{
- __ctl_clear_bit(0, 28);
- S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT;
- S390_lowcore.svc_new_psw.mask |= PSW_MASK_DAT;
- S390_lowcore.program_new_psw.mask |= PSW_MASK_DAT;
- S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT;
- __ctl_set_bit(0, 28);
+ if (abs_lowcore_map(0, lowcore_ptr[0], false))
+ panic("Couldn't setup absolute lowcore");
}
static struct resource code_resource = {
@@ -489,8 +500,9 @@ static struct resource __initdata *standard_resources[] = {
static void __init setup_resources(void)
{
struct resource *res, *std_res, *sub_res;
- struct memblock_region *reg;
+ phys_addr_t start, end;
int j;
+ u64 i;
code_resource.start = (unsigned long) _text;
code_resource.end = (unsigned long) _etext - 1;
@@ -499,7 +511,7 @@ static void __init setup_resources(void)
bss_resource.start = (unsigned long) __bss_start;
bss_resource.end = (unsigned long) __bss_stop - 1;
- for_each_memblock(memory, reg) {
+ for_each_mem_range(i, &start, &end) {
res = memblock_alloc(sizeof(*res), 8);
if (!res)
panic("%s: Failed to allocate %zu bytes align=0x%x\n",
@@ -507,8 +519,13 @@ static void __init setup_resources(void)
res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
res->name = "System RAM";
- res->start = reg->base;
- res->end = reg->base + reg->size - 1;
+ res->start = start;
+ /*
+ * In memblock, end points to the first byte after the
+ * range while in resources, end points to the last byte in
+ * the range.
+ */
+ res->end = end - 1;
request_resource(&iomem_resource, res);
for (j = 0; j < ARRAY_SIZE(standard_resources); j++) {
@@ -539,7 +556,8 @@ static void __init setup_resources(void)
* part of the System RAM resource.
*/
if (crashk_res.end) {
- memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0);
+ memblock_add_node(crashk_res.start, resource_size(&crashk_res),
+ 0, MEMBLOCK_NONE);
memblock_reserve(crashk_res.start, resource_size(&crashk_res));
insert_resource(&iomem_resource, &crashk_res);
}
@@ -548,56 +566,17 @@ static void __init setup_resources(void)
static void __init setup_memory_end(void)
{
- unsigned long vmax, tmp;
-
- /* Choose kernel address space layout: 3 or 4 levels. */
- if (IS_ENABLED(CONFIG_KASAN)) {
- vmax = IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING)
- ? _REGION1_SIZE
- : _REGION2_SIZE;
- } else {
- tmp = (memory_end ?: max_physmem_end) / PAGE_SIZE;
- tmp = tmp * (sizeof(struct page) + PAGE_SIZE);
- if (tmp + vmalloc_size + MODULES_LEN <= _REGION2_SIZE)
- vmax = _REGION2_SIZE; /* 3-level kernel page table */
- else
- vmax = _REGION1_SIZE; /* 4-level kernel page table */
- }
-
- /* module area is at the end of the kernel address space. */
- MODULES_END = vmax;
- MODULES_VADDR = MODULES_END - MODULES_LEN;
- VMALLOC_END = MODULES_VADDR;
- VMALLOC_START = VMALLOC_END - vmalloc_size;
-
- /* Split remaining virtual space between 1:1 mapping & vmemmap array */
- tmp = VMALLOC_START / (PAGE_SIZE + sizeof(struct page));
- /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */
- tmp = SECTION_ALIGN_UP(tmp);
- tmp = VMALLOC_START - tmp * sizeof(struct page);
- tmp &= ~((vmax >> 11) - 1); /* align to page table level */
- tmp = min(tmp, 1UL << MAX_PHYSMEM_BITS);
- vmemmap = (struct page *) tmp;
-
- /* Take care that memory_end is set and <= vmemmap */
- memory_end = min(memory_end ?: max_physmem_end, (unsigned long)vmemmap);
-#ifdef CONFIG_KASAN
- /* fit in kasan shadow memory region between 1:1 and vmemmap */
- memory_end = min(memory_end, KASAN_SHADOW_START);
- vmemmap = max(vmemmap, (struct page *)KASAN_SHADOW_END);
-#endif
- max_pfn = max_low_pfn = PFN_DOWN(memory_end);
- memblock_remove(memory_end, ULONG_MAX);
-
- pr_notice("The maximum memory size is %luMB\n", memory_end >> 20);
+ max_pfn = max_low_pfn = PFN_DOWN(ident_map_size);
+ pr_notice("The maximum memory size is %luMB\n", ident_map_size >> 20);
}
#ifdef CONFIG_CRASH_DUMP
/*
- * When kdump is enabled, we have to ensure that no memory from
- * the area [0 - crashkernel memory size] and
- * [crashk_res.start - crashk_res.end] is set offline.
+ * When kdump is enabled, we have to ensure that no memory from the area
+ * [0 - crashkernel memory size] is set offline - it will be exchanged with
+ * the crashkernel memory region when kdump is triggered. The crashkernel
+ * memory region can never get offlined (pages are unmovable).
*/
static int kdump_mem_notifier(struct notifier_block *nb,
unsigned long action, void *data)
@@ -608,11 +587,7 @@ static int kdump_mem_notifier(struct notifier_block *nb,
return NOTIFY_OK;
if (arg->start_pfn < PFN_DOWN(resource_size(&crashk_res)))
return NOTIFY_BAD;
- if (arg->start_pfn > PFN_DOWN(crashk_res.end))
- return NOTIFY_OK;
- if (arg->start_pfn + arg->nr_pages - 1 < PFN_DOWN(crashk_res.start))
- return NOTIFY_OK;
- return NOTIFY_BAD;
+ return NOTIFY_OK;
}
static struct notifier_block kdump_mem_nb = {
@@ -622,36 +597,15 @@ static struct notifier_block kdump_mem_nb = {
#endif
/*
- * Make sure that the area behind memory_end is protected
- */
-static void reserve_memory_end(void)
-{
- if (memory_end_set)
- memblock_reserve(memory_end, ULONG_MAX);
-}
-
-/*
- * Make sure that oldmem, where the dump is stored, is protected
+ * Reserve page tables created by decompressor
*/
-static void reserve_oldmem(void)
+static void __init reserve_pgtables(void)
{
-#ifdef CONFIG_CRASH_DUMP
- if (OLDMEM_BASE)
- /* Forget all memory above the running kdump system */
- memblock_reserve(OLDMEM_SIZE, (phys_addr_t)ULONG_MAX);
-#endif
-}
+ unsigned long start, end;
+ struct reserved_range *range;
-/*
- * Make sure that oldmem, where the dump is stored, is protected
- */
-static void remove_oldmem(void)
-{
-#ifdef CONFIG_CRASH_DUMP
- if (OLDMEM_BASE)
- /* Forget all memory above the running kdump system */
- memblock_remove(OLDMEM_SIZE, (phys_addr_t)ULONG_MAX);
-#endif
+ for_each_physmem_reserved_type_range(RR_VMEM, range, &start, &end)
+ memblock_reserve(start, end - start);
}
/*
@@ -664,8 +618,8 @@ static void __init reserve_crashkernel(void)
phys_addr_t low, high;
int rc;
- rc = parse_crashkernel(boot_command_line, memory_end, &crash_size,
- &crash_base);
+ rc = parse_crashkernel(boot_command_line, ident_map_size,
+ &crash_size, &crash_base, NULL, NULL);
crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN);
crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN);
@@ -678,9 +632,9 @@ static void __init reserve_crashkernel(void)
return;
}
- low = crash_base ?: OLDMEM_BASE;
+ low = crash_base ?: oldmem_data.start;
high = low + crash_size;
- if (low >= OLDMEM_BASE && high <= OLDMEM_BASE + OLDMEM_SIZE) {
+ if (low >= oldmem_data.start && high <= oldmem_data.start + oldmem_data.size) {
/* The crashkernel fits into OLDMEM, reuse OLDMEM */
crash_base = low;
} else {
@@ -694,8 +648,9 @@ static void __init reserve_crashkernel(void)
return;
}
low = crash_base ?: low;
- crash_base = memblock_find_in_range(low, high, crash_size,
- KEXEC_CRASH_MEM_ALIGN);
+ crash_base = memblock_phys_alloc_range(crash_size,
+ KEXEC_CRASH_MEM_ALIGN,
+ low, high);
}
if (!crash_base) {
@@ -704,10 +659,12 @@ static void __init reserve_crashkernel(void)
return;
}
- if (register_memory_notifier(&kdump_mem_nb))
+ if (register_memory_notifier(&kdump_mem_nb)) {
+ memblock_phys_free(crash_base, crash_size);
return;
+ }
- if (!OLDMEM_BASE && MACHINE_IS_VM)
+ if (!oldmem_data.start && MACHINE_IS_VM)
diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size));
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
@@ -725,13 +682,13 @@ static void __init reserve_crashkernel(void)
*/
static void __init reserve_initrd(void)
{
-#ifdef CONFIG_BLK_DEV_INITRD
- if (!INITRD_START || !INITRD_SIZE)
+ unsigned long addr, size;
+
+ if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD) || !get_physmem_reserved(RR_INITRD, &addr, &size))
return;
- initrd_start = INITRD_START;
- initrd_end = initrd_start + INITRD_SIZE;
- memblock_reserve(INITRD_START, INITRD_SIZE);
-#endif
+ initrd_start = (unsigned long)__va(addr);
+ initrd_end = initrd_start + size;
+ memblock_reserve(addr, size);
}
/*
@@ -743,75 +700,37 @@ static void __init reserve_certificate_list(void)
memblock_reserve(ipl_cert_list_addr, ipl_cert_list_size);
}
-static void __init reserve_mem_detect_info(void)
+static void __init reserve_physmem_info(void)
{
- unsigned long start, size;
+ unsigned long addr, size;
- get_mem_detect_reserved(&start, &size);
- if (size)
- memblock_reserve(start, size);
+ if (get_physmem_reserved(RR_MEM_DETECT_EXTENDED, &addr, &size))
+ memblock_reserve(addr, size);
}
-static void __init free_mem_detect_info(void)
+static void __init free_physmem_info(void)
{
- unsigned long start, size;
+ unsigned long addr, size;
- get_mem_detect_reserved(&start, &size);
- if (size)
- memblock_free(start, size);
+ if (get_physmem_reserved(RR_MEM_DETECT_EXTENDED, &addr, &size))
+ memblock_phys_free(addr, size);
}
-static void __init memblock_physmem_add(phys_addr_t start, phys_addr_t size)
-{
- memblock_dbg("memblock_physmem_add: [%#016llx-%#016llx]\n",
- start, start + size - 1);
- memblock_add_range(&memblock.memory, start, size, 0, 0);
- memblock_add_range(&memblock.physmem, start, size, 0, 0);
-}
-
-static const char * __init get_mem_info_source(void)
-{
- switch (mem_detect.info_source) {
- case MEM_DETECT_SCLP_STOR_INFO:
- return "sclp storage info";
- case MEM_DETECT_DIAG260:
- return "diag260";
- case MEM_DETECT_SCLP_READ_INFO:
- return "sclp read info";
- case MEM_DETECT_BIN_SEARCH:
- return "binary search";
- }
- return "none";
-}
-
-static void __init memblock_add_mem_detect_info(void)
+static void __init memblock_add_physmem_info(void)
{
unsigned long start, end;
int i;
- memblock_dbg("physmem info source: %s (%hhd)\n",
- get_mem_info_source(), mem_detect.info_source);
+ pr_debug("physmem info source: %s (%hhd)\n",
+ get_physmem_info_source(), physmem_info.info_source);
/* keep memblock lists close to the kernel */
memblock_set_bottom_up(true);
- for_each_mem_detect_block(i, &start, &end)
+ for_each_physmem_usable_range(i, &start, &end)
+ memblock_add(start, end - start);
+ for_each_physmem_online_range(i, &start, &end)
memblock_physmem_add(start, end - start);
memblock_set_bottom_up(false);
- memblock_dump_all();
-}
-
-/*
- * Check for initrd being in usable memory
- */
-static void __init check_initrd(void)
-{
-#ifdef CONFIG_BLK_DEV_INITRD
- if (INITRD_START && INITRD_SIZE &&
- !memblock_is_region_memory(INITRD_START, INITRD_SIZE)) {
- pr_err("The initial RAM disk does not fit into the memory\n");
- memblock_free(INITRD_START, INITRD_SIZE);
- initrd_start = initrd_end = 0;
- }
-#endif
+ memblock_set_node(0, ULONG_MAX, &memblock.memory, 0);
}
/*
@@ -819,176 +738,68 @@ static void __init check_initrd(void)
*/
static void __init reserve_kernel(void)
{
- unsigned long start_pfn = PFN_UP(__pa(_end));
-
- memblock_reserve(0, HEAD_END);
- memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn)
- - (unsigned long)_stext);
- memblock_reserve(__sdma, __edma - __sdma);
+ memblock_reserve(0, STARTUP_NORMAL_OFFSET);
+ memblock_reserve(OLDMEM_BASE, sizeof(unsigned long));
+ memblock_reserve(OLDMEM_SIZE, sizeof(unsigned long));
+ memblock_reserve(physmem_info.reserved[RR_AMODE31].start, __eamode31 - __samode31);
+ memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP);
+ memblock_reserve(__pa(_stext), _end - _stext);
}
static void __init setup_memory(void)
{
- struct memblock_region *reg;
+ phys_addr_t start, end;
+ u64 i;
/*
* Init storage key for present memory
*/
- for_each_memblock(memory, reg) {
- storage_key_init_range(reg->base, reg->base + reg->size);
- }
- psw_set_key(PAGE_DEFAULT_KEY);
+ for_each_mem_range(i, &start, &end)
+ storage_key_init_range(start, end);
- /* Only cosmetics */
- memblock_enforce_memory_limit(memblock_end_of_DRAM());
+ psw_set_key(PAGE_DEFAULT_KEY);
}
-/*
- * Setup hardware capabilities.
- */
-static int __init setup_hwcaps(void)
+static void __init relocate_amode31_section(void)
{
- static const int stfl_bits[6] = { 0, 2, 7, 17, 19, 21 };
- struct cpuid cpu_id;
- int i;
+ unsigned long amode31_size = __eamode31 - __samode31;
+ long amode31_offset, *ptr;
- /*
- * The store facility list bits numbers as found in the principles
- * of operation are numbered with bit 1UL<<31 as number 0 to
- * bit 1UL<<0 as number 31.
- * Bit 0: instructions named N3, "backported" to esa-mode
- * Bit 2: z/Architecture mode is active
- * Bit 7: the store-facility-list-extended facility is installed
- * Bit 17: the message-security assist is installed
- * Bit 19: the long-displacement facility is installed
- * Bit 21: the extended-immediate facility is installed
- * Bit 22: extended-translation facility 3 is installed
- * Bit 30: extended-translation facility 3 enhancement facility
- * These get translated to:
- * HWCAP_S390_ESAN3 bit 0, HWCAP_S390_ZARCH bit 1,
- * HWCAP_S390_STFLE bit 2, HWCAP_S390_MSA bit 3,
- * HWCAP_S390_LDISP bit 4, HWCAP_S390_EIMM bit 5 and
- * HWCAP_S390_ETF3EH bit 8 (22 && 30).
- */
- for (i = 0; i < 6; i++)
- if (test_facility(stfl_bits[i]))
- elf_hwcap |= 1UL << i;
-
- if (test_facility(22) && test_facility(30))
- elf_hwcap |= HWCAP_S390_ETF3EH;
-
- /*
- * Check for additional facilities with store-facility-list-extended.
- * stfle stores doublewords (8 byte) with bit 1ULL<<63 as bit 0
- * and 1ULL<<0 as bit 63. Bits 0-31 contain the same information
- * as stored by stfl, bits 32-xxx contain additional facilities.
- * How many facility words are stored depends on the number of
- * doublewords passed to the instruction. The additional facilities
- * are:
- * Bit 42: decimal floating point facility is installed
- * Bit 44: perform floating point operation facility is installed
- * translated to:
- * HWCAP_S390_DFP bit 6 (42 && 44).
- */
- if ((elf_hwcap & (1UL << 2)) && test_facility(42) && test_facility(44))
- elf_hwcap |= HWCAP_S390_DFP;
+ amode31_offset = physmem_info.reserved[RR_AMODE31].start - (unsigned long)__samode31;
+ pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size);
- /*
- * Huge page support HWCAP_S390_HPAGE is bit 7.
- */
- if (MACHINE_HAS_EDAT1)
- elf_hwcap |= HWCAP_S390_HPAGE;
+ /* Move original AMODE31 section to the new one */
+ memmove((void *)physmem_info.reserved[RR_AMODE31].start, __samode31, amode31_size);
+ /* Zero out the old AMODE31 section to catch invalid accesses within it */
+ memset(__samode31, 0, amode31_size);
- /*
- * 64-bit register support for 31-bit processes
- * HWCAP_S390_HIGH_GPRS is bit 9.
- */
- elf_hwcap |= HWCAP_S390_HIGH_GPRS;
-
- /*
- * Transactional execution support HWCAP_S390_TE is bit 10.
- */
- if (MACHINE_HAS_TE)
- elf_hwcap |= HWCAP_S390_TE;
-
- /*
- * Vector extension HWCAP_S390_VXRS is bit 11. The Vector extension
- * can be disabled with the "novx" parameter. Use MACHINE_HAS_VX
- * instead of facility bit 129.
- */
- if (MACHINE_HAS_VX) {
- elf_hwcap |= HWCAP_S390_VXRS;
- if (test_facility(134))
- elf_hwcap |= HWCAP_S390_VXRS_EXT;
- if (test_facility(135))
- elf_hwcap |= HWCAP_S390_VXRS_BCD;
- if (test_facility(148))
- elf_hwcap |= HWCAP_S390_VXRS_EXT2;
- if (test_facility(152))
- elf_hwcap |= HWCAP_S390_VXRS_PDE;
- }
- if (test_facility(150))
- elf_hwcap |= HWCAP_S390_SORT;
- if (test_facility(151))
- elf_hwcap |= HWCAP_S390_DFLT;
+ /* Update all AMODE31 region references */
+ for (ptr = _start_amode31_refs; ptr != _end_amode31_refs; ptr++)
+ *ptr += amode31_offset;
+}
- /*
- * Guarded storage support HWCAP_S390_GS is bit 12.
- */
- if (MACHINE_HAS_GS)
- elf_hwcap |= HWCAP_S390_GS;
-
- get_cpu_id(&cpu_id);
- add_device_randomness(&cpu_id, sizeof(cpu_id));
- switch (cpu_id.machine) {
- case 0x2064:
- case 0x2066:
- default: /* Use "z900" as default for 64 bit kernels. */
- strcpy(elf_platform, "z900");
- break;
- case 0x2084:
- case 0x2086:
- strcpy(elf_platform, "z990");
- break;
- case 0x2094:
- case 0x2096:
- strcpy(elf_platform, "z9-109");
- break;
- case 0x2097:
- case 0x2098:
- strcpy(elf_platform, "z10");
- break;
- case 0x2817:
- case 0x2818:
- strcpy(elf_platform, "z196");
- break;
- case 0x2827:
- case 0x2828:
- strcpy(elf_platform, "zEC12");
- break;
- case 0x2964:
- case 0x2965:
- strcpy(elf_platform, "z13");
- break;
- case 0x3906:
- case 0x3907:
- strcpy(elf_platform, "z14");
- break;
- case 0x8561:
- case 0x8562:
- strcpy(elf_platform, "z15");
- break;
- }
+/* This must be called after AMODE31 relocation */
+static void __init setup_cr(void)
+{
+ union ctlreg2 cr2;
+ union ctlreg5 cr5;
+ union ctlreg15 cr15;
- /*
- * Virtualization support HWCAP_INT_SIE is bit 0.
- */
- if (sclp.has_sief2)
- int_hwcap |= HWCAP_INT_SIE;
+ __ctl_duct[1] = (unsigned long)__ctl_aste;
+ __ctl_duct[2] = (unsigned long)__ctl_aste;
+ __ctl_duct[4] = (unsigned long)__ctl_duald;
- return 0;
+ /* Update control registers CR2, CR5 and CR15 */
+ local_ctl_store(2, &cr2.reg);
+ local_ctl_store(5, &cr5.reg);
+ local_ctl_store(15, &cr15.reg);
+ cr2.ducto = (unsigned long)__ctl_duct >> 6;
+ cr5.pasteo = (unsigned long)__ctl_duct >> 6;
+ cr15.lsea = (unsigned long)__ctl_linkage_stack >> 3;
+ system_ctl_load(2, &cr2.reg);
+ system_ctl_load(5, &cr5.reg);
+ system_ctl_load(15, &cr15.reg);
}
-arch_initcall(setup_hwcaps);
/*
* Add system information as device randomness
@@ -997,30 +808,15 @@ static void __init setup_randomness(void)
{
struct sysinfo_3_2_2 *vmms;
- vmms = (struct sysinfo_3_2_2 *) memblock_phys_alloc(PAGE_SIZE,
- PAGE_SIZE);
+ vmms = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!vmms)
panic("Failed to allocate memory for sysinfo structure\n");
-
if (stsi(vmms, 3, 2, 2) == 0 && vmms->count)
add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count);
- memblock_free((unsigned long) vmms, PAGE_SIZE);
-}
-
-/*
- * Find the correct size for the task_struct. This depends on
- * the size of the struct fpu at the end of the thread_struct
- * which is embedded in the task_struct.
- */
-static void __init setup_task_size(void)
-{
- int task_size = sizeof(struct task_struct);
+ memblock_free(vmms, PAGE_SIZE);
- if (!MACHINE_HAS_VX) {
- task_size -= sizeof(__vector128) * __NUM_VXRS;
- task_size += sizeof(freg_t) * __NUM_FPRS;
- }
- arch_task_struct_size = task_size;
+ if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG))
+ static_branch_enable(&s390_arch_random_available);
}
/*
@@ -1031,8 +827,7 @@ static void __init setup_control_program_code(void)
{
union diag318_info diag318_info = {
.cpnc = CPNC_LINUX,
- .cpvc_linux = 0,
- .cpvc_distro = {0},
+ .cpvc = 0,
};
if (!sclp.has_diag318)
@@ -1052,11 +847,11 @@ static void __init log_component_list(void)
if (!early_ipl_comp_list_addr)
return;
- if (ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR)
+ if (ipl_block.hdr.flags & IPL_PL_FLAG_SIPL)
pr_info("Linux is running with Secure-IPL enabled\n");
else
pr_info("Linux is running with Secure-IPL disabled\n");
- ptr = (void *) early_ipl_comp_list_addr;
+ ptr = __va(early_ipl_comp_list_addr);
end = (void *) ptr + early_ipl_comp_list_size;
pr_info("The IPL report contains the following components:\n");
while (ptr < end) {
@@ -1102,14 +897,12 @@ void __init setup_arch(char **cmdline_p)
ROOT_DEV = Root_RAM0;
- init_mm.start_code = (unsigned long) _text;
- init_mm.end_code = (unsigned long) _etext;
- init_mm.end_data = (unsigned long) _edata;
- init_mm.brk = (unsigned long) _end;
+ setup_initial_init_mm(_text, _etext, _edata, _end);
if (IS_ENABLED(CONFIG_EXPOLINE_AUTO))
nospec_auto_detect();
+ jump_label_init();
parse_early_param();
#ifdef CONFIG_CRASH_DUMP
/* Deactivate elfcorehdr= kernel parameter */
@@ -1118,49 +911,44 @@ void __init setup_arch(char **cmdline_p)
os_info_init();
setup_ipl();
- setup_task_size();
setup_control_program_code();
/* Do some memory reservations *before* memory is added to memblock */
- reserve_memory_end();
- reserve_oldmem();
+ reserve_pgtables();
reserve_kernel();
reserve_initrd();
reserve_certificate_list();
- reserve_mem_detect_info();
+ reserve_physmem_info();
+ memblock_set_current_limit(ident_map_size);
memblock_allow_resize();
/* Get information about *all* installed memory */
- memblock_add_mem_detect_info();
-
- free_mem_detect_info();
- remove_oldmem();
-
- /*
- * Make sure all chunks are MAX_ORDER aligned so we don't need the
- * extra checks that HOLES_IN_ZONE would require.
- *
- * Is this still required?
- */
- memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT));
+ memblock_add_physmem_info();
+ free_physmem_info();
setup_memory_end();
+ memblock_dump_all();
setup_memory();
- dma_contiguous_reserve(memory_end);
+
+ relocate_amode31_section();
+ setup_cr();
+ setup_uv();
+ dma_contiguous_reserve(ident_map_size);
vmcp_cma_reserve();
+ if (MACHINE_HAS_EDAT2)
+ hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
- check_initrd();
reserve_crashkernel();
#ifdef CONFIG_CRASH_DUMP
/*
- * Be aware that smp_save_dump_cpus() triggers a system reset.
+ * Be aware that smp_save_dump_secondary_cpus() triggers a system reset.
* Therefore CPU and device initialization should be done afterwards.
*/
- smp_save_dump_cpus();
+ smp_save_dump_secondary_cpus();
#endif
setup_resources();
- setup_lowcore_dat_off();
+ setup_lowcore();
smp_fill_possible_mask();
cpu_detect_mhz_feature();
cpu_init();
@@ -1168,8 +956,11 @@ void __init setup_arch(char **cmdline_p)
smp_detect_cpus();
topology_init_early();
+ if (test_facility(193))
+ static_branch_enable(&cpu_has_bear);
+
/*
- * Create kernel page tables and switch to virtual addressing.
+ * Create kernel page tables.
*/
paging_init();
@@ -1177,7 +968,9 @@ void __init setup_arch(char **cmdline_p)
* After paging_init created the kernel page table, the new PSWs
* in lowcore can now run with DAT enabled.
*/
- setup_lowcore_dat_on();
+#ifdef CONFIG_CRASH_DUMP
+ smp_save_dump_ipl_cpu();
+#endif
/* Setup default console */
conmode_default();
@@ -1187,7 +980,7 @@ void __init setup_arch(char **cmdline_p)
if (IS_ENABLED(CONFIG_EXPOLINE))
nospec_init_branches();
- /* Setup zfcpdump support */
+ /* Setup zfcp/nvme dump support */
setup_zfcpdump();
/* Add system specific data to the random pool */
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index e6fca5498e1f..43e9661cd715 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -12,10 +12,12 @@
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
+#include <linux/rseq.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
#include <linux/signal.h>
+#include <linux/entry-common.h>
#include <linux/errno.h>
#include <linux/wait.h>
#include <linux/ptrace.h>
@@ -24,13 +26,13 @@
#include <linux/tty.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
-#include <linux/tracehook.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <asm/ucontext.h>
#include <linux/uaccess.h>
#include <asm/lowcore.h>
#include <asm/switch_to.h>
+#include <asm/vdso.h>
#include "entry.h"
/*
@@ -139,7 +141,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
{
_sigregs user_sregs;
- /* Alwys make any pending restarted system call return -EINTR */
+ /* Always make any pending restarted system call return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs)))
@@ -148,10 +150,6 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW_MASK_RI))
return -EINVAL;
- /* Test the floating-point-control word. */
- if (test_fp_ctl(user_sregs.fpregs.fpc))
- return -EINVAL;
-
/* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */
regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) |
(user_sregs.regs.psw.mask & (PSW_MASK_USER | PSW_MASK_RI));
@@ -181,9 +179,9 @@ static int save_sigregs_ext(struct pt_regs *regs,
int i;
/* Save vector registers to signal stack */
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
for (i = 0; i < __NUM_VXRS_LOW; i++)
- vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1);
+ vxrs[i] = current->thread.fpu.vxrs[i].low;
if (__copy_to_user(&sregs_ext->vxrs_low, vxrs,
sizeof(sregs_ext->vxrs_low)) ||
__copy_to_user(&sregs_ext->vxrs_high,
@@ -201,7 +199,7 @@ static int restore_sigregs_ext(struct pt_regs *regs,
int i;
/* Restore vector registers from signal stack */
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
if (__copy_from_user(vxrs, &sregs_ext->vxrs_low,
sizeof(sregs_ext->vxrs_low)) ||
__copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW,
@@ -209,7 +207,7 @@ static int restore_sigregs_ext(struct pt_regs *regs,
sizeof(sregs_ext->vxrs_high)))
return -EFAULT;
for (i = 0; i < __NUM_VXRS_LOW; i++)
- *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i];
+ current->thread.fpu.vxrs[i].low = vxrs[i];
}
return 0;
}
@@ -299,7 +297,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
* included in the signal frame on a 31-bit system.
*/
frame_size = sizeof(*frame) - sizeof(frame->sregs_ext);
- if (MACHINE_HAS_VX)
+ if (cpu_has_vx())
frame_size += sizeof(frame->sregs_ext);
frame = get_sigframe(ka, regs, frame_size);
if (frame == (void __user *) -1UL)
@@ -332,15 +330,10 @@ static int setup_frame(int sig, struct k_sigaction *ka,
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
- if (ka->sa.sa_flags & SA_RESTORER) {
+ if (ka->sa.sa_flags & SA_RESTORER)
restorer = (unsigned long) ka->sa.sa_restorer;
- } else {
- /* Signal frame without vector registers are short ! */
- __u16 __user *svc = (void __user *) frame + frame_size - 2;
- if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
- return -EFAULT;
- restorer = (unsigned long) svc;
- }
+ else
+ restorer = VDSO64_SYMBOL(current, sigreturn);
/* Set up registers for signal handler */
regs->gprs[14] = restorer;
@@ -381,7 +374,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
* included in the signal frame on a 31-bit system.
*/
uc_flags = 0;
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
frame_size += sizeof(_sigregs_ext);
uc_flags |= UC_VXRS;
}
@@ -395,14 +388,10 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
- if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = (unsigned long) ksig->ka.sa.sa_restorer;
- } else {
- __u16 __user *svc = &frame->svc_insn;
- if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
- return -EFAULT;
- restorer = (unsigned long) svc;
- }
+ else
+ restorer = VDSO64_SYMBOL(current, rt_sigreturn);
/* Create siginfo on the signal stack */
if (copy_siginfo_to_user(&frame->info, &ksig->info))
@@ -459,7 +448,8 @@ static void handle_signal(struct ksignal *ksig, sigset_t *oldset,
* the kernel can handle, and then we build all the user-level signal handling
* stack-frames in one go after that.
*/
-void do_signal(struct pt_regs *regs)
+
+void arch_do_signal_or_restart(struct pt_regs *regs)
{
struct ksignal ksig;
sigset_t *oldset = sigmask_to_save();
@@ -487,7 +477,7 @@ void do_signal(struct pt_regs *regs)
regs->gprs[2] = -EINTR;
break;
}
- /* fallthrough */
+ fallthrough;
case -ERESTARTNOINTR:
regs->gprs[2] = regs->orig_gpr2;
regs->psw.addr =
@@ -498,6 +488,7 @@ void do_signal(struct pt_regs *regs)
}
/* No longer in a system call */
clear_pt_regs_flag(regs, PIF_SYSCALL);
+
rseq_signal_deliver(&ksig, regs);
if (is_compat_task())
handle_signal32(&ksig, oldset, regs);
@@ -513,16 +504,22 @@ void do_signal(struct pt_regs *regs)
switch (regs->gprs[2]) {
case -ERESTART_RESTARTBLOCK:
/* Restart with sys_restart_syscall */
- regs->int_code = __NR_restart_syscall;
- /* fallthrough */
+ regs->gprs[2] = regs->orig_gpr2;
+ current->restart_block.arch_data = regs->psw.addr;
+ if (is_compat_task())
+ regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall);
+ else
+ regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall);
+ if (test_thread_flag(TIF_SINGLE_STEP))
+ clear_thread_flag(TIF_PER_TRAP);
+ break;
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
- /* Restart system call with magic TIF bit. */
regs->gprs[2] = regs->orig_gpr2;
- set_pt_regs_flag(regs, PIF_SYSCALL);
+ regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
if (test_thread_flag(TIF_SINGLE_STEP))
- clear_pt_regs_flag(regs, PIF_PER_TRAP);
+ clear_thread_flag(TIF_PER_TRAP);
break;
}
}
@@ -532,10 +529,3 @@ void do_signal(struct pt_regs *regs)
*/
restore_saved_sigmask();
}
-
-void do_notify_resume(struct pt_regs *regs)
-{
- clear_thread_flag(TIF_NOTIFY_RESUME);
- tracehook_notify_resume(regs);
- rseq_handle_notify_resume(NULL, regs);
-}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 2794cad9312e..c39d9f0d4b1c 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -5,7 +5,6 @@
* Copyright IBM Corp. 1999, 2012
* Author(s): Denis Joseph Barrow,
* Martin Schwidefsky <schwidefsky@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*
* based on other smp stuff by
* (c) 1995 Alan Cox, CymruNET Ltd <alan@cymru.net>
@@ -30,6 +29,7 @@
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/irqflags.h>
+#include <linux/irq_work.h>
#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/sched/hotplug.h>
@@ -37,6 +37,8 @@
#include <linux/crash_dump.h>
#include <linux/kprobes.h>
#include <asm/asm-offsets.h>
+#include <asm/ctlreg.h>
+#include <asm/pfault.h>
#include <asm/diag.h>
#include <asm/switch_to.h>
#include <asm/facility.h>
@@ -45,9 +47,8 @@
#include <asm/irq.h>
#include <asm/tlbflush.h>
#include <asm/vtimer.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
#include <asm/sclp.h>
-#include <asm/vdso.h>
#include <asm/debug.h>
#include <asm/os_info.h>
#include <asm/sigp.h>
@@ -55,12 +56,16 @@
#include <asm/nmi.h>
#include <asm/stacktrace.h>
#include <asm/topology.h>
+#include <asm/vdso.h>
+#include <asm/maccess.h>
#include "entry.h"
enum {
ec_schedule = 0,
ec_call_function_single,
ec_stop_cpu,
+ ec_mcck_pending,
+ ec_irq_work,
};
enum {
@@ -71,7 +76,6 @@ enum {
static DEFINE_PER_CPU(struct cpu *, cpu_device);
struct pcpu {
- struct lowcore *lowcore; /* lowcore page(s) for the cpu */
unsigned long ec_mask; /* bit mask for ec_xxx functions */
unsigned long ec_clk; /* sigp timestamp for ec_xxx */
signed char state; /* physical cpu state */
@@ -93,6 +97,7 @@ __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS];
#endif
static unsigned int smp_max_threads __initdata = -1U;
+cpumask_t cpu_setup_mask;
static int __init early_nosmt(char *s)
{
@@ -110,7 +115,7 @@ early_param("smt", early_smt);
/*
* The smp_cpu_state_mutex must be held when changing the state or polarization
- * member of a pcpu data structure within the pcpu_devices arreay.
+ * member of a pcpu data structure within the pcpu_devices array.
*/
DEFINE_MUTEX(smp_cpu_state_mutex);
@@ -145,7 +150,7 @@ static int pcpu_sigp_retry(struct pcpu *pcpu, u8 order, u32 parm)
static inline int pcpu_stopped(struct pcpu *pcpu)
{
- u32 uninitialized_var(status);
+ u32 status;
if (__pcpu_sigp(pcpu->address, SIGP_SENSE,
0, &status) != SIGP_CC_STATUS_STORED)
@@ -188,102 +193,100 @@ static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit)
static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
{
- unsigned long async_stack, nodat_stack;
+ unsigned long async_stack, nodat_stack, mcck_stack;
struct lowcore *lc;
- if (pcpu != &pcpu_devices[0]) {
- pcpu->lowcore = (struct lowcore *)
- __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER);
- nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
- if (!pcpu->lowcore || !nodat_stack)
- goto out;
- } else {
- nodat_stack = pcpu->lowcore->nodat_stack - STACK_INIT_OFFSET;
- }
+ lc = (struct lowcore *) __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER);
+ nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
async_stack = stack_alloc();
- if (!async_stack)
+ mcck_stack = stack_alloc();
+ if (!lc || !nodat_stack || !async_stack || !mcck_stack)
goto out;
- lc = pcpu->lowcore;
memcpy(lc, &S390_lowcore, 512);
memset((char *) lc + 512, 0, sizeof(*lc) - 512);
lc->async_stack = async_stack + STACK_INIT_OFFSET;
lc->nodat_stack = nodat_stack + STACK_INIT_OFFSET;
+ lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET;
lc->cpu_nr = cpu;
lc->spinlock_lockval = arch_spin_lockval(cpu);
lc->spinlock_index = 0;
- lc->br_r1_trampoline = 0x07f1; /* br %r1 */
- if (nmi_alloc_per_cpu(lc))
- goto out_async;
- if (vdso_alloc_per_cpu(lc))
+ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
+ lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
+ lc->preempt_count = PREEMPT_DISABLED;
+ if (nmi_alloc_mcesa(&lc->mcesad))
+ goto out;
+ if (abs_lowcore_map(cpu, lc, true))
goto out_mcesa;
lowcore_ptr[cpu] = lc;
- pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, (u32)(unsigned long) lc);
+ pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, __pa(lc));
return 0;
out_mcesa:
- nmi_free_per_cpu(lc);
-out_async:
- stack_free(async_stack);
+ nmi_free_mcesa(&lc->mcesad);
out:
- if (pcpu != &pcpu_devices[0]) {
- free_pages(nodat_stack, THREAD_SIZE_ORDER);
- free_pages((unsigned long) pcpu->lowcore, LC_ORDER);
- }
+ stack_free(mcck_stack);
+ stack_free(async_stack);
+ free_pages(nodat_stack, THREAD_SIZE_ORDER);
+ free_pages((unsigned long) lc, LC_ORDER);
return -ENOMEM;
}
static void pcpu_free_lowcore(struct pcpu *pcpu)
{
- unsigned long async_stack, nodat_stack, lowcore;
-
- nodat_stack = pcpu->lowcore->nodat_stack - STACK_INIT_OFFSET;
- async_stack = pcpu->lowcore->async_stack - STACK_INIT_OFFSET;
- lowcore = (unsigned long) pcpu->lowcore;
+ unsigned long async_stack, nodat_stack, mcck_stack;
+ struct lowcore *lc;
+ int cpu;
+ cpu = pcpu - pcpu_devices;
+ lc = lowcore_ptr[cpu];
+ nodat_stack = lc->nodat_stack - STACK_INIT_OFFSET;
+ async_stack = lc->async_stack - STACK_INIT_OFFSET;
+ mcck_stack = lc->mcck_stack - STACK_INIT_OFFSET;
pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0);
- lowcore_ptr[pcpu - pcpu_devices] = NULL;
- vdso_free_per_cpu(pcpu->lowcore);
- nmi_free_per_cpu(pcpu->lowcore);
+ lowcore_ptr[cpu] = NULL;
+ abs_lowcore_unmap(cpu);
+ nmi_free_mcesa(&lc->mcesad);
stack_free(async_stack);
- if (pcpu == &pcpu_devices[0])
- return;
+ stack_free(mcck_stack);
free_pages(nodat_stack, THREAD_SIZE_ORDER);
- free_pages(lowcore, LC_ORDER);
+ free_pages((unsigned long) lc, LC_ORDER);
}
static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
{
- struct lowcore *lc = pcpu->lowcore;
+ struct lowcore *lc, *abs_lc;
+ lc = lowcore_ptr[cpu];
cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask);
cpumask_set_cpu(cpu, mm_cpumask(&init_mm));
lc->cpu_nr = cpu;
+ lc->restart_flags = RESTART_FLAG_CTLREGS;
lc->spinlock_lockval = arch_spin_lockval(cpu);
lc->spinlock_index = 0;
lc->percpu_offset = __per_cpu_offset[cpu];
lc->kernel_asce = S390_lowcore.kernel_asce;
- lc->user_asce = S390_lowcore.kernel_asce;
+ lc->user_asce = s390_invalid_asce;
lc->machine_flags = S390_lowcore.machine_flags;
lc->user_timer = lc->system_timer =
lc->steal_timer = lc->avg_steal_timer = 0;
- __ctl_store(lc->cregs_save_area, 0, 15);
+ abs_lc = get_abs_lowcore();
+ memcpy(lc->cregs_save_area, abs_lc->cregs_save_area, sizeof(lc->cregs_save_area));
+ put_abs_lowcore(abs_lc);
lc->cregs_save_area[1] = lc->kernel_asce;
- lc->cregs_save_area[7] = lc->vdso_asce;
+ lc->cregs_save_area[7] = lc->user_asce;
save_access_regs((unsigned int *) lc->access_regs_save_area);
- memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
- sizeof(lc->stfle_fac_list));
- memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list,
- sizeof(lc->alt_stfle_fac_list));
arch_spin_lock_setup(cpu);
}
static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk)
{
- struct lowcore *lc = pcpu->lowcore;
+ struct lowcore *lc;
+ int cpu;
- lc->kernel_stack = (unsigned long) task_stack_page(tsk)
- + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
- lc->current_task = (unsigned long) tsk;
+ cpu = pcpu - pcpu_devices;
+ lc = lowcore_ptr[cpu];
+ lc->kernel_stack = (unsigned long)task_stack_page(tsk) + STACK_INIT_OFFSET;
+ lc->current_task = (unsigned long)tsk;
lc->lpp = LPP_MAGIC;
lc->current_pid = tsk->pid;
lc->user_timer = tsk->thread.user_timer;
@@ -296,41 +299,59 @@ static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk)
static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data)
{
- struct lowcore *lc = pcpu->lowcore;
+ struct lowcore *lc;
+ int cpu;
- lc->restart_stack = lc->nodat_stack;
+ cpu = pcpu - pcpu_devices;
+ lc = lowcore_ptr[cpu];
+ lc->restart_stack = lc->kernel_stack;
lc->restart_fn = (unsigned long) func;
lc->restart_data = (unsigned long) data;
- lc->restart_source = -1UL;
+ lc->restart_source = -1U;
pcpu_sigp_retry(pcpu, SIGP_RESTART, 0);
}
+typedef void (pcpu_delegate_fn)(void *);
+
/*
* Call function via PSW restart on pcpu and stop the current cpu.
*/
-static void __pcpu_delegate(void (*func)(void*), void *data)
+static void __pcpu_delegate(pcpu_delegate_fn *func, void *data)
{
func(data); /* should not return */
}
-static void __no_sanitize_address pcpu_delegate(struct pcpu *pcpu,
- void (*func)(void *),
- void *data, unsigned long stack)
+static void pcpu_delegate(struct pcpu *pcpu,
+ pcpu_delegate_fn *func,
+ void *data, unsigned long stack)
{
- struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices];
- unsigned long source_cpu = stap();
+ struct lowcore *lc, *abs_lc;
+ unsigned int source_cpu;
- __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
- if (pcpu->address == source_cpu)
- CALL_ON_STACK(__pcpu_delegate, stack, 2, func, data);
+ lc = lowcore_ptr[pcpu - pcpu_devices];
+ source_cpu = stap();
+
+ if (pcpu->address == source_cpu) {
+ call_on_stack(2, stack, void, __pcpu_delegate,
+ pcpu_delegate_fn *, func, void *, data);
+ }
/* Stop target cpu (if func returns this stops the current cpu). */
pcpu_sigp_retry(pcpu, SIGP_STOP, 0);
+ pcpu_sigp_retry(pcpu, SIGP_CPU_RESET, 0);
/* Restart func on the target cpu and stop the current cpu. */
- mem_assign_absolute(lc->restart_stack, stack);
- mem_assign_absolute(lc->restart_fn, (unsigned long) func);
- mem_assign_absolute(lc->restart_data, (unsigned long) data);
- mem_assign_absolute(lc->restart_source, source_cpu);
- __bpon();
+ if (lc) {
+ lc->restart_stack = stack;
+ lc->restart_fn = (unsigned long)func;
+ lc->restart_data = (unsigned long)data;
+ lc->restart_source = source_cpu;
+ } else {
+ abs_lc = get_abs_lowcore();
+ abs_lc->restart_stack = stack;
+ abs_lc->restart_fn = (unsigned long)func;
+ abs_lc->restart_data = (unsigned long)data;
+ abs_lc->restart_source = source_cpu;
+ put_abs_lowcore(abs_lc);
+ }
asm volatile(
"0: sigp 0,%0,%2 # sigp restart to target cpu\n"
" brc 2,0b # busy, try again\n"
@@ -382,7 +403,7 @@ void smp_call_online_cpu(void (*func)(void *), void *data)
*/
void smp_call_ipl_cpu(void (*func)(void *), void *data)
{
- struct lowcore *lc = pcpu_devices->lowcore;
+ struct lowcore *lc = lowcore_ptr[0];
if (pcpu_devices[0].address == stap())
lc = &S390_lowcore;
@@ -401,7 +422,12 @@ int smp_find_processor_id(u16 address)
return -1;
}
-bool arch_vcpu_is_preempted(int cpu)
+void schedule_mcck_handler(void)
+{
+ pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_mcck_pending);
+}
+
+bool notrace arch_vcpu_is_preempted(int cpu)
{
if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
return false;
@@ -411,17 +437,15 @@ bool arch_vcpu_is_preempted(int cpu)
}
EXPORT_SYMBOL(arch_vcpu_is_preempted);
-void smp_yield_cpu(int cpu)
+void notrace smp_yield_cpu(int cpu)
{
- if (MACHINE_HAS_DIAG9C) {
- diag_stat_inc_norecursion(DIAG_STAT_X09C);
- asm volatile("diag %0,0,0x9c"
- : : "d" (pcpu_devices[cpu].address));
- } else if (MACHINE_HAS_DIAG44 && !smp_cpu_mtid) {
- diag_stat_inc_norecursion(DIAG_STAT_X044);
- asm volatile("diag 0,0,0x44");
- }
+ if (!MACHINE_HAS_DIAG9C)
+ return;
+ diag_stat_inc_norecursion(DIAG_STAT_X09C);
+ asm volatile("diag %0,0,0x9c"
+ : : "d" (pcpu_devices[cpu].address));
}
+EXPORT_SYMBOL_GPL(smp_yield_cpu);
/*
* Send cpus emergency shutdown signal. This gives the cpus the
@@ -429,10 +453,12 @@ void smp_yield_cpu(int cpu)
*/
void notrace smp_emergency_stop(void)
{
- cpumask_t cpumask;
+ static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+ static cpumask_t cpumask;
u64 end;
int cpu;
+ arch_spin_lock(&lock);
cpumask_copy(&cpumask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &cpumask);
@@ -453,6 +479,7 @@ void notrace smp_emergency_stop(void)
break;
cpu_relax();
}
+ arch_spin_unlock(&lock);
}
NOKPROBE_SYMBOL(smp_emergency_stop);
@@ -464,7 +491,7 @@ void smp_send_stop(void)
int cpu;
/* Disable all interrupts/machine checks */
- __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
+ __load_psw_mask(PSW_KERNEL_BITS);
trace_hardirqs_off();
debug_set_critical();
@@ -498,6 +525,10 @@ static void smp_handle_ext_call(void)
scheduler_ipi();
if (test_bit(ec_call_function_single, &bits))
generic_smp_call_function_single_interrupt();
+ if (test_bit(ec_mcck_pending, &bits))
+ s390_handle_mcck();
+ if (test_bit(ec_irq_work, &bits))
+ irq_work_run();
}
static void do_ext_call_interrupt(struct ext_code ext_code,
@@ -525,71 +556,37 @@ void arch_send_call_function_single_ipi(int cpu)
* it goes straight through and wastes no time serializing
* anything. Worst case is that we lose a reschedule ...
*/
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
pcpu_ec_call(pcpu_devices + cpu, ec_schedule);
}
-/*
- * parameter area for the set/clear control bit callbacks
- */
-struct ec_creg_mask_parms {
- unsigned long orval;
- unsigned long andval;
- int cr;
-};
-
-/*
- * callback for setting/clearing control bits
- */
-static void smp_ctl_bit_callback(void *info)
-{
- struct ec_creg_mask_parms *pp = info;
- unsigned long cregs[16];
-
- __ctl_store(cregs, 0, 15);
- cregs[pp->cr] = (cregs[pp->cr] & pp->andval) | pp->orval;
- __ctl_load(cregs, 0, 15);
-}
-
-/*
- * Set a bit in a control register of all cpus
- */
-void smp_ctl_set_bit(int cr, int bit)
-{
- struct ec_creg_mask_parms parms = { 1UL << bit, -1UL, cr };
-
- on_each_cpu(smp_ctl_bit_callback, &parms, 1);
-}
-EXPORT_SYMBOL(smp_ctl_set_bit);
-
-/*
- * Clear a bit in a control register of all cpus
- */
-void smp_ctl_clear_bit(int cr, int bit)
+#ifdef CONFIG_IRQ_WORK
+void arch_irq_work_raise(void)
{
- struct ec_creg_mask_parms parms = { 0, ~(1UL << bit), cr };
-
- on_each_cpu(smp_ctl_bit_callback, &parms, 1);
+ pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_irq_work);
}
-EXPORT_SYMBOL(smp_ctl_clear_bit);
+#endif
#ifdef CONFIG_CRASH_DUMP
int smp_store_status(int cpu)
{
- struct pcpu *pcpu = pcpu_devices + cpu;
+ struct lowcore *lc;
+ struct pcpu *pcpu;
unsigned long pa;
- pa = __pa(&pcpu->lowcore->floating_pt_save_area);
+ pcpu = pcpu_devices + cpu;
+ lc = lowcore_ptr[cpu];
+ pa = __pa(&lc->floating_pt_save_area);
if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS,
pa) != SIGP_CC_ORDER_CODE_ACCEPTED)
return -EIO;
- if (!MACHINE_HAS_VX && !MACHINE_HAS_GS)
+ if (!cpu_has_vx() && !MACHINE_HAS_GS)
return 0;
- pa = __pa(pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK);
+ pa = lc->mcesad & MCESA_ORIGIN_MASK;
if (MACHINE_HAS_GS)
- pa |= pcpu->lowcore->mcesad & MCESA_LC_MASK;
+ pa |= lc->mcesad & MCESA_LC_MASK;
if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS,
pa) != SIGP_CC_ORDER_CODE_ACCEPTED)
return -EIO;
@@ -599,14 +596,14 @@ int smp_store_status(int cpu)
/*
* Collect CPU state of the previous, crashed system.
* There are four cases:
- * 1) standard zfcp dump
- * condition: OLDMEM_BASE == NULL && ipl_info.type == IPL_TYPE_FCP_DUMP
+ * 1) standard zfcp/nvme dump
+ * condition: OLDMEM_BASE == NULL && is_ipl_type_dump() == true
* The state for all CPUs except the boot CPU needs to be collected
* with sigp stop-and-store-status. The boot CPU state is located in
* the absolute lowcore of the memory stored in the HSA. The zcore code
* will copy the boot CPU state from the HSA.
- * 2) stand-alone kdump for SCSI (zfcp dump with swapped memory)
- * condition: OLDMEM_BASE != NULL && ipl_info.type == IPL_TYPE_FCP_DUMP
+ * 2) stand-alone kdump for SCSI/NVMe (zfcp/nvme dump with swapped memory)
+ * condition: OLDMEM_BASE != NULL && is_ipl_type_dump() == true
* The state for all CPUs except the boot CPU needs to be collected
* with sigp stop-and-store-status. The firmware or the boot-loader
* stored the registers of the boot CPU in the absolute lowcore in the
@@ -622,42 +619,39 @@ int smp_store_status(int cpu)
* This case does not exist for s390 anymore, setup_arch explicitly
* deactivates the elfcorehdr= kernel parameter
*/
-static __init void smp_save_cpu_vxrs(struct save_area *sa, u16 addr,
- bool is_boot_cpu, unsigned long page)
+static bool dump_available(void)
{
- __vector128 *vxrs = (__vector128 *) page;
-
- if (is_boot_cpu)
- vxrs = boot_cpu_vector_save_area;
- else
- __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, page);
- save_area_add_vxrs(sa, vxrs);
+ return oldmem_data.start || is_ipl_type_dump();
}
-static __init void smp_save_cpu_regs(struct save_area *sa, u16 addr,
- bool is_boot_cpu, unsigned long page)
+void __init smp_save_dump_ipl_cpu(void)
{
- void *regs = (void *) page;
+ struct save_area *sa;
+ void *regs;
- if (is_boot_cpu)
- copy_oldmem_kernel(regs, (void *) __LC_FPREGS_SAVE_AREA, 512);
- else
- __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, page);
+ if (!dump_available())
+ return;
+ sa = save_area_alloc(true);
+ regs = memblock_alloc(512, 8);
+ if (!sa || !regs)
+ panic("could not allocate memory for boot CPU save area\n");
+ copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512);
save_area_add_regs(sa, regs);
+ memblock_free(regs, 512);
+ if (cpu_has_vx())
+ save_area_add_vxrs(sa, boot_cpu_vector_save_area);
}
-void __init smp_save_dump_cpus(void)
+void __init smp_save_dump_secondary_cpus(void)
{
int addr, boot_cpu_addr, max_cpu_addr;
struct save_area *sa;
- unsigned long page;
- bool is_boot_cpu;
+ void *page;
- if (!(OLDMEM_BASE || ipl_info.type == IPL_TYPE_FCP_DUMP))
- /* No previous system present, normal boot. */
+ if (!dump_available())
return;
/* Allocate a page as dumping area for the store status sigps */
- page = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, 1UL << 31);
+ page = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
if (!page)
panic("ERROR: Failed to allocate %lx bytes below %lx\n",
PAGE_SIZE, 1UL << 31);
@@ -667,29 +661,23 @@ void __init smp_save_dump_cpus(void)
boot_cpu_addr = stap();
max_cpu_addr = SCLP_MAX_CORES << sclp.mtid_prev;
for (addr = 0; addr <= max_cpu_addr; addr++) {
+ if (addr == boot_cpu_addr)
+ continue;
if (__pcpu_sigp_relax(addr, SIGP_SENSE, 0) ==
SIGP_CC_NOT_OPERATIONAL)
continue;
- is_boot_cpu = (addr == boot_cpu_addr);
- /* Allocate save area */
- sa = save_area_alloc(is_boot_cpu);
+ sa = save_area_alloc(false);
if (!sa)
panic("could not allocate memory for save area\n");
- if (MACHINE_HAS_VX)
- /* Get the vector registers */
- smp_save_cpu_vxrs(sa, addr, is_boot_cpu, page);
- /*
- * For a zfcp dump OLDMEM_BASE == NULL and the registers
- * of the boot CPU are stored in the HSA. To retrieve
- * these registers an SCLP request is required which is
- * done by drivers/s390/char/zcore.c:init_cpu_info()
- */
- if (!is_boot_cpu || OLDMEM_BASE)
- /* Get the CPU registers */
- smp_save_cpu_regs(sa, addr, is_boot_cpu, page);
+ __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page));
+ save_area_add_regs(sa, page);
+ if (cpu_has_vx()) {
+ __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, __pa(page));
+ save_area_add_vxrs(sa, page);
+ }
}
memblock_free(page, PAGE_SIZE);
- diag_dma_ops.diag308_reset();
+ diag_amode31_ops.diag308_reset();
pcpu_set_smt(0);
}
#endif /* CONFIG_CRASH_DUMP */
@@ -704,6 +692,11 @@ int smp_cpu_get_polarization(int cpu)
return pcpu_devices[cpu].polarization;
}
+int smp_cpu_get_cpu_address(int cpu)
+{
+ return pcpu_devices[cpu].address;
+}
+
static void __ref smp_get_core_info(struct sclp_core_info *info, int early)
{
static int use_sigp_detection;
@@ -763,11 +756,13 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail,
static int __smp_rescan_cpus(struct sclp_core_info *info, bool early)
{
struct sclp_core_entry *core;
- cpumask_t avail;
+ static cpumask_t avail;
bool configured;
u16 core_id;
int nr, i;
+ cpus_read_lock();
+ mutex_lock(&smp_cpu_state_mutex);
nr = 0;
cpumask_xor(&avail, cpu_possible_mask, cpu_present_mask);
/*
@@ -788,6 +783,8 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early)
configured = i < info->configured;
nr += smp_add_core(&info->core[i], &avail, configured, early);
}
+ mutex_unlock(&smp_cpu_state_mutex);
+ cpus_read_unlock();
return nr;
}
@@ -835,83 +832,70 @@ void __init smp_detect_cpus(void)
pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus);
/* Add CPUs present at boot */
- get_online_cpus();
__smp_rescan_cpus(info, true);
- put_online_cpus();
- memblock_free_early((unsigned long)info, sizeof(*info));
+ memblock_free(info, sizeof(*info));
}
-static void smp_init_secondary(void)
+/*
+ * Activate a secondary processor.
+ */
+static void smp_start_secondary(void *cpuvoid)
{
- int cpu = smp_processor_id();
+ int cpu = raw_smp_processor_id();
S390_lowcore.last_update_clock = get_tod_clock();
+ S390_lowcore.restart_stack = (unsigned long)restart_stack;
+ S390_lowcore.restart_fn = (unsigned long)do_restart;
+ S390_lowcore.restart_data = 0;
+ S390_lowcore.restart_source = -1U;
+ S390_lowcore.restart_flags = 0;
restore_access_regs(S390_lowcore.access_regs_save_area);
- set_cpu_flag(CIF_ASCE_PRIMARY);
- set_cpu_flag(CIF_ASCE_SECONDARY);
cpu_init();
- preempt_disable();
+ rcutree_report_cpu_starting(cpu);
init_cpu_timer();
vtime_init();
+ vdso_getcpu_init();
pfault_init();
- notify_cpu_starting(smp_processor_id());
+ cpumask_set_cpu(cpu, &cpu_setup_mask);
+ update_cpu_masks();
+ notify_cpu_starting(cpu);
if (topology_cpu_dedicated(cpu))
set_cpu_flag(CIF_DEDICATED_CPU);
else
clear_cpu_flag(CIF_DEDICATED_CPU);
- set_cpu_online(smp_processor_id(), true);
+ set_cpu_online(cpu, true);
inc_irq_stat(CPU_RST);
local_irq_enable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
-/*
- * Activate a secondary processor.
- */
-static void __no_sanitize_address smp_start_secondary(void *cpuvoid)
-{
- S390_lowcore.restart_stack = (unsigned long) restart_stack;
- S390_lowcore.restart_fn = (unsigned long) do_restart;
- S390_lowcore.restart_data = 0;
- S390_lowcore.restart_source = -1UL;
- __ctl_load(S390_lowcore.cregs_save_area, 0, 15);
- __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
- CALL_ON_STACK_NORETURN(smp_init_secondary, S390_lowcore.kernel_stack);
-}
-
/* Upping and downing of CPUs */
int __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
- struct pcpu *pcpu;
- int base, i, rc;
+ struct pcpu *pcpu = pcpu_devices + cpu;
+ int rc;
- pcpu = pcpu_devices + cpu;
if (pcpu->state != CPU_STATE_CONFIGURED)
return -EIO;
- base = smp_get_base_cpu(cpu);
- for (i = 0; i <= smp_cpu_mtid; i++) {
- if (base + i < nr_cpu_ids)
- if (cpu_online(base + i))
- break;
- }
- /*
- * If this is the first CPU of the core to get online
- * do an initial CPU reset.
- */
- if (i > smp_cpu_mtid &&
- pcpu_sigp_retry(pcpu_devices + base, SIGP_INITIAL_CPU_RESET, 0) !=
+ if (pcpu_sigp_retry(pcpu, SIGP_INITIAL_CPU_RESET, 0) !=
SIGP_CC_ORDER_CODE_ACCEPTED)
return -EIO;
rc = pcpu_alloc_lowcore(pcpu, cpu);
if (rc)
return rc;
+ /*
+ * Make sure global control register contents do not change
+ * until new CPU has initialized control registers.
+ */
+ system_ctlreg_lock();
pcpu_prepare_secondary(pcpu, cpu);
pcpu_attach_task(pcpu, tidle);
pcpu_start_fn(pcpu, smp_start_secondary, NULL);
/* Wait until cpu puts itself in the online & active maps */
while (!cpu_online(cpu))
cpu_relax();
+ system_ctlreg_unlock();
return 0;
}
@@ -926,19 +910,23 @@ early_param("possible_cpus", _setup_possible_cpus);
int __cpu_disable(void)
{
- unsigned long cregs[16];
+ struct ctlreg cregs[16];
+ int cpu;
/* Handle possible pending IPIs */
smp_handle_ext_call();
- set_cpu_online(smp_processor_id(), false);
+ cpu = smp_processor_id();
+ set_cpu_online(cpu, false);
+ cpumask_clear_cpu(cpu, &cpu_setup_mask);
+ update_cpu_masks();
/* Disable pseudo page faults on this cpu. */
pfault_fini();
/* Disable interrupt sources via control register. */
- __ctl_store(cregs, 0, 15);
- cregs[0] &= ~0x0000ee70UL; /* disable all external interrupts */
- cregs[6] &= ~0xff000000UL; /* disable all I/O interrupts */
- cregs[14] &= ~0x1f000000UL; /* disable most machine checks */
- __ctl_load(cregs, 0, 15);
+ __local_ctl_store(0, 15, cregs);
+ cregs[0].val &= ~0x0000ee70UL; /* disable all external interrupts */
+ cregs[6].val &= ~0xff000000UL; /* disable all I/O interrupts */
+ cregs[14].val &= ~0x1f000000UL; /* disable most machine checks */
+ __local_ctl_load(0, 15, cregs);
clear_cpu_flag(CIF_NOHZ_DELAY);
return 0;
}
@@ -959,7 +947,6 @@ void __cpu_die(unsigned int cpu)
void __noreturn cpu_die(void)
{
idle_task_exit();
- __bpon();
pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0);
for (;;) ;
}
@@ -979,12 +966,12 @@ void __init smp_fill_possible_mask(void)
void __init smp_prepare_cpus(unsigned int max_cpus)
{
- /* request the 0x1201 emergency signal external interrupt */
if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt))
panic("Couldn't request external interrupt 0x1201");
- /* request the 0x1202 external call external interrupt */
+ system_ctl_set_bit(0, 14);
if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt))
panic("Couldn't request external interrupt 0x1202");
+ system_ctl_set_bit(0, 13);
}
void __init smp_prepare_boot_cpu(void)
@@ -993,15 +980,10 @@ void __init smp_prepare_boot_cpu(void)
WARN_ON(!cpu_present(0) || !cpu_online(0));
pcpu->state = CPU_STATE_CONFIGURED;
- pcpu->lowcore = (struct lowcore *)(unsigned long) store_prefix();
S390_lowcore.percpu_offset = __per_cpu_offset[0];
smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN);
}
-void __init smp_cpus_done(unsigned int max_cpus)
-{
-}
-
void __init smp_setup_processor_id(void)
{
pcpu_devices[0].address = stap();
@@ -1044,14 +1026,12 @@ static ssize_t cpu_configure_store(struct device *dev,
return -EINVAL;
if (val != 0 && val != 1)
return -EINVAL;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&smp_cpu_state_mutex);
rc = -EBUSY;
- /* disallow configuration changes of online cpus and cpu 0 */
+ /* disallow configuration changes of online cpus */
cpu = dev->id;
cpu = smp_get_base_cpu(cpu);
- if (cpu == 0)
- goto out;
for (i = 0; i <= smp_cpu_mtid; i++)
if (cpu_online(cpu + i))
goto out;
@@ -1093,7 +1073,7 @@ static ssize_t cpu_configure_store(struct device *dev,
}
out:
mutex_unlock(&smp_cpu_state_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return rc ? rc : count;
}
static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store);
@@ -1131,6 +1111,7 @@ static int smp_cpu_online(unsigned int cpu)
return sysfs_create_group(&s->kobj, &cpu_online_attr_group);
}
+
static int smp_cpu_pre_down(unsigned int cpu)
{
struct device *s = &per_cpu(cpu_device, cpu)->dev;
@@ -1150,7 +1131,7 @@ static int smp_add_present_cpu(int cpu)
return -ENOMEM;
per_cpu(cpu_device, cpu) = c;
s = &c->dev;
- c->hotpluggable = 1;
+ c->hotpluggable = !!cpu;
rc = register_cpu(c, cpu);
if (rc)
goto out;
@@ -1179,11 +1160,7 @@ int __ref smp_rescan_cpus(void)
if (!info)
return -ENOMEM;
smp_get_core_info(info, 0);
- get_online_cpus();
- mutex_lock(&smp_cpu_state_mutex);
nr = __smp_rescan_cpus(info, false);
- mutex_unlock(&smp_cpu_state_mutex);
- put_online_cpus();
kfree(info);
if (nr)
topology_schedule_update();
@@ -1208,11 +1185,17 @@ static DEVICE_ATTR_WO(rescan);
static int __init s390_smp_init(void)
{
+ struct device *dev_root;
int cpu, rc = 0;
- rc = device_create_file(cpu_subsys.dev_root, &dev_attr_rescan);
- if (rc)
- return rc;
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ rc = device_create_file(dev_root, &dev_attr_rescan);
+ put_device(dev_root);
+ if (rc)
+ return rc;
+ }
+
for_each_present_cpu(cpu) {
rc = smp_add_present_cpu(cpu);
if (rc)
diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index fc5419ac64c8..94f440e38303 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -3,13 +3,15 @@
* Stack trace management functions
*
* Copyright IBM Corp. 2006
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#include <linux/stacktrace.h>
+#include <linux/uaccess.h>
+#include <linux/compat.h>
#include <asm/stacktrace.h>
#include <asm/unwind.h>
#include <asm/kprobes.h>
+#include <asm/ptrace.h>
void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
struct task_struct *task, struct pt_regs *regs)
@@ -19,17 +21,11 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
unwind_for_each_frame(&state, task, regs, 0) {
addr = unwind_get_return_address(&state);
- if (!addr || !consume_entry(cookie, addr, false))
+ if (!addr || !consume_entry(cookie, addr))
break;
}
}
-/*
- * This function returns an error if it detects any unreliable features of the
- * stack. Otherwise it guarantees that the stack trace is reliable.
- *
- * If the task is not 'current', the caller *must* ensure the task is inactive.
- */
int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
void *cookie, struct task_struct *task)
{
@@ -47,16 +43,16 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
if (!addr)
return -EINVAL;
-#ifdef CONFIG_KPROBES
+#ifdef CONFIG_RETHOOK
/*
- * Mark stacktraces with kretprobed functions on them
+ * Mark stacktraces with krethook functions on them
* as unreliable.
*/
- if (state.ip == (unsigned long)kretprobe_trampoline)
+ if (state.ip == (unsigned long)arch_rethook_trampoline)
return -EINVAL;
#endif
- if (!consume_entry(cookie, addr, false))
+ if (!consume_entry(cookie, addr))
return -EINVAL;
}
@@ -65,3 +61,43 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
return -EINVAL;
return 0;
}
+
+void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
+ const struct pt_regs *regs)
+{
+ struct stack_frame_user __user *sf;
+ unsigned long ip, sp;
+ bool first = true;
+
+ if (is_compat_task())
+ return;
+ if (!consume_entry(cookie, instruction_pointer(regs)))
+ return;
+ sf = (void __user *)user_stack_pointer(regs);
+ pagefault_disable();
+ while (1) {
+ if (__get_user(sp, &sf->back_chain))
+ break;
+ if (__get_user(ip, &sf->gprs[8]))
+ break;
+ if (ip & 0x1) {
+ /*
+ * If the instruction address is invalid, and this
+ * is the first stack frame, assume r14 has not
+ * been written to the stack yet. Otherwise exit.
+ */
+ if (first && !(regs->gprs[14] & 0x1))
+ ip = regs->gprs[14];
+ else
+ break;
+ }
+ if (!consume_entry(cookie, ip))
+ break;
+ /* Sanity check: ABI requires SP to be aligned 8 bytes. */
+ if (!sp || sp & 0x7)
+ break;
+ sf = (void __user *)sp;
+ first = false;
+ }
+ pagefault_enable();
+}
diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c
index 888cc2f166db..30bb20461db4 100644
--- a/arch/s390/kernel/sthyi.c
+++ b/arch/s390/kernel/sthyi.c
@@ -317,7 +317,9 @@ static void fill_diag(struct sthyi_sctns *sctns)
if (pages <= 0)
return;
- diag204_buf = vmalloc(array_size(pages, PAGE_SIZE));
+ diag204_buf = __vmalloc_node(array_size(pages, PAGE_SIZE),
+ PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE,
+ __builtin_return_address(0));
if (!diag204_buf)
return;
@@ -395,19 +397,18 @@ out:
static int sthyi(u64 vaddr, u64 *rc)
{
- register u64 code asm("0") = 0;
- register u64 addr asm("2") = vaddr;
- register u64 rcode asm("3");
+ union register_pair r1 = { .even = 0, }; /* subcode */
+ union register_pair r2 = { .even = vaddr, };
int cc;
asm volatile(
- ".insn rre,0xB2560000,%[code],%[addr]\n"
+ ".insn rre,0xB2560000,%[r1],%[r2]\n"
"ipm %[cc]\n"
"srl %[cc],28\n"
- : [cc] "=d" (cc), "=d" (rcode)
- : [code] "d" (code), [addr] "a" (addr)
+ : [cc] "=&d" (cc), [r2] "+&d" (r2.pair)
+ : [r1] "d" (r1.pair)
: "memory", "cc");
- *rc = rcode;
+ *rc = r2.odd;
return cc;
}
@@ -460,9 +461,9 @@ static int sthyi_update_cache(u64 *rc)
*
* Fills the destination with system information returned by the STHYI
* instruction. The data is generated by emulation or execution of STHYI,
- * if available. The return value is the condition code that would be
- * returned, the rc parameter is the return code which is passed in
- * register R2 + 1.
+ * if available. The return value is either a negative error value or
+ * the condition code that would be returned, the rc parameter is the
+ * return code which is passed in register R2 + 1.
*/
int sthyi_fill(void *dst, u64 *rc)
{
diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c
deleted file mode 100644
index 75b7b307946e..000000000000
--- a/arch/s390/kernel/suspend.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Suspend support specific for s390.
- *
- * Copyright IBM Corp. 2009
- *
- * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com>
- */
-
-#include <linux/pfn.h>
-#include <linux/suspend.h>
-#include <linux/mm.h>
-#include <linux/pci.h>
-#include <asm/ctl_reg.h>
-#include <asm/ipl.h>
-#include <asm/cio.h>
-#include <asm/sections.h>
-#include "entry.h"
-
-/*
- * The restore of the saved pages in an hibernation image will set
- * the change and referenced bits in the storage key for each page.
- * Overindication of the referenced bits after an hibernation cycle
- * does not cause any harm but the overindication of the change bits
- * would cause trouble.
- * Use the ARCH_SAVE_PAGE_KEYS hooks to save the storage key of each
- * page to the most significant byte of the associated page frame
- * number in the hibernation image.
- */
-
-/*
- * Key storage is allocated as a linked list of pages.
- * The size of the keys array is (PAGE_SIZE - sizeof(long))
- */
-struct page_key_data {
- struct page_key_data *next;
- unsigned char data[];
-};
-
-#define PAGE_KEY_DATA_SIZE (PAGE_SIZE - sizeof(struct page_key_data *))
-
-static struct page_key_data *page_key_data;
-static struct page_key_data *page_key_rp, *page_key_wp;
-static unsigned long page_key_rx, page_key_wx;
-unsigned long suspend_zero_pages;
-
-/*
- * For each page in the hibernation image one additional byte is
- * stored in the most significant byte of the page frame number.
- * On suspend no additional memory is required but on resume the
- * keys need to be memorized until the page data has been restored.
- * Only then can the storage keys be set to their old state.
- */
-unsigned long page_key_additional_pages(unsigned long pages)
-{
- return DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE);
-}
-
-/*
- * Free page_key_data list of arrays.
- */
-void page_key_free(void)
-{
- struct page_key_data *pkd;
-
- while (page_key_data) {
- pkd = page_key_data;
- page_key_data = pkd->next;
- free_page((unsigned long) pkd);
- }
-}
-
-/*
- * Allocate page_key_data list of arrays with enough room to store
- * one byte for each page in the hibernation image.
- */
-int page_key_alloc(unsigned long pages)
-{
- struct page_key_data *pk;
- unsigned long size;
-
- size = DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE);
- while (size--) {
- pk = (struct page_key_data *) get_zeroed_page(GFP_KERNEL);
- if (!pk) {
- page_key_free();
- return -ENOMEM;
- }
- pk->next = page_key_data;
- page_key_data = pk;
- }
- page_key_rp = page_key_wp = page_key_data;
- page_key_rx = page_key_wx = 0;
- return 0;
-}
-
-/*
- * Save the storage key into the upper 8 bits of the page frame number.
- */
-void page_key_read(unsigned long *pfn)
-{
- struct page *page;
- unsigned long addr;
- unsigned char key;
-
- page = pfn_to_page(*pfn);
- addr = (unsigned long) page_address(page);
- key = (unsigned char) page_get_storage_key(addr) & 0x7f;
- if (arch_test_page_nodat(page))
- key |= 0x80;
- *(unsigned char *) pfn = key;
-}
-
-/*
- * Extract the storage key from the upper 8 bits of the page frame number
- * and store it in the page_key_data list of arrays.
- */
-void page_key_memorize(unsigned long *pfn)
-{
- page_key_wp->data[page_key_wx] = *(unsigned char *) pfn;
- *(unsigned char *) pfn = 0;
- if (++page_key_wx < PAGE_KEY_DATA_SIZE)
- return;
- page_key_wp = page_key_wp->next;
- page_key_wx = 0;
-}
-
-/*
- * Get the next key from the page_key_data list of arrays and set the
- * storage key of the page referred by @address. If @address refers to
- * a "safe" page the swsusp_arch_resume code will transfer the storage
- * key from the buffer page to the original page.
- */
-void page_key_write(void *address)
-{
- struct page *page;
- unsigned char key;
-
- key = page_key_rp->data[page_key_rx];
- page_set_storage_key((unsigned long) address, key & 0x7f, 0);
- page = virt_to_page(address);
- if (key & 0x80)
- arch_set_page_nodat(page, 0);
- else
- arch_set_page_dat(page, 0);
- if (++page_key_rx >= PAGE_KEY_DATA_SIZE)
- return;
- page_key_rp = page_key_rp->next;
- page_key_rx = 0;
-}
-
-int pfn_is_nosave(unsigned long pfn)
-{
- unsigned long nosave_begin_pfn = PFN_DOWN(__pa(&__nosave_begin));
- unsigned long nosave_end_pfn = PFN_DOWN(__pa(&__nosave_end));
- unsigned long end_rodata_pfn = PFN_DOWN(__pa(__end_rodata)) - 1;
- unsigned long stext_pfn = PFN_DOWN(__pa(_stext));
-
- /* Always save lowcore pages (LC protection might be enabled). */
- if (pfn <= LC_PAGES)
- return 0;
- if (pfn >= nosave_begin_pfn && pfn < nosave_end_pfn)
- return 1;
- /* Skip memory holes and read-only pages (DCSS, ...). */
- if (pfn >= stext_pfn && pfn <= end_rodata_pfn)
- return 0;
- if (tprot(PFN_PHYS(pfn)))
- return 1;
- return 0;
-}
-
-/*
- * PM notifier callback for suspend
- */
-static int suspend_pm_cb(struct notifier_block *nb, unsigned long action,
- void *ptr)
-{
- switch (action) {
- case PM_SUSPEND_PREPARE:
- case PM_HIBERNATION_PREPARE:
- suspend_zero_pages = __get_free_pages(GFP_KERNEL, LC_ORDER);
- if (!suspend_zero_pages)
- return NOTIFY_BAD;
- break;
- case PM_POST_SUSPEND:
- case PM_POST_HIBERNATION:
- free_pages(suspend_zero_pages, LC_ORDER);
- break;
- default:
- return NOTIFY_DONE;
- }
- return NOTIFY_OK;
-}
-
-static int __init suspend_pm_init(void)
-{
- pm_notifier(suspend_pm_cb, 0);
- return 0;
-}
-arch_initcall(suspend_pm_init);
-
-void save_processor_state(void)
-{
- /* swsusp_arch_suspend() actually saves all cpu register contents.
- * Machine checks must be disabled since swsusp_arch_suspend() stores
- * register contents to their lowcore save areas. That's the same
- * place where register contents on machine checks would be saved.
- * To avoid register corruption disable machine checks.
- * We must also disable machine checks in the new psw mask for
- * program checks, since swsusp_arch_suspend() may generate program
- * checks. Disabling machine checks for all other new psw masks is
- * just paranoia.
- */
- local_mcck_disable();
- /* Disable lowcore protection */
- __ctl_clear_bit(0,28);
- S390_lowcore.external_new_psw.mask &= ~PSW_MASK_MCHECK;
- S390_lowcore.svc_new_psw.mask &= ~PSW_MASK_MCHECK;
- S390_lowcore.io_new_psw.mask &= ~PSW_MASK_MCHECK;
- S390_lowcore.program_new_psw.mask &= ~PSW_MASK_MCHECK;
-}
-
-void restore_processor_state(void)
-{
- S390_lowcore.external_new_psw.mask |= PSW_MASK_MCHECK;
- S390_lowcore.svc_new_psw.mask |= PSW_MASK_MCHECK;
- S390_lowcore.io_new_psw.mask |= PSW_MASK_MCHECK;
- S390_lowcore.program_new_psw.mask |= PSW_MASK_MCHECK;
- /* Enable lowcore protection */
- __ctl_set_bit(0,28);
- local_mcck_enable();
-}
-
-/* Called at the end of swsusp_arch_resume */
-void s390_early_resume(void)
-{
- lgr_info_log();
- channel_subsystem_reinit();
- zpci_rescan();
-}
diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S
deleted file mode 100644
index a7baf0b5f818..000000000000
--- a/arch/s390/kernel/swsusp.S
+++ /dev/null
@@ -1,276 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * S390 64-bit swsusp implementation
- *
- * Copyright IBM Corp. 2009
- *
- * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com>
- * Michael Holzheu <holzheu@linux.vnet.ibm.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/page.h>
-#include <asm/ptrace.h>
-#include <asm/thread_info.h>
-#include <asm/asm-offsets.h>
-#include <asm/nospec-insn.h>
-#include <asm/sigp.h>
-
-/*
- * Save register context in absolute 0 lowcore and call swsusp_save() to
- * create in-memory kernel image. The context is saved in the designated
- * "store status" memory locations (see POP).
- * We return from this function twice. The first time during the suspend to
- * disk process. The second time via the swsusp_arch_resume() function
- * (see below) in the resume process.
- * This function runs with disabled interrupts.
- */
- GEN_BR_THUNK %r14
-
- .section .text
-ENTRY(swsusp_arch_suspend)
- lg %r1,__LC_NODAT_STACK
- stmg %r6,%r15,__SF_GPRS(%r1)
- aghi %r1,-STACK_FRAME_OVERHEAD
- stg %r15,__SF_BACKCHAIN(%r1)
- lgr %r15,%r1
-
- /* Store FPU registers */
- brasl %r14,save_fpu_regs
-
- /* Deactivate DAT */
- stnsm __SF_EMPTY(%r15),0xfb
-
- /* Store prefix register on stack */
- stpx __SF_EMPTY(%r15)
-
- /* Save prefix register contents for lowcore copy */
- llgf %r10,__SF_EMPTY(%r15)
-
- /* Get pointer to save area */
- lghi %r1,0x1000
-
- /* Save CPU address */
- stap __LC_EXT_CPU_ADDR(%r0)
-
- /* Store registers */
- mvc 0x318(4,%r1),__SF_EMPTY(%r15) /* move prefix to lowcore */
- stam %a0,%a15,0x340(%r1) /* store access registers */
- stctg %c0,%c15,0x380(%r1) /* store control registers */
- stmg %r0,%r15,0x280(%r1) /* store general registers */
-
- stpt 0x328(%r1) /* store timer */
- stck __SF_EMPTY(%r15) /* store clock */
- stckc 0x330(%r1) /* store clock comparator */
-
- /* Update cputime accounting before going to sleep */
- lg %r0,__LC_LAST_UPDATE_TIMER
- slg %r0,0x328(%r1)
- alg %r0,__LC_SYSTEM_TIMER
- stg %r0,__LC_SYSTEM_TIMER
- mvc __LC_LAST_UPDATE_TIMER(8),0x328(%r1)
- lg %r0,__LC_LAST_UPDATE_CLOCK
- slg %r0,__SF_EMPTY(%r15)
- alg %r0,__LC_STEAL_TIMER
- stg %r0,__LC_STEAL_TIMER
- mvc __LC_LAST_UPDATE_CLOCK(8),__SF_EMPTY(%r15)
-
- /* Activate DAT */
- stosm __SF_EMPTY(%r15),0x04
-
- /* Set prefix page to zero */
- xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15)
- spx __SF_EMPTY(%r15)
-
- /* Save absolute zero pages */
- larl %r2,suspend_zero_pages
- lg %r2,0(%r2)
- lghi %r4,0
- lghi %r3,2*PAGE_SIZE
- lghi %r5,2*PAGE_SIZE
-1: mvcle %r2,%r4,0
- jo 1b
-
- /* Copy lowcore to absolute zero lowcore */
- lghi %r2,0
- lgr %r4,%r10
- lghi %r3,2*PAGE_SIZE
- lghi %r5,2*PAGE_SIZE
-1: mvcle %r2,%r4,0
- jo 1b
-
- /* Save image */
- brasl %r14,swsusp_save
-
- /* Restore prefix register and return */
- lghi %r1,0x1000
- spx 0x318(%r1)
- lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15)
- lghi %r2,0
- BR_EX %r14
-ENDPROC(swsusp_arch_suspend)
-
-/*
- * Restore saved memory image to correct place and restore register context.
- * Then we return to the function that called swsusp_arch_suspend().
- * swsusp_arch_resume() runs with disabled interrupts.
- */
-ENTRY(swsusp_arch_resume)
- stmg %r6,%r15,__SF_GPRS(%r15)
- lgr %r1,%r15
- aghi %r15,-STACK_FRAME_OVERHEAD
- stg %r1,__SF_BACKCHAIN(%r15)
-
- /* Make all free pages stable */
- lghi %r2,1
- brasl %r14,arch_set_page_states
-
- /* Set prefix page to zero */
- xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15)
- spx __SF_EMPTY(%r15)
-
- /* Deactivate DAT */
- stnsm __SF_EMPTY(%r15),0xfb
-
- /* Restore saved image */
- larl %r1,restore_pblist
- lg %r1,0(%r1)
- ltgr %r1,%r1
- jz 2f
-0:
- lg %r2,8(%r1)
- lg %r4,0(%r1)
- iske %r0,%r4
- lghi %r3,PAGE_SIZE
- lghi %r5,PAGE_SIZE
-1:
- mvcle %r2,%r4,0
- jo 1b
- lg %r2,8(%r1)
- sske %r0,%r2
- lg %r1,16(%r1)
- ltgr %r1,%r1
- jnz 0b
-2:
- ptlb /* flush tlb */
-
- /* Reset System */
- larl %r1,.Lnew_pgm_check_psw
- epsw %r2,%r3
- stm %r2,%r3,0(%r1)
- mvc __LC_PGM_NEW_PSW(16,%r0),0(%r1)
- larl %r1,__swsusp_reset_dma
- lg %r1,0(%r1)
- BASR_EX %r14,%r1
- larl %r1,smp_cpu_mt_shift
- icm %r1,15,0(%r1)
- jz smt_done
- llgfr %r1,%r1
-smt_loop:
- sigp %r1,%r0,SIGP_SET_MULTI_THREADING
- brc 8,smt_done /* accepted */
- brc 2,smt_loop /* busy, try again */
-smt_done:
- larl %r1,.Lnew_pgm_check_psw
- lpswe 0(%r1)
-pgm_check_entry:
-
- /* Switch to original suspend CPU */
- larl %r1,.Lresume_cpu /* Resume CPU address: r2 */
- stap 0(%r1)
- llgh %r2,0(%r1)
- llgh %r1,__LC_EXT_CPU_ADDR(%r0) /* Suspend CPU address: r1 */
- cgr %r1,%r2
- je restore_registers /* r1 = r2 -> nothing to do */
- larl %r4,.Lrestart_suspend_psw /* Set new restart PSW */
- mvc __LC_RST_NEW_PSW(16,%r0),0(%r4)
-3:
- sigp %r9,%r1,SIGP_INITIAL_CPU_RESET /* sigp initial cpu reset */
- brc 8,4f /* accepted */
- brc 2,3b /* busy, try again */
-
- /* Suspend CPU not available -> panic */
- larl %r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD
- larl %r2,.Lpanic_string
- brasl %r14,sclp_early_printk_force
- larl %r3,.Ldisabled_wait_31
- lpsw 0(%r3)
-4:
- /* Switch to suspend CPU */
- sigp %r9,%r1,SIGP_RESTART /* sigp restart to suspend CPU */
- brc 2,4b /* busy, try again */
-5:
- sigp %r9,%r2,SIGP_STOP /* sigp stop to current resume CPU */
- brc 2,5b /* busy, try again */
-6: j 6b
-
-restart_suspend:
- larl %r1,.Lresume_cpu
- llgh %r2,0(%r1)
-7:
- sigp %r9,%r2,SIGP_SENSE /* sigp sense, wait for resume CPU */
- brc 8,7b /* accepted, status 0, still running */
- brc 2,7b /* busy, try again */
- tmll %r9,0x40 /* Test if resume CPU is stopped */
- jz 7b
-
-restore_registers:
- /* Restore registers */
- lghi %r13,0x1000 /* %r1 = pointer to save area */
-
- /* Ignore time spent in suspended state. */
- llgf %r1,0x318(%r13)
- stck __LC_LAST_UPDATE_CLOCK(%r1)
- spt 0x328(%r13) /* reprogram timer */
- //sckc 0x330(%r13) /* set clock comparator */
-
- lctlg %c0,%c15,0x380(%r13) /* load control registers */
- lam %a0,%a15,0x340(%r13) /* load access registers */
-
- /* Load old stack */
- lg %r15,0x2f8(%r13)
-
- /* Save prefix register */
- mvc __SF_EMPTY(4,%r15),0x318(%r13)
-
- /* Restore absolute zero pages */
- lghi %r2,0
- larl %r4,suspend_zero_pages
- lg %r4,0(%r4)
- lghi %r3,2*PAGE_SIZE
- lghi %r5,2*PAGE_SIZE
-1: mvcle %r2,%r4,0
- jo 1b
-
- /* Restore prefix register */
- spx __SF_EMPTY(%r15)
-
- /* Activate DAT */
- stosm __SF_EMPTY(%r15),0x04
-
- /* Make all free pages unstable */
- lghi %r2,0
- brasl %r14,arch_set_page_states
-
- /* Call arch specific early resume code */
- brasl %r14,s390_early_resume
-
- /* Return 0 */
- lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15)
- lghi %r2,0
- BR_EX %r14
-ENDPROC(swsusp_arch_resume)
-
- .section .data..nosave,"aw",@progbits
- .align 8
-.Ldisabled_wait_31:
- .long 0x000a0000,0x00000000
-.Lpanic_string:
- .asciz "Resume not possible because suspend CPU is no longer available\n"
- .align 8
-.Lrestart_suspend_psw:
- .quad 0x0000000180000000,restart_suspend
-.Lnew_pgm_check_psw:
- .quad 0,pgm_check_entry
-.Lresume_cpu:
- .byte 0,0
diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/syscall.c
index 202fa73ac167..dc2355c623d6 100644
--- a/arch/s390/kernel/sys_s390.c
+++ b/arch/s390/kernel/syscall.c
@@ -29,6 +29,13 @@
#include <linux/unistd.h>
#include <linux/ipc.h>
#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/thread_info.h>
+#include <linux/entry-common.h>
+
+#include <asm/ptrace.h>
+#include <asm/vtime.h>
+
#include "entry.h"
/*
@@ -100,3 +107,64 @@ SYSCALL_DEFINE0(ni_syscall)
{
return -ENOSYS;
}
+
+static void do_syscall(struct pt_regs *regs)
+{
+ unsigned long nr;
+
+ nr = regs->int_code & 0xffff;
+ if (!nr) {
+ nr = regs->gprs[1] & 0xffff;
+ regs->int_code &= ~0xffffUL;
+ regs->int_code |= nr;
+ }
+
+ regs->gprs[2] = nr;
+
+ if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) {
+ regs->psw.addr = current->restart_block.arch_data;
+ current->restart_block.arch_data = 1;
+ }
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+
+ /*
+ * In the s390 ptrace ABI, both the syscall number and the return value
+ * use gpr2. However, userspace puts the syscall number either in the
+ * svc instruction itself, or uses gpr1. To make at least skipping syscalls
+ * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here
+ * and if set, the syscall will be skipped.
+ */
+
+ if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)))
+ goto out;
+ regs->gprs[2] = -ENOSYS;
+ if (likely(nr >= NR_syscalls))
+ goto out;
+ do {
+ regs->gprs[2] = current->thread.sys_call_table[nr](regs);
+ } while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART));
+out:
+ syscall_exit_to_user_mode_work(regs);
+}
+
+void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
+{
+ add_random_kstack_offset();
+ enter_from_user_mode(regs);
+ regs->psw = S390_lowcore.svc_old_psw;
+ regs->int_code = S390_lowcore.svc_int_code;
+ update_timer_sys();
+ if (static_branch_likely(&cpu_has_bear))
+ current->thread.last_break = regs->last_break;
+
+ local_irq_enable();
+ regs->orig_gpr2 = regs->gprs[2];
+
+ if (per_trap)
+ set_thread_flag(TIF_PER_TRAP);
+
+ regs->flags = 0;
+ set_pt_regs_flag(regs, PIF_SYSCALL);
+ do_syscall(regs);
+ exit_to_user_mode();
+}
diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile
index b98f25029b8e..fb85e797946d 100644
--- a/arch/s390/kernel/syscalls/Makefile
+++ b/arch/s390/kernel/syscalls/Makefile
@@ -21,8 +21,7 @@ uapi: $(uapi-hdrs-y)
# Create output directory if not already present
-_dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \
- $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)')
+$(shell mkdir -p $(uapi) $(kapi))
filechk_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$2" < $<
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 3054e9c035a3..095bb86339a7 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -26,7 +26,7 @@
16 32 lchown - sys_lchown16
19 common lseek sys_lseek compat_sys_lseek
20 common getpid sys_getpid sys_getpid
-21 common mount sys_mount compat_sys_mount
+21 common mount sys_mount sys_mount
22 common umount sys_oldumount sys_oldumount
23 32 setuid - sys_setuid16
24 32 getuid - sys_getuid16
@@ -100,7 +100,7 @@
106 common stat sys_newstat compat_sys_newstat
107 common lstat sys_newlstat compat_sys_newlstat
108 common fstat sys_newfstat compat_sys_newfstat
-110 common lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie
+110 common lookup_dcookie - -
111 common vhangup sys_vhangup sys_vhangup
112 common idle - -
114 common wait4 sys_wait4 compat_sys_wait4
@@ -122,7 +122,7 @@
131 common quotactl sys_quotactl sys_quotactl
132 common getpgid sys_getpgid sys_getpgid
133 common fchdir sys_fchdir sys_fchdir
-134 common bdflush sys_bdflush sys_bdflush
+134 common bdflush sys_ni_syscall sys_ni_syscall
135 common sysfs sys_sysfs sys_sysfs
136 common personality sys_s390_personality sys_s390_personality
137 common afs_syscall - -
@@ -134,11 +134,11 @@
142 64 select sys_select -
143 common flock sys_flock sys_flock
144 common msync sys_msync sys_msync
-145 common readv sys_readv compat_sys_readv
-146 common writev sys_writev compat_sys_writev
+145 common readv sys_readv sys_readv
+146 common writev sys_writev sys_writev
147 common getsid sys_getsid sys_getsid
148 common fdatasync sys_fdatasync sys_fdatasync
-149 common _sysctl sys_sysctl compat_sys_sysctl
+149 common _sysctl - -
150 common mlock sys_mlock sys_mlock
151 common munlock sys_munlock sys_munlock
152 common mlockall sys_mlockall sys_mlockall
@@ -274,9 +274,9 @@
265 common statfs64 sys_statfs64 compat_sys_statfs64
266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64
267 common remap_file_pages sys_remap_file_pages sys_remap_file_pages
-268 common mbind sys_mbind compat_sys_mbind
-269 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
-270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy
+268 common mbind sys_mbind sys_mbind
+269 common get_mempolicy sys_get_mempolicy sys_get_mempolicy
+270 common set_mempolicy sys_set_mempolicy sys_set_mempolicy
271 common mq_open sys_mq_open compat_sys_mq_open
272 common mq_unlink sys_mq_unlink sys_mq_unlink
273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32
@@ -293,7 +293,7 @@
284 common inotify_init sys_inotify_init sys_inotify_init
285 common inotify_add_watch sys_inotify_add_watch sys_inotify_add_watch
286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch
-287 common migrate_pages sys_migrate_pages compat_sys_migrate_pages
+287 common migrate_pages sys_migrate_pages sys_migrate_pages
288 common openat sys_openat compat_sys_openat
289 common mkdirat sys_mkdirat sys_mkdirat
290 common mknodat sys_mknodat sys_mknodat
@@ -316,8 +316,8 @@
306 common splice sys_splice sys_splice
307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range
308 common tee sys_tee sys_tee
-309 common vmsplice sys_vmsplice compat_sys_vmsplice
-310 common move_pages sys_move_pages compat_sys_move_pages
+309 common vmsplice sys_vmsplice sys_vmsplice
+310 common move_pages sys_move_pages sys_move_pages
311 common getcpu sys_getcpu sys_getcpu
312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait
313 common utimes sys_utimes sys_utimes_time32
@@ -347,8 +347,8 @@
337 common clock_adjtime sys_clock_adjtime sys_clock_adjtime32
338 common syncfs sys_syncfs sys_syncfs
339 common setns sys_setns sys_setns
-340 common process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
-341 common process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
+340 common process_vm_readv sys_process_vm_readv sys_process_vm_readv
+341 common process_vm_writev sys_process_vm_writev sys_process_vm_writev
342 common s390_runtime_instr sys_s390_runtime_instr sys_s390_runtime_instr
343 common kcmp sys_kcmp sys_kcmp
344 common finit_module sys_finit_module sys_finit_module
@@ -372,8 +372,8 @@
362 common connect sys_connect sys_connect
363 common listen sys_listen sys_listen
364 common accept4 sys_accept4 sys_accept4
-365 common getsockopt sys_getsockopt compat_sys_getsockopt
-366 common setsockopt sys_setsockopt compat_sys_setsockopt
+365 common getsockopt sys_getsockopt sys_getsockopt
+366 common setsockopt sys_setsockopt sys_setsockopt
367 common getsockname sys_getsockname sys_getsockname
368 common getpeername sys_getpeername sys_getpeername
369 common sendto sys_sendto sys_sendto
@@ -438,3 +438,29 @@
433 common fspick sys_fspick sys_fspick
434 common pidfd_open sys_pidfd_open sys_pidfd_open
435 common clone3 sys_clone3 sys_clone3
+436 common close_range sys_close_range sys_close_range
+437 common openat2 sys_openat2 sys_openat2
+438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
+439 common faccessat2 sys_faccessat2 sys_faccessat2
+440 common process_madvise sys_process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
+442 common mount_setattr sys_mount_setattr sys_mount_setattr
+443 common quotactl_fd sys_quotactl_fd sys_quotactl_fd
+444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset
+445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule
+446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self
+447 common memfd_secret sys_memfd_secret sys_memfd_secret
+448 common process_mrelease sys_process_mrelease sys_process_mrelease
+449 common futex_waitv sys_futex_waitv sys_futex_waitv
+450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2 sys_fchmodat2
+453 common map_shadow_stack sys_map_shadow_stack sys_map_shadow_stack
+454 common futex_wake sys_futex_wake sys_futex_wake
+455 common futex_wait sys_futex_wait sys_futex_wait
+456 common futex_requeue sys_futex_requeue sys_futex_requeue
+457 common statmount sys_statmount sys_statmount
+458 common listmount sys_listmount sys_listmount
+459 common lsm_get_self_attr sys_lsm_get_self_attr sys_lsm_get_self_attr
+460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr
+461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index 2ac3c9b56a13..f6f8f498c9be 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -14,6 +14,7 @@
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/slab.h>
+#include <asm/asm-extable.h>
#include <asm/ebcdic.h>
#include <asm/debug.h>
#include <asm/sysinfo.h>
@@ -25,19 +26,22 @@ int topology_max_mnest;
static inline int __stsi(void *sysinfo, int fc, int sel1, int sel2, int *lvl)
{
- register int r0 asm("0") = (fc << 28) | sel1;
- register int r1 asm("1") = sel2;
+ int r0 = (fc << 28) | sel1;
int rc = 0;
asm volatile(
- " stsi 0(%3)\n"
+ " lr 0,%[r0]\n"
+ " lr 1,%[r1]\n"
+ " stsi 0(%[sysinfo])\n"
"0: jz 2f\n"
- "1: lhi %1,%4\n"
- "2:\n"
+ "1: lhi %[rc],%[retval]\n"
+ "2: lr %[r0],0\n"
EX_TABLE(0b, 1b)
- : "+d" (r0), "+d" (rc)
- : "d" (r1), "a" (sysinfo), "K" (-EOPNOTSUPP)
- : "cc", "memory");
+ : [r0] "+d" (r0), [rc] "+d" (rc)
+ : [r1] "d" (sel2),
+ [sysinfo] "a" (sysinfo),
+ [retval] "K" (-EOPNOTSUPP)
+ : "cc", "0", "1", "memory");
*lvl = ((unsigned int) r0) >> 28;
return rc;
}
@@ -77,10 +81,12 @@ static bool convert_ext_name(unsigned char encoding, char *name, size_t len)
static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info)
{
+ bool has_var_cap;
int i;
if (stsi(info, 1, 1, 1))
return;
+ has_var_cap = !!info->model_var_cap[0];
EBCASC(info->manufacturer, sizeof(info->manufacturer));
EBCASC(info->type, sizeof(info->type));
EBCASC(info->model, sizeof(info->model));
@@ -89,6 +95,8 @@ static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info)
EBCASC(info->model_capacity, sizeof(info->model_capacity));
EBCASC(info->model_perm_cap, sizeof(info->model_perm_cap));
EBCASC(info->model_temp_cap, sizeof(info->model_temp_cap));
+ if (has_var_cap)
+ EBCASC(info->model_var_cap, sizeof(info->model_var_cap));
seq_printf(m, "Manufacturer: %-16.16s\n", info->manufacturer);
seq_printf(m, "Type: %-4.4s\n", info->type);
if (info->lic)
@@ -116,12 +124,18 @@ static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info)
seq_printf(m, "Model Temp. Capacity: %-16.16s %08u\n",
info->model_temp_cap,
info->model_temp_cap_rating);
+ if (has_var_cap && info->model_var_cap_rating)
+ seq_printf(m, "Model Var. Capacity: %-16.16s %08u\n",
+ info->model_var_cap,
+ info->model_var_cap_rating);
if (info->ncr)
seq_printf(m, "Nominal Cap. Rating: %08u\n", info->ncr);
if (info->npr)
seq_printf(m, "Nominal Perm. Rating: %08u\n", info->npr);
if (info->ntr)
seq_printf(m, "Nominal Temp. Rating: %08u\n", info->ntr);
+ if (has_var_cap && info->nvr)
+ seq_printf(m, "Nominal Var. Rating: %08u\n", info->nvr);
if (info->cai) {
seq_printf(m, "Capacity Adj. Ind.: %d\n", info->cai);
seq_printf(m, "Capacity Ch. Reason: %d\n", info->ccr);
diff --git a/arch/s390/kernel/text_amode31.S b/arch/s390/kernel/text_amode31.S
new file mode 100644
index 000000000000..14c6d25c035f
--- /dev/null
+++ b/arch/s390/kernel/text_amode31.S
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Code that needs to run below 2 GB.
+ *
+ * Copyright IBM Corp. 2019
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm-extable.h>
+#include <asm/errno.h>
+#include <asm/sigp.h>
+
+ .section .amode31.text,"ax"
+/*
+ * Simplified version of expoline thunk. The normal thunks can not be used here,
+ * because they might be more than 2 GB away, and not reachable by the relative
+ * branch. No comdat, exrl, etc. optimizations used here, because it only
+ * affects a few functions that are not performance-relevant.
+ */
+ .macro BR_EX_AMODE31_r14
+ larl %r1,0f
+ ex 0,0(%r1)
+ j .
+0: br %r14
+ .endm
+
+/*
+ * int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode)
+ */
+SYM_FUNC_START(_diag14_amode31)
+ lgr %r1,%r2
+ lgr %r2,%r3
+ lgr %r3,%r4
+ lhi %r5,-EIO
+ sam31
+ diag %r1,%r2,0x14
+.Ldiag14_ex:
+ ipm %r5
+ srl %r5,28
+.Ldiag14_fault:
+ sam64
+ lgfr %r2,%r5
+ BR_EX_AMODE31_r14
+ EX_TABLE_AMODE31(.Ldiag14_ex, .Ldiag14_fault)
+SYM_FUNC_END(_diag14_amode31)
+
+/*
+ * int _diag210_amode31(struct diag210 *addr)
+ */
+SYM_FUNC_START(_diag210_amode31)
+ lgr %r1,%r2
+ lhi %r2,-1
+ sam31
+ diag %r1,%r0,0x210
+.Ldiag210_ex:
+ ipm %r2
+ srl %r2,28
+.Ldiag210_fault:
+ sam64
+ lgfr %r2,%r2
+ BR_EX_AMODE31_r14
+ EX_TABLE_AMODE31(.Ldiag210_ex, .Ldiag210_fault)
+SYM_FUNC_END(_diag210_amode31)
+
+/*
+ * int diag8c(struct diag8c *addr, struct ccw_dev_id *devno, size_t len)
+*/
+SYM_FUNC_START(_diag8c_amode31)
+ llgf %r3,0(%r3)
+ sam31
+ diag %r2,%r4,0x8c
+.Ldiag8c_ex:
+ sam64
+ lgfr %r2,%r3
+ BR_EX_AMODE31_r14
+ EX_TABLE_AMODE31(.Ldiag8c_ex, .Ldiag8c_ex)
+SYM_FUNC_END(_diag8c_amode31)
+/*
+ * int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode)
+ */
+SYM_FUNC_START(_diag26c_amode31)
+ lghi %r5,-EOPNOTSUPP
+ sam31
+ diag %r2,%r4,0x26c
+.Ldiag26c_ex:
+ sam64
+ lgfr %r2,%r5
+ BR_EX_AMODE31_r14
+ EX_TABLE_AMODE31(.Ldiag26c_ex, .Ldiag26c_ex)
+SYM_FUNC_END(_diag26c_amode31)
+
+/*
+ * void _diag0c_amode31(struct hypfs_diag0c_entry *entry)
+ */
+SYM_FUNC_START(_diag0c_amode31)
+ sam31
+ diag %r2,%r2,0x0c
+ sam64
+ BR_EX_AMODE31_r14
+SYM_FUNC_END(_diag0c_amode31)
+
+/*
+ * void _diag308_reset_amode31(void)
+ *
+ * Calls diag 308 subcode 1 and continues execution
+ */
+SYM_FUNC_START(_diag308_reset_amode31)
+ larl %r4,ctlregs # Save control registers
+ stctg %c0,%c15,0(%r4)
+ lg %r2,0(%r4) # Disable lowcore protection
+ nilh %r2,0xefff
+ larl %r4,ctlreg0
+ stg %r2,0(%r4)
+ lctlg %c0,%c0,0(%r4)
+ larl %r4,fpctl # Floating point control register
+ stfpc 0(%r4)
+ larl %r4,prefix # Save prefix register
+ stpx 0(%r4)
+ larl %r4,prefix_zero # Set prefix register to 0
+ spx 0(%r4)
+ larl %r4,continue_psw # Save PSW flags
+ epsw %r2,%r3
+ stm %r2,%r3,0(%r4)
+ larl %r4,.Lrestart_part2 # Setup restart PSW at absolute 0
+ larl %r3,restart_diag308_psw
+ og %r4,0(%r3) # Save PSW
+ lghi %r3,0
+ sturg %r4,%r3 # Use sturg, because of large pages
+ lghi %r1,1
+ lghi %r0,0
+ diag %r0,%r1,0x308
+.Lrestart_part2:
+ lhi %r0,0 # Load r0 with zero
+ lhi %r1,2 # Use mode 2 = ESAME (dump)
+ sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode
+ sam64 # Switch to 64 bit addressing mode
+ larl %r4,ctlregs # Restore control registers
+ lctlg %c0,%c15,0(%r4)
+ larl %r4,fpctl # Restore floating point ctl register
+ lfpc 0(%r4)
+ larl %r4,prefix # Restore prefix register
+ spx 0(%r4)
+ larl %r4,continue_psw # Restore PSW flags
+ larl %r2,.Lcontinue
+ stg %r2,8(%r4)
+ lpswe 0(%r4)
+.Lcontinue:
+ BR_EX_AMODE31_r14
+SYM_FUNC_END(_diag308_reset_amode31)
+
+ .section .amode31.data,"aw",@progbits
+ .balign 8
+SYM_DATA_LOCAL(restart_diag308_psw, .long 0x00080000,0x80000000)
+SYM_DATA_LOCAL(continue_psw, .quad 0,0)
+SYM_DATA_LOCAL(ctlreg0, .quad 0)
+SYM_DATA_LOCAL(ctlregs, .fill 16,8,0)
+SYM_DATA_LOCAL(fpctl, .long 0)
+SYM_DATA_LOCAL(prefix, .long 0)
+SYM_DATA_LOCAL(prefix_zero, .long 0)
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index f9d070d016e3..14abad953c02 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -41,6 +41,9 @@
#include <linux/gfp.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
+#include <vdso/vsyscall.h>
+#include <vdso/clocksource.h>
+#include <vdso/helpers.h>
#include <asm/facility.h>
#include <asm/delay.h>
#include <asm/div64.h>
@@ -52,11 +55,7 @@
#include <asm/cio.h>
#include "entry.h"
-unsigned char tod_clock_base[16] __aligned(8) = {
- /* Force to data section. */
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-};
+union tod_clock tod_clock_base __section(".data");
EXPORT_SYMBOL_GPL(tod_clock_base);
u64 clock_comparator_max = -1ULL;
@@ -69,10 +68,10 @@ EXPORT_SYMBOL(s390_epoch_delta_notifier);
unsigned char ptff_function_mask[16];
-static unsigned long long lpar_offset;
-static unsigned long long initial_leap_seconds;
-static unsigned long long tod_steering_end;
-static long long tod_steering_delta;
+static unsigned long lpar_offset;
+static unsigned long initial_leap_seconds;
+static unsigned long tod_steering_end;
+static long tod_steering_delta;
/*
* Get time offsets with PTFF
@@ -81,10 +80,12 @@ void __init time_early_init(void)
{
struct ptff_qto qto;
struct ptff_qui qui;
+ int cs;
/* Initialize TOD steering parameters */
- tod_steering_end = *(unsigned long long *) &tod_clock_base[1];
- vdso_data->ts_end = tod_steering_end;
+ tod_steering_end = tod_clock_base.tod;
+ for (cs = 0; cs < CS_BASES; cs++)
+ vdso_data[cs].arch_data.tod_steering_end = tod_steering_end;
if (!test_facility(28))
return;
@@ -97,10 +98,15 @@ void __init time_early_init(void)
/* get initial leap seconds */
if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0)
- initial_leap_seconds = (unsigned long long)
+ initial_leap_seconds = (unsigned long)
((long) qui.old_leap * 4096000000L);
}
+unsigned long long noinstr sched_clock_noinstr(void)
+{
+ return tod_to_ns(__get_tod_clock_monotonic());
+}
+
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -110,18 +116,13 @@ unsigned long long notrace sched_clock(void)
}
NOKPROBE_SYMBOL(sched_clock);
-static void ext_to_timespec64(unsigned char *clk, struct timespec64 *xt)
+static void ext_to_timespec64(union tod_clock *clk, struct timespec64 *xt)
{
- unsigned long long high, low, rem, sec, nsec;
+ unsigned long rem, sec, nsec;
- /* Split extendnd TOD clock to micro-seconds and sub-micro-seconds */
- high = (*(unsigned long long *) clk) >> 4;
- low = (*(unsigned long long *)&clk[7]) << 4;
- /* Calculate seconds and nano-seconds */
- sec = high;
+ sec = clk->us;
rem = do_div(sec, 1000000);
- nsec = (((low >> 32) + (rem << 32)) * 1000) >> 32;
-
+ nsec = ((clk->sus + (rem << 12)) * 125) >> 9;
xt->tv_sec = sec;
xt->tv_nsec = nsec;
}
@@ -172,10 +173,10 @@ void init_cpu_timer(void)
clockevents_register_device(cd);
/* Enable clock comparator timer interrupt. */
- __ctl_set_bit(0,11);
+ local_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SUBMASK_BIT);
/* Always allow the timing alert external interrupt. */
- __ctl_set_bit(0, 4);
+ local_ctl_set_bit(0, CR0_ETR_SUBMASK_BIT);
}
static void clock_comparator_interrupt(struct ext_code ext_code,
@@ -201,30 +202,26 @@ static void stp_reset(void);
void read_persistent_clock64(struct timespec64 *ts)
{
- unsigned char clk[STORE_CLOCK_EXT_SIZE];
- __u64 delta;
+ union tod_clock clk;
+ u64 delta;
delta = initial_leap_seconds + TOD_UNIX_EPOCH;
- get_tod_clock_ext(clk);
- *(__u64 *) &clk[1] -= delta;
- if (*(__u64 *) &clk[1] > delta)
- clk[0]--;
- ext_to_timespec64(clk, ts);
+ store_tod_clock_ext(&clk);
+ clk.eitod -= delta;
+ ext_to_timespec64(&clk, ts);
}
void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
struct timespec64 *boot_offset)
{
- unsigned char clk[STORE_CLOCK_EXT_SIZE];
struct timespec64 boot_time;
- __u64 delta;
+ union tod_clock clk;
+ u64 delta;
delta = initial_leap_seconds + TOD_UNIX_EPOCH;
- memcpy(clk, tod_clock_base, STORE_CLOCK_EXT_SIZE);
- *(__u64 *)&clk[1] -= delta;
- if (*(__u64 *)&clk[1] > delta)
- clk[0]--;
- ext_to_timespec64(clk, &boot_time);
+ clk = tod_clock_base;
+ clk.eitod -= delta;
+ ext_to_timespec64(&clk, &boot_time);
read_persistent_clock64(wall_time);
*boot_offset = timespec64_sub(*wall_time, boot_time);
@@ -232,12 +229,12 @@ void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
static u64 read_tod_clock(struct clocksource *cs)
{
- unsigned long long now, adj;
+ unsigned long now, adj;
preempt_disable(); /* protect from changes to steering parameters */
now = get_tod_clock();
adj = tod_steering_end - now;
- if (unlikely((s64) adj >= 0))
+ if (unlikely((s64) adj > 0))
/*
* manually steer by 1 cycle every 2^16 cycles. This
* corresponds to shifting the tod delta by 15. 1s is
@@ -253,10 +250,11 @@ static struct clocksource clocksource_tod = {
.name = "tod",
.rating = 400,
.read = read_tod_clock,
- .mask = -1ULL,
+ .mask = CLOCKSOURCE_MASK(64),
.mult = 1000,
.shift = 12,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .vdso_clock_mode = VDSO_CLOCKMODE_TOD,
};
struct clocksource * __init clocksource_default_clock(void)
@@ -264,55 +262,6 @@ struct clocksource * __init clocksource_default_clock(void)
return &clocksource_tod;
}
-void update_vsyscall(struct timekeeper *tk)
-{
- u64 nsecps;
-
- if (tk->tkr_mono.clock != &clocksource_tod)
- return;
-
- /* Make userspace gettimeofday spin until we're done. */
- ++vdso_data->tb_update_count;
- smp_wmb();
- vdso_data->xtime_tod_stamp = tk->tkr_mono.cycle_last;
- vdso_data->xtime_clock_sec = tk->xtime_sec;
- vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
- vdso_data->wtom_clock_sec =
- tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- vdso_data->wtom_clock_nsec = tk->tkr_mono.xtime_nsec +
- + ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
- nsecps = (u64) NSEC_PER_SEC << tk->tkr_mono.shift;
- while (vdso_data->wtom_clock_nsec >= nsecps) {
- vdso_data->wtom_clock_nsec -= nsecps;
- vdso_data->wtom_clock_sec++;
- }
-
- vdso_data->xtime_coarse_sec = tk->xtime_sec;
- vdso_data->xtime_coarse_nsec =
- (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
- vdso_data->wtom_coarse_sec =
- vdso_data->xtime_coarse_sec + tk->wall_to_monotonic.tv_sec;
- vdso_data->wtom_coarse_nsec =
- vdso_data->xtime_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
- while (vdso_data->wtom_coarse_nsec >= NSEC_PER_SEC) {
- vdso_data->wtom_coarse_nsec -= NSEC_PER_SEC;
- vdso_data->wtom_coarse_sec++;
- }
-
- vdso_data->tk_mult = tk->tkr_mono.mult;
- vdso_data->tk_shift = tk->tkr_mono.shift;
- smp_wmb();
- ++vdso_data->tb_update_count;
-}
-
-extern struct timezone sys_tz;
-
-void update_vsyscall_tz(void)
-{
- vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
- vdso_data->tz_dsttime = sys_tz.tz_dsttime;
-}
-
/*
* Initialize the TOD clock and the CPU timer of
* the boot cpu.
@@ -341,11 +290,12 @@ void __init time_init(void)
}
static DEFINE_PER_CPU(atomic_t, clock_sync_word);
-static DEFINE_MUTEX(clock_sync_mutex);
+static DEFINE_MUTEX(stp_mutex);
static unsigned long clock_sync_flags;
-#define CLOCK_SYNC_HAS_STP 0
-#define CLOCK_SYNC_STP 1
+#define CLOCK_SYNC_HAS_STP 0
+#define CLOCK_SYNC_STP 1
+#define CLOCK_SYNC_STPINFO_VALID 2
/*
* The get_clock function for the physical clock. It will get the current
@@ -419,18 +369,15 @@ static inline int check_sync_clock(void)
* Apply clock delta to the global data structures.
* This is called once on the CPU that performed the clock sync.
*/
-static void clock_sync_global(unsigned long long delta)
+static void clock_sync_global(long delta)
{
unsigned long now, adj;
struct ptff_qto qto;
+ int cs;
/* Fixup the monotonic sched clock. */
- *(unsigned long long *) &tod_clock_base[1] += delta;
- if (*(unsigned long long *) &tod_clock_base[1] < delta)
- /* Epoch overflow */
- tod_clock_base[0]++;
+ tod_clock_base.eitod += delta;
/* Adjust TOD steering parameters. */
- vdso_data->tb_update_count++;
now = get_tod_clock();
adj = tod_steering_end - now;
if (unlikely((s64) adj >= 0))
@@ -439,12 +386,14 @@ static void clock_sync_global(unsigned long long delta)
-(adj >> 15) : (adj >> 15);
tod_steering_delta += delta;
if ((abs(tod_steering_delta) >> 48) != 0)
- panic("TOD clock sync offset %lli is too large to drift\n",
+ panic("TOD clock sync offset %li is too large to drift\n",
tod_steering_delta);
tod_steering_end = now + (abs(tod_steering_delta) << 15);
- vdso_data->ts_dir = (tod_steering_delta < 0) ? 0 : 1;
- vdso_data->ts_end = tod_steering_end;
- vdso_data->tb_update_count++;
+ for (cs = 0; cs < CS_BASES; cs++) {
+ vdso_data[cs].arch_data.tod_steering_end = tod_steering_end;
+ vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta;
+ }
+
/* Update LPAR offset. */
if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0)
lpar_offset = qto.tod_epoch_difference;
@@ -456,7 +405,7 @@ static void clock_sync_global(unsigned long long delta)
* Apply clock delta to the per-CPU data structures of this CPU.
* This is called for each online CPU after the call to clock_sync_global.
*/
-static void clock_sync_local(unsigned long long delta)
+static void clock_sync_local(long delta)
{
/* Add the delta to the clock comparator. */
if (S390_lowcore.clock_comparator != clock_comparator_max) {
@@ -480,7 +429,7 @@ static void __init time_init_wq(void)
struct clock_sync_data {
atomic_t cpus;
int in_sync;
- unsigned long long clock_delta;
+ long clock_delta;
};
/*
@@ -491,7 +440,6 @@ static struct stp_sstpi stp_info;
static void *stp_page;
static void stp_work_fn(struct work_struct *work);
-static DEFINE_MUTEX(stp_work_mutex);
static DECLARE_WORK(stp_work, stp_work_fn);
static struct timer_list stp_timer;
@@ -582,10 +530,26 @@ void stp_queue_work(void)
queue_work(time_sync_wq, &stp_work);
}
+static int __store_stpinfo(void)
+{
+ int rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
+
+ if (rc)
+ clear_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
+ else
+ set_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
+ return rc;
+}
+
+static int stpinfo_valid(void)
+{
+ return stp_online && test_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
+}
+
static int stp_sync_clock(void *data)
{
struct clock_sync_data *sync = data;
- unsigned long long clock_delta;
+ long clock_delta, flags;
static int first;
int rc;
@@ -595,19 +559,18 @@ static int stp_sync_clock(void *data)
while (atomic_read(&sync->cpus) != 0)
cpu_relax();
rc = 0;
- if (stp_info.todoff[0] || stp_info.todoff[1] ||
- stp_info.todoff[2] || stp_info.todoff[3] ||
- stp_info.tmd != 2) {
+ if (stp_info.todoff || stp_info.tmd != 2) {
+ flags = vdso_update_begin();
rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0,
&clock_delta);
if (rc == 0) {
sync->clock_delta = clock_delta;
clock_sync_global(clock_delta);
- rc = chsc_sstpi(stp_page, &stp_info,
- sizeof(struct stp_sstpi));
+ rc = __store_stpinfo();
if (rc == 0 && stp_info.tmd != 2)
rc = -EAGAIN;
}
+ vdso_update_end(flags);
}
sync->in_sync = rc ? -EAGAIN : 1;
xchg(&first, 0);
@@ -627,6 +590,81 @@ static int stp_sync_clock(void *data)
return 0;
}
+static int stp_clear_leap(void)
+{
+ struct __kernel_timex txc;
+ int ret;
+
+ memset(&txc, 0, sizeof(txc));
+
+ ret = do_adjtimex(&txc);
+ if (ret < 0)
+ return ret;
+
+ txc.modes = ADJ_STATUS;
+ txc.status &= ~(STA_INS|STA_DEL);
+ return do_adjtimex(&txc);
+}
+
+static void stp_check_leap(void)
+{
+ struct stp_stzi stzi;
+ struct stp_lsoib *lsoib = &stzi.lsoib;
+ struct __kernel_timex txc;
+ int64_t timediff;
+ int leapdiff, ret;
+
+ if (!stp_info.lu || !check_sync_clock()) {
+ /*
+ * Either a scheduled leap second was removed by the operator,
+ * or STP is out of sync. In both cases, clear the leap second
+ * kernel flags.
+ */
+ if (stp_clear_leap() < 0)
+ pr_err("failed to clear leap second flags\n");
+ return;
+ }
+
+ if (chsc_stzi(stp_page, &stzi, sizeof(stzi))) {
+ pr_err("stzi failed\n");
+ return;
+ }
+
+ timediff = tod_to_ns(lsoib->nlsout - get_tod_clock()) / NSEC_PER_SEC;
+ leapdiff = lsoib->nlso - lsoib->also;
+
+ if (leapdiff != 1 && leapdiff != -1) {
+ pr_err("Cannot schedule %d leap seconds\n", leapdiff);
+ return;
+ }
+
+ if (timediff < 0) {
+ if (stp_clear_leap() < 0)
+ pr_err("failed to clear leap second flags\n");
+ } else if (timediff < 7200) {
+ memset(&txc, 0, sizeof(txc));
+ ret = do_adjtimex(&txc);
+ if (ret < 0)
+ return;
+
+ txc.modes = ADJ_STATUS;
+ if (leapdiff > 0)
+ txc.status |= STA_INS;
+ else
+ txc.status |= STA_DEL;
+ ret = do_adjtimex(&txc);
+ if (ret < 0)
+ pr_err("failed to set leap second flags\n");
+ /* arm Timer to clear leap second flags */
+ mod_timer(&stp_timer, jiffies + msecs_to_jiffies(14400 * MSEC_PER_SEC));
+ } else {
+ /* The day the leap second is scheduled for hasn't been reached. Retry
+ * in one hour.
+ */
+ mod_timer(&stp_timer, jiffies + msecs_to_jiffies(3600 * MSEC_PER_SEC));
+ }
+}
+
/*
* STP work. Check for the STP state and take over the clock
* synchronization if the STP clock source is usable.
@@ -637,7 +675,7 @@ static void stp_work_fn(struct work_struct *work)
int rc;
/* prevent multiple execution. */
- mutex_lock(&stp_work_mutex);
+ mutex_lock(&stp_mutex);
if (!stp_online) {
chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL);
@@ -645,33 +683,34 @@ static void stp_work_fn(struct work_struct *work)
goto out_unlock;
}
- rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0, NULL);
+ rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xf0e0, NULL);
if (rc)
goto out_unlock;
- rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
+ rc = __store_stpinfo();
if (rc || stp_info.c == 0)
goto out_unlock;
/* Skip synchronization if the clock is already in sync. */
- if (check_sync_clock())
- goto out_unlock;
-
- memset(&stp_sync, 0, sizeof(stp_sync));
- cpus_read_lock();
- atomic_set(&stp_sync.cpus, num_online_cpus() - 1);
- stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask);
- cpus_read_unlock();
+ if (!check_sync_clock()) {
+ memset(&stp_sync, 0, sizeof(stp_sync));
+ cpus_read_lock();
+ atomic_set(&stp_sync.cpus, num_online_cpus() - 1);
+ stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask);
+ cpus_read_unlock();
+ }
if (!check_sync_clock())
/*
- * There is a usable clock but the synchonization failed.
+ * There is a usable clock but the synchronization failed.
* Retry after a second.
*/
- mod_timer(&stp_timer, jiffies + HZ);
+ mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC));
+ else if (stp_info.lu)
+ stp_check_leap();
out_unlock:
- mutex_unlock(&stp_work_mutex);
+ mutex_unlock(&stp_mutex);
}
/*
@@ -682,115 +721,178 @@ static struct bus_type stp_subsys = {
.dev_name = "stp",
};
-static ssize_t stp_ctn_id_show(struct device *dev,
+static ssize_t ctn_id_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%016llx\n",
- *(unsigned long long *) stp_info.ctnid);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%016lx\n",
+ *(unsigned long *) stp_info.ctnid);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL);
+static DEVICE_ATTR_RO(ctn_id);
-static ssize_t stp_ctn_type_show(struct device *dev,
+static ssize_t ctn_type_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%i\n", stp_info.ctn);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%i\n", stp_info.ctn);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL);
+static DEVICE_ATTR_RO(ctn_type);
-static ssize_t stp_dst_offset_show(struct device *dev,
+static ssize_t dst_offset_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online || !(stp_info.vbits & 0x2000))
- return -ENODATA;
- return sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid() && (stp_info.vbits & 0x2000))
+ ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL);
+static DEVICE_ATTR_RO(dst_offset);
-static ssize_t stp_leap_seconds_show(struct device *dev,
+static ssize_t leap_seconds_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online || !(stp_info.vbits & 0x8000))
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid() && (stp_info.vbits & 0x8000))
+ ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
+ mutex_unlock(&stp_mutex);
+ return ret;
+}
+
+static DEVICE_ATTR_RO(leap_seconds);
+
+static ssize_t leap_seconds_scheduled_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct stp_stzi stzi;
+ ssize_t ret;
+
+ mutex_lock(&stp_mutex);
+ if (!stpinfo_valid() || !(stp_info.vbits & 0x8000) || !stp_info.lu) {
+ mutex_unlock(&stp_mutex);
return -ENODATA;
- return sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
+ }
+
+ ret = chsc_stzi(stp_page, &stzi, sizeof(stzi));
+ mutex_unlock(&stp_mutex);
+ if (ret < 0)
+ return ret;
+
+ if (!stzi.lsoib.p)
+ return sprintf(buf, "0,0\n");
+
+ return sprintf(buf, "%lu,%d\n",
+ tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC,
+ stzi.lsoib.nlso - stzi.lsoib.also);
}
-static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL);
+static DEVICE_ATTR_RO(leap_seconds_scheduled);
-static ssize_t stp_stratum_show(struct device *dev,
+static ssize_t stratum_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL);
+static DEVICE_ATTR_RO(stratum);
-static ssize_t stp_time_offset_show(struct device *dev,
+static ssize_t time_offset_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online || !(stp_info.vbits & 0x0800))
- return -ENODATA;
- return sprintf(buf, "%i\n", (int) stp_info.tto);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid() && (stp_info.vbits & 0x0800))
+ ret = sprintf(buf, "%i\n", (int) stp_info.tto);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL);
+static DEVICE_ATTR_RO(time_offset);
-static ssize_t stp_time_zone_offset_show(struct device *dev,
+static ssize_t time_zone_offset_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online || !(stp_info.vbits & 0x4000))
- return -ENODATA;
- return sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid() && (stp_info.vbits & 0x4000))
+ ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(time_zone_offset, 0400,
- stp_time_zone_offset_show, NULL);
+static DEVICE_ATTR_RO(time_zone_offset);
-static ssize_t stp_timing_mode_show(struct device *dev,
+static ssize_t timing_mode_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%i\n", stp_info.tmd);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%i\n", stp_info.tmd);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL);
+static DEVICE_ATTR_RO(timing_mode);
-static ssize_t stp_timing_state_show(struct device *dev,
+static ssize_t timing_state_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%i\n", stp_info.tst);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%i\n", stp_info.tst);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL);
+static DEVICE_ATTR_RO(timing_state);
-static ssize_t stp_online_show(struct device *dev,
+static ssize_t online_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "%i\n", stp_online);
}
-static ssize_t stp_online_store(struct device *dev,
+static ssize_t online_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
@@ -801,14 +903,14 @@ static ssize_t stp_online_store(struct device *dev,
return -EINVAL;
if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags))
return -EOPNOTSUPP;
- mutex_lock(&clock_sync_mutex);
+ mutex_lock(&stp_mutex);
stp_online = value;
if (stp_online)
set_bit(CLOCK_SYNC_STP, &clock_sync_flags);
else
clear_bit(CLOCK_SYNC_STP, &clock_sync_flags);
queue_work(time_sync_wq, &stp_work);
- mutex_unlock(&clock_sync_mutex);
+ mutex_unlock(&stp_mutex);
return count;
}
@@ -816,46 +918,27 @@ static ssize_t stp_online_store(struct device *dev,
* Can't use DEVICE_ATTR because the attribute should be named
* stp/online but dev_attr_online already exists in this file ..
*/
-static struct device_attribute dev_attr_stp_online = {
- .attr = { .name = "online", .mode = 0600 },
- .show = stp_online_show,
- .store = stp_online_store,
-};
-
-static struct device_attribute *stp_attributes[] = {
- &dev_attr_ctn_id,
- &dev_attr_ctn_type,
- &dev_attr_dst_offset,
- &dev_attr_leap_seconds,
- &dev_attr_stp_online,
- &dev_attr_stratum,
- &dev_attr_time_offset,
- &dev_attr_time_zone_offset,
- &dev_attr_timing_mode,
- &dev_attr_timing_state,
+static DEVICE_ATTR_RW(online);
+
+static struct attribute *stp_dev_attrs[] = {
+ &dev_attr_ctn_id.attr,
+ &dev_attr_ctn_type.attr,
+ &dev_attr_dst_offset.attr,
+ &dev_attr_leap_seconds.attr,
+ &dev_attr_online.attr,
+ &dev_attr_leap_seconds_scheduled.attr,
+ &dev_attr_stratum.attr,
+ &dev_attr_time_offset.attr,
+ &dev_attr_time_zone_offset.attr,
+ &dev_attr_timing_mode.attr,
+ &dev_attr_timing_state.attr,
NULL
};
+ATTRIBUTE_GROUPS(stp_dev);
static int __init stp_init_sysfs(void)
{
- struct device_attribute **attr;
- int rc;
-
- rc = subsys_system_register(&stp_subsys, NULL);
- if (rc)
- goto out;
- for (attr = stp_attributes; *attr; attr++) {
- rc = device_create_file(stp_subsys.dev_root, *attr);
- if (rc)
- goto out_unreg;
- }
- return 0;
-out_unreg:
- for (; attr >= stp_attributes; attr--)
- device_remove_file(stp_subsys.dev_root, *attr);
- bus_unregister(&stp_subsys);
-out:
- return rc;
+ return subsys_system_register(&stp_subsys, stp_dev_groups);
}
device_initcall(stp_init_sysfs);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 3627953007ed..89e91b8ce842 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright IBM Corp. 2007, 2011
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#define KMSG_COMPONENT "cpu"
@@ -26,7 +25,6 @@
#include <linux/nodemask.h>
#include <linux/node.h>
#include <asm/sysinfo.h>
-#include <asm/numa.h>
#define PTF_HORIZONTAL (0UL)
#define PTF_VERTICAL (1UL)
@@ -63,50 +61,56 @@ static struct mask_info drawer_info;
struct cpu_topology_s390 cpu_topology[NR_CPUS];
EXPORT_SYMBOL_GPL(cpu_topology);
-cpumask_t cpus_with_topology;
-
-static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu)
+static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int cpu)
{
- cpumask_t mask;
+ static cpumask_t mask;
- cpumask_copy(&mask, cpumask_of(cpu));
+ cpumask_clear(&mask);
+ if (!cpumask_test_cpu(cpu, &cpu_setup_mask))
+ goto out;
+ cpumask_set_cpu(cpu, &mask);
switch (topology_mode) {
case TOPOLOGY_MODE_HW:
while (info) {
if (cpumask_test_cpu(cpu, &info->mask)) {
- mask = info->mask;
+ cpumask_copy(&mask, &info->mask);
break;
}
info = info->next;
}
- if (cpumask_empty(&mask))
- cpumask_copy(&mask, cpumask_of(cpu));
break;
case TOPOLOGY_MODE_PACKAGE:
cpumask_copy(&mask, cpu_present_mask);
break;
default:
- /* fallthrough */
+ fallthrough;
case TOPOLOGY_MODE_SINGLE:
- cpumask_copy(&mask, cpumask_of(cpu));
break;
}
- return mask;
+ cpumask_and(&mask, &mask, &cpu_setup_mask);
+out:
+ cpumask_copy(dst, &mask);
}
-static cpumask_t cpu_thread_map(unsigned int cpu)
+static void cpu_thread_map(cpumask_t *dst, unsigned int cpu)
{
- cpumask_t mask;
- int i;
+ static cpumask_t mask;
+ unsigned int max_cpu;
- cpumask_copy(&mask, cpumask_of(cpu));
+ cpumask_clear(&mask);
+ if (!cpumask_test_cpu(cpu, &cpu_setup_mask))
+ goto out;
+ cpumask_set_cpu(cpu, &mask);
if (topology_mode != TOPOLOGY_MODE_HW)
- return mask;
+ goto out;
cpu -= cpu % (smp_cpu_mtid + 1);
- for (i = 0; i <= smp_cpu_mtid; i++)
- if (cpu_present(cpu + i))
- cpumask_set_cpu(cpu + i, &mask);
- return mask;
+ max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1);
+ for (; cpu <= max_cpu; cpu++) {
+ if (cpumask_test_cpu(cpu, &cpu_setup_mask))
+ cpumask_set_cpu(cpu, &mask);
+ }
+out:
+ cpumask_copy(dst, &mask);
}
#define TOPOLOGY_CORE_BITS 64
@@ -120,26 +124,26 @@ static void add_cpus_to_mask(struct topology_core *tl_core,
unsigned int core;
for_each_set_bit(core, &tl_core->mask, TOPOLOGY_CORE_BITS) {
- unsigned int rcore;
- int lcpu, i;
+ unsigned int max_cpu, rcore;
+ int cpu;
rcore = TOPOLOGY_CORE_BITS - 1 - core + tl_core->origin;
- lcpu = smp_find_processor_id(rcore << smp_cpu_mt_shift);
- if (lcpu < 0)
+ cpu = smp_find_processor_id(rcore << smp_cpu_mt_shift);
+ if (cpu < 0)
continue;
- for (i = 0; i <= smp_cpu_mtid; i++) {
- topo = &cpu_topology[lcpu + i];
+ max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1);
+ for (; cpu <= max_cpu; cpu++) {
+ topo = &cpu_topology[cpu];
topo->drawer_id = drawer->id;
topo->book_id = book->id;
topo->socket_id = socket->id;
topo->core_id = rcore;
- topo->thread_id = lcpu + i;
+ topo->thread_id = cpu;
topo->dedicated = tl_core->d;
- cpumask_set_cpu(lcpu + i, &drawer->mask);
- cpumask_set_cpu(lcpu + i, &book->mask);
- cpumask_set_cpu(lcpu + i, &socket->mask);
- cpumask_set_cpu(lcpu + i, &cpus_with_topology);
- smp_cpu_set_polarization(lcpu + i, tl_core->pp);
+ cpumask_set_cpu(cpu, &drawer->mask);
+ cpumask_set_cpu(cpu, &book->mask);
+ cpumask_set_cpu(cpu, &socket->mask);
+ smp_cpu_set_polarization(cpu, tl_core->pp);
}
}
}
@@ -245,17 +249,18 @@ int topology_set_cpu_management(int fc)
return rc;
}
-static void update_cpu_masks(void)
+void update_cpu_masks(void)
{
- struct cpu_topology_s390 *topo;
- int cpu, id;
+ struct cpu_topology_s390 *topo, *topo_package, *topo_sibling;
+ int cpu, sibling, pkg_first, smt_first, id;
for_each_possible_cpu(cpu) {
topo = &cpu_topology[cpu];
- topo->thread_mask = cpu_thread_map(cpu);
- topo->core_mask = cpu_group_map(&socket_info, cpu);
- topo->book_mask = cpu_group_map(&book_info, cpu);
- topo->drawer_mask = cpu_group_map(&drawer_info, cpu);
+ cpu_thread_map(&topo->thread_mask, cpu);
+ cpu_group_map(&topo->core_mask, &socket_info, cpu);
+ cpu_group_map(&topo->book_mask, &book_info, cpu);
+ cpu_group_map(&topo->drawer_mask, &drawer_info, cpu);
+ topo->booted_cores = 0;
if (topology_mode != TOPOLOGY_MODE_HW) {
id = topology_mode == TOPOLOGY_MODE_PACKAGE ? 0 : cpu;
topo->thread_id = cpu;
@@ -263,11 +268,23 @@ static void update_cpu_masks(void)
topo->socket_id = id;
topo->book_id = id;
topo->drawer_id = id;
- if (cpu_present(cpu))
- cpumask_set_cpu(cpu, &cpus_with_topology);
}
}
- numa_update_cpu_topology();
+ for_each_online_cpu(cpu) {
+ topo = &cpu_topology[cpu];
+ pkg_first = cpumask_first(&topo->core_mask);
+ topo_package = &cpu_topology[pkg_first];
+ if (cpu == pkg_first) {
+ for_each_cpu(sibling, &topo->core_mask) {
+ topo_sibling = &cpu_topology[sibling];
+ smt_first = cpumask_first(&topo_sibling->thread_mask);
+ if (sibling == smt_first)
+ topo_package->booted_cores++;
+ }
+ } else {
+ topo->booted_cores = topo_package->booted_cores;
+ }
+ }
}
void store_topology(struct sysinfo_15_1_x *info)
@@ -289,7 +306,6 @@ static int __arch_update_cpu_topology(void)
int rc = 0;
mutex_lock(&smp_cpu_state_mutex);
- cpumask_clear(&cpus_with_topology);
if (MACHINE_HAS_TOPOLOGY) {
rc = 1;
store_topology(info);
@@ -346,9 +362,9 @@ static atomic_t topology_poll = ATOMIC_INIT(0);
static void set_topology_timer(void)
{
if (atomic_add_unless(&topology_poll, -1, 0))
- mod_timer(&topology_timer, jiffies + HZ / 10);
+ mod_timer(&topology_timer, jiffies + msecs_to_jiffies(100));
else
- mod_timer(&topology_timer, jiffies + HZ * 60);
+ mod_timer(&topology_timer, jiffies + msecs_to_jiffies(60 * MSEC_PER_SEC));
}
void topology_expect_change(void)
@@ -391,7 +407,7 @@ static ssize_t dispatching_store(struct device *dev,
if (val != 0 && val != 1)
return -EINVAL;
rc = 0;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&smp_cpu_state_mutex);
if (cpu_management == val)
goto out;
@@ -402,7 +418,7 @@ static ssize_t dispatching_store(struct device *dev,
topology_expect_change();
out:
mutex_unlock(&smp_cpu_state_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return rc ? rc : count;
}
static DEVICE_ATTR_RW(dispatching);
@@ -506,7 +522,7 @@ static struct sched_domain_topology_level s390_topology[] = {
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
{ cpu_book_mask, SD_INIT_NAME(BOOK) },
{ cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { cpu_cpu_mask, SD_INIT_NAME(PKG) },
{ NULL, },
};
@@ -554,6 +570,7 @@ void __init topology_init_early(void)
alloc_masks(info, &book_info, 2);
alloc_masks(info, &drawer_info, 3);
out:
+ cpumask_set_cpu(0, &cpu_setup_mask);
__arch_update_cpu_topology();
__arch_update_dedicated_flag(NULL);
}
@@ -584,7 +601,7 @@ static int __init topology_setup(char *str)
early_param("topology", topology_setup);
static int topology_ctl_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int enabled = topology_is_enabled();
int new_mode;
@@ -619,27 +636,25 @@ static struct ctl_table topology_ctl_table[] = {
.mode = 0644,
.proc_handler = topology_ctl_handler,
},
- { },
-};
-
-static struct ctl_table topology_dir_table[] = {
- {
- .procname = "s390",
- .maxlen = 0,
- .mode = 0555,
- .child = topology_ctl_table,
- },
- { },
};
static int __init topology_init(void)
{
+ struct device *dev_root;
+ int rc = 0;
+
timer_setup(&topology_timer, topology_timer_fn, TIMER_DEFERRABLE);
if (MACHINE_HAS_TOPOLOGY)
set_topology_timer();
else
topology_update_polarization_simple();
- register_sysctl_table(topology_dir_table);
- return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching);
+ register_sysctl("s390", topology_ctl_table);
+
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ rc = device_create_file(dev_root, &dev_attr_dispatching);
+ put_device(dev_root);
+ }
+ return rc;
}
device_initcall(topology_init);
diff --git a/arch/s390/kernel/trace.c b/arch/s390/kernel/trace.c
index 490b52e85014..11a669f3cc93 100644
--- a/arch/s390/kernel/trace.c
+++ b/arch/s390/kernel/trace.c
@@ -14,7 +14,7 @@ EXPORT_TRACEPOINT_SYMBOL(s390_diagnose);
static DEFINE_PER_CPU(unsigned int, diagnose_trace_depth);
-void trace_s390_diagnose_norecursion(int diag_nr)
+void notrace trace_s390_diagnose_norecursion(int diag_nr)
{
unsigned long flags;
unsigned int *depth;
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 164c0282b41a..46dac4540ca8 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -13,8 +13,11 @@
* 'Traps.c' handles hardware traps and faults after we have saved some
* state in 'asm.s'.
*/
+#include "asm/irqflags.h"
+#include "asm/ptrace.h"
#include <linux/kprobes.h>
#include <linux/kdebug.h>
+#include <linux/randomize_kstack.h>
#include <linux/extable.h>
#include <linux/ptrace.h>
#include <linux/sched.h>
@@ -23,7 +26,10 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/cpu.h>
+#include <linux/entry-common.h>
+#include <asm/asm-extable.h>
#include <asm/fpu/api.h>
+#include <asm/vtime.h>
#include "entry.h"
static inline void __user *get_trap_ip(struct pt_regs *regs)
@@ -31,16 +37,18 @@ static inline void __user *get_trap_ip(struct pt_regs *regs)
unsigned long address;
if (regs->int_code & 0x200)
- address = *(unsigned long *)(current->thread.trap_tdb + 24);
+ address = current->thread.trap_tdb.data[3];
else
address = regs->psw.addr;
return (void __user *) (address - (regs->int_code >> 16));
}
+#ifdef CONFIG_GENERIC_BUG
int is_valid_bugaddr(unsigned long addr)
{
return 1;
}
+#endif
void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
{
@@ -48,18 +56,8 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
force_sig_fault(si_signo, si_code, get_trap_ip(regs));
report_user_fault(regs, si_signo, 0);
} else {
- const struct exception_table_entry *fixup;
- fixup = s390_search_extables(regs->psw.addr);
- if (fixup)
- regs->psw.addr = extable_fixup(fixup);
- else {
- enum bug_trap_type btt;
-
- btt = report_bug(regs->psw.addr, regs);
- if (btt == BUG_TRAP_TYPE_WARN)
- return;
+ if (!fixup_exception(regs))
die(regs, str);
- }
}
}
@@ -83,17 +81,17 @@ void do_per_trap(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(do_per_trap);
-void default_trap_handler(struct pt_regs *regs)
+static void default_trap_handler(struct pt_regs *regs)
{
if (user_mode(regs)) {
report_user_fault(regs, SIGSEGV, 0);
- do_exit(SIGSEGV);
+ force_exit_sig(SIGSEGV);
} else
die(regs, "Unknown program exception");
}
#define DO_ERROR_INFO(name, signr, sicode, str) \
-void name(struct pt_regs *regs) \
+static void name(struct pt_regs *regs) \
{ \
do_trap(regs, signr, sicode, str); \
}
@@ -145,13 +143,13 @@ static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc)
do_trap(regs, SIGFPE, si_code, "floating point exception");
}
-void translation_exception(struct pt_regs *regs)
+static void translation_specification_exception(struct pt_regs *regs)
{
/* May never happen. */
- panic("Translation exception");
+ panic("Translation-Specification Exception");
}
-void illegal_op(struct pt_regs *regs)
+static void illegal_op(struct pt_regs *regs)
{
__u8 opcode[6];
__u16 __user *location;
@@ -193,11 +191,11 @@ NOKPROBE_SYMBOL(illegal_op);
DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN,
"specification exception");
-void vector_exception(struct pt_regs *regs)
+static void vector_exception(struct pt_regs *regs)
{
int si_code, vic;
- if (!MACHINE_HAS_VX) {
+ if (!cpu_has_vx()) {
do_trap(regs, SIGILL, ILL_ILLOPN, "illegal operation");
return;
}
@@ -227,7 +225,7 @@ void vector_exception(struct pt_regs *regs)
do_trap(regs, SIGFPE, si_code, "vector exception");
}
-void data_exception(struct pt_regs *regs)
+static void data_exception(struct pt_regs *regs)
{
save_fpu_regs();
if (current->thread.fpu.fpc & FPC_DXC_MASK)
@@ -236,7 +234,7 @@ void data_exception(struct pt_regs *regs)
do_trap(regs, SIGILL, ILL_ILLOPN, "data exception");
}
-void space_switch_exception(struct pt_regs *regs)
+static void space_switch_exception(struct pt_regs *regs)
{
/* Set user psw back to home space mode. */
if (user_mode(regs))
@@ -245,6 +243,23 @@ void space_switch_exception(struct pt_regs *regs)
do_trap(regs, SIGILL, ILL_PRVOPC, "space switch event");
}
+static void monitor_event_exception(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ return;
+
+ switch (report_bug(regs->psw.addr - (regs->int_code >> 16), regs)) {
+ case BUG_TRAP_TYPE_NONE:
+ fixup_exception(regs);
+ break;
+ case BUG_TRAP_TYPE_WARN:
+ break;
+ case BUG_TRAP_TYPE_BUG:
+ die(regs, "monitor event");
+ break;
+ }
+}
+
void kernel_stack_overflow(struct pt_regs *regs)
{
bust_spinlocks(1);
@@ -255,8 +270,147 @@ void kernel_stack_overflow(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(kernel_stack_overflow);
+static void __init test_monitor_call(void)
+{
+ int val = 1;
+
+ if (!IS_ENABLED(CONFIG_BUG))
+ return;
+ asm volatile(
+ " mc 0,0\n"
+ "0: xgr %0,%0\n"
+ "1:\n"
+ EX_TABLE(0b,1b)
+ : "+d" (val));
+ if (!val)
+ panic("Monitor call doesn't work!\n");
+}
+
void __init trap_init(void)
{
- sort_extable(__start_dma_ex_table, __stop_dma_ex_table);
+ unsigned long flags;
+ struct ctlreg cr0;
+
+ local_irq_save(flags);
+ cr0 = local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT);
+ psw_bits(S390_lowcore.external_new_psw).mcheck = 1;
+ psw_bits(S390_lowcore.program_new_psw).mcheck = 1;
+ psw_bits(S390_lowcore.svc_new_psw).mcheck = 1;
+ psw_bits(S390_lowcore.io_new_psw).mcheck = 1;
+ local_ctl_load(0, &cr0);
+ local_irq_restore(flags);
local_mcck_enable();
+ test_monitor_call();
}
+
+static void (*pgm_check_table[128])(struct pt_regs *regs);
+
+void noinstr __do_pgm_check(struct pt_regs *regs)
+{
+ unsigned int trapnr;
+ irqentry_state_t state;
+
+ regs->int_code = S390_lowcore.pgm_int_code;
+ regs->int_parm_long = S390_lowcore.trans_exc_code;
+
+ state = irqentry_enter(regs);
+
+ if (user_mode(regs)) {
+ update_timer_sys();
+ if (!static_branch_likely(&cpu_has_bear)) {
+ if (regs->last_break < 4096)
+ regs->last_break = 1;
+ }
+ current->thread.last_break = regs->last_break;
+ }
+
+ if (S390_lowcore.pgm_code & 0x0200) {
+ /* transaction abort */
+ current->thread.trap_tdb = S390_lowcore.pgm_tdb;
+ }
+
+ if (S390_lowcore.pgm_code & PGM_INT_CODE_PER) {
+ if (user_mode(regs)) {
+ struct per_event *ev = &current->thread.per_event;
+
+ set_thread_flag(TIF_PER_TRAP);
+ ev->address = S390_lowcore.per_address;
+ ev->cause = S390_lowcore.per_code_combined;
+ ev->paid = S390_lowcore.per_access_id;
+ } else {
+ /* PER event in kernel is kprobes */
+ __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER);
+ do_per_trap(regs);
+ goto out;
+ }
+ }
+
+ if (!irqs_disabled_flags(regs->psw.mask))
+ trace_hardirqs_on();
+ __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER);
+
+ trapnr = regs->int_code & PGM_INT_CODE_MASK;
+ if (trapnr)
+ pgm_check_table[trapnr](regs);
+out:
+ local_irq_disable();
+ irqentry_exit(regs, state);
+}
+
+/*
+ * The program check table contains exactly 128 (0x00-0x7f) entries. Each
+ * line defines the function to be called corresponding to the program check
+ * interruption code.
+ */
+static void (*pgm_check_table[128])(struct pt_regs *regs) = {
+ [0x00] = default_trap_handler,
+ [0x01] = illegal_op,
+ [0x02] = privileged_op,
+ [0x03] = execute_exception,
+ [0x04] = do_protection_exception,
+ [0x05] = addressing_exception,
+ [0x06] = specification_exception,
+ [0x07] = data_exception,
+ [0x08] = overflow_exception,
+ [0x09] = divide_exception,
+ [0x0a] = overflow_exception,
+ [0x0b] = divide_exception,
+ [0x0c] = hfp_overflow_exception,
+ [0x0d] = hfp_underflow_exception,
+ [0x0e] = hfp_significance_exception,
+ [0x0f] = hfp_divide_exception,
+ [0x10] = do_dat_exception,
+ [0x11] = do_dat_exception,
+ [0x12] = translation_specification_exception,
+ [0x13] = special_op_exception,
+ [0x14] = default_trap_handler,
+ [0x15] = operand_exception,
+ [0x16] = default_trap_handler,
+ [0x17] = default_trap_handler,
+ [0x18] = transaction_exception,
+ [0x19] = default_trap_handler,
+ [0x1a] = default_trap_handler,
+ [0x1b] = vector_exception,
+ [0x1c] = space_switch_exception,
+ [0x1d] = hfp_sqrt_exception,
+ [0x1e ... 0x37] = default_trap_handler,
+ [0x38] = do_dat_exception,
+ [0x39] = do_dat_exception,
+ [0x3a] = do_dat_exception,
+ [0x3b] = do_dat_exception,
+ [0x3c] = default_trap_handler,
+ [0x3d] = do_secure_storage_access,
+ [0x3e] = do_non_secure_storage_access,
+ [0x3f] = do_secure_storage_violation,
+ [0x40] = monitor_event_exception,
+ [0x41 ... 0x7f] = default_trap_handler,
+};
+
+#define COND_TRAP(x) asm( \
+ ".weak " __stringify(x) "\n\t" \
+ ".set " __stringify(x) "," \
+ __stringify(default_trap_handler))
+
+COND_TRAP(do_secure_storage_access);
+COND_TRAP(do_non_secure_storage_access);
+COND_TRAP(do_secure_storage_violation);
diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c
index da2d4d4c5b0e..0ece156fdd7c 100644
--- a/arch/s390/kernel/unwind_bc.c
+++ b/arch/s390/kernel/unwind_bc.c
@@ -36,10 +36,17 @@ static bool update_stack_info(struct unwind_state *state, unsigned long sp)
return true;
}
-static inline bool is_task_pt_regs(struct unwind_state *state,
- struct pt_regs *regs)
+static inline bool is_final_pt_regs(struct unwind_state *state,
+ struct pt_regs *regs)
{
- return task_pt_regs(state->task) == regs;
+ /* user mode or kernel thread pt_regs at the bottom of task stack */
+ if (task_pt_regs(state->task) == regs)
+ return true;
+
+ /* user mode pt_regs at the bottom of irq stack */
+ return state->stack_info.type == STACK_TYPE_IRQ &&
+ state->stack_info.end - sizeof(struct pt_regs) == (unsigned long)regs &&
+ READ_ONCE_NOCHECK(regs->psw.mask) & PSW_MASK_PSTATE;
}
bool unwind_next_frame(struct unwind_state *state)
@@ -57,8 +64,8 @@ bool unwind_next_frame(struct unwind_state *state)
ip = READ_ONCE_NOCHECK(sf->gprs[8]);
reliable = false;
regs = NULL;
- if (!__kernel_text_address(ip)) {
- /* skip bogus %r14 */
+ /* skip bogus %r14 or if is the same as regs->psw.addr */
+ if (!__kernel_text_address(ip) || state->ip == unwind_recover_ret_addr(state, ip)) {
state->regs = NULL;
return unwind_next_frame(state);
}
@@ -80,7 +87,7 @@ bool unwind_next_frame(struct unwind_state *state)
if (!on_stack(info, sp, sizeof(struct pt_regs)))
goto out_err;
regs = (struct pt_regs *) sp;
- if (is_task_pt_regs(state, regs))
+ if (is_final_pt_regs(state, regs))
goto out_stop;
ip = READ_ONCE_NOCHECK(regs->psw.addr);
sp = READ_ONCE_NOCHECK(regs->gprs[15]);
@@ -96,13 +103,11 @@ bool unwind_next_frame(struct unwind_state *state)
if (sp & 0x7)
goto out_err;
- ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *) sp);
-
/* Update unwind state */
state->sp = sp;
- state->ip = ip;
state->regs = regs;
state->reliable = reliable;
+ state->ip = unwind_recover_ret_addr(state, ip);
return true;
out_err:
@@ -154,12 +159,10 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
ip = READ_ONCE_NOCHECK(sf->gprs[8]);
}
- ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, NULL);
-
/* Update unwind state */
state->sp = sp;
- state->ip = ip;
state->reliable = true;
+ state->ip = unwind_recover_ret_addr(state, ip);
if (!first_frame)
return;
diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c
index 5007fac01bb5..b88345ef8bd9 100644
--- a/arch/s390/kernel/uprobes.c
+++ b/arch/s390/kernel/uprobes.c
@@ -32,7 +32,7 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
return -EINVAL;
if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT)
return -EINVAL;
- clear_pt_regs_flag(regs, PIF_PER_TRAP);
+ clear_thread_flag(TIF_PER_TRAP);
auprobe->saved_per = psw_bits(regs->psw).per;
auprobe->saved_int_code = regs->int_code;
regs->int_code = UPROBE_TRAP_NR;
@@ -103,7 +103,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
/* fix per address */
current->thread.per_event.address = utask->vaddr;
/* trigger per event */
- set_pt_regs_flag(regs, PIF_PER_TRAP);
+ set_thread_flag(TIF_PER_TRAP);
}
return 0;
}
@@ -126,6 +126,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
case DIE_SSTEP:
if (uprobe_post_sstep_notifier(regs))
return NOTIFY_STOP;
+ break;
default:
break;
}
@@ -176,9 +177,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len)
__typeof__(*(ptr)) input; \
int __rc = 0; \
\
- if (!test_facility(34)) \
- __rc = EMU_ILLEGAL_OP; \
- else if ((u64 __force)ptr & mask) \
+ if ((u64 __force)ptr & mask) \
__rc = EMU_SPECIFICATION; \
else if (get_user(input, ptr)) \
__rc = EMU_ADDRESSING; \
@@ -193,9 +192,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len)
__typeof__(ptr) __ptr = (ptr); \
int __rc = 0; \
\
- if (!test_facility(34)) \
- __rc = EMU_ILLEGAL_OP; \
- else if ((u64 __force)__ptr & mask) \
+ if ((u64 __force)__ptr & mask) \
__rc = EMU_SPECIFICATION; \
else if (put_user(*(input), __ptr)) \
__rc = EMU_ADDRESSING; \
@@ -212,9 +209,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len)
__typeof__(*(ptr)) input; \
int __rc = 0; \
\
- if (!test_facility(34)) \
- __rc = EMU_ILLEGAL_OP; \
- else if ((u64 __force)ptr & mask) \
+ if ((u64 __force)ptr & mask) \
__rc = EMU_SPECIFICATION; \
else if (get_user(input, ptr)) \
__rc = EMU_ADDRESSING; \
@@ -259,7 +254,7 @@ static void sim_stor_event(struct pt_regs *regs, void *addr, int len)
return;
current->thread.per_event.address = regs->psw.addr;
current->thread.per_event.cause = PER_EVENT_STORE >> 16;
- set_pt_regs_flag(regs, PIF_PER_TRAP);
+ set_thread_flag(TIF_PER_TRAP);
}
/*
@@ -326,10 +321,6 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs)
break;
case 0xc6:
switch (insn->opc1) {
- case 0x02: /* pfdrl */
- if (!test_facility(34))
- rc = EMU_ILLEGAL_OP;
- break;
case 0x04: /* cghrl */
rc = emu_cmp_ril(regs, (s16 __user *)uptr, &rx->s64);
break;
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
new file mode 100644
index 000000000000..fc07bc39e698
--- /dev/null
+++ b/arch/s390/kernel/uv.c
@@ -0,0 +1,717 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Ultravisor functions and initialization
+ *
+ * Copyright IBM Corp. 2019, 2020
+ */
+#define KMSG_COMPONENT "prot_virt"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/sizes.h>
+#include <linux/bitmap.h>
+#include <linux/memblock.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <asm/facility.h>
+#include <asm/sections.h>
+#include <asm/uv.h>
+
+/* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */
+#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
+int __bootdata_preserved(prot_virt_guest);
+#endif
+
+/*
+ * uv_info contains both host and guest information but it's currently only
+ * expected to be used within modules if it's the KVM module or for
+ * any PV guest module.
+ *
+ * The kernel itself will write these values once in uv_query_info()
+ * and then make some of them readable via a sysfs interface.
+ */
+struct uv_info __bootdata_preserved(uv_info);
+EXPORT_SYMBOL(uv_info);
+
+#if IS_ENABLED(CONFIG_KVM)
+int __bootdata_preserved(prot_virt_host);
+EXPORT_SYMBOL(prot_virt_host);
+
+static int __init uv_init(phys_addr_t stor_base, unsigned long stor_len)
+{
+ struct uv_cb_init uvcb = {
+ .header.cmd = UVC_CMD_INIT_UV,
+ .header.len = sizeof(uvcb),
+ .stor_origin = stor_base,
+ .stor_len = stor_len,
+ };
+
+ if (uv_call(0, (uint64_t)&uvcb)) {
+ pr_err("Ultravisor init failed with rc: 0x%x rrc: 0%x\n",
+ uvcb.header.rc, uvcb.header.rrc);
+ return -1;
+ }
+ return 0;
+}
+
+void __init setup_uv(void)
+{
+ void *uv_stor_base;
+
+ if (!is_prot_virt_host())
+ return;
+
+ uv_stor_base = memblock_alloc_try_nid(
+ uv_info.uv_base_stor_len, SZ_1M, SZ_2G,
+ MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
+ if (!uv_stor_base) {
+ pr_warn("Failed to reserve %lu bytes for ultravisor base storage\n",
+ uv_info.uv_base_stor_len);
+ goto fail;
+ }
+
+ if (uv_init(__pa(uv_stor_base), uv_info.uv_base_stor_len)) {
+ memblock_free(uv_stor_base, uv_info.uv_base_stor_len);
+ goto fail;
+ }
+
+ pr_info("Reserving %luMB as ultravisor base storage\n",
+ uv_info.uv_base_stor_len >> 20);
+ return;
+fail:
+ pr_info("Disabling support for protected virtualization");
+ prot_virt_host = 0;
+}
+
+/*
+ * Requests the Ultravisor to pin the page in the shared state. This will
+ * cause an intercept when the guest attempts to unshare the pinned page.
+ */
+int uv_pin_shared(unsigned long paddr)
+{
+ struct uv_cb_cfs uvcb = {
+ .header.cmd = UVC_CMD_PIN_PAGE_SHARED,
+ .header.len = sizeof(uvcb),
+ .paddr = paddr,
+ };
+
+ if (uv_call(0, (u64)&uvcb))
+ return -EINVAL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(uv_pin_shared);
+
+/*
+ * Requests the Ultravisor to destroy a guest page and make it
+ * accessible to the host. The destroy clears the page instead of
+ * exporting.
+ *
+ * @paddr: Absolute host address of page to be destroyed
+ */
+static int uv_destroy_page(unsigned long paddr)
+{
+ struct uv_cb_cfs uvcb = {
+ .header.cmd = UVC_CMD_DESTR_SEC_STOR,
+ .header.len = sizeof(uvcb),
+ .paddr = paddr
+ };
+
+ if (uv_call(0, (u64)&uvcb)) {
+ /*
+ * Older firmware uses 107/d as an indication of a non secure
+ * page. Let us emulate the newer variant (no-op).
+ */
+ if (uvcb.header.rc == 0x107 && uvcb.header.rrc == 0xd)
+ return 0;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * The caller must already hold a reference to the page
+ */
+int uv_destroy_owned_page(unsigned long paddr)
+{
+ struct page *page = phys_to_page(paddr);
+ int rc;
+
+ get_page(page);
+ rc = uv_destroy_page(paddr);
+ if (!rc)
+ clear_bit(PG_arch_1, &page->flags);
+ put_page(page);
+ return rc;
+}
+
+/*
+ * Requests the Ultravisor to encrypt a guest page and make it
+ * accessible to the host for paging (export).
+ *
+ * @paddr: Absolute host address of page to be exported
+ */
+int uv_convert_from_secure(unsigned long paddr)
+{
+ struct uv_cb_cfs uvcb = {
+ .header.cmd = UVC_CMD_CONV_FROM_SEC_STOR,
+ .header.len = sizeof(uvcb),
+ .paddr = paddr
+ };
+
+ if (uv_call(0, (u64)&uvcb))
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * The caller must already hold a reference to the page
+ */
+int uv_convert_owned_from_secure(unsigned long paddr)
+{
+ struct page *page = phys_to_page(paddr);
+ int rc;
+
+ get_page(page);
+ rc = uv_convert_from_secure(paddr);
+ if (!rc)
+ clear_bit(PG_arch_1, &page->flags);
+ put_page(page);
+ return rc;
+}
+
+/*
+ * Calculate the expected ref_count for a page that would otherwise have no
+ * further pins. This was cribbed from similar functions in other places in
+ * the kernel, but with some slight modifications. We know that a secure
+ * page can not be a huge page for example.
+ */
+static int expected_page_refs(struct page *page)
+{
+ int res;
+
+ res = page_mapcount(page);
+ if (PageSwapCache(page)) {
+ res++;
+ } else if (page_mapping(page)) {
+ res++;
+ if (page_has_private(page))
+ res++;
+ }
+ return res;
+}
+
+static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
+{
+ int expected, cc = 0;
+
+ if (PageWriteback(page))
+ return -EAGAIN;
+ expected = expected_page_refs(page);
+ if (!page_ref_freeze(page, expected))
+ return -EBUSY;
+ set_bit(PG_arch_1, &page->flags);
+ /*
+ * If the UVC does not succeed or fail immediately, we don't want to
+ * loop for long, or we might get stall notifications.
+ * On the other hand, this is a complex scenario and we are holding a lot of
+ * locks, so we can't easily sleep and reschedule. We try only once,
+ * and if the UVC returned busy or partial completion, we return
+ * -EAGAIN and we let the callers deal with it.
+ */
+ cc = __uv_call(0, (u64)uvcb);
+ page_ref_unfreeze(page, expected);
+ /*
+ * Return -ENXIO if the page was not mapped, -EINVAL for other errors.
+ * If busy or partially completed, return -EAGAIN.
+ */
+ if (cc == UVC_CC_OK)
+ return 0;
+ else if (cc == UVC_CC_BUSY || cc == UVC_CC_PARTIAL)
+ return -EAGAIN;
+ return uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
+}
+
+/**
+ * should_export_before_import - Determine whether an export is needed
+ * before an import-like operation
+ * @uvcb: the Ultravisor control block of the UVC to be performed
+ * @mm: the mm of the process
+ *
+ * Returns whether an export is needed before every import-like operation.
+ * This is needed for shared pages, which don't trigger a secure storage
+ * exception when accessed from a different guest.
+ *
+ * Although considered as one, the Unpin Page UVC is not an actual import,
+ * so it is not affected.
+ *
+ * No export is needed also when there is only one protected VM, because the
+ * page cannot belong to the wrong VM in that case (there is no "other VM"
+ * it can belong to).
+ *
+ * Return: true if an export is needed before every import, otherwise false.
+ */
+static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
+{
+ /*
+ * The misc feature indicates, among other things, that importing a
+ * shared page from a different protected VM will automatically also
+ * transfer its ownership.
+ */
+ if (uv_has_feature(BIT_UV_FEAT_MISC))
+ return false;
+ if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
+ return false;
+ return atomic_read(&mm->context.protected_count) > 1;
+}
+
+/*
+ * Requests the Ultravisor to make a page accessible to a guest.
+ * If it's brought in the first time, it will be cleared. If
+ * it has been exported before, it will be decrypted and integrity
+ * checked.
+ */
+int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
+{
+ struct vm_area_struct *vma;
+ bool local_drain = false;
+ spinlock_t *ptelock;
+ unsigned long uaddr;
+ struct page *page;
+ pte_t *ptep;
+ int rc;
+
+again:
+ rc = -EFAULT;
+ mmap_read_lock(gmap->mm);
+
+ uaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(uaddr))
+ goto out;
+ vma = vma_lookup(gmap->mm, uaddr);
+ if (!vma)
+ goto out;
+ /*
+ * Secure pages cannot be huge and userspace should not combine both.
+ * In case userspace does it anyway this will result in an -EFAULT for
+ * the unpack. The guest is thus never reaching secure mode. If
+ * userspace is playing dirty tricky with mapping huge pages later
+ * on this will result in a segmentation fault.
+ */
+ if (is_vm_hugetlb_page(vma))
+ goto out;
+
+ rc = -ENXIO;
+ ptep = get_locked_pte(gmap->mm, uaddr, &ptelock);
+ if (!ptep)
+ goto out;
+ if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) {
+ page = pte_page(*ptep);
+ rc = -EAGAIN;
+ if (trylock_page(page)) {
+ if (should_export_before_import(uvcb, gmap->mm))
+ uv_convert_from_secure(page_to_phys(page));
+ rc = make_page_secure(page, uvcb);
+ unlock_page(page);
+ }
+ }
+ pte_unmap_unlock(ptep, ptelock);
+out:
+ mmap_read_unlock(gmap->mm);
+
+ if (rc == -EAGAIN) {
+ /*
+ * If we are here because the UVC returned busy or partial
+ * completion, this is just a useless check, but it is safe.
+ */
+ wait_on_page_writeback(page);
+ } else if (rc == -EBUSY) {
+ /*
+ * If we have tried a local drain and the page refcount
+ * still does not match our expected safe value, try with a
+ * system wide drain. This is needed if the pagevecs holding
+ * the page are on a different CPU.
+ */
+ if (local_drain) {
+ lru_add_drain_all();
+ /* We give up here, and let the caller try again */
+ return -EAGAIN;
+ }
+ /*
+ * We are here if the page refcount does not match the
+ * expected safe value. The main culprits are usually
+ * pagevecs. With lru_add_drain() we drain the pagevecs
+ * on the local CPU so that hopefully the refcount will
+ * reach the expected safe value.
+ */
+ lru_add_drain();
+ local_drain = true;
+ /* And now we try again immediately after draining */
+ goto again;
+ } else if (rc == -ENXIO) {
+ if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE))
+ return -EFAULT;
+ return -EAGAIN;
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_make_secure);
+
+int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
+{
+ struct uv_cb_cts uvcb = {
+ .header.cmd = UVC_CMD_CONV_TO_SEC_STOR,
+ .header.len = sizeof(uvcb),
+ .guest_handle = gmap->guest_handle,
+ .gaddr = gaddr,
+ };
+
+ return gmap_make_secure(gmap, gaddr, &uvcb);
+}
+EXPORT_SYMBOL_GPL(gmap_convert_to_secure);
+
+/**
+ * gmap_destroy_page - Destroy a guest page.
+ * @gmap: the gmap of the guest
+ * @gaddr: the guest address to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ */
+int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
+{
+ struct vm_area_struct *vma;
+ unsigned long uaddr;
+ struct page *page;
+ int rc;
+
+ rc = -EFAULT;
+ mmap_read_lock(gmap->mm);
+
+ uaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(uaddr))
+ goto out;
+ vma = vma_lookup(gmap->mm, uaddr);
+ if (!vma)
+ goto out;
+ /*
+ * Huge pages should not be able to become secure
+ */
+ if (is_vm_hugetlb_page(vma))
+ goto out;
+
+ rc = 0;
+ /* we take an extra reference here */
+ page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET);
+ if (IS_ERR_OR_NULL(page))
+ goto out;
+ rc = uv_destroy_owned_page(page_to_phys(page));
+ /*
+ * Fault handlers can race; it is possible that two CPUs will fault
+ * on the same secure page. One CPU can destroy the page, reboot,
+ * re-enter secure mode and import it, while the second CPU was
+ * stuck at the beginning of the handler. At some point the second
+ * CPU will be able to progress, and it will not be able to destroy
+ * the page. In that case we do not want to terminate the process,
+ * we instead try to export the page.
+ */
+ if (rc)
+ rc = uv_convert_owned_from_secure(page_to_phys(page));
+ put_page(page);
+out:
+ mmap_read_unlock(gmap->mm);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_destroy_page);
+
+/*
+ * To be called with the page locked or with an extra reference! This will
+ * prevent gmap_make_secure from touching the page concurrently. Having 2
+ * parallel make_page_accessible is fine, as the UV calls will become a
+ * no-op if the page is already exported.
+ */
+int arch_make_page_accessible(struct page *page)
+{
+ int rc = 0;
+
+ /* Hugepage cannot be protected, so nothing to do */
+ if (PageHuge(page))
+ return 0;
+
+ /*
+ * PG_arch_1 is used in 3 places:
+ * 1. for kernel page tables during early boot
+ * 2. for storage keys of huge pages and KVM
+ * 3. As an indication that this page might be secure. This can
+ * overindicate, e.g. we set the bit before calling
+ * convert_to_secure.
+ * As secure pages are never huge, all 3 variants can co-exists.
+ */
+ if (!test_bit(PG_arch_1, &page->flags))
+ return 0;
+
+ rc = uv_pin_shared(page_to_phys(page));
+ if (!rc) {
+ clear_bit(PG_arch_1, &page->flags);
+ return 0;
+ }
+
+ rc = uv_convert_from_secure(page_to_phys(page));
+ if (!rc) {
+ clear_bit(PG_arch_1, &page->flags);
+ return 0;
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(arch_make_page_accessible);
+
+#endif
+
+#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM)
+static ssize_t uv_query_facilities(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n%lx\n%lx\n%lx\n",
+ uv_info.inst_calls_list[0],
+ uv_info.inst_calls_list[1],
+ uv_info.inst_calls_list[2],
+ uv_info.inst_calls_list[3]);
+}
+
+static struct kobj_attribute uv_query_facilities_attr =
+ __ATTR(facilities, 0444, uv_query_facilities, NULL);
+
+static ssize_t uv_query_supp_se_hdr_ver(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_ver);
+}
+
+static struct kobj_attribute uv_query_supp_se_hdr_ver_attr =
+ __ATTR(supp_se_hdr_ver, 0444, uv_query_supp_se_hdr_ver, NULL);
+
+static ssize_t uv_query_supp_se_hdr_pcf(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_pcf);
+}
+
+static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr =
+ __ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL);
+
+static ssize_t uv_query_dump_cpu_len(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.guest_cpu_stor_len);
+}
+
+static struct kobj_attribute uv_query_dump_cpu_len_attr =
+ __ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL);
+
+static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_storage_state_len);
+}
+
+static struct kobj_attribute uv_query_dump_storage_state_len_attr =
+ __ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL);
+
+static ssize_t uv_query_dump_finalize_len(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_finalize_len);
+}
+
+static struct kobj_attribute uv_query_dump_finalize_len_attr =
+ __ATTR(dump_finalize_len, 0444, uv_query_dump_finalize_len, NULL);
+
+static ssize_t uv_query_feature_indications(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.uv_feature_indications);
+}
+
+static struct kobj_attribute uv_query_feature_indications_attr =
+ __ATTR(feature_indications, 0444, uv_query_feature_indications, NULL);
+
+static ssize_t uv_query_max_guest_cpus(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", uv_info.max_guest_cpu_id + 1);
+}
+
+static struct kobj_attribute uv_query_max_guest_cpus_attr =
+ __ATTR(max_cpus, 0444, uv_query_max_guest_cpus, NULL);
+
+static ssize_t uv_query_max_guest_vms(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", uv_info.max_num_sec_conf);
+}
+
+static struct kobj_attribute uv_query_max_guest_vms_attr =
+ __ATTR(max_guests, 0444, uv_query_max_guest_vms, NULL);
+
+static ssize_t uv_query_max_guest_addr(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.max_sec_stor_addr);
+}
+
+static struct kobj_attribute uv_query_max_guest_addr_attr =
+ __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL);
+
+static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_att_req_hdr_ver);
+}
+
+static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr =
+ __ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL);
+
+static ssize_t uv_query_supp_att_pflags(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_att_pflags);
+}
+
+static struct kobj_attribute uv_query_supp_att_pflags_attr =
+ __ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL);
+
+static ssize_t uv_query_supp_add_secret_req_ver(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_req_ver);
+}
+
+static struct kobj_attribute uv_query_supp_add_secret_req_ver_attr =
+ __ATTR(supp_add_secret_req_ver, 0444, uv_query_supp_add_secret_req_ver, NULL);
+
+static ssize_t uv_query_supp_add_secret_pcf(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_pcf);
+}
+
+static struct kobj_attribute uv_query_supp_add_secret_pcf_attr =
+ __ATTR(supp_add_secret_pcf, 0444, uv_query_supp_add_secret_pcf, NULL);
+
+static ssize_t uv_query_supp_secret_types(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_secret_types);
+}
+
+static struct kobj_attribute uv_query_supp_secret_types_attr =
+ __ATTR(supp_secret_types, 0444, uv_query_supp_secret_types, NULL);
+
+static ssize_t uv_query_max_secrets(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", uv_info.max_secrets);
+}
+
+static struct kobj_attribute uv_query_max_secrets_attr =
+ __ATTR(max_secrets, 0444, uv_query_max_secrets, NULL);
+
+static struct attribute *uv_query_attrs[] = {
+ &uv_query_facilities_attr.attr,
+ &uv_query_feature_indications_attr.attr,
+ &uv_query_max_guest_cpus_attr.attr,
+ &uv_query_max_guest_vms_attr.attr,
+ &uv_query_max_guest_addr_attr.attr,
+ &uv_query_supp_se_hdr_ver_attr.attr,
+ &uv_query_supp_se_hdr_pcf_attr.attr,
+ &uv_query_dump_storage_state_len_attr.attr,
+ &uv_query_dump_finalize_len_attr.attr,
+ &uv_query_dump_cpu_len_attr.attr,
+ &uv_query_supp_att_req_hdr_ver_attr.attr,
+ &uv_query_supp_att_pflags_attr.attr,
+ &uv_query_supp_add_secret_req_ver_attr.attr,
+ &uv_query_supp_add_secret_pcf_attr.attr,
+ &uv_query_supp_secret_types_attr.attr,
+ &uv_query_max_secrets_attr.attr,
+ NULL,
+};
+
+static struct attribute_group uv_query_attr_group = {
+ .attrs = uv_query_attrs,
+};
+
+static ssize_t uv_is_prot_virt_guest(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int val = 0;
+
+#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
+ val = prot_virt_guest;
+#endif
+ return sysfs_emit(buf, "%d\n", val);
+}
+
+static ssize_t uv_is_prot_virt_host(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int val = 0;
+
+#if IS_ENABLED(CONFIG_KVM)
+ val = prot_virt_host;
+#endif
+
+ return sysfs_emit(buf, "%d\n", val);
+}
+
+static struct kobj_attribute uv_prot_virt_guest =
+ __ATTR(prot_virt_guest, 0444, uv_is_prot_virt_guest, NULL);
+
+static struct kobj_attribute uv_prot_virt_host =
+ __ATTR(prot_virt_host, 0444, uv_is_prot_virt_host, NULL);
+
+static const struct attribute *uv_prot_virt_attrs[] = {
+ &uv_prot_virt_guest.attr,
+ &uv_prot_virt_host.attr,
+ NULL,
+};
+
+static struct kset *uv_query_kset;
+static struct kobject *uv_kobj;
+
+static int __init uv_info_init(void)
+{
+ int rc = -ENOMEM;
+
+ if (!test_facility(158))
+ return 0;
+
+ uv_kobj = kobject_create_and_add("uv", firmware_kobj);
+ if (!uv_kobj)
+ return -ENOMEM;
+
+ rc = sysfs_create_files(uv_kobj, uv_prot_virt_attrs);
+ if (rc)
+ goto out_kobj;
+
+ uv_query_kset = kset_create_and_add("query", NULL, uv_kobj);
+ if (!uv_query_kset) {
+ rc = -ENOMEM;
+ goto out_ind_files;
+ }
+
+ rc = sysfs_create_group(&uv_query_kset->kobj, &uv_query_attr_group);
+ if (!rc)
+ return 0;
+
+ kset_unregister(uv_query_kset);
+out_ind_files:
+ sysfs_remove_files(uv_kobj, uv_prot_virt_attrs);
+out_kobj:
+ kobject_del(uv_kobj);
+ kobject_put(uv_kobj);
+ return rc;
+}
+device_initcall(uv_info_init);
+#endif
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index bcc9bdb39ba2..bbaefd84f15e 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -6,269 +6,253 @@
* Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
*/
-#include <linux/init.h>
+#include <linux/binfmts.h>
+#include <linux/compat.h>
+#include <linux/elf.h>
#include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
#include <linux/slab.h>
-#include <linux/user.h>
-#include <linux/elf.h>
-#include <linux/security.h>
-#include <linux/memblock.h>
-#include <linux/compat.h>
-#include <asm/asm-offsets.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/sections.h>
+#include <linux/smp.h>
+#include <linux/time_namespace.h>
+#include <linux/random.h>
+#include <vdso/datapage.h>
#include <asm/vdso.h>
-#include <asm/facility.h>
-extern char vdso64_start, vdso64_end;
-static void *vdso64_kbase = &vdso64_start;
-static unsigned int vdso64_pages;
-static struct page **vdso64_pagelist;
+extern char vdso64_start[], vdso64_end[];
+extern char vdso32_start[], vdso32_end[];
-/*
- * Should the kernel map a VDSO page into processes and pass its
- * address down to glibc upon exec()?
- */
-unsigned int __read_mostly vdso_enabled = 1;
-
-static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct page **vdso_pagelist;
- unsigned long vdso_pages;
-
- vdso_pagelist = vdso64_pagelist;
- vdso_pages = vdso64_pages;
-
- if (vmf->pgoff >= vdso_pages)
- return VM_FAULT_SIGBUS;
+static struct vm_special_mapping vvar_mapping;
- vmf->page = vdso_pagelist[vmf->pgoff];
- get_page(vmf->page);
- return 0;
-}
-
-static int vdso_mremap(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma)
-{
- unsigned long vdso_pages;
-
- vdso_pages = vdso64_pages;
-
- if ((vdso_pages << PAGE_SHIFT) != vma->vm_end - vma->vm_start)
- return -EINVAL;
+static union {
+ struct vdso_data data[CS_BASES];
+ u8 page[PAGE_SIZE];
+} vdso_data_store __page_aligned_data;
- if (WARN_ON_ONCE(current->mm != vma->vm_mm))
- return -EFAULT;
+struct vdso_data *vdso_data = vdso_data_store.data;
- current->mm->context.vdso_base = vma->vm_start;
- return 0;
-}
-
-static const struct vm_special_mapping vdso_mapping = {
- .name = "[vdso]",
- .fault = vdso_fault,
- .mremap = vdso_mremap,
+enum vvar_pages {
+ VVAR_DATA_PAGE_OFFSET,
+ VVAR_TIMENS_PAGE_OFFSET,
+ VVAR_NR_PAGES,
};
-static int __init vdso_setup(char *str)
+#ifdef CONFIG_TIME_NS
+struct vdso_data *arch_get_vdso_data(void *vvar_page)
{
- bool enabled;
-
- if (!kstrtobool(str, &enabled))
- vdso_enabled = enabled;
- return 1;
+ return (struct vdso_data *)(vvar_page);
}
-__setup("vdso=", vdso_setup);
-
-/*
- * The vdso data page
- */
-static union {
- struct vdso_data data;
- u8 page[PAGE_SIZE];
-} vdso_data_store __page_aligned_data;
-struct vdso_data *vdso_data = &vdso_data_store.data;
/*
- * Setup vdso data page.
+ * The VVAR page layout depends on whether a task belongs to the root or
+ * non-root time namespace. Whenever a task changes its namespace, the VVAR
+ * page tables are cleared and then they will be re-faulted with a
+ * corresponding layout.
+ * See also the comment near timens_setup_vdso_data() for details.
*/
-static void __init vdso_init_data(struct vdso_data *vd)
+int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
{
- vd->ectg_available = test_facility(31);
-}
-
-/*
- * Allocate/free per cpu vdso data.
- */
-#define SEGMENT_ORDER 2
+ struct mm_struct *mm = task->mm;
+ VMA_ITERATOR(vmi, mm, 0);
+ struct vm_area_struct *vma;
-/*
- * The initial vdso_data structure for the boot CPU. Eventually
- * it is replaced with a properly allocated structure in vdso_init.
- * This is necessary because a valid S390_lowcore.vdso_per_cpu_data
- * pointer is required to be able to return from an interrupt or
- * program check. See the exit paths in entry.S.
- */
-struct vdso_data boot_vdso_data __initdata;
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
+ if (!vma_is_special_mapping(vma, &vvar_mapping))
+ continue;
+ zap_vma_pages(vma);
+ break;
+ }
+ mmap_read_unlock(mm);
+ return 0;
+}
+#endif
-void __init vdso_alloc_boot_cpu(struct lowcore *lowcore)
+static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
{
- lowcore->vdso_per_cpu_data = (unsigned long) &boot_vdso_data;
+ struct page *timens_page = find_timens_vvar_page(vma);
+ unsigned long addr, pfn;
+ vm_fault_t err;
+
+ switch (vmf->pgoff) {
+ case VVAR_DATA_PAGE_OFFSET:
+ pfn = virt_to_pfn(vdso_data);
+ if (timens_page) {
+ /*
+ * Fault in VVAR page too, since it will be accessed
+ * to get clock data anyway.
+ */
+ addr = vmf->address + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE;
+ err = vmf_insert_pfn(vma, addr, pfn);
+ if (unlikely(err & VM_FAULT_ERROR))
+ return err;
+ pfn = page_to_pfn(timens_page);
+ }
+ break;
+#ifdef CONFIG_TIME_NS
+ case VVAR_TIMENS_PAGE_OFFSET:
+ /*
+ * If a task belongs to a time namespace then a namespace
+ * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and
+ * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET
+ * offset.
+ * See also the comment near timens_setup_vdso_data().
+ */
+ if (!timens_page)
+ return VM_FAULT_SIGBUS;
+ pfn = virt_to_pfn(vdso_data);
+ break;
+#endif /* CONFIG_TIME_NS */
+ default:
+ return VM_FAULT_SIGBUS;
+ }
+ return vmf_insert_pfn(vma, vmf->address, pfn);
}
-int vdso_alloc_per_cpu(struct lowcore *lowcore)
+static int vdso_mremap(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma)
{
- unsigned long segment_table, page_table, page_frame;
- struct vdso_per_cpu_data *vd;
-
- segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER);
- page_table = get_zeroed_page(GFP_KERNEL);
- page_frame = get_zeroed_page(GFP_KERNEL);
- if (!segment_table || !page_table || !page_frame)
- goto out;
- arch_set_page_dat(virt_to_page(segment_table), SEGMENT_ORDER);
- arch_set_page_dat(virt_to_page(page_table), 0);
-
- /* Initialize per-cpu vdso data page */
- vd = (struct vdso_per_cpu_data *) page_frame;
- vd->cpu_nr = lowcore->cpu_nr;
- vd->node_id = cpu_to_node(vd->cpu_nr);
-
- /* Set up page table for the vdso address space */
- memset64((u64 *)segment_table, _SEGMENT_ENTRY_EMPTY, _CRST_ENTRIES);
- memset64((u64 *)page_table, _PAGE_INVALID, PTRS_PER_PTE);
-
- *(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table;
- *(unsigned long *) page_table = _PAGE_PROTECT + page_frame;
-
- lowcore->vdso_asce = segment_table +
- _ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT;
- lowcore->vdso_per_cpu_data = page_frame;
-
+ current->mm->context.vdso_base = vma->vm_start;
return 0;
-
-out:
- free_page(page_frame);
- free_page(page_table);
- free_pages(segment_table, SEGMENT_ORDER);
- return -ENOMEM;
}
-void vdso_free_per_cpu(struct lowcore *lowcore)
-{
- unsigned long segment_table, page_table, page_frame;
+static struct vm_special_mapping vvar_mapping = {
+ .name = "[vvar]",
+ .fault = vvar_fault,
+};
- segment_table = lowcore->vdso_asce & PAGE_MASK;
- page_table = *(unsigned long *) segment_table;
- page_frame = *(unsigned long *) page_table;
+static struct vm_special_mapping vdso64_mapping = {
+ .name = "[vdso]",
+ .mremap = vdso_mremap,
+};
+
+static struct vm_special_mapping vdso32_mapping = {
+ .name = "[vdso]",
+ .mremap = vdso_mremap,
+};
- free_page(page_frame);
- free_page(page_table);
- free_pages(segment_table, SEGMENT_ORDER);
+int vdso_getcpu_init(void)
+{
+ set_tod_programmable_field(smp_processor_id());
+ return 0;
}
+early_initcall(vdso_getcpu_init); /* Must be called before SMP init */
-/*
- * This is called from binfmt_elf, we create the special vma for the
- * vDSO and insert it into the mm struct tree
- */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len)
{
+ unsigned long vvar_start, vdso_text_start, vdso_text_len;
+ struct vm_special_mapping *vdso_mapping;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long vdso_pages;
- unsigned long vdso_base;
int rc;
- if (!vdso_enabled)
- return 0;
-
- if (is_compat_task())
- return 0;
-
- vdso_pages = vdso64_pages;
- /*
- * vDSO has a problem and was disabled, just don't "enable" it for
- * the process
- */
- if (vdso_pages == 0)
- return 0;
-
- /*
- * pick a base address for the vDSO in process space. We try to put
- * it at vdso_base which is the "natural" base for it, but we might
- * fail and end up putting it elsewhere.
- */
- if (down_write_killable(&mm->mmap_sem))
+ BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES);
+ if (mmap_write_lock_killable(mm))
return -EINTR;
- vdso_base = get_unmapped_area(NULL, 0, vdso_pages << PAGE_SHIFT, 0, 0);
- if (IS_ERR_VALUE(vdso_base)) {
- rc = vdso_base;
- goto out_up;
- }
- /*
- * our vma flags don't have VM_WRITE so by default, the process
- * isn't allowed to write those pages.
- * gdb can break that with ptrace interface, and thus trigger COW
- * on those pages but it's then your responsibility to never do that
- * on the "data" page of the vDSO or you'll stop getting kernel
- * updates and your nice userland gettimeofday will be totally dead.
- * It's fine to use that for setting breakpoints in the vDSO code
- * pages though.
- */
- vma = _install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT,
+ if (is_compat_task()) {
+ vdso_text_len = vdso32_end - vdso32_start;
+ vdso_mapping = &vdso32_mapping;
+ } else {
+ vdso_text_len = vdso64_end - vdso64_start;
+ vdso_mapping = &vdso64_mapping;
+ }
+ vvar_start = get_unmapped_area(NULL, addr, vdso_mapping_len, 0, 0);
+ rc = vvar_start;
+ if (IS_ERR_VALUE(vvar_start))
+ goto out;
+ vma = _install_special_mapping(mm, vvar_start, VVAR_NR_PAGES*PAGE_SIZE,
+ VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
+ VM_PFNMAP,
+ &vvar_mapping);
+ rc = PTR_ERR(vma);
+ if (IS_ERR(vma))
+ goto out;
+ vdso_text_start = vvar_start + VVAR_NR_PAGES * PAGE_SIZE;
+ /* VM_MAYWRITE for COW so gdb can set breakpoints */
+ vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
- &vdso_mapping);
+ vdso_mapping);
if (IS_ERR(vma)) {
+ do_munmap(mm, vvar_start, PAGE_SIZE, NULL);
rc = PTR_ERR(vma);
- goto out_up;
+ } else {
+ current->mm->context.vdso_base = vdso_text_start;
+ rc = 0;
}
+out:
+ mmap_write_unlock(mm);
+ return rc;
+}
- current->mm->context.vdso_base = vdso_base;
- rc = 0;
+static unsigned long vdso_addr(unsigned long start, unsigned long len)
+{
+ unsigned long addr, end, offset;
-out_up:
- up_write(&mm->mmap_sem);
- return rc;
+ /*
+ * Round up the start address. It can start out unaligned as a result
+ * of stack start randomization.
+ */
+ start = PAGE_ALIGN(start);
+
+ /* Round the lowest possible end address up to a PMD boundary. */
+ end = (start + len + PMD_SIZE - 1) & PMD_MASK;
+ if (end >= VDSO_BASE)
+ end = VDSO_BASE;
+ end -= len;
+
+ if (end > start) {
+ offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1);
+ addr = start + (offset << PAGE_SHIFT);
+ } else {
+ addr = start;
+ }
+ return addr;
}
-static int __init vdso_init(void)
+unsigned long vdso_size(void)
{
- int i;
+ unsigned long size = VVAR_NR_PAGES * PAGE_SIZE;
- vdso_init_data(vdso_data);
+ if (is_compat_task())
+ size += vdso32_end - vdso32_start;
+ else
+ size += vdso64_end - vdso64_start;
+ return PAGE_ALIGN(size);
+}
- /* Calculate the size of the 64 bit vDSO */
- vdso64_pages = ((&vdso64_end - &vdso64_start
- + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+ unsigned long addr = VDSO_BASE;
+ unsigned long size = vdso_size();
- /* Make sure pages are in the correct state */
- vdso64_pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *),
- GFP_KERNEL);
- BUG_ON(vdso64_pagelist == NULL);
- for (i = 0; i < vdso64_pages - 1; i++) {
- struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
- get_page(pg);
- vdso64_pagelist[i] = pg;
- }
- vdso64_pagelist[vdso64_pages - 1] = virt_to_page(vdso_data);
- vdso64_pagelist[vdso64_pages] = NULL;
- if (vdso_alloc_per_cpu(&S390_lowcore))
- BUG();
+ if (current->flags & PF_RANDOMIZE)
+ addr = vdso_addr(current->mm->start_stack + PAGE_SIZE, size);
+ return map_vdso(addr, size);
+}
- get_page(virt_to_page(vdso_data));
+static struct page ** __init vdso_setup_pages(void *start, void *end)
+{
+ int pages = (end - start) >> PAGE_SHIFT;
+ struct page **pagelist;
+ int i;
+ pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL);
+ if (!pagelist)
+ panic("%s: Cannot allocate page list for VDSO", __func__);
+ for (i = 0; i < pages; i++)
+ pagelist[i] = virt_to_page(start + i * PAGE_SIZE);
+ return pagelist;
+}
+
+static int __init vdso_init(void)
+{
+ vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end);
+ if (IS_ENABLED(CONFIG_COMPAT))
+ vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end);
return 0;
}
-early_initcall(vdso_init);
+arch_initcall(vdso_init);
diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso32/.gitignore
new file mode 100644
index 000000000000..5167384843b9
--- /dev/null
+++ b/arch/s390/kernel/vdso32/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+vdso32.lds
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
new file mode 100644
index 000000000000..caec7db6f966
--- /dev/null
+++ b/arch/s390/kernel/vdso32/Makefile
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: GPL-2.0
+# List of files in the vdso
+
+KCOV_INSTRUMENT := n
+
+# Include the generic Makefile to check the built vdso.
+include $(srctree)/lib/vdso/Makefile
+obj-vdso32 = vdso_user_wrapper-32.o note-32.o
+
+# Build rules
+
+targets := $(obj-vdso32) vdso32.so vdso32.so.dbg
+obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
+
+KBUILD_AFLAGS += -DBUILD_VDSO
+KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
+KBUILD_AFLAGS_32 += -m31 -s
+
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin
+
+LDFLAGS_vdso32.so.dbg += -fPIC -shared -soname=linux-vdso32.so.1 \
+ --hash-style=both --build-id=sha1 -melf_s390 -T
+
+$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+
+obj-y += vdso32_wrapper.o
+targets += vdso32.lds
+CPPFLAGS_vdso32.lds += -P -C -U$(ARCH)
+
+# Disable gcov profiling, ubsan and kasan for VDSO code
+GCOV_PROFILE := n
+UBSAN_SANITIZE := n
+KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
+
+# Force dependency (incbin is bad)
+$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so
+
+quiet_cmd_vdso_and_check = VDSO $@
+ cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check)
+
+$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE
+ $(call if_changed,vdso_and_check)
+
+# strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+ $(call if_changed,objcopy)
+
+$(obj-vdso32): %-32.o: %.S FORCE
+ $(call if_changed_dep,vdso32as)
+
+# actual build commands
+quiet_cmd_vdso32as = VDSO32A $@
+ cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $<
+quiet_cmd_vdso32cc = VDSO32C $@
+ cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $<
+
+# Generate VDSO offsets using helper script
+gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
+quiet_cmd_vdsosym = VDSOSYM $@
+ cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
+
+include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE
+ $(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
new file mode 100755
index 000000000000..9c4f951e227d
--- /dev/null
+++ b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Match symbols in the DSO that look like VDSO_*; produce a header file
+# of constant offsets into the shared object.
+#
+# Doing this inside the Makefile will break the $(filter-out) function,
+# causing Kbuild to rebuild the vdso-offsets header file every time.
+#
+# Inspired by arm64 version.
+#
+
+LC_ALL=C
+sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso32/note.S
new file mode 100644
index 000000000000..db19d0680a0a
--- /dev/null
+++ b/arch/s390/kernel/vdso32/note.S
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+ELFNOTE_START(Linux, 0, "a")
+ .long LINUX_VERSION_CODE
+ELFNOTE_END
diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S
new file mode 100644
index 000000000000..edf5ff1debe1
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso32.lds.S
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This is the infamous ld script for the 64 bits vdso
+ * library
+ */
+
+#include <asm/page.h>
+#include <asm/vdso.h>
+
+OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
+OUTPUT_ARCH(s390:31-bit)
+ENTRY(_start)
+
+SECTIONS
+{
+ PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+#ifdef CONFIG_TIME_NS
+ PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
+#endif
+ . = VDSO_LBASE + SIZEOF_HEADERS;
+
+ .hash : { *(.hash) } :text
+ .gnu.hash : { *(.gnu.hash) }
+ .dynsym : { *(.dynsym) }
+ .dynstr : { *(.dynstr) }
+ .gnu.version : { *(.gnu.version) }
+ .gnu.version_d : { *(.gnu.version_d) }
+ .gnu.version_r : { *(.gnu.version_r) }
+
+ .note : { *(.note.*) } :text :note
+
+ . = ALIGN(16);
+ .text : {
+ *(.text .stub .text.* .gnu.linkonce.t.*)
+ } :text
+ PROVIDE(__etext = .);
+ PROVIDE(_etext = .);
+ PROVIDE(etext = .);
+
+ /*
+ * Other stuff is appended to the text segment:
+ */
+ .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+ .rodata1 : { *(.rodata1) }
+
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
+ .eh_frame : { KEEP (*(.eh_frame)) } :text
+ .gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) }
+
+ .rela.dyn ALIGN(8) : { *(.rela.dyn) }
+ .got ALIGN(8) : { *(.got .toc) }
+ .got.plt ALIGN(8) : { *(.got.plt) }
+
+ _end = .;
+ PROVIDE(end = .);
+
+ /*
+ * Stabs debugging sections are here too.
+ */
+ .stab 0 : { *(.stab) }
+ .stabstr 0 : { *(.stabstr) }
+ .stab.excl 0 : { *(.stab.excl) }
+ .stab.exclstr 0 : { *(.stab.exclstr) }
+ .stab.index 0 : { *(.stab.index) }
+ .stab.indexstr 0 : { *(.stab.indexstr) }
+ .comment 0 : { *(.comment) }
+
+ /*
+ * DWARF debug sections.
+ * Symbols in the DWARF debugging sections are relative to the
+ * beginning of the section so we begin them at 0.
+ */
+ /* DWARF 1 */
+ .debug 0 : { *(.debug) }
+ .line 0 : { *(.line) }
+ /* GNU DWARF 1 extensions */
+ .debug_srcinfo 0 : { *(.debug_srcinfo) }
+ .debug_sfnames 0 : { *(.debug_sfnames) }
+ /* DWARF 1.1 and DWARF 2 */
+ .debug_aranges 0 : { *(.debug_aranges) }
+ .debug_pubnames 0 : { *(.debug_pubnames) }
+ /* DWARF 2 */
+ .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) }
+ .debug_abbrev 0 : { *(.debug_abbrev) }
+ .debug_line 0 : { *(.debug_line) }
+ .debug_frame 0 : { *(.debug_frame) }
+ .debug_str 0 : { *(.debug_str) }
+ .debug_loc 0 : { *(.debug_loc) }
+ .debug_macinfo 0 : { *(.debug_macinfo) }
+ /* SGI/MIPS DWARF 2 extensions */
+ .debug_weaknames 0 : { *(.debug_weaknames) }
+ .debug_funcnames 0 : { *(.debug_funcnames) }
+ .debug_typenames 0 : { *(.debug_typenames) }
+ .debug_varnames 0 : { *(.debug_varnames) }
+ /* DWARF 3 */
+ .debug_pubtypes 0 : { *(.debug_pubtypes) }
+ .debug_ranges 0 : { *(.debug_ranges) }
+ .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+
+ /DISCARD/ : {
+ *(.note.GNU-stack)
+ *(.branch_lt)
+ *(.data .data.* .gnu.linkonce.d.* .sdata*)
+ *(.bss .sbss .dynbss .dynsbss)
+ }
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME 0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+ text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ note PT_NOTE FLAGS(4); /* PF_R */
+ eh_frame_hdr PT_GNU_EH_FRAME;
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+ VDSO_VERSION_STRING {
+ global:
+ /*
+ * Has to be there for the kernel to find
+ */
+ __kernel_compat_restart_syscall;
+ __kernel_compat_rt_sigreturn;
+ __kernel_compat_sigreturn;
+ local: *;
+ };
+}
diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S
new file mode 100644
index 000000000000..de2fb930471a
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+ __PAGE_ALIGNED_DATA
+
+ .globl vdso32_start, vdso32_end
+ .balign PAGE_SIZE
+vdso32_start:
+ .incbin "arch/s390/kernel/vdso32/vdso32.so"
+ .balign PAGE_SIZE
+vdso32_end:
+
+ .previous
diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S
new file mode 100644
index 000000000000..2e645003fdaf
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso_user_wrapper.S
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/unistd.h>
+#include <asm/dwarf.h>
+
+.macro vdso_syscall func,syscall
+ .globl __kernel_compat_\func
+ .type __kernel_compat_\func,@function
+ __ALIGN
+__kernel_compat_\func:
+ CFI_STARTPROC
+ svc \syscall
+ /* Make sure we notice when a syscall returns, which shouldn't happen */
+ .word 0
+ CFI_ENDPROC
+ .size __kernel_compat_\func,.-__kernel_compat_\func
+.endm
+
+vdso_syscall restart_syscall,__NR_restart_syscall
+vdso_syscall sigreturn,__NR_sigreturn
+vdso_syscall rt_sigreturn,__NR_rt_sigreturn
diff --git a/arch/s390/kernel/vdso64/.gitignore b/arch/s390/kernel/vdso64/.gitignore
index 3fd18cf9fec2..4ec80685fecc 100644
--- a/arch/s390/kernel/vdso64/.gitignore
+++ b/arch/s390/kernel/vdso64/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
vdso64.lds
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index bec19e7e6e1c..e3c9085f8fa7 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -1,44 +1,56 @@
# SPDX-License-Identifier: GPL-2.0
-# List of files in the vdso, has to be asm only for now
+# List of files in the vdso
KCOV_INSTRUMENT := n
-obj-vdso64 = gettimeofday.o clock_getres.o clock_gettime.o note.o getcpu.o
+# Include the generic Makefile to check the built vdso.
+include $(srctree)/lib/vdso/Makefile
+obj-vdso64 = vdso_user_wrapper.o note.o
+obj-cvdso64 = vdso64_generic.o getcpu.o
+VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) $(CC_FLAGS_CHECK_STACK)
+CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE)
+CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE)
# Build rules
-targets := $(obj-vdso64) vdso64.so vdso64.so.dbg
+targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg
obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64))
+obj-cvdso64 := $(addprefix $(obj)/, $(obj-cvdso64))
KBUILD_AFLAGS += -DBUILD_VDSO
-KBUILD_CFLAGS += -DBUILD_VDSO
+KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS))
-KBUILD_AFLAGS_64 += -m64 -s
+KBUILD_AFLAGS_64 += -m64
KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS))
-KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin
-KBUILD_CFLAGS_64 += -nostdlib -Wl,-soname=linux-vdso64.so.1 \
- -Wl,--hash-style=both
+KBUILD_CFLAGS_64 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_64))
+KBUILD_CFLAGS_64 += -m64 -fPIC -fno-common -fno-builtin
+ldflags-y := -fPIC -shared -soname=linux-vdso64.so.1 \
+ --hash-style=both --build-id=sha1 -T
$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64)
$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64)
obj-y += vdso64_wrapper.o
-extra-y += vdso64.lds
+targets += vdso64.lds
CPPFLAGS_vdso64.lds += -P -C -U$(ARCH)
# Disable gcov profiling, ubsan and kasan for VDSO code
GCOV_PROFILE := n
UBSAN_SANITIZE := n
KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
# Force dependency (incbin is bad)
$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so
+quiet_cmd_vdso_and_check = VDSO $@
+ cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check)
+
# link rule for the .so file, .lds has to be first
-$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) FORCE
- $(call if_changed,vdso64ld)
+$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE
+ $(call if_changed,vdso_and_check)
# strip rule for the .so file
$(obj)/%.so: OBJCOPYFLAGS := -S
@@ -49,18 +61,19 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
$(obj-vdso64): %.o: %.S FORCE
$(call if_changed_dep,vdso64as)
+$(obj-cvdso64): %.o: %.c FORCE
+ $(call if_changed_dep,vdso64cc)
+
# actual build commands
-quiet_cmd_vdso64ld = VDSO64L $@
- cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $(filter %.lds %.o,$^) -o $@
quiet_cmd_vdso64as = VDSO64A $@
cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $<
+quiet_cmd_vdso64cc = VDSO64C $@
+ cmd_vdso64cc = $(CC) $(c_flags) -c -o $@ $<
-# install commands for the unstripped file
-quiet_cmd_vdso_install = INSTALL $@
- cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
-
-vdso64.so: $(obj)/vdso64.so.dbg
- @mkdir -p $(MODLIB)/vdso
- $(call cmd,vdso_install)
+# Generate VDSO offsets using helper script
+gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
+quiet_cmd_vdsosym = VDSOSYM $@
+ cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
-vdso_install: vdso64.so
+include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE
+ $(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso64/clock_getres.S b/arch/s390/kernel/vdso64/clock_getres.S
deleted file mode 100644
index 081435398e0a..000000000000
--- a/arch/s390/kernel/vdso64/clock_getres.S
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of clock_getres() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- * Copyright IBM Corp. 2008
- * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/unistd.h>
-#include <asm/dwarf.h>
-
- .text
- .align 4
- .globl __kernel_clock_getres
- .type __kernel_clock_getres,@function
-__kernel_clock_getres:
- CFI_STARTPROC
- larl %r1,4f
- cghi %r2,__CLOCK_REALTIME_COARSE
- je 0f
- cghi %r2,__CLOCK_MONOTONIC_COARSE
- je 0f
- larl %r1,3f
- cghi %r2,__CLOCK_REALTIME
- je 0f
- cghi %r2,__CLOCK_MONOTONIC
- je 0f
- cghi %r2,__CLOCK_THREAD_CPUTIME_ID
- je 0f
- cghi %r2,-2 /* Per-thread CPUCLOCK with PID=0, VIRT=1 */
- jne 2f
- larl %r5,_vdso_data
- icm %r0,15,__LC_ECTG_OK(%r5)
- jz 2f
-0: ltgr %r3,%r3
- jz 1f /* res == NULL */
- lg %r0,0(%r1)
- xc 0(8,%r3),0(%r3) /* set tp->tv_sec to zero */
- stg %r0,8(%r3) /* store tp->tv_usec */
-1: lghi %r2,0
- br %r14
-2: lghi %r1,__NR_clock_getres /* fallback to svc */
- svc 0
- br %r14
- CFI_ENDPROC
-3: .quad __CLOCK_REALTIME_RES
-4: .quad __CLOCK_COARSE_RES
- .size __kernel_clock_getres,.-__kernel_clock_getres
diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S
deleted file mode 100644
index 9d2ee79b90f2..000000000000
--- a/arch/s390/kernel/vdso64/clock_gettime.S
+++ /dev/null
@@ -1,163 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of clock_gettime() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- * Copyright IBM Corp. 2008
- * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/unistd.h>
-#include <asm/dwarf.h>
-#include <asm/ptrace.h>
-
- .text
- .align 4
- .globl __kernel_clock_gettime
- .type __kernel_clock_gettime,@function
-__kernel_clock_gettime:
- CFI_STARTPROC
- aghi %r15,-16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
- larl %r5,_vdso_data
- cghi %r2,__CLOCK_REALTIME_COARSE
- je 4f
- cghi %r2,__CLOCK_REALTIME
- je 5f
- cghi %r2,-3 /* Per-thread CPUCLOCK with PID=0, VIRT=1 */
- je 9f
- cghi %r2,__CLOCK_MONOTONIC_COARSE
- je 3f
- cghi %r2,__CLOCK_MONOTONIC
- jne 12f
-
- /* CLOCK_MONOTONIC */
-0: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 0b
- stcke 0(%r15) /* Store TOD clock */
- lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */
- lg %r0,__VDSO_WTOM_SEC(%r5)
- lg %r1,1(%r15)
- sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */
- msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */
- alg %r1,__VDSO_WTOM_NSEC(%r5)
- srlg %r1,%r1,0(%r2) /* >> tk->shift */
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 0b
- larl %r5,13f
-1: clg %r1,0(%r5)
- jl 2f
- slg %r1,0(%r5)
- aghi %r0,1
- j 1b
-2: stg %r0,0(%r3) /* store tp->tv_sec */
- stg %r1,8(%r3) /* store tp->tv_nsec */
- lghi %r2,0
- aghi %r15,16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
- CFI_RESTORE 15
- br %r14
-
- /* CLOCK_MONOTONIC_COARSE */
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-3: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 3b
- lg %r0,__VDSO_WTOM_CRS_SEC(%r5)
- lg %r1,__VDSO_WTOM_CRS_NSEC(%r5)
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 3b
- j 2b
-
- /* CLOCK_REALTIME_COARSE */
-4: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 4b
- lg %r0,__VDSO_XTIME_CRS_SEC(%r5)
- lg %r1,__VDSO_XTIME_CRS_NSEC(%r5)
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 4b
- j 7f
-
- /* CLOCK_REALTIME */
-5: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 5b
- stcke 0(%r15) /* Store TOD clock */
- lg %r1,1(%r15)
- lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */
- slgr %r0,%r1 /* now - ts_steering_end */
- ltgr %r0,%r0 /* past end of steering ? */
- jm 17f
- srlg %r0,%r0,15 /* 1 per 2^16 */
- tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */
- jz 18f
- lcgr %r0,%r0 /* negative TOD offset */
-18: algr %r1,%r0 /* add steering offset */
-17: lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */
- sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */
- msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */
- alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */
- srlg %r1,%r1,0(%r2) /* >> tk->shift */
- lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 5b
- larl %r5,13f
-6: clg %r1,0(%r5)
- jl 7f
- slg %r1,0(%r5)
- aghi %r0,1
- j 6b
-7: stg %r0,0(%r3) /* store tp->tv_sec */
- stg %r1,8(%r3) /* store tp->tv_nsec */
- lghi %r2,0
- aghi %r15,16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
- CFI_RESTORE 15
- br %r14
-
- /* CPUCLOCK_VIRT for this thread */
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-9: lghi %r4,0
- icm %r0,15,__VDSO_ECTG_OK(%r5)
- jz 12f
- sacf 256 /* Magic ectg instruction */
- .insn ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4
- sacf 0
- algr %r1,%r0 /* r1 = cputime as TOD value */
- mghi %r1,1000 /* convert to nanoseconds */
- srlg %r1,%r1,12 /* r1 = cputime in nanosec */
- lgr %r4,%r1
- larl %r5,13f
- srlg %r1,%r1,9 /* divide by 1000000000 */
- mlg %r0,8(%r5)
- srlg %r0,%r0,11 /* r0 = tv_sec */
- stg %r0,0(%r3)
- msg %r0,0(%r5) /* calculate tv_nsec */
- slgr %r4,%r0 /* r4 = tv_nsec */
- stg %r4,8(%r3)
- lghi %r2,0
- aghi %r15,16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
- CFI_RESTORE 15
- br %r14
-
- /* Fallback to system call */
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-12: lghi %r1,__NR_clock_gettime
- svc 0
- aghi %r15,16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
- CFI_RESTORE 15
- br %r14
- CFI_ENDPROC
-
-13: .quad 1000000000
-14: .quad 19342813113834067
- .size __kernel_clock_gettime,.-__kernel_clock_gettime
diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh
new file mode 100755
index 000000000000..37f05cb38dad
--- /dev/null
+++ b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Match symbols in the DSO that look like VDSO_*; produce a header file
+# of constant offsets into the shared object.
+#
+# Doing this inside the Makefile will break the $(filter-out) function,
+# causing Kbuild to rebuild the vdso-offsets header file every time.
+#
+# Inspired by arm64 version.
+#
+
+LC_ALL=C
+sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso64/getcpu.S b/arch/s390/kernel/vdso64/getcpu.S
deleted file mode 100644
index 3c04f7328500..000000000000
--- a/arch/s390/kernel/vdso64/getcpu.S
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of getcpu() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- * Copyright IBM Corp. 2016
- * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/dwarf.h>
-
- .text
- .align 4
- .globl __kernel_getcpu
- .type __kernel_getcpu,@function
-__kernel_getcpu:
- CFI_STARTPROC
- sacf 256
- lm %r4,%r5,__VDSO_GETCPU_VAL(%r0)
- sacf 0
- ltgr %r2,%r2
- jz 2f
- st %r5,0(%r2)
-2: ltgr %r3,%r3
- jz 3f
- st %r4,0(%r3)
-3: lghi %r2,0
- br %r14
- CFI_ENDPROC
- .size __kernel_getcpu,.-__kernel_getcpu
diff --git a/arch/s390/kernel/vdso64/getcpu.c b/arch/s390/kernel/vdso64/getcpu.c
new file mode 100644
index 000000000000..5c5d4a848b76
--- /dev/null
+++ b/arch/s390/kernel/vdso64/getcpu.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright IBM Corp. 2020 */
+
+#include <linux/compiler.h>
+#include <linux/getcpu.h>
+#include <asm/timex.h>
+#include "vdso.h"
+
+int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+{
+ union tod_clock clk;
+
+ /* CPU number is stored in the programmable field of the TOD clock */
+ store_tod_clock_ext(&clk);
+ if (cpu)
+ *cpu = clk.pf;
+ /* NUMA node is always zero */
+ if (node)
+ *node = 0;
+ return 0;
+}
diff --git a/arch/s390/kernel/vdso64/gettimeofday.S b/arch/s390/kernel/vdso64/gettimeofday.S
deleted file mode 100644
index aebe10dc7c99..000000000000
--- a/arch/s390/kernel/vdso64/gettimeofday.S
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of gettimeofday() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- * Copyright IBM Corp. 2008
- * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/unistd.h>
-#include <asm/dwarf.h>
-#include <asm/ptrace.h>
-
- .text
- .align 4
- .globl __kernel_gettimeofday
- .type __kernel_gettimeofday,@function
-__kernel_gettimeofday:
- CFI_STARTPROC
- aghi %r15,-16
- CFI_ADJUST_CFA_OFFSET 16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
- larl %r5,_vdso_data
-0: ltgr %r3,%r3 /* check if tz is NULL */
- je 1f
- mvc 0(8,%r3),__VDSO_TIMEZONE(%r5)
-1: ltgr %r2,%r2 /* check if tv is NULL */
- je 4f
- lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 0b
- stcke 0(%r15) /* Store TOD clock */
- lg %r1,1(%r15)
- lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */
- slgr %r0,%r1 /* now - ts_steering_end */
- ltgr %r0,%r0 /* past end of steering ? */
- jm 6f
- srlg %r0,%r0,15 /* 1 per 2^16 */
- tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */
- jz 7f
- lcgr %r0,%r0 /* negative TOD offset */
-7: algr %r1,%r0 /* add steering offset */
-6: sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */
- msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */
- alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */
- lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 0b
- lgf %r5,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */
- srlg %r1,%r1,0(%r5) /* >> tk->shift */
- larl %r5,5f
-2: clg %r1,0(%r5)
- jl 3f
- slg %r1,0(%r5)
- aghi %r0,1
- j 2b
-3: stg %r0,0(%r2) /* store tv->tv_sec */
- slgr %r0,%r0 /* tv_nsec -> tv_usec */
- ml %r0,8(%r5)
- srlg %r0,%r0,6
- stg %r0,8(%r2) /* store tv->tv_usec */
-4: lghi %r2,0
- aghi %r15,16
- CFI_ADJUST_CFA_OFFSET -16
- CFI_RESTORE 15
- br %r14
- CFI_ENDPROC
-5: .quad 1000000000
- .long 274877907
- .size __kernel_gettimeofday,.-__kernel_gettimeofday
diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso64/vdso.h
new file mode 100644
index 000000000000..34c7a2312f9d
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vdso.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARCH_S390_KERNEL_VDSO64_VDSO_H
+#define __ARCH_S390_KERNEL_VDSO64_VDSO_H
+
+#include <vdso/datapage.h>
+
+struct getcpu_cache;
+
+int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);
+int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
+int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
+int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts);
+
+#endif /* __ARCH_S390_KERNEL_VDSO64_VDSO_H */
diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S
index 7ddb116b5e2e..4461ea151e49 100644
--- a/arch/s390/kernel/vdso64/vdso64.lds.S
+++ b/arch/s390/kernel/vdso64/vdso64.lds.S
@@ -13,7 +13,11 @@ ENTRY(_start)
SECTIONS
{
- . = VDSO64_LBASE + SIZEOF_HEADERS;
+ PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+#ifdef CONFIG_TIME_NS
+ PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
+#endif
+ . = VDSO_LBASE + SIZEOF_HEADERS;
.hash : { *(.hash) } :text
.gnu.hash : { *(.gnu.hash) }
@@ -47,6 +51,7 @@ SECTIONS
.rela.dyn ALIGN(8) : { *(.rela.dyn) }
.got ALIGN(8) : { *(.got .toc) }
+ .got.plt ALIGN(8) : { *(.got.plt) }
_end = .;
PROVIDE(end = .);
@@ -94,9 +99,6 @@ SECTIONS
.debug_ranges 0 : { *(.debug_ranges) }
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
- . = ALIGN(PAGE_SIZE);
- PROVIDE(_vdso_data = .);
-
/DISCARD/ : {
*(.note.GNU-stack)
*(.branch_lt)
@@ -136,7 +138,9 @@ VERSION
__kernel_clock_gettime;
__kernel_clock_getres;
__kernel_getcpu;
-
+ __kernel_restart_syscall;
+ __kernel_rt_sigreturn;
+ __kernel_sigreturn;
local: *;
};
}
diff --git a/arch/s390/kernel/vdso64/vdso64_generic.c b/arch/s390/kernel/vdso64/vdso64_generic.c
new file mode 100644
index 000000000000..a9aa75643c08
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vdso64_generic.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../../../../lib/vdso/gettimeofday.c"
+#include "vdso.h"
+
+int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv,
+ struct timezone *tz)
+{
+ return __cvdso_gettimeofday(tv, tz);
+}
+
+int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts)
+{
+ return __cvdso_clock_gettime(clock, ts);
+}
+
+int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts)
+{
+ return __cvdso_clock_getres(clock, ts);
+}
diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
new file mode 100644
index 000000000000..57f62596e53b
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/vdso.h>
+#include <asm/unistd.h>
+#include <asm/asm-offsets.h>
+#include <asm/dwarf.h>
+#include <asm/ptrace.h>
+
+#define WRAPPER_FRAME_SIZE (STACK_FRAME_OVERHEAD+8)
+
+/*
+ * Older glibc version called vdso without allocating a stackframe. This wrapper
+ * is just used to allocate a stackframe. See
+ * https://sourceware.org/git/?p=glibc.git;a=commit;h=478593e6374f3818da39332260dc453cb19cfa1e
+ * for details.
+ */
+.macro vdso_func func
+ .globl __kernel_\func
+ .type __kernel_\func,@function
+ __ALIGN
+__kernel_\func:
+ CFI_STARTPROC
+ aghi %r15,-WRAPPER_FRAME_SIZE
+ CFI_DEF_CFA_OFFSET (STACK_FRAME_OVERHEAD + WRAPPER_FRAME_SIZE)
+ CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
+ stg %r14,STACK_FRAME_OVERHEAD(%r15)
+ brasl %r14,__s390_vdso_\func
+ lg %r14,STACK_FRAME_OVERHEAD(%r15)
+ aghi %r15,WRAPPER_FRAME_SIZE
+ CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
+ CFI_RESTORE 15
+ br %r14
+ CFI_ENDPROC
+ .size __kernel_\func,.-__kernel_\func
+.endm
+
+vdso_func gettimeofday
+vdso_func clock_getres
+vdso_func clock_gettime
+vdso_func getcpu
+
+.macro vdso_syscall func,syscall
+ .globl __kernel_\func
+ .type __kernel_\func,@function
+ __ALIGN
+__kernel_\func:
+ CFI_STARTPROC
+ svc \syscall
+ /* Make sure we notice when a syscall returns, which shouldn't happen */
+ .word 0
+ CFI_ENDPROC
+ .size __kernel_\func,.-__kernel_\func
+.endm
+
+vdso_syscall restart_syscall,__NR_restart_syscall
+vdso_syscall sigreturn,__NR_sigreturn
+vdso_syscall rt_sigreturn,__NR_rt_sigreturn
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 37695499717d..e32ef446f451 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -5,16 +5,22 @@
#include <asm/thread_info.h>
#include <asm/page.h>
+#include <asm/ftrace.lds.h>
/*
* Put .bss..swapper_pg_dir as the first thing in .bss. This will
* make sure it has 16k alignment.
*/
-#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir)
+#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir) \
+ *(.bss..invalid_pg_dir)
+
+#define RO_EXCEPTION_TABLE_ALIGN 16
/* Handle ro_after_init data on our own. */
#define RO_AFTER_INIT_DATA
+#define RUNTIME_DISCARD_EXIT
+
#define EMITS_PT_NOTE
#include <asm-generic/vmlinux.lds.h>
@@ -40,11 +46,11 @@ SECTIONS
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
- CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
+ FTRACE_HOTPATCH_TRAMPOLINES_TEXT
*(.text.*_indirect_*)
*(.fixup)
*(.gnu.warning)
@@ -63,13 +69,20 @@ SECTIONS
*(.data..ro_after_init)
JUMP_TABLE_DATA
} :data
- EXCEPTION_TABLE(16)
. = ALIGN(PAGE_SIZE);
__end_ro_after_init = .;
RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE)
BOOT_DATA_PRESERVED
+ . = ALIGN(8);
+ .amode31.refs : {
+ _start_amode31_refs = .;
+ *(.amode31.refs)
+ _end_amode31_refs = .;
+ }
+
+ . = ALIGN(PAGE_SIZE);
_edata = .; /* End of data section */
/* will be freed after init */
@@ -122,6 +135,7 @@ SECTIONS
/*
* Table with the patch locations to undo expolines
*/
+ . = ALIGN(4);
.nospec_call_table : {
__nospec_call_start = . ;
*(.s390_indirect*)
@@ -135,6 +149,32 @@ SECTIONS
BOOT_DATA
+ /*
+ * .amode31 section for code, data, ex_table that need to stay
+ * below 2 GB, even when the kernel is relocated above 2 GB.
+ */
+ . = ALIGN(PAGE_SIZE);
+ _samode31 = .;
+ .amode31.text : {
+ _stext_amode31 = .;
+ *(.amode31.text)
+ *(.amode31.text.*_indirect_*)
+ . = ALIGN(PAGE_SIZE);
+ _etext_amode31 = .;
+ }
+ . = ALIGN(16);
+ .amode31.ex_table : {
+ _start_amode31_ex_table = .;
+ KEEP(*(.amode31.ex_table))
+ _stop_amode31_ex_table = .;
+ }
+ . = ALIGN(PAGE_SIZE);
+ .amode31.data : {
+ *(.amode31.data)
+ }
+ . = ALIGN(PAGE_SIZE);
+ _eamode31 = .;
+
/* early.c uses stsi, which requires page aligned data. */
. = ALIGN(PAGE_SIZE);
INIT_DATA_SECTION(0x100)
@@ -157,6 +197,7 @@ SECTIONS
BSS_SECTION(PAGE_SIZE, 4 * PAGE_SIZE, PAGE_SIZE)
+ . = ALIGN(PAGE_SIZE);
_end = . ;
/*
@@ -176,15 +217,28 @@ SECTIONS
QUAD(__dynsym_start) /* dynsym_start */
QUAD(__rela_dyn_start) /* rela_dyn_start */
QUAD(__rela_dyn_end) /* rela_dyn_end */
+ QUAD(_eamode31 - _samode31) /* amode31_size */
+ QUAD(init_mm)
+ QUAD(swapper_pg_dir)
+ QUAD(invalid_pg_dir)
+#ifdef CONFIG_KASAN
+ QUAD(kasan_early_shadow_page)
+ QUAD(kasan_early_shadow_pte)
+ QUAD(kasan_early_shadow_pmd)
+ QUAD(kasan_early_shadow_pud)
+ QUAD(kasan_early_shadow_p4d)
+#endif
} :NONE
/* Debugging sections. */
STABS_DEBUG
DWARF_DEBUG
+ ELF_DETAILS
/* Sections to be discarded */
DISCARDS
/DISCARD/ : {
*(.eh_frame)
+ *(.interp)
}
}
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 8df10d3c8f6c..e0a88dcaf5cb 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -7,13 +7,13 @@
*/
#include <linux/kernel_stat.h>
-#include <linux/sched/cputime.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/timex.h>
#include <linux/types.h>
#include <linux/time.h>
-
+#include <asm/alternative.h>
+#include <asm/cputime.h>
#include <asm/vtimer.h>
#include <asm/vtime.h>
#include <asm/cpu_mf.h>
@@ -130,13 +130,10 @@ static int do_account_vtime(struct task_struct *tsk)
clock = S390_lowcore.last_update_clock;
asm volatile(
" stpt %0\n" /* Store current cpu timer value */
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
" stckf %1" /* Store current tod clock value */
-#else
- " stck %1" /* Store current tod clock value */
-#endif
: "=Q" (S390_lowcore.last_update_timer),
- "=Q" (S390_lowcore.last_update_clock));
+ "=Q" (S390_lowcore.last_update_clock)
+ : : "cc");
clock = S390_lowcore.last_update_clock - clock;
timer -= S390_lowcore.last_update_timer;
@@ -216,41 +213,56 @@ void vtime_flush(struct task_struct *tsk)
avg_steal = S390_lowcore.avg_steal_timer / 2;
if ((s64) steal > 0) {
S390_lowcore.steal_timer = 0;
- account_steal_time(steal);
+ account_steal_time(cputime_to_nsecs(steal));
avg_steal += steal;
}
S390_lowcore.avg_steal_timer = avg_steal;
}
+static u64 vtime_delta(void)
+{
+ u64 timer = S390_lowcore.last_update_timer;
+
+ S390_lowcore.last_update_timer = get_vtimer();
+
+ return timer - S390_lowcore.last_update_timer;
+}
+
/*
* Update process times based on virtual cpu times stored by entry.S
* to the lowcore fields user_timer, system_timer & steal_clock.
*/
-void vtime_account_irq_enter(struct task_struct *tsk)
+void vtime_account_kernel(struct task_struct *tsk)
{
- u64 timer;
+ u64 delta = vtime_delta();
- timer = S390_lowcore.last_update_timer;
- S390_lowcore.last_update_timer = get_vtimer();
- timer -= S390_lowcore.last_update_timer;
-
- if ((tsk->flags & PF_VCPU) && (irq_count() == 0))
- S390_lowcore.guest_timer += timer;
- else if (hardirq_count())
- S390_lowcore.hardirq_timer += timer;
- else if (in_serving_softirq())
- S390_lowcore.softirq_timer += timer;
+ if (tsk->flags & PF_VCPU)
+ S390_lowcore.guest_timer += delta;
else
- S390_lowcore.system_timer += timer;
+ S390_lowcore.system_timer += delta;
- virt_timer_forward(timer);
+ virt_timer_forward(delta);
}
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
-
-void vtime_account_kernel(struct task_struct *tsk)
-__attribute__((alias("vtime_account_irq_enter")));
EXPORT_SYMBOL_GPL(vtime_account_kernel);
+void vtime_account_softirq(struct task_struct *tsk)
+{
+ u64 delta = vtime_delta();
+
+ S390_lowcore.softirq_timer += delta;
+
+ virt_timer_forward(delta);
+}
+
+void vtime_account_hardirq(struct task_struct *tsk)
+{
+ u64 delta = vtime_delta();
+
+ S390_lowcore.hardirq_timer += delta;
+
+ virt_timer_forward(delta);
+}
+
/*
* Sorted add to a list. List is linear searched until first bigger
* element is found.
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index d3db3d7ed077..72e9b7dcdf7d 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -7,7 +7,7 @@ source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
def_bool y
prompt "KVM"
- ---help---
+ help
Say Y here to get to see options for using your Linux host to run other
operating systems inside virtual machines (guests).
This option alone does not add any kernel code.
@@ -20,20 +20,18 @@ config KVM
def_tristate y
prompt "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
- select PREEMPT_NOTIFIERS
select HAVE_KVM_CPU_RELAX_INTERCEPT
select HAVE_KVM_VCPU_ASYNC_IOCTL
- select HAVE_KVM_EVENTFD
select KVM_ASYNC_PF
select KVM_ASYNC_PF_SYNC
+ select KVM_COMMON
select HAVE_KVM_IRQCHIP
- select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_INVALID_WAKEUPS
select HAVE_KVM_NO_POLL
- select SRCU
select KVM_VFIO
- ---help---
+ select MMU_NOTIFIER
+ help
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
on any 64bit machine.
@@ -49,14 +47,10 @@ config KVM
config KVM_S390_UCONTROL
bool "Userspace controlled virtual machines"
depends on KVM
- ---help---
+ help
Allow CAP_SYS_ADMIN users to create KVM virtual machines that are
controlled by userspace.
If unsure, say N.
-# OK, it's a little counter-intuitive to do this, but it puts it neatly under
-# the virtualization menu.
-source "drivers/vhost/Kconfig"
-
endif # VIRTUALIZATION
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 05ee90a5ea08..02217fb4ae10 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -3,12 +3,12 @@
#
# Copyright IBM Corp. 2008
-KVM := ../../../virt/kvm
-common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o
+include $(srctree)/virt/kvm/Makefile.kvm
ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
-kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o vsie.o
+kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
+kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o
+kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 3fb54ec2cf3e..3c65b8258ae6 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -2,7 +2,7 @@
/*
* handling diagnose instructions
*
- * Copyright IBM Corp. 2008, 2011
+ * Copyright IBM Corp. 2008, 2020
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
@@ -10,7 +10,6 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
-#include <asm/pgalloc.h>
#include <asm/gmap.h>
#include <asm/virtio-ccw.h>
#include "kvm-s390.h"
@@ -25,7 +24,7 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + PAGE_SIZE;
- vcpu->stat.diagnose_10++;
+ vcpu->stat.instruction_diagnose_10++;
if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end
|| start < 2 * PAGE_SIZE)
@@ -75,7 +74,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 3, "diag page reference parameter block at 0x%llx",
vcpu->run->s.regs.gprs[rx]);
- vcpu->stat.diagnose_258++;
+ vcpu->stat.instruction_diagnose_258++;
if (vcpu->run->s.regs.gprs[rx] & 7)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm));
@@ -146,18 +145,32 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
{
VCPU_EVENT(vcpu, 5, "%s", "diag time slice end");
- vcpu->stat.diagnose_44++;
+ vcpu->stat.instruction_diagnose_44++;
kvm_vcpu_on_spin(vcpu, true);
return 0;
}
+static int forward_cnt;
+static unsigned long cur_slice;
+
+static int diag9c_forwarding_overrun(void)
+{
+ /* Reset the count on a new slice */
+ if (time_after(jiffies, cur_slice)) {
+ cur_slice = jiffies;
+ forward_cnt = diag9c_forwarding_hz / HZ;
+ }
+ return forward_cnt-- <= 0 ? 1 : 0;
+}
+
static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu *tcpu;
+ int tcpu_cpu;
int tid;
tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
- vcpu->stat.diagnose_9c++;
+ vcpu->stat.instruction_diagnose_9c++;
/* yield to self */
if (tid == vcpu->vcpu_id)
@@ -168,9 +181,22 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
if (!tcpu)
goto no_yield;
- /* target already running */
- if (READ_ONCE(tcpu->cpu) >= 0)
- goto no_yield;
+ /* target guest VCPU already running */
+ tcpu_cpu = READ_ONCE(tcpu->cpu);
+ if (tcpu_cpu >= 0) {
+ if (!diag9c_forwarding_hz || diag9c_forwarding_overrun())
+ goto no_yield;
+
+ /* target host CPU already running */
+ if (!vcpu_is_preempted(tcpu_cpu))
+ goto no_yield;
+ smp_yield_cpu(tcpu_cpu);
+ VCPU_EVENT(vcpu, 5,
+ "diag time slice end directed to %d: yield forwarded",
+ tid);
+ vcpu->stat.diag_9c_forward++;
+ return 0;
+ }
if (kvm_vcpu_yield_to(tcpu) <= 0)
goto no_yield;
@@ -179,7 +205,7 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
return 0;
no_yield:
VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d: ignored", tid);
- vcpu->stat.diagnose_9c_ignored++;
+ vcpu->stat.diag_9c_ignored++;
return 0;
}
@@ -189,7 +215,7 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
unsigned long subcode = vcpu->run->s.regs.gprs[reg] & 0xffff;
VCPU_EVENT(vcpu, 3, "diag ipl functions, subcode %lx", subcode);
- vcpu->stat.diagnose_308++;
+ vcpu->stat.instruction_diagnose_308++;
switch (subcode) {
case 3:
vcpu->run->s390_reset_flags = KVM_S390_RESET_CLEAR;
@@ -201,6 +227,10 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
return -EOPNOTSUPP;
}
+ /*
+ * no need to check the return value of vcpu_stop as it can only have
+ * an error for protvirt, but protvirt means user cpu state
+ */
if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
kvm_s390_vcpu_stop(vcpu);
vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM;
@@ -217,7 +247,7 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
{
int ret;
- vcpu->stat.diagnose_500++;
+ vcpu->stat.instruction_diagnose_500++;
/* No virtio-ccw notification? Get out quickly. */
if (!vcpu->kvm->arch.css_support ||
(vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
@@ -271,7 +301,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
case 0x500:
return __diag_virtio_hypercall(vcpu);
default:
- vcpu->stat.diagnose_other++;
+ vcpu->stat.instruction_diagnose_other++;
return -EOPNOTSUPP;
}
}
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 07d30ffcfa41..5bfcc50c1a68 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -9,8 +9,9 @@
#include <linux/vmalloc.h>
#include <linux/mm_types.h>
#include <linux/err.h>
-
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
+#include <linux/bitfield.h>
+#include <asm/fault.h>
#include <asm/gmap.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -261,77 +262,77 @@ struct aste {
/* .. more fields there */
};
-int ipte_lock_held(struct kvm_vcpu *vcpu)
+int ipte_lock_held(struct kvm *kvm)
{
- if (vcpu->arch.sie_block->eca & ECA_SII) {
+ if (sclp.has_siif) {
int rc;
- read_lock(&vcpu->kvm->arch.sca_lock);
- rc = kvm_s390_get_ipte_control(vcpu->kvm)->kh != 0;
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_lock(&kvm->arch.sca_lock);
+ rc = kvm_s390_get_ipte_control(kvm)->kh != 0;
+ read_unlock(&kvm->arch.sca_lock);
return rc;
}
- return vcpu->kvm->arch.ipte_lock_count != 0;
+ return kvm->arch.ipte_lock_count != 0;
}
-static void ipte_lock_simple(struct kvm_vcpu *vcpu)
+static void ipte_lock_simple(struct kvm *kvm)
{
union ipte_control old, new, *ic;
- mutex_lock(&vcpu->kvm->arch.ipte_mutex);
- vcpu->kvm->arch.ipte_lock_count++;
- if (vcpu->kvm->arch.ipte_lock_count > 1)
+ mutex_lock(&kvm->arch.ipte_mutex);
+ kvm->arch.ipte_lock_count++;
+ if (kvm->arch.ipte_lock_count > 1)
goto out;
retry:
- read_lock(&vcpu->kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(vcpu->kvm);
+ read_lock(&kvm->arch.sca_lock);
+ ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
if (old.k) {
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
new = old;
new.k = 1;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
out:
- mutex_unlock(&vcpu->kvm->arch.ipte_mutex);
+ mutex_unlock(&kvm->arch.ipte_mutex);
}
-static void ipte_unlock_simple(struct kvm_vcpu *vcpu)
+static void ipte_unlock_simple(struct kvm *kvm)
{
union ipte_control old, new, *ic;
- mutex_lock(&vcpu->kvm->arch.ipte_mutex);
- vcpu->kvm->arch.ipte_lock_count--;
- if (vcpu->kvm->arch.ipte_lock_count)
+ mutex_lock(&kvm->arch.ipte_mutex);
+ kvm->arch.ipte_lock_count--;
+ if (kvm->arch.ipte_lock_count)
goto out;
- read_lock(&vcpu->kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(vcpu->kvm);
+ read_lock(&kvm->arch.sca_lock);
+ ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
new = old;
new.k = 0;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&vcpu->kvm->arch.sca_lock);
- wake_up(&vcpu->kvm->arch.ipte_wq);
+ read_unlock(&kvm->arch.sca_lock);
+ wake_up(&kvm->arch.ipte_wq);
out:
- mutex_unlock(&vcpu->kvm->arch.ipte_mutex);
+ mutex_unlock(&kvm->arch.ipte_mutex);
}
-static void ipte_lock_siif(struct kvm_vcpu *vcpu)
+static void ipte_lock_siif(struct kvm *kvm)
{
union ipte_control old, new, *ic;
retry:
- read_lock(&vcpu->kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(vcpu->kvm);
+ read_lock(&kvm->arch.sca_lock);
+ ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
if (old.kg) {
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
@@ -339,15 +340,15 @@ retry:
new.k = 1;
new.kh++;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
}
-static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
+static void ipte_unlock_siif(struct kvm *kvm)
{
union ipte_control old, new, *ic;
- read_lock(&vcpu->kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(vcpu->kvm);
+ read_lock(&kvm->arch.sca_lock);
+ ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
new = old;
@@ -355,25 +356,25 @@ static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
if (!new.kh)
new.k = 0;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
if (!new.kh)
- wake_up(&vcpu->kvm->arch.ipte_wq);
+ wake_up(&kvm->arch.ipte_wq);
}
-void ipte_lock(struct kvm_vcpu *vcpu)
+void ipte_lock(struct kvm *kvm)
{
- if (vcpu->arch.sie_block->eca & ECA_SII)
- ipte_lock_siif(vcpu);
+ if (sclp.has_siif)
+ ipte_lock_siif(kvm);
else
- ipte_lock_simple(vcpu);
+ ipte_lock_simple(kvm);
}
-void ipte_unlock(struct kvm_vcpu *vcpu)
+void ipte_unlock(struct kvm *kvm)
{
- if (vcpu->arch.sie_block->eca & ECA_SII)
- ipte_unlock_siif(vcpu);
+ if (sclp.has_siif)
+ ipte_unlock_siif(kvm);
else
- ipte_unlock_simple(vcpu);
+ ipte_unlock_simple(kvm);
}
static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
@@ -465,61 +466,55 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
return 0;
}
-struct trans_exc_code_bits {
- unsigned long addr : 52; /* Translation-exception Address */
- unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */
- unsigned long : 2;
- unsigned long b56 : 1;
- unsigned long : 3;
- unsigned long b60 : 1;
- unsigned long b61 : 1;
- unsigned long as : 2; /* ASCE Identifier */
-};
-
-enum {
- FSI_UNKNOWN = 0, /* Unknown wether fetch or store */
- FSI_STORE = 1, /* Exception was due to store operation */
- FSI_FETCH = 2 /* Exception was due to fetch operation */
-};
-
enum prot_type {
PROT_TYPE_LA = 0,
PROT_TYPE_KEYC = 1,
PROT_TYPE_ALC = 2,
PROT_TYPE_DAT = 3,
PROT_TYPE_IEP = 4,
+ /* Dummy value for passing an initialized value when code != PGM_PROTECTION */
+ PROT_NONE,
};
-static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
- u8 ar, enum gacc_mode mode, enum prot_type prot)
+static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar,
+ enum gacc_mode mode, enum prot_type prot, bool terminate)
{
struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
- struct trans_exc_code_bits *tec;
+ union teid *teid;
memset(pgm, 0, sizeof(*pgm));
pgm->code = code;
- tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+ teid = (union teid *)&pgm->trans_exc_code;
switch (code) {
case PGM_PROTECTION:
switch (prot) {
+ case PROT_NONE:
+ /* We should never get here, acts like termination */
+ WARN_ON_ONCE(1);
+ break;
case PROT_TYPE_IEP:
- tec->b61 = 1;
- /* FALL THROUGH */
+ teid->b61 = 1;
+ fallthrough;
case PROT_TYPE_LA:
- tec->b56 = 1;
+ teid->b56 = 1;
break;
case PROT_TYPE_KEYC:
- tec->b60 = 1;
+ teid->b60 = 1;
break;
case PROT_TYPE_ALC:
- tec->b60 = 1;
- /* FALL THROUGH */
+ teid->b60 = 1;
+ fallthrough;
case PROT_TYPE_DAT:
- tec->b61 = 1;
+ teid->b61 = 1;
break;
}
- /* FALL THROUGH */
+ if (terminate) {
+ teid->b56 = 0;
+ teid->b60 = 0;
+ teid->b61 = 0;
+ }
+ fallthrough;
case PGM_ASCE_TYPE:
case PGM_PAGE_TRANSLATION:
case PGM_REGION_FIRST_TRANS:
@@ -531,10 +526,10 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
* exc_access_id has to be set to 0 for some instructions. Both
* cases have to be handled by the caller.
*/
- tec->addr = gva >> PAGE_SHIFT;
- tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
- tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
- /* FALL THROUGH */
+ teid->addr = gva >> PAGE_SHIFT;
+ teid->fsi = mode == GACC_STORE ? TEID_FSI_STORE : TEID_FSI_FETCH;
+ teid->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+ fallthrough;
case PGM_ALEN_TRANSLATION:
case PGM_ALE_SEQUENCE:
case PGM_ASTE_VALIDITY:
@@ -551,6 +546,12 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
return code;
}
+static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar,
+ enum gacc_mode mode, enum prot_type prot)
+{
+ return trans_exc_ending(vcpu, code, gva, ar, mode, prot, false);
+}
+
static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
unsigned long ga, u8 ar, enum gacc_mode mode)
{
@@ -607,7 +608,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
* Returns: - zero on success; @gpa contains the resulting absolute address
* - a negative value if guest access failed due to e.g. broken
* guest mapping
- * - a positve value if an access exception happened. In this case
+ * - a positive value if an access exception happened. In this case
* the returned value is the program interruption code as defined
* by the architecture
*/
@@ -677,7 +678,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
dat_protection |= rfte.p;
ptr = rfte.rto * PAGE_SIZE + vaddr.rsx * 8;
}
- /* fallthrough */
+ fallthrough;
case ASCE_TYPE_REGION2: {
union region2_table_entry rste;
@@ -695,7 +696,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
dat_protection |= rste.p;
ptr = rste.rto * PAGE_SIZE + vaddr.rtx * 8;
}
- /* fallthrough */
+ fallthrough;
case ASCE_TYPE_REGION3: {
union region3_table_entry rtte;
@@ -723,7 +724,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
dat_protection |= rtte.fc0.p;
ptr = rtte.fc0.sto * PAGE_SIZE + vaddr.sx * 8;
}
- /* fallthrough */
+ fallthrough;
case ASCE_TYPE_SEGMENT: {
union segment_table_entry ste;
@@ -794,48 +795,270 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
return 1;
}
-static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
- unsigned long *pages, unsigned long nr_pages,
- const union asce asce, enum gacc_mode mode)
+static int vm_check_access_key(struct kvm *kvm, u8 access_key,
+ enum gacc_mode mode, gpa_t gpa)
+{
+ u8 storage_key, access_control;
+ bool fetch_protected;
+ unsigned long hva;
+ int r;
+
+ if (access_key == 0)
+ return 0;
+
+ hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+ if (kvm_is_error_hva(hva))
+ return PGM_ADDRESSING;
+
+ mmap_read_lock(current->mm);
+ r = get_guest_storage_key(current->mm, hva, &storage_key);
+ mmap_read_unlock(current->mm);
+ if (r)
+ return r;
+ access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key);
+ if (access_control == access_key)
+ return 0;
+ fetch_protected = storage_key & _PAGE_FP_BIT;
+ if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected)
+ return 0;
+ return PGM_PROTECTION;
+}
+
+static bool fetch_prot_override_applicable(struct kvm_vcpu *vcpu, enum gacc_mode mode,
+ union asce asce)
+{
+ psw_t *psw = &vcpu->arch.sie_block->gpsw;
+ unsigned long override;
+
+ if (mode == GACC_FETCH || mode == GACC_IFETCH) {
+ /* check if fetch protection override enabled */
+ override = vcpu->arch.sie_block->gcr[0];
+ override &= CR0_FETCH_PROTECTION_OVERRIDE;
+ /* not applicable if subject to DAT && private space */
+ override = override && !(psw_bits(*psw).dat && asce.p);
+ return override;
+ }
+ return false;
+}
+
+static bool fetch_prot_override_applies(unsigned long ga, unsigned int len)
+{
+ return ga < 2048 && ga + len <= 2048;
+}
+
+static bool storage_prot_override_applicable(struct kvm_vcpu *vcpu)
+{
+ /* check if storage protection override enabled */
+ return vcpu->arch.sie_block->gcr[0] & CR0_STORAGE_PROTECTION_OVERRIDE;
+}
+
+static bool storage_prot_override_applies(u8 access_control)
+{
+ /* matches special storage protection override key (9) -> allow */
+ return access_control == PAGE_SPO_ACC;
+}
+
+static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key,
+ enum gacc_mode mode, union asce asce, gpa_t gpa,
+ unsigned long ga, unsigned int len)
+{
+ u8 storage_key, access_control;
+ unsigned long hva;
+ int r;
+
+ /* access key 0 matches any storage key -> allow */
+ if (access_key == 0)
+ return 0;
+ /*
+ * caller needs to ensure that gfn is accessible, so we can
+ * assume that this cannot fail
+ */
+ hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa));
+ mmap_read_lock(current->mm);
+ r = get_guest_storage_key(current->mm, hva, &storage_key);
+ mmap_read_unlock(current->mm);
+ if (r)
+ return r;
+ access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key);
+ /* access key matches storage key -> allow */
+ if (access_control == access_key)
+ return 0;
+ if (mode == GACC_FETCH || mode == GACC_IFETCH) {
+ /* it is a fetch and fetch protection is off -> allow */
+ if (!(storage_key & _PAGE_FP_BIT))
+ return 0;
+ if (fetch_prot_override_applicable(vcpu, mode, asce) &&
+ fetch_prot_override_applies(ga, len))
+ return 0;
+ }
+ if (storage_prot_override_applicable(vcpu) &&
+ storage_prot_override_applies(access_control))
+ return 0;
+ return PGM_PROTECTION;
+}
+
+/**
+ * guest_range_to_gpas() - Calculate guest physical addresses of page fragments
+ * covering a logical range
+ * @vcpu: virtual cpu
+ * @ga: guest address, start of range
+ * @ar: access register
+ * @gpas: output argument, may be NULL
+ * @len: length of range in bytes
+ * @asce: address-space-control element to use for translation
+ * @mode: access mode
+ * @access_key: access key to mach the range's storage keys against
+ *
+ * Translate a logical range to a series of guest absolute addresses,
+ * such that the concatenation of page fragments starting at each gpa make up
+ * the whole range.
+ * The translation is performed as if done by the cpu for the given @asce, @ar,
+ * @mode and state of the @vcpu.
+ * If the translation causes an exception, its program interruption code is
+ * returned and the &struct kvm_s390_pgm_info pgm member of @vcpu is modified
+ * such that a subsequent call to kvm_s390_inject_prog_vcpu() will inject
+ * a correct exception into the guest.
+ * The resulting gpas are stored into @gpas, unless it is NULL.
+ *
+ * Note: All fragments except the first one start at the beginning of a page.
+ * When deriving the boundaries of a fragment from a gpa, all but the last
+ * fragment end at the end of the page.
+ *
+ * Return:
+ * * 0 - success
+ * * <0 - translation could not be performed, for example if guest
+ * memory could not be accessed
+ * * >0 - an access exception occurred. In this case the returned value
+ * is the program interruption code and the contents of pgm may
+ * be used to inject an exception into the guest.
+ */
+static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+ unsigned long *gpas, unsigned long len,
+ const union asce asce, enum gacc_mode mode,
+ u8 access_key)
{
psw_t *psw = &vcpu->arch.sie_block->gpsw;
+ unsigned int offset = offset_in_page(ga);
+ unsigned int fragment_len;
int lap_enabled, rc = 0;
enum prot_type prot;
+ unsigned long gpa;
lap_enabled = low_address_protection_enabled(vcpu, asce);
- while (nr_pages) {
+ while (min(PAGE_SIZE - offset, len) > 0) {
+ fragment_len = min(PAGE_SIZE - offset, len);
ga = kvm_s390_logical_to_effective(vcpu, ga);
if (mode == GACC_STORE && lap_enabled && is_low_address(ga))
return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode,
PROT_TYPE_LA);
- ga &= PAGE_MASK;
if (psw_bits(*psw).dat) {
- rc = guest_translate(vcpu, ga, pages, asce, mode, &prot);
+ rc = guest_translate(vcpu, ga, &gpa, asce, mode, &prot);
if (rc < 0)
return rc;
} else {
- *pages = kvm_s390_real_to_abs(vcpu, ga);
- if (kvm_is_error_gpa(vcpu->kvm, *pages))
+ gpa = kvm_s390_real_to_abs(vcpu, ga);
+ if (kvm_is_error_gpa(vcpu->kvm, gpa)) {
rc = PGM_ADDRESSING;
+ prot = PROT_NONE;
+ }
}
if (rc)
return trans_exc(vcpu, rc, ga, ar, mode, prot);
- ga += PAGE_SIZE;
- pages++;
- nr_pages--;
+ rc = vcpu_check_access_key(vcpu, access_key, mode, asce, gpa, ga,
+ fragment_len);
+ if (rc)
+ return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_KEYC);
+ if (gpas)
+ *gpas++ = gpa;
+ offset = 0;
+ ga += fragment_len;
+ len -= fragment_len;
+ }
+ return 0;
+}
+
+static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
+ void *data, unsigned int len)
+{
+ const unsigned int offset = offset_in_page(gpa);
+ const gfn_t gfn = gpa_to_gfn(gpa);
+ int rc;
+
+ if (mode == GACC_STORE)
+ rc = kvm_write_guest_page(kvm, gfn, data, offset, len);
+ else
+ rc = kvm_read_guest_page(kvm, gfn, data, offset, len);
+ return rc;
+}
+
+static int
+access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
+ void *data, unsigned int len, u8 access_key)
+{
+ struct kvm_memory_slot *slot;
+ bool writable;
+ gfn_t gfn;
+ hva_t hva;
+ int rc;
+
+ gfn = gpa >> PAGE_SHIFT;
+ slot = gfn_to_memslot(kvm, gfn);
+ hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+
+ if (kvm_is_error_hva(hva))
+ return PGM_ADDRESSING;
+ /*
+ * Check if it's a ro memslot, even tho that can't occur (they're unsupported).
+ * Don't try to actually handle that case.
+ */
+ if (!writable && mode == GACC_STORE)
+ return -EOPNOTSUPP;
+ hva += offset_in_page(gpa);
+ if (mode == GACC_STORE)
+ rc = copy_to_user_key((void __user *)hva, data, len, access_key);
+ else
+ rc = copy_from_user_key(data, (void __user *)hva, len, access_key);
+ if (rc)
+ return PGM_PROTECTION;
+ if (mode == GACC_STORE)
+ mark_page_dirty_in_slot(kvm, slot, gfn);
+ return 0;
+}
+
+int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data,
+ unsigned long len, enum gacc_mode mode, u8 access_key)
+{
+ int offset = offset_in_page(gpa);
+ int fragment_len;
+ int rc;
+
+ while (min(PAGE_SIZE - offset, len) > 0) {
+ fragment_len = min(PAGE_SIZE - offset, len);
+ rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key);
+ if (rc)
+ return rc;
+ offset = 0;
+ len -= fragment_len;
+ data += fragment_len;
+ gpa += fragment_len;
}
return 0;
}
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
- unsigned long len, enum gacc_mode mode)
+int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+ void *data, unsigned long len, enum gacc_mode mode,
+ u8 access_key)
{
psw_t *psw = &vcpu->arch.sie_block->gpsw;
- unsigned long _len, nr_pages, gpa, idx;
- unsigned long pages_array[2];
- unsigned long *pages;
+ unsigned long nr_pages, idx;
+ unsigned long gpa_array[2];
+ unsigned int fragment_len;
+ unsigned long *gpas;
+ enum prot_type prot;
int need_ipte_lock;
union asce asce;
+ bool try_storage_prot_override;
+ bool try_fetch_prot_override;
int rc;
if (!len)
@@ -845,55 +1068,199 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
if (rc)
return rc;
nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
- pages = pages_array;
- if (nr_pages > ARRAY_SIZE(pages_array))
- pages = vmalloc(array_size(nr_pages, sizeof(unsigned long)));
- if (!pages)
+ gpas = gpa_array;
+ if (nr_pages > ARRAY_SIZE(gpa_array))
+ gpas = vmalloc(array_size(nr_pages, sizeof(unsigned long)));
+ if (!gpas)
return -ENOMEM;
+ try_fetch_prot_override = fetch_prot_override_applicable(vcpu, mode, asce);
+ try_storage_prot_override = storage_prot_override_applicable(vcpu);
need_ipte_lock = psw_bits(*psw).dat && !asce.r;
if (need_ipte_lock)
- ipte_lock(vcpu);
- rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode);
- for (idx = 0; idx < nr_pages && !rc; idx++) {
- gpa = *(pages + idx) + (ga & ~PAGE_MASK);
- _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
- if (mode == GACC_STORE)
- rc = kvm_write_guest(vcpu->kvm, gpa, data, _len);
+ ipte_lock(vcpu->kvm);
+ /*
+ * Since we do the access further down ultimately via a move instruction
+ * that does key checking and returns an error in case of a protection
+ * violation, we don't need to do the check during address translation.
+ * Skip it by passing access key 0, which matches any storage key,
+ * obviating the need for any further checks. As a result the check is
+ * handled entirely in hardware on access, we only need to take care to
+ * forego key protection checking if fetch protection override applies or
+ * retry with the special key 9 in case of storage protection override.
+ */
+ rc = guest_range_to_gpas(vcpu, ga, ar, gpas, len, asce, mode, 0);
+ if (rc)
+ goto out_unlock;
+ for (idx = 0; idx < nr_pages; idx++) {
+ fragment_len = min(PAGE_SIZE - offset_in_page(gpas[idx]), len);
+ if (try_fetch_prot_override && fetch_prot_override_applies(ga, fragment_len)) {
+ rc = access_guest_page(vcpu->kvm, mode, gpas[idx],
+ data, fragment_len);
+ } else {
+ rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx],
+ data, fragment_len, access_key);
+ }
+ if (rc == PGM_PROTECTION && try_storage_prot_override)
+ rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx],
+ data, fragment_len, PAGE_SPO_ACC);
+ if (rc)
+ break;
+ len -= fragment_len;
+ data += fragment_len;
+ ga = kvm_s390_logical_to_effective(vcpu, ga + fragment_len);
+ }
+ if (rc > 0) {
+ bool terminate = (mode == GACC_STORE) && (idx > 0);
+
+ if (rc == PGM_PROTECTION)
+ prot = PROT_TYPE_KEYC;
else
- rc = kvm_read_guest(vcpu->kvm, gpa, data, _len);
- len -= _len;
- ga += _len;
- data += _len;
+ prot = PROT_NONE;
+ rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate);
}
+out_unlock:
if (need_ipte_lock)
- ipte_unlock(vcpu);
- if (nr_pages > ARRAY_SIZE(pages_array))
- vfree(pages);
+ ipte_unlock(vcpu->kvm);
+ if (nr_pages > ARRAY_SIZE(gpa_array))
+ vfree(gpas);
return rc;
}
int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
void *data, unsigned long len, enum gacc_mode mode)
{
- unsigned long _len, gpa;
+ unsigned int fragment_len;
+ unsigned long gpa;
int rc = 0;
while (len && !rc) {
gpa = kvm_s390_real_to_abs(vcpu, gra);
- _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
- if (mode)
- rc = write_guest_abs(vcpu, gpa, data, _len);
- else
- rc = read_guest_abs(vcpu, gpa, data, _len);
- len -= _len;
- gra += _len;
- data += _len;
+ fragment_len = min(PAGE_SIZE - offset_in_page(gpa), len);
+ rc = access_guest_page(vcpu->kvm, mode, gpa, data, fragment_len);
+ len -= fragment_len;
+ gra += fragment_len;
+ data += fragment_len;
}
return rc;
}
/**
- * guest_translate_address - translate guest logical into guest absolute address
+ * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address.
+ * @kvm: Virtual machine instance.
+ * @gpa: Absolute guest address of the location to be changed.
+ * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a
+ * non power of two will result in failure.
+ * @old_addr: Pointer to old value. If the location at @gpa contains this value,
+ * the exchange will succeed. After calling cmpxchg_guest_abs_with_key()
+ * *@old_addr contains the value at @gpa before the attempt to
+ * exchange the value.
+ * @new: The value to place at @gpa.
+ * @access_key: The access key to use for the guest access.
+ * @success: output value indicating if an exchange occurred.
+ *
+ * Atomically exchange the value at @gpa by @new, if it contains *@old.
+ * Honors storage keys.
+ *
+ * Return: * 0: successful exchange
+ * * >0: a program interruption code indicating the reason cmpxchg could
+ * not be attempted
+ * * -EINVAL: address misaligned or len not power of two
+ * * -EAGAIN: transient failure (len 1 or 2)
+ * * -EOPNOTSUPP: read-only memslot (should never occur)
+ */
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
+ __uint128_t *old_addr, __uint128_t new,
+ u8 access_key, bool *success)
+{
+ gfn_t gfn = gpa_to_gfn(gpa);
+ struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ bool writable;
+ hva_t hva;
+ int ret;
+
+ if (!IS_ALIGNED(gpa, len))
+ return -EINVAL;
+
+ hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+ if (kvm_is_error_hva(hva))
+ return PGM_ADDRESSING;
+ /*
+ * Check if it's a read-only memslot, even though that cannot occur
+ * since those are unsupported.
+ * Don't try to actually handle that case.
+ */
+ if (!writable)
+ return -EOPNOTSUPP;
+
+ hva += offset_in_page(gpa);
+ /*
+ * The cmpxchg_user_key macro depends on the type of "old", so we need
+ * a case for each valid length and get some code duplication as long
+ * as we don't introduce a new macro.
+ */
+ switch (len) {
+ case 1: {
+ u8 old;
+
+ ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ case 2: {
+ u16 old;
+
+ ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ case 4: {
+ u32 old;
+
+ ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ case 8: {
+ u64 old;
+
+ ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ case 16: {
+ __uint128_t old;
+
+ ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+ if (*success)
+ mark_page_dirty_in_slot(kvm, slot, gfn);
+ /*
+ * Assume that the fault is caused by protection, either key protection
+ * or user page write protection.
+ */
+ if (ret == -EFAULT)
+ ret = PGM_PROTECTION;
+ return ret;
+}
+
+/**
+ * guest_translate_address_with_key - translate guest logical into guest absolute address
+ * @vcpu: virtual cpu
+ * @gva: Guest virtual address
+ * @ar: Access register
+ * @gpa: Guest physical address
+ * @mode: Translation access mode
+ * @access_key: access key to mach the storage key with
*
* Parameter semantics are the same as the ones from guest_translate.
* The memory contents at the guest address are not changed.
@@ -901,11 +1268,10 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
* Note: The IPTE lock is not taken during this function, so the caller
* has to take care of this.
*/
-int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
- unsigned long *gpa, enum gacc_mode mode)
+int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
+ unsigned long *gpa, enum gacc_mode mode,
+ u8 access_key)
{
- psw_t *psw = &vcpu->arch.sie_block->gpsw;
- enum prot_type prot;
union asce asce;
int rc;
@@ -913,49 +1279,62 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
if (rc)
return rc;
- if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
- if (mode == GACC_STORE)
- return trans_exc(vcpu, PGM_PROTECTION, gva, 0,
- mode, PROT_TYPE_LA);
- }
+ return guest_range_to_gpas(vcpu, gva, ar, gpa, 1, asce, mode,
+ access_key);
+}
- if (psw_bits(*psw).dat && !asce.r) { /* Use DAT? */
- rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot);
- if (rc > 0)
- return trans_exc(vcpu, rc, gva, 0, mode, prot);
- } else {
- *gpa = kvm_s390_real_to_abs(vcpu, gva);
- if (kvm_is_error_gpa(vcpu->kvm, *gpa))
- return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0);
- }
+/**
+ * check_gva_range - test a range of guest virtual addresses for accessibility
+ * @vcpu: virtual cpu
+ * @gva: Guest virtual address
+ * @ar: Access register
+ * @length: Length of test range
+ * @mode: Translation access mode
+ * @access_key: access key to mach the storage keys with
+ */
+int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
+ unsigned long length, enum gacc_mode mode, u8 access_key)
+{
+ union asce asce;
+ int rc = 0;
+
+ rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
+ if (rc)
+ return rc;
+ ipte_lock(vcpu->kvm);
+ rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode,
+ access_key);
+ ipte_unlock(vcpu->kvm);
return rc;
}
/**
- * check_gva_range - test a range of guest virtual addresses for accessibility
+ * check_gpa_range - test a range of guest physical addresses for accessibility
+ * @kvm: virtual machine instance
+ * @gpa: guest physical address
+ * @length: length of test range
+ * @mode: access mode to test, relevant for storage keys
+ * @access_key: access key to mach the storage keys with
*/
-int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
- unsigned long length, enum gacc_mode mode)
+int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length,
+ enum gacc_mode mode, u8 access_key)
{
- unsigned long gpa;
- unsigned long currlen;
+ unsigned int fragment_len;
int rc = 0;
- ipte_lock(vcpu);
- while (length > 0 && !rc) {
- currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE));
- rc = guest_translate_address(vcpu, gva, ar, &gpa, mode);
- gva += currlen;
- length -= currlen;
+ while (length && !rc) {
+ fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length);
+ rc = vm_check_access_key(kvm, access_key, mode, gpa);
+ length -= fragment_len;
+ gpa += fragment_len;
}
- ipte_unlock(vcpu);
-
return rc;
}
/**
* kvm_s390_check_low_addr_prot_real - check for low-address protection
+ * @vcpu: virtual cpu
* @gra: Guest real address
*
* Checks whether an address is subject to low-address protection and set
@@ -976,13 +1355,17 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
* kvm_s390_shadow_tables - walk the guest page table and create shadow tables
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
- * @pgt: pointer to the page table address result
+ * @pgt: pointer to the beginning of the page table for the given address if
+ * successful (return value 0), or to the first invalid DAT entry in
+ * case of exceptions (return value > 0)
+ * @dat_protection: referenced memory is write protected
* @fake: pgt references contiguous guest memory block, not a pgtable
*/
static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
unsigned long *pgt, int *dat_protection,
int *fake)
{
+ struct kvm *kvm;
struct gmap *parent;
union asce asce;
union vaddress vaddr;
@@ -991,6 +1374,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
*fake = 0;
*dat_protection = 0;
+ kvm = sg->private;
parent = sg->parent;
vaddr.addr = saddr;
asce.val = sg->orig_asce;
@@ -1034,6 +1418,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
rfte.val = ptr;
goto shadow_r2t;
}
+ *pgt = ptr + vaddr.rfx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
if (rc)
return rc;
@@ -1050,7 +1435,9 @@ shadow_r2t:
rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
if (rc)
return rc;
- } /* fallthrough */
+ kvm->stat.gmap_shadow_r1_entry++;
+ }
+ fallthrough;
case ASCE_TYPE_REGION2: {
union region2_table_entry rste;
@@ -1059,6 +1446,7 @@ shadow_r2t:
rste.val = ptr;
goto shadow_r3t;
}
+ *pgt = ptr + vaddr.rsx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
if (rc)
return rc;
@@ -1076,7 +1464,9 @@ shadow_r3t:
rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
if (rc)
return rc;
- } /* fallthrough */
+ kvm->stat.gmap_shadow_r2_entry++;
+ }
+ fallthrough;
case ASCE_TYPE_REGION3: {
union region3_table_entry rtte;
@@ -1085,6 +1475,7 @@ shadow_r3t:
rtte.val = ptr;
goto shadow_sgt;
}
+ *pgt = ptr + vaddr.rtx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
if (rc)
return rc;
@@ -1111,7 +1502,9 @@ shadow_sgt:
rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
if (rc)
return rc;
- } /* fallthrough */
+ kvm->stat.gmap_shadow_r3_entry++;
+ }
+ fallthrough;
case ASCE_TYPE_SEGMENT: {
union segment_table_entry ste;
@@ -1120,6 +1513,7 @@ shadow_sgt:
ste.val = ptr;
goto shadow_pgt;
}
+ *pgt = ptr + vaddr.sx * 8;
rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
if (rc)
return rc;
@@ -1142,6 +1536,7 @@ shadow_pgt:
rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
if (rc)
return rc;
+ kvm->stat.gmap_shadow_sg_entry++;
}
}
/* Return the parent address of the page table */
@@ -1154,6 +1549,8 @@ shadow_pgt:
* @vcpu: virtual cpu
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
+ * @datptr: will contain the address of the faulting DAT table entry, or of
+ * the valid leaf, plus some flags
*
* Returns: - 0 if the shadow fault was successfully resolved
* - > 0 (pgm exception code) on exceptions while faulting
@@ -1162,21 +1559,21 @@ shadow_pgt:
* - -ENOMEM if out of memory
*/
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
- unsigned long saddr)
+ unsigned long saddr, unsigned long *datptr)
{
union vaddress vaddr;
union page_table_entry pte;
- unsigned long pgt;
+ unsigned long pgt = 0;
int dat_protection, fake;
int rc;
- down_read(&sg->mm->mmap_sem);
+ mmap_read_lock(sg->mm);
/*
* We don't want any guest-2 tables to change - so the parent
* tables/pointers we read stay valid - unshadowing is however
* always possible - only guest_table_lock protects us.
*/
- ipte_lock(vcpu);
+ ipte_lock(vcpu->kvm);
rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
if (rc)
@@ -1188,8 +1585,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
pte.val = pgt + vaddr.px * PAGE_SIZE;
goto shadow_page;
}
- if (!rc)
- rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+
+ switch (rc) {
+ case PGM_SEGMENT_TRANSLATION:
+ case PGM_REGION_THIRD_TRANS:
+ case PGM_REGION_SECOND_TRANS:
+ case PGM_REGION_FIRST_TRANS:
+ pgt |= PEI_NOT_PTE;
+ break;
+ case 0:
+ pgt += vaddr.px * 8;
+ rc = gmap_read_table(sg->parent, pgt, &pte.val);
+ }
+ if (datptr)
+ *datptr = pgt | dat_protection * PEI_DAT_PROT;
if (!rc && pte.i)
rc = PGM_PAGE_TRANSLATION;
if (!rc && pte.z)
@@ -1198,7 +1607,8 @@ shadow_page:
pte.p |= dat_protection;
if (!rc)
rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
- ipte_unlock(vcpu);
- up_read(&sg->mm->mmap_sem);
+ vcpu->kvm->stat.gmap_shadow_pg_entry++;
+ ipte_unlock(vcpu->kvm);
+ mmap_read_unlock(sg->mm);
return rc;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index f4c51756c462..b320d12aa049 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -18,17 +18,14 @@
/**
* kvm_s390_real_to_abs - convert guest real address to guest absolute address
- * @vcpu - guest virtual cpu
+ * @prefix - guest prefix
* @gra - guest real address
*
* Returns the guest absolute address that corresponds to the passed guest real
- * address @gra of a virtual guest cpu by applying its prefix.
+ * address @gra of by applying the given prefix.
*/
-static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
- unsigned long gra)
+static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra)
{
- unsigned long prefix = kvm_s390_get_prefix(vcpu);
-
if (gra < 2 * PAGE_SIZE)
gra += prefix;
else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE)
@@ -37,6 +34,43 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
}
/**
+ * kvm_s390_real_to_abs - convert guest real address to guest absolute address
+ * @vcpu - guest virtual cpu
+ * @gra - guest real address
+ *
+ * Returns the guest absolute address that corresponds to the passed guest real
+ * address @gra of a virtual guest cpu by applying its prefix.
+ */
+static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
+ unsigned long gra)
+{
+ return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra);
+}
+
+/**
+ * _kvm_s390_logical_to_effective - convert guest logical to effective address
+ * @psw: psw of the guest
+ * @ga: guest logical address
+ *
+ * Convert a guest logical address to an effective address by applying the
+ * rules of the addressing mode defined by bits 31 and 32 of the given PSW
+ * (extendended/basic addressing mode).
+ *
+ * Depending on the addressing mode, the upper 40 bits (24 bit addressing
+ * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing
+ * mode) of @ga will be zeroed and the remaining bits will be returned.
+ */
+static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw,
+ unsigned long ga)
+{
+ if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
+ return ga;
+ if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
+ return ga & ((1UL << 31) - 1);
+ return ga & ((1UL << 24) - 1);
+}
+
+/**
* kvm_s390_logical_to_effective - convert guest logical to effective address
* @vcpu: guest virtual cpu
* @ga: guest logical address
@@ -52,13 +86,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu,
unsigned long ga)
{
- psw_t *psw = &vcpu->arch.sie_block->gpsw;
-
- if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
- return ga;
- if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
- return ga & ((1UL << 31) - 1);
- return ga & ((1UL << 24) - 1);
+ return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga);
}
/*
@@ -158,24 +186,37 @@ enum gacc_mode {
GACC_IFETCH,
};
-int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
- u8 ar, unsigned long *gpa, enum gacc_mode mode);
+int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
+ unsigned long *gpa, enum gacc_mode mode,
+ u8 access_key);
+
int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
- unsigned long length, enum gacc_mode mode);
+ unsigned long length, enum gacc_mode mode, u8 access_key);
+
+int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length,
+ enum gacc_mode mode, u8 access_key);
+
+int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data,
+ unsigned long len, enum gacc_mode mode, u8 access_key);
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
- unsigned long len, enum gacc_mode mode);
+int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+ void *data, unsigned long len, enum gacc_mode mode,
+ u8 access_key);
int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
void *data, unsigned long len, enum gacc_mode mode);
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old,
+ __uint128_t new, u8 access_key, bool *success);
+
/**
- * write_guest - copy data from kernel space to guest space
+ * write_guest_with_key - copy data from kernel space to guest space
* @vcpu: virtual cpu
* @ga: guest address
* @ar: access register
* @data: source address in kernel space
* @len: number of bytes to copy
+ * @access_key: access key the storage key needs to match
*
* Copy @len bytes from @data (kernel space) to @ga (guest address).
* In order to copy data to guest space the PSW of the vcpu is inspected:
@@ -186,8 +227,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
* The addressing mode of the PSW is also inspected, so that address wrap
* around is taken into account for 24-, 31- and 64-bit addressing mode,
* if the to be copied data crosses page boundaries in guest address space.
- * In addition also low address and DAT protection are inspected before
- * copying any data (key protection is currently not implemented).
+ * In addition low address, DAT and key protection checks are performed before
+ * copying any data.
*
* This function modifies the 'struct kvm_s390_pgm_info pgm' member of @vcpu.
* In case of an access exception (e.g. protection exception) pgm will contain
@@ -215,10 +256,53 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
* if data has been changed in guest space in case of an exception.
*/
static inline __must_check
+int write_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+ void *data, unsigned long len, u8 access_key)
+{
+ return access_guest_with_key(vcpu, ga, ar, data, len, GACC_STORE,
+ access_key);
+}
+
+/**
+ * write_guest - copy data from kernel space to guest space
+ * @vcpu: virtual cpu
+ * @ga: guest address
+ * @ar: access register
+ * @data: source address in kernel space
+ * @len: number of bytes to copy
+ *
+ * The behaviour of write_guest is identical to write_guest_with_key, except
+ * that the PSW access key is used instead of an explicit argument.
+ */
+static inline __must_check
int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
unsigned long len)
{
- return access_guest(vcpu, ga, ar, data, len, GACC_STORE);
+ u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key;
+
+ return write_guest_with_key(vcpu, ga, ar, data, len, access_key);
+}
+
+/**
+ * read_guest_with_key - copy data from guest space to kernel space
+ * @vcpu: virtual cpu
+ * @ga: guest address
+ * @ar: access register
+ * @data: destination address in kernel space
+ * @len: number of bytes to copy
+ * @access_key: access key the storage key needs to match
+ *
+ * Copy @len bytes from @ga (guest address) to @data (kernel space).
+ *
+ * The behaviour of read_guest_with_key is identical to write_guest_with_key,
+ * except that data will be copied from guest space to kernel space.
+ */
+static inline __must_check
+int read_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
+ void *data, unsigned long len, u8 access_key)
+{
+ return access_guest_with_key(vcpu, ga, ar, data, len, GACC_FETCH,
+ access_key);
}
/**
@@ -231,14 +315,16 @@ int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
*
* Copy @len bytes from @ga (guest address) to @data (kernel space).
*
- * The behaviour of read_guest is identical to write_guest, except that
- * data will be copied from guest space to kernel space.
+ * The behaviour of read_guest is identical to read_guest_with_key, except
+ * that the PSW access key is used instead of an explicit argument.
*/
static inline __must_check
int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
unsigned long len)
{
- return access_guest(vcpu, ga, ar, data, len, GACC_FETCH);
+ u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key;
+
+ return read_guest_with_key(vcpu, ga, ar, data, len, access_key);
}
/**
@@ -259,7 +345,10 @@ static inline __must_check
int read_guest_instr(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
unsigned long len)
{
- return access_guest(vcpu, ga, 0, data, len, GACC_IFETCH);
+ u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key;
+
+ return access_guest_with_key(vcpu, ga, 0, data, len, GACC_IFETCH,
+ access_key);
}
/**
@@ -354,12 +443,16 @@ int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
return access_guest_real(vcpu, gra, data, len, 0);
}
-void ipte_lock(struct kvm_vcpu *vcpu);
-void ipte_unlock(struct kvm_vcpu *vcpu);
-int ipte_lock_held(struct kvm_vcpu *vcpu);
+void ipte_lock(struct kvm *kvm);
+void ipte_unlock(struct kvm *kvm);
+int ipte_lock_held(struct kvm *kvm);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
+/* MVPG PEI indication bits */
+#define PEI_DAT_PROT 2
+#define PEI_NOT_PTE 4
+
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
- unsigned long saddr);
+ unsigned long saddr, unsigned long *datptr);
#endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index 394a5f53805b..80879fc73c90 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -184,7 +184,7 @@ static int __import_wp_info(struct kvm_vcpu *vcpu,
if (wp_info->len < 0 || wp_info->len > MAX_WP_SIZE)
return -EINVAL;
- wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL);
+ wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL_ACCOUNT);
if (!wp_info->old_data)
return -ENOMEM;
/* try to backup the original value */
@@ -213,8 +213,8 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT)
return -EINVAL;
- bp_data = memdup_user(dbg->arch.hw_bp,
- sizeof(*bp_data) * dbg->arch.nr_hw_bp);
+ bp_data = memdup_array_user(dbg->arch.hw_bp, dbg->arch.nr_hw_bp,
+ sizeof(*bp_data));
if (IS_ERR(bp_data))
return PTR_ERR(bp_data);
@@ -234,7 +234,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
if (nr_wp > 0) {
wp_info = kmalloc_array(nr_wp,
sizeof(*wp_info),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!wp_info) {
ret = -ENOMEM;
goto error;
@@ -243,7 +243,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
if (nr_bp > 0) {
bp_info = kmalloc_array(nr_bp,
sizeof(*bp_info),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!bp_info) {
ret = -ENOMEM;
goto error;
@@ -349,7 +349,7 @@ static struct kvm_hw_wp_info_arch *any_wp_changed(struct kvm_vcpu *vcpu)
if (!wp_info || !wp_info->old_data || wp_info->len <= 0)
continue;
- temp = kmalloc(wp_info->len, GFP_KERNEL);
+ temp = kmalloc(wp_info->len, GFP_KERNEL_ACCOUNT);
if (!temp)
continue;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index a389fa85cca2..b16352083ff9 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -2,7 +2,7 @@
/*
* in-kernel handling for sie intercepts
*
- * Copyright IBM Corp. 2008, 2014
+ * Copyright IBM Corp. 2008, 2020
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
@@ -12,10 +12,10 @@
#include <linux/errno.h>
#include <linux/pagemap.h>
-#include <asm/kvm_host.h>
#include <asm/asm-offsets.h>
#include <asm/irq.h>
#include <asm/sysinfo.h>
+#include <asm/uv.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -79,6 +79,10 @@ static int handle_stop(struct kvm_vcpu *vcpu)
return rc;
}
+ /*
+ * no need to check the return value of vcpu_stop as it can only have
+ * an error for protvirt, but protvirt means user cpu state
+ */
if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
kvm_s390_vcpu_stop(vcpu);
return -EOPNOTSUPP;
@@ -213,7 +217,7 @@ static int handle_itdb(struct kvm_vcpu *vcpu)
return 0;
if (current->thread.per_flags & PER_FLAG_NO_TE)
return 0;
- itdb = (struct kvm_s390_itdb *)vcpu->arch.sie_block->itdba;
+ itdb = phys_to_virt(vcpu->arch.sie_block->itdba);
rc = write_guest_lc(vcpu, __LC_PGM_TDB, itdb, sizeof(*itdb));
if (rc)
return rc;
@@ -224,6 +228,21 @@ static int handle_itdb(struct kvm_vcpu *vcpu)
#define per_event(vcpu) (vcpu->arch.sie_block->iprcc & PGM_PER)
+static bool should_handle_per_event(const struct kvm_vcpu *vcpu)
+{
+ if (!guestdbg_enabled(vcpu) || !per_event(vcpu))
+ return false;
+ if (guestdbg_sstep_enabled(vcpu) &&
+ vcpu->arch.sie_block->iprcc != PGM_PER) {
+ /*
+ * __vcpu_run() will exit after delivering the concurrently
+ * indicated condition.
+ */
+ return false;
+ }
+ return true;
+}
+
static int handle_prog(struct kvm_vcpu *vcpu)
{
psw_t psw;
@@ -231,7 +250,14 @@ static int handle_prog(struct kvm_vcpu *vcpu)
vcpu->stat.exit_program_interruption++;
- if (guestdbg_enabled(vcpu) && per_event(vcpu)) {
+ /*
+ * Intercept 8 indicates a loop of specification exceptions
+ * for protected guests.
+ */
+ if (kvm_s390_pv_cpu_is_protected(vcpu))
+ return -EOPNOTSUPP;
+
+ if (should_handle_per_event(vcpu)) {
rc = kvm_s390_handle_per_event(vcpu);
if (rc)
return rc;
@@ -258,11 +284,20 @@ static int handle_prog(struct kvm_vcpu *vcpu)
/**
* handle_external_interrupt - used for external interruption interceptions
+ * @vcpu: virtual cpu
+ *
+ * This interception occurs if:
+ * - the CPUSTAT_EXT_INT bit was already set when the external interrupt
+ * occurred. In this case, the interrupt needs to be injected manually to
+ * preserve interrupt priority.
+ * - the external new PSW has external interrupts enabled, which will cause an
+ * interruption loop. We drop to userspace in this case.
+ *
+ * The latter case can be detected by inspecting the external mask bit in the
+ * external new psw.
*
- * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if
- * the new PSW does not have external interrupts disabled. In the first case,
- * we've got to deliver the interrupt manually, and in the second case, we
- * drop to userspace to handle the situation there.
+ * Under PV, only the latter case can occur, since interrupt priorities are
+ * handled in the ultravisor.
*/
static int handle_external_interrupt(struct kvm_vcpu *vcpu)
{
@@ -273,10 +308,18 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
vcpu->stat.exit_external_interrupt++;
- rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t));
- if (rc)
- return rc;
- /* We can not handle clock comparator or timer interrupt with bad PSW */
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ newpsw = vcpu->arch.sie_block->gpsw;
+ } else {
+ rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t));
+ if (rc)
+ return rc;
+ }
+
+ /*
+ * Clock comparator or timer interrupt with external interrupt enabled
+ * will cause interrupt loop. Drop to userspace.
+ */
if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) &&
(newpsw.mask & PSW_MASK_EXT))
return -EOPNOTSUPP;
@@ -304,7 +347,8 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
}
/**
- * Handle MOVE PAGE partial execution interception.
+ * handle_mvpg_pei - Handle MOVE PAGE partial execution interception.
+ * @vcpu: virtual cpu
*
* This interception can only happen for guests with DAT disabled and
* addresses that are currently not mapped in the host. Thus we try to
@@ -318,18 +362,18 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
- /* Make sure that the source is paged-in */
- rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg2],
- reg2, &srcaddr, GACC_FETCH);
+ /* Ensure that the source is paged-in, no actual access -> no key checking */
+ rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg2],
+ reg2, &srcaddr, GACC_FETCH, 0);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0);
if (rc != 0)
return rc;
- /* Make sure that the destination is paged-in */
- rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg1],
- reg1, &dstaddr, GACC_STORE);
+ /* Ensure that the source is paged-in, no actual access -> no key checking */
+ rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg1],
+ reg1, &dstaddr, GACC_STORE, 0);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1);
@@ -360,8 +404,8 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
*/
int handle_sthyi(struct kvm_vcpu *vcpu)
{
- int reg1, reg2, r = 0;
- u64 code, addr, cc = 0, rc = 0;
+ int reg1, reg2, cc = 0, r = 0;
+ u64 code, addr, rc = 0;
struct sthyi_sctns *sctns = NULL;
if (!test_kvm_facility(vcpu->kvm, 74))
@@ -384,21 +428,28 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
goto out;
}
- if (addr & ~PAGE_MASK)
+ if (!kvm_s390_pv_cpu_is_protected(vcpu) && (addr & ~PAGE_MASK))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- sctns = (void *)get_zeroed_page(GFP_KERNEL);
+ sctns = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!sctns)
return -ENOMEM;
cc = sthyi_fill(sctns, &rc);
-
+ if (cc < 0) {
+ free_page((unsigned long)sctns);
+ return cc;
+ }
out:
if (!cc) {
- r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
- if (r) {
- free_page((unsigned long)sctns);
- return kvm_s390_inject_prog_cond(vcpu, r);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ memcpy(sida_addr(vcpu->arch.sie_block), sctns, PAGE_SIZE);
+ } else {
+ r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
+ if (r) {
+ free_page((unsigned long)sctns);
+ return kvm_s390_inject_prog_cond(vcpu, r);
+ }
}
}
@@ -444,6 +495,110 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
}
+static int handle_pv_spx(struct kvm_vcpu *vcpu)
+{
+ u32 pref = *(u32 *)sida_addr(vcpu->arch.sie_block);
+
+ kvm_s390_set_prefix(vcpu, pref);
+ trace_kvm_s390_handle_prefix(vcpu, 1, pref);
+ return 0;
+}
+
+static int handle_pv_sclp(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+
+ spin_lock(&fi->lock);
+ /*
+ * 2 cases:
+ * a: an sccb answering interrupt was already pending or in flight.
+ * As the sccb value is not known we can simply set some value to
+ * trigger delivery of a saved SCCB. UV will then use its saved
+ * copy of the SCCB value.
+ * b: an error SCCB interrupt needs to be injected so we also inject
+ * a fake SCCB address. Firmware will use the proper one.
+ * This makes sure, that both errors and real sccb returns will only
+ * be delivered after a notification intercept (instruction has
+ * finished) but not after others.
+ */
+ fi->srv_signal.ext_params |= 0x43000;
+ set_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs);
+ clear_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs);
+ spin_unlock(&fi->lock);
+ return 0;
+}
+
+static int handle_pv_uvc(struct kvm_vcpu *vcpu)
+{
+ struct uv_cb_share *guest_uvcb = sida_addr(vcpu->arch.sie_block);
+ struct uv_cb_cts uvcb = {
+ .header.cmd = UVC_CMD_UNPIN_PAGE_SHARED,
+ .header.len = sizeof(uvcb),
+ .guest_handle = kvm_s390_pv_get_handle(vcpu->kvm),
+ .gaddr = guest_uvcb->paddr,
+ };
+ int rc;
+
+ if (guest_uvcb->header.cmd != UVC_CMD_REMOVE_SHARED_ACCESS) {
+ WARN_ONCE(1, "Unexpected notification intercept for UVC 0x%x\n",
+ guest_uvcb->header.cmd);
+ return 0;
+ }
+ rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb);
+ /*
+ * If the unpin did not succeed, the guest will exit again for the UVC
+ * and we will retry the unpin.
+ */
+ if (rc == -EINVAL)
+ return 0;
+ /*
+ * If we got -EAGAIN here, we simply return it. It will eventually
+ * get propagated all the way to userspace, which should then try
+ * again.
+ */
+ return rc;
+}
+
+static int handle_pv_notification(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ if (vcpu->arch.sie_block->ipa == 0xb210)
+ return handle_pv_spx(vcpu);
+ if (vcpu->arch.sie_block->ipa == 0xb220)
+ return handle_pv_sclp(vcpu);
+ if (vcpu->arch.sie_block->ipa == 0xb9a4)
+ return handle_pv_uvc(vcpu);
+ if (vcpu->arch.sie_block->ipa >> 8 == 0xae) {
+ /*
+ * Besides external call, other SIGP orders also cause a
+ * 108 (pv notify) intercept. In contrast to external call,
+ * these orders need to be emulated and hence the appropriate
+ * place to handle them is in handle_instruction().
+ * So first try kvm_s390_handle_sigp_pei() and if that isn't
+ * successful, go on with handle_instruction().
+ */
+ ret = kvm_s390_handle_sigp_pei(vcpu);
+ if (!ret)
+ return ret;
+ }
+
+ return handle_instruction(vcpu);
+}
+
+static bool should_handle_per_ifetch(const struct kvm_vcpu *vcpu, int rc)
+{
+ /* Process PER, also if the instruction is processed in user space. */
+ if (!(vcpu->arch.sie_block->icptstatus & 0x02))
+ return false;
+ if (rc != 0 && rc != -EOPNOTSUPP)
+ return false;
+ if (guestdbg_sstep_enabled(vcpu) && vcpu->arch.local_int.pending_irqs)
+ /* __vcpu_run() will exit after delivering the interrupt. */
+ return false;
+ return true;
+}
+
int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
{
int rc, per_rc = 0;
@@ -478,15 +633,35 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
rc = handle_partial_execution(vcpu);
break;
case ICPT_KSS:
- rc = kvm_s390_skey_check_enable(vcpu);
+ /* Instruction will be redriven, skip the PER check. */
+ return kvm_s390_skey_check_enable(vcpu);
+ case ICPT_MCHKREQ:
+ case ICPT_INT_ENABLE:
+ /*
+ * PSW bit 13 or a CR (0, 6, 14) changed and we might
+ * now be able to deliver interrupts. The pre-run code
+ * will take care of this.
+ */
+ rc = 0;
+ break;
+ case ICPT_PV_INSTR:
+ rc = handle_instruction(vcpu);
+ break;
+ case ICPT_PV_NOTIFY:
+ rc = handle_pv_notification(vcpu);
+ break;
+ case ICPT_PV_PREF:
+ rc = 0;
+ gmap_convert_to_secure(vcpu->arch.gmap,
+ kvm_s390_get_prefix(vcpu));
+ gmap_convert_to_secure(vcpu->arch.gmap,
+ kvm_s390_get_prefix(vcpu) + PAGE_SIZE);
break;
default:
return -EOPNOTSUPP;
}
- /* process PER, also if the instrution is processed in user space */
- if (vcpu->arch.sie_block->icptstatus & 0x02 &&
- (!rc || rc == -EOPNOTSUPP))
+ if (should_handle_per_ifetch(vcpu, rc))
per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu);
return per_rc ? per_rc : rc;
}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 165dea4c7f19..fc4007cc067a 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2,7 +2,7 @@
/*
* handling kvm guest interrupts
*
- * Copyright IBM Corp. 2008, 2015
+ * Copyright IBM Corp. 2008, 2020
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
*/
@@ -28,9 +28,11 @@
#include <asm/switch_to.h>
#include <asm/nmi.h>
#include <asm/airq.h>
+#include <asm/tpi.h>
#include "kvm-s390.h"
#include "gaccess.h"
#include "trace-s390.h"
+#include "pci.h"
#define PFAULT_INIT 0x0600
#define PFAULT_DONE 0x0680
@@ -81,8 +83,9 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
struct esca_block *sca = vcpu->kvm->arch.sca;
union esca_sigp_ctrl *sigp_ctrl =
&(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- union esca_sigp_ctrl new_val = {0}, old_val = *sigp_ctrl;
+ union esca_sigp_ctrl new_val = {0}, old_val;
+ old_val = READ_ONCE(*sigp_ctrl);
new_val.scn = src_id;
new_val.c = 1;
old_val.c = 0;
@@ -93,8 +96,9 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
struct bsca_block *sca = vcpu->kvm->arch.sca;
union bsca_sigp_ctrl *sigp_ctrl =
&(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- union bsca_sigp_ctrl new_val = {0}, old_val = *sigp_ctrl;
+ union bsca_sigp_ctrl new_val = {0}, old_val;
+ old_val = READ_ONCE(*sigp_ctrl);
new_val.scn = src_id;
new_val.c = 1;
old_val.c = 0;
@@ -124,16 +128,18 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu)
struct esca_block *sca = vcpu->kvm->arch.sca;
union esca_sigp_ctrl *sigp_ctrl =
&(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- union esca_sigp_ctrl old = *sigp_ctrl;
+ union esca_sigp_ctrl old;
+ old = READ_ONCE(*sigp_ctrl);
expect = old.value;
rc = cmpxchg(&sigp_ctrl->value, old.value, 0);
} else {
struct bsca_block *sca = vcpu->kvm->arch.sca;
union bsca_sigp_ctrl *sigp_ctrl =
&(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- union bsca_sigp_ctrl old = *sigp_ctrl;
+ union bsca_sigp_ctrl old;
+ old = READ_ONCE(*sigp_ctrl);
expect = old.value;
rc = cmpxchg(&sigp_ctrl->value, old.value, 0);
}
@@ -297,11 +303,6 @@ static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi)
return 0;
}
-static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa)
-{
- return READ_ONCE(gisa->next_alert) != (u32)(u64)gisa;
-}
-
static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
{
set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
@@ -312,11 +313,6 @@ static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa)
return READ_ONCE(gisa->ipm);
}
-static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
-{
- clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
-}
-
static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
{
return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
@@ -324,8 +320,11 @@ static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu)
{
- return vcpu->kvm->arch.float_int.pending_irqs |
- vcpu->arch.local_int.pending_irqs;
+ unsigned long pending = vcpu->kvm->arch.float_int.pending_irqs |
+ vcpu->arch.local_int.pending_irqs;
+
+ pending &= ~vcpu->kvm->arch.float_int.masked_irqs;
+ return pending;
}
static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
@@ -383,10 +382,18 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
__clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &active_mask);
if (!(vcpu->arch.sie_block->gcr[0] & CR0_CPU_TIMER_SUBMASK))
__clear_bit(IRQ_PEND_EXT_CPU_TIMER, &active_mask);
- if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK))
+ if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) {
__clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask);
+ __clear_bit(IRQ_PEND_EXT_SERVICE_EV, &active_mask);
+ }
if (psw_mchk_disabled(vcpu))
active_mask &= ~IRQ_PEND_MCHK_MASK;
+ /* PV guest cpus can have a single interruption injected at a time. */
+ if (kvm_s390_pv_cpu_get_handle(vcpu) &&
+ vcpu->arch.sie_block->iictl != IICTL_CODE_NONE)
+ active_mask &= ~(IRQ_PEND_EXT_II_MASK |
+ IRQ_PEND_IO_MASK |
+ IRQ_PEND_MCHK_MASK);
/*
* Check both floating and local interrupt's cr14 because
* bit IRQ_PEND_MCHK_REP could be set in both cases.
@@ -408,13 +415,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
static void __set_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
- set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask);
}
static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask);
}
static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
@@ -479,19 +486,23 @@ static void set_intercept_indicators(struct kvm_vcpu *vcpu)
static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
- int rc;
+ int rc = 0;
vcpu->stat.deliver_cputm++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER,
0, 0);
-
- rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER,
- (u16 *)__LC_EXT_INT_CODE);
- rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
- rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
- rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+ vcpu->arch.sie_block->eic = EXT_IRQ_CPU_TIMER;
+ } else {
+ rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER,
+ (u16 *)__LC_EXT_INT_CODE);
+ rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
+ rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ }
clear_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
return rc ? -EFAULT : 0;
}
@@ -499,19 +510,23 @@ static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
static int __must_check __deliver_ckc(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
- int rc;
+ int rc = 0;
vcpu->stat.deliver_ckc++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP,
0, 0);
-
- rc = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP,
- (u16 __user *)__LC_EXT_INT_CODE);
- rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
- rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
- rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+ vcpu->arch.sie_block->eic = EXT_IRQ_CLK_COMP;
+ } else {
+ rc = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP,
+ (u16 __user *)__LC_EXT_INT_CODE);
+ rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
+ rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ }
clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs);
return rc ? -EFAULT : 0;
}
@@ -553,6 +568,20 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
union mci mci;
int rc;
+ /*
+ * All other possible payload for a machine check (e.g. the register
+ * contents in the save area) will be handled by the ultravisor, as
+ * the hypervisor does not not have the needed information for
+ * protected guests.
+ */
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_MCHK;
+ vcpu->arch.sie_block->mcic = mchk->mcic;
+ vcpu->arch.sie_block->faddr = mchk->failing_storage_address;
+ vcpu->arch.sie_block->edc = mchk->ext_damage_code;
+ return 0;
+ }
+
mci.val = mchk->mcic;
/* take care of lazy register loading */
save_fpu_regs();
@@ -610,7 +639,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE);
/* Register-save areas */
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128);
} else {
@@ -669,7 +698,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
/*
* We indicate floating repressible conditions along with
* other pending conditions. Channel Report Pending and Channel
- * Subsystem damage are the only two and and are indicated by
+ * Subsystem damage are the only two and are indicated by
* bits in mcic and masked in cr14.
*/
if (test_and_clear_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) {
@@ -696,17 +725,21 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
static int __must_check __deliver_restart(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
- int rc;
+ int rc = 0;
VCPU_EVENT(vcpu, 3, "%s", "deliver: cpu restart");
vcpu->stat.deliver_restart_signal++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0);
- rc = write_guest_lc(vcpu,
- offsetof(struct lowcore, restart_old_psw),
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
- rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw),
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_RESTART;
+ } else {
+ rc = write_guest_lc(vcpu,
+ offsetof(struct lowcore, restart_old_psw),
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw),
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ }
clear_bit(IRQ_PEND_RESTART, &li->pending_irqs);
return rc ? -EFAULT : 0;
}
@@ -748,6 +781,12 @@ static int __must_check __deliver_emergency_signal(struct kvm_vcpu *vcpu)
vcpu->stat.deliver_emergency_signal++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY,
cpu_addr, 0);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+ vcpu->arch.sie_block->eic = EXT_IRQ_EMERGENCY_SIG;
+ vcpu->arch.sie_block->extcpuaddr = cpu_addr;
+ return 0;
+ }
rc = put_guest_lc(vcpu, EXT_IRQ_EMERGENCY_SIG,
(u16 *)__LC_EXT_INT_CODE);
@@ -776,6 +815,12 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu)
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
KVM_S390_INT_EXTERNAL_CALL,
extcall.code, 0);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+ vcpu->arch.sie_block->eic = EXT_IRQ_EXTERNAL_CALL;
+ vcpu->arch.sie_block->extcpuaddr = extcall.code;
+ return 0;
+ }
rc = put_guest_lc(vcpu, EXT_IRQ_EXTERNAL_CALL,
(u16 *)__LC_EXT_INT_CODE);
@@ -787,6 +832,21 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu)
return rc ? -EFAULT : 0;
}
+static int __deliver_prog_pv(struct kvm_vcpu *vcpu, u16 code)
+{
+ switch (code) {
+ case PGM_SPECIFICATION:
+ vcpu->arch.sie_block->iictl = IICTL_CODE_SPECIFICATION;
+ break;
+ case PGM_OPERAND:
+ vcpu->arch.sie_block->iictl = IICTL_CODE_OPERAND;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -807,6 +867,10 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
pgm_info.code, 0);
+ /* PER is handled by the ultravisor */
+ if (kvm_s390_pv_cpu_is_protected(vcpu))
+ return __deliver_prog_pv(vcpu, pgm_info.code & ~PGM_PER);
+
switch (pgm_info.code & ~PGM_PER) {
case PGM_AFX_TRANSLATION:
case PGM_ASX_TRANSLATION:
@@ -818,7 +882,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
case PGM_PRIMARY_AUTHORITY:
case PGM_SECONDARY_AUTHORITY:
nullifying = true;
- /* fall through */
+ fallthrough;
case PGM_SPACE_SWITCH:
rc = put_guest_lc(vcpu, pgm_info.trans_exc_code,
(u64 *)__LC_TRANS_EXC_CODE);
@@ -892,7 +956,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
/* bit 1+2 of the target are the ilc, so we can directly use ilen */
rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC);
rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea,
- (u64 *) __LC_LAST_BREAK);
+ (u64 *) __LC_PGM_LAST_BREAK);
rc |= put_guest_lc(vcpu, pgm_info.code,
(u16 *)__LC_PGM_INT_CODE);
rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW,
@@ -902,20 +966,49 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
return rc ? -EFAULT : 0;
}
+#define SCCB_MASK 0xFFFFFFF8
+#define SCCB_EVENT_PENDING 0x3
+
+static int write_sclp(struct kvm_vcpu *vcpu, u32 parm)
+{
+ int rc;
+
+ if (kvm_s390_pv_cpu_get_handle(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_EXT;
+ vcpu->arch.sie_block->eic = EXT_IRQ_SERVICE_SIG;
+ vcpu->arch.sie_block->eiparams = parm;
+ return 0;
+ }
+
+ rc = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE);
+ rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
+ rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= put_guest_lc(vcpu, parm,
+ (u32 *)__LC_EXT_PARAMS);
+
+ return rc ? -EFAULT : 0;
+}
+
static int __must_check __deliver_service(struct kvm_vcpu *vcpu)
{
struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
struct kvm_s390_ext_info ext;
- int rc = 0;
spin_lock(&fi->lock);
- if (!(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) {
+ if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs) ||
+ !(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) {
spin_unlock(&fi->lock);
return 0;
}
ext = fi->srv_signal;
memset(&fi->srv_signal, 0, sizeof(ext));
clear_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs);
+ clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs);
+ if (kvm_s390_pv_cpu_is_protected(vcpu))
+ set_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs);
spin_unlock(&fi->lock);
VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x",
@@ -924,16 +1017,31 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu)
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE,
ext.ext_params, 0);
- rc = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE);
- rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
- rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
- rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
- &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
- rc |= put_guest_lc(vcpu, ext.ext_params,
- (u32 *)__LC_EXT_PARAMS);
+ return write_sclp(vcpu, ext.ext_params);
+}
- return rc ? -EFAULT : 0;
+static int __must_check __deliver_service_ev(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+ struct kvm_s390_ext_info ext;
+
+ spin_lock(&fi->lock);
+ if (!(test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs))) {
+ spin_unlock(&fi->lock);
+ return 0;
+ }
+ ext = fi->srv_signal;
+ /* only clear the event bit */
+ fi->srv_signal.ext_params &= ~SCCB_EVENT_PENDING;
+ clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs);
+ spin_unlock(&fi->lock);
+
+ VCPU_EVENT(vcpu, 4, "%s", "deliver: sclp parameter event");
+ vcpu->stat.deliver_service_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE,
+ ext.ext_params, 0);
+
+ return write_sclp(vcpu, SCCB_EVENT_PENDING);
}
static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu)
@@ -1028,6 +1136,15 @@ static int __do_deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_io_info *io)
{
int rc;
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->iictl = IICTL_CODE_IO;
+ vcpu->arch.sie_block->subchannel_id = io->subchannel_id;
+ vcpu->arch.sie_block->subchannel_nr = io->subchannel_nr;
+ vcpu->arch.sie_block->io_int_parm = io->io_int_parm;
+ vcpu->arch.sie_block->io_int_word = io->io_int_word;
+ return 0;
+ }
+
rc = put_guest_lc(vcpu, io->subchannel_id, (u16 *)__LC_SUBCHANNEL_ID);
rc |= put_guest_lc(vcpu, io->subchannel_nr, (u16 *)__LC_SUBCHANNEL_NR);
rc |= put_guest_lc(vcpu, io->io_int_parm, (u32 *)__LC_IO_INT_PARM);
@@ -1166,7 +1283,7 @@ static u64 __calculate_sltime(struct kvm_vcpu *vcpu)
/* already expired? */
if (cputm >> 63)
return 0;
- return min(sltime, tod_to_ns(cputm));
+ return min_t(u64, sltime, tod_to_ns(cputm));
}
} else if (cpu_timer_interrupts_enabled(vcpu)) {
sltime = kvm_s390_get_cpu_timer(vcpu);
@@ -1213,10 +1330,11 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
hrtimer_start(&vcpu->arch.ckc_timer, sltime, HRTIMER_MODE_REL);
VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime);
no_timer:
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- kvm_vcpu_block(vcpu);
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ kvm_vcpu_halt(vcpu);
+ vcpu->valid_wakeup = false;
__unset_cpu_idle(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
hrtimer_cancel(&vcpu->arch.ckc_timer);
return 0;
@@ -1269,6 +1387,7 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
int rc = 0;
+ bool delivered = false;
unsigned long irq_type;
unsigned long irqs;
@@ -1329,6 +1448,9 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
case IRQ_PEND_EXT_SERVICE:
rc = __deliver_service(vcpu);
break;
+ case IRQ_PEND_EXT_SERVICE_EV:
+ rc = __deliver_service_ev(vcpu);
+ break;
case IRQ_PEND_PFAULT_DONE:
rc = __deliver_pfault_done(vcpu);
break;
@@ -1339,6 +1461,19 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
WARN_ONCE(1, "Unknown pending irq type %ld", irq_type);
clear_bit(irq_type, &li->pending_irqs);
}
+ delivered |= !rc;
+ }
+
+ /*
+ * We delivered at least one interrupt and modified the PC. Force a
+ * singlestep event now.
+ */
+ if (delivered && guestdbg_sstep_enabled(vcpu)) {
+ struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch;
+
+ debug_exit->addr = vcpu->arch.sie_block->gpsw.addr;
+ debug_exit->type = KVM_SINGLESTEP;
+ vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING;
}
set_intercept_indicators(vcpu);
@@ -1421,7 +1556,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL)
return -EINVAL;
- if (sclp.has_sigpif)
+ if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu))
return sca_inject_ext_call(vcpu, src_id);
if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
@@ -1668,7 +1803,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
goto out;
}
gisa_out:
- tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+ tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
if (tmp_inti) {
tmp_inti->type = KVM_S390_INT_IO(1, 0, 0, 0);
tmp_inti->io.io_int_word = isc_to_int_word(isc);
@@ -1681,9 +1816,6 @@ out:
return inti;
}
-#define SCCB_MASK 0xFFFFFFF8
-#define SCCB_EVENT_PENDING 0x3
-
static int __inject_service(struct kvm *kvm,
struct kvm_s390_interrupt_info *inti)
{
@@ -1692,6 +1824,11 @@ static int __inject_service(struct kvm *kvm,
kvm->stat.inject_service_signal++;
spin_lock(&fi->lock);
fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_EVENT_PENDING;
+
+ /* We always allow events, track them separately from the sccb ints */
+ if (fi->srv_signal.ext_params & SCCB_EVENT_PENDING)
+ set_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs);
+
/*
* Early versions of the QEMU s390 bios will inject several
* service interrupts after another without handling a
@@ -1773,6 +1910,12 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
kvm->stat.inject_io++;
isc = int_word_to_isc(inti->io.io_int_word);
+ /*
+ * We do not use the lock checking variant as this is just a
+ * performance optimization and we do not hold the lock here.
+ * This is ok as the code will pick interrupts from both "lists"
+ * for delivery.
+ */
if (gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) {
VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc);
gisa_set_ipm_gisc(gi->origin, isc);
@@ -1834,7 +1977,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
break;
case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
if (!(type & KVM_S390_INT_IO_AI_MASK &&
- kvm->arch.gisa_int.origin))
+ kvm->arch.gisa_int.origin) ||
+ kvm_s390_pv_cpu_get_handle(dst_vcpu))
kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT);
break;
default:
@@ -1881,7 +2025,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
struct kvm_s390_interrupt_info *inti;
int rc;
- inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+ inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
if (!inti)
return -ENOMEM;
@@ -1981,6 +2125,13 @@ int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu)
return test_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs);
}
+int kvm_s390_is_restart_irq_pending(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+
+ return test_bit(IRQ_PEND_RESTART, &li->pending_irqs);
+}
+
void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -2080,6 +2231,10 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm)
struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
int i;
+ mutex_lock(&kvm->lock);
+ if (!kvm_s390_pv_is_protected(kvm))
+ fi->masked_irqs = 0;
+ mutex_unlock(&kvm->lock);
spin_lock(&fi->lock);
fi->pending_irqs = 0;
memset(&fi->srv_signal, 0, sizeof(fi->srv_signal));
@@ -2146,7 +2301,8 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
n++;
}
}
- if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs)) {
+ if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs) ||
+ test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs)) {
if (n == max_irqs) {
/* signal userspace to try again */
ret = -ENOMEM;
@@ -2190,7 +2346,7 @@ static int flic_ais_mode_get_all(struct kvm *kvm, struct kvm_device_attr *attr)
return -EINVAL;
if (!test_kvm_facility(kvm, 72))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
mutex_lock(&fi->ais_lock);
ais.simm = fi->simm;
@@ -2275,7 +2431,7 @@ static int enqueue_floating_irq(struct kvm_device *dev,
return -EINVAL;
while (len >= sizeof(struct kvm_s390_irq)) {
- inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+ inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
if (!inti)
return -ENOMEM;
@@ -2323,13 +2479,10 @@ static int register_io_adapter(struct kvm_device *dev,
if (dev->kvm->arch.adapters[adapter_info.id] != NULL)
return -EINVAL;
- adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+ adapter = kzalloc(sizeof(*adapter), GFP_KERNEL_ACCOUNT);
if (!adapter)
return -ENOMEM;
- INIT_LIST_HEAD(&adapter->maps);
- init_rwsem(&adapter->maps_lock);
- atomic_set(&adapter->nr_maps, 0);
adapter->id = adapter_info.id;
adapter->isc = adapter_info.isc;
adapter->maskable = adapter_info.maskable;
@@ -2354,87 +2507,12 @@ int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked)
return ret;
}
-static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
-{
- struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
- struct s390_map_info *map;
- int ret;
-
- if (!adapter || !addr)
- return -EINVAL;
-
- map = kzalloc(sizeof(*map), GFP_KERNEL);
- if (!map) {
- ret = -ENOMEM;
- goto out;
- }
- INIT_LIST_HEAD(&map->list);
- map->guest_addr = addr;
- map->addr = gmap_translate(kvm->arch.gmap, addr);
- if (map->addr == -EFAULT) {
- ret = -EFAULT;
- goto out;
- }
- ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page);
- if (ret < 0)
- goto out;
- BUG_ON(ret != 1);
- down_write(&adapter->maps_lock);
- if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) {
- list_add_tail(&map->list, &adapter->maps);
- ret = 0;
- } else {
- put_page(map->page);
- ret = -EINVAL;
- }
- up_write(&adapter->maps_lock);
-out:
- if (ret)
- kfree(map);
- return ret;
-}
-
-static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
-{
- struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
- struct s390_map_info *map, *tmp;
- int found = 0;
-
- if (!adapter || !addr)
- return -EINVAL;
-
- down_write(&adapter->maps_lock);
- list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
- if (map->guest_addr == addr) {
- found = 1;
- atomic_dec(&adapter->nr_maps);
- list_del(&map->list);
- put_page(map->page);
- kfree(map);
- break;
- }
- }
- up_write(&adapter->maps_lock);
-
- return found ? 0 : -EINVAL;
-}
-
void kvm_s390_destroy_adapters(struct kvm *kvm)
{
int i;
- struct s390_map_info *map, *tmp;
- for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
- if (!kvm->arch.adapters[i])
- continue;
- list_for_each_entry_safe(map, tmp,
- &kvm->arch.adapters[i]->maps, list) {
- list_del(&map->list);
- put_page(map->page);
- kfree(map);
- }
+ for (i = 0; i < MAX_S390_IO_ADAPTERS; i++)
kfree(kvm->arch.adapters[i]);
- }
}
static int modify_io_adapter(struct kvm_device *dev,
@@ -2456,11 +2534,14 @@ static int modify_io_adapter(struct kvm_device *dev,
if (ret > 0)
ret = 0;
break;
+ /*
+ * The following operations are no longer needed and therefore no-ops.
+ * The gpa to hva translation is done when an IRQ route is set up. The
+ * set_irq code uses get_user_pages_remote() to do the actual write.
+ */
case KVM_S390_IO_ADAPTER_MAP:
- ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr);
- break;
case KVM_S390_IO_ADAPTER_UNMAP:
- ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr);
+ ret = 0;
break;
default:
ret = -EINVAL;
@@ -2499,7 +2580,7 @@ static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr)
int ret = 0;
if (!test_kvm_facility(kvm, 72))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
return -EFAULT;
@@ -2579,7 +2660,7 @@ static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr)
struct kvm_s390_ais_all ais;
if (!test_kvm_facility(kvm, 72))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (copy_from_user(&ais, (void __user *)attr->addr, sizeof(ais)))
return -EFAULT;
@@ -2595,7 +2676,7 @@ static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr)
static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
{
int r = 0;
- unsigned int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
switch (attr->group) {
@@ -2699,19 +2780,15 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
}
-static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter,
- u64 addr)
+static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
{
- struct s390_map_info *map;
+ struct page *page = NULL;
- if (!adapter)
- return NULL;
-
- list_for_each_entry(map, &adapter->maps, list) {
- if (map->guest_addr == addr)
- return map;
- }
- return NULL;
+ mmap_read_lock(kvm->mm);
+ get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE,
+ &page, NULL);
+ mmap_read_unlock(kvm->mm);
+ return page;
}
static int adapter_indicators_set(struct kvm *kvm,
@@ -2720,30 +2797,35 @@ static int adapter_indicators_set(struct kvm *kvm,
{
unsigned long bit;
int summary_set, idx;
- struct s390_map_info *info;
+ struct page *ind_page, *summary_page;
void *map;
- info = get_map_info(adapter, adapter_int->ind_addr);
- if (!info)
+ ind_page = get_map_page(kvm, adapter_int->ind_addr);
+ if (!ind_page)
return -1;
- map = page_address(info->page);
- bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap);
- set_bit(bit, map);
- idx = srcu_read_lock(&kvm->srcu);
- mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
- set_page_dirty_lock(info->page);
- info = get_map_info(adapter, adapter_int->summary_addr);
- if (!info) {
- srcu_read_unlock(&kvm->srcu, idx);
+ summary_page = get_map_page(kvm, adapter_int->summary_addr);
+ if (!summary_page) {
+ put_page(ind_page);
return -1;
}
- map = page_address(info->page);
- bit = get_ind_bit(info->addr, adapter_int->summary_offset,
- adapter->swap);
+
+ idx = srcu_read_lock(&kvm->srcu);
+ map = page_address(ind_page);
+ bit = get_ind_bit(adapter_int->ind_addr,
+ adapter_int->ind_offset, adapter->swap);
+ set_bit(bit, map);
+ mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT);
+ set_page_dirty_lock(ind_page);
+ map = page_address(summary_page);
+ bit = get_ind_bit(adapter_int->summary_addr,
+ adapter_int->summary_offset, adapter->swap);
summary_set = test_and_set_bit(bit, map);
- mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
- set_page_dirty_lock(info->page);
+ mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT);
+ set_page_dirty_lock(summary_page);
srcu_read_unlock(&kvm->srcu, idx);
+
+ put_page(ind_page);
+ put_page(summary_page);
return summary_set ? 0 : 1;
}
@@ -2765,9 +2847,7 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
adapter = get_io_adapter(kvm, e->adapter.adapter_id);
if (!adapter)
return -1;
- down_read(&adapter->maps_lock);
ret = adapter_indicators_set(kvm, adapter, &e->adapter);
- up_read(&adapter->maps_lock);
if ((ret > 0) && !adapter->masked) {
ret = kvm_s390_inject_airq(kvm, adapter);
if (ret == 0)
@@ -2818,23 +2898,27 @@ int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
- int ret;
+ u64 uaddr;
switch (ue->type) {
+ /* we store the userspace addresses instead of the guest addresses */
case KVM_IRQ_ROUTING_S390_ADAPTER:
e->set = set_adapter_int;
- e->adapter.summary_addr = ue->u.adapter.summary_addr;
- e->adapter.ind_addr = ue->u.adapter.ind_addr;
+ uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr);
+ if (uaddr == -EFAULT)
+ return -EFAULT;
+ e->adapter.summary_addr = uaddr;
+ uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr);
+ if (uaddr == -EFAULT)
+ return -EFAULT;
+ e->adapter.ind_addr = uaddr;
e->adapter.summary_offset = ue->u.adapter.summary_offset;
e->adapter.ind_offset = ue->u.adapter.ind_offset;
e->adapter.adapter_id = ue->u.adapter.adapter_id;
- ret = 0;
- break;
+ return 0;
default:
- ret = -EINVAL;
+ return -EINVAL;
}
-
- return ret;
}
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
@@ -2983,18 +3067,19 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask)
{
- int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus);
+ int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus);
struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
struct kvm_vcpu *vcpu;
+ u8 vcpu_isc_mask;
- for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
- vcpu = kvm_get_vcpu(kvm, vcpu_id);
+ for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) {
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
if (psw_ioint_disabled(vcpu))
continue;
- deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
- if (deliverable_mask) {
+ vcpu_isc_mask = (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
+ if (deliverable_mask & vcpu_isc_mask) {
/* lately kicked but not yet running */
- if (test_and_set_bit(vcpu_id, gi->kicked_mask))
+ if (test_and_set_bit(vcpu_idx, gi->kicked_mask))
return;
kvm_s390_vcpu_wakeup(vcpu);
return;
@@ -3015,7 +3100,7 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer)
__airqs_kick_single_vcpu(kvm, pending_mask);
hrtimer_forward_now(timer, ns_to_ktime(gi->expires));
return HRTIMER_RESTART;
- };
+ }
return HRTIMER_NORESTART;
}
@@ -3027,9 +3112,9 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer)
static void process_gib_alert_list(void)
{
struct kvm_s390_gisa_interrupt *gi;
+ u32 final, gisa_phys, origin = 0UL;
struct kvm_s390_gisa *gisa;
struct kvm *kvm;
- u32 final, origin = 0UL;
do {
/*
@@ -3055,9 +3140,10 @@ static void process_gib_alert_list(void)
* interruptions asap.
*/
while (origin & GISA_ADDR_MASK) {
- gisa = (struct kvm_s390_gisa *)(u64)origin;
+ gisa_phys = origin;
+ gisa = phys_to_virt(gisa_phys);
origin = gisa->next_alert;
- gisa->next_alert = (u32)(u64)gisa;
+ gisa->next_alert = gisa_phys;
kvm = container_of(gisa, struct sie_page2, gisa)->kvm;
gi = &kvm->arch.gisa_int;
if (hrtimer_active(&gi->timer))
@@ -3091,23 +3177,67 @@ void kvm_s390_gisa_init(struct kvm *kvm)
hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
gi->timer.function = gisa_vcpu_kicker;
memset(gi->origin, 0, sizeof(struct kvm_s390_gisa));
- gi->origin->next_alert = (u32)(u64)gi->origin;
+ gi->origin->next_alert = (u32)virt_to_phys(gi->origin);
VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin);
}
+void kvm_s390_gisa_enable(struct kvm *kvm)
+{
+ struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ u32 gisa_desc;
+
+ if (gi->origin)
+ return;
+ kvm_s390_gisa_init(kvm);
+ gisa_desc = kvm_s390_get_gisa_desc(kvm);
+ if (!gisa_desc)
+ return;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ mutex_lock(&vcpu->mutex);
+ vcpu->arch.sie_block->gd = gisa_desc;
+ vcpu->arch.sie_block->eca |= ECA_AIV;
+ VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u",
+ vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id);
+ mutex_unlock(&vcpu->mutex);
+ }
+}
+
void kvm_s390_gisa_destroy(struct kvm *kvm)
{
struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
+ struct kvm_s390_gisa *gisa = gi->origin;
if (!gi->origin)
return;
- if (gi->alert.mask)
- KVM_EVENT(3, "vm 0x%pK has unexpected iam 0x%02x",
- kvm, gi->alert.mask);
- while (gisa_in_alert_list(gi->origin))
- cpu_relax();
+ WARN(gi->alert.mask != 0x00,
+ "unexpected non zero alert.mask 0x%02x",
+ gi->alert.mask);
+ gi->alert.mask = 0x00;
+ if (gisa_set_iam(gi->origin, gi->alert.mask))
+ process_gib_alert_list();
hrtimer_cancel(&gi->timer);
gi->origin = NULL;
+ VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa);
+}
+
+void kvm_s390_gisa_disable(struct kvm *kvm)
+{
+ struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ if (!gi->origin)
+ return;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ mutex_lock(&vcpu->mutex);
+ vcpu->arch.sie_block->eca &= ~ECA_AIV;
+ vcpu->arch.sie_block->gd = 0U;
+ mutex_unlock(&vcpu->mutex);
+ VCPU_EVENT(vcpu, 3, "AIV disabled for cpu %03u", vcpu->vcpu_id);
+ }
+ kvm_s390_gisa_destroy(kvm);
}
/**
@@ -3193,29 +3323,111 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister);
-static void gib_alert_irq_handler(struct airq_struct *airq, bool floating)
+static void aen_host_forward(unsigned long si)
+{
+ struct kvm_s390_gisa_interrupt *gi;
+ struct zpci_gaite *gaite;
+ struct kvm *kvm;
+
+ gaite = (struct zpci_gaite *)aift->gait +
+ (si * sizeof(struct zpci_gaite));
+ if (gaite->count == 0)
+ return;
+ if (gaite->aisb != 0)
+ set_bit_inv(gaite->aisbo, phys_to_virt(gaite->aisb));
+
+ kvm = kvm_s390_pci_si_to_kvm(aift, si);
+ if (!kvm)
+ return;
+ gi = &kvm->arch.gisa_int;
+
+ if (!(gi->origin->g1.simm & AIS_MODE_MASK(gaite->gisc)) ||
+ !(gi->origin->g1.nimm & AIS_MODE_MASK(gaite->gisc))) {
+ gisa_set_ipm_gisc(gi->origin, gaite->gisc);
+ if (hrtimer_active(&gi->timer))
+ hrtimer_cancel(&gi->timer);
+ hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL);
+ kvm->stat.aen_forward++;
+ }
+}
+
+static void aen_process_gait(u8 isc)
{
+ bool found = false, first = true;
+ union zpci_sic_iib iib = {{0}};
+ unsigned long si, flags;
+
+ spin_lock_irqsave(&aift->gait_lock, flags);
+
+ if (!aift->gait) {
+ spin_unlock_irqrestore(&aift->gait_lock, flags);
+ return;
+ }
+
+ for (si = 0;;) {
+ /* Scan adapter summary indicator bit vector */
+ si = airq_iv_scan(aift->sbv, si, airq_iv_end(aift->sbv));
+ if (si == -1UL) {
+ if (first || found) {
+ /* Re-enable interrupts. */
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, isc,
+ &iib);
+ first = found = false;
+ } else {
+ /* Interrupts on and all bits processed */
+ break;
+ }
+ found = false;
+ si = 0;
+ /* Scan again after re-enabling interrupts */
+ continue;
+ }
+ found = true;
+ aen_host_forward(si);
+ }
+
+ spin_unlock_irqrestore(&aift->gait_lock, flags);
+}
+
+static void gib_alert_irq_handler(struct airq_struct *airq,
+ struct tpi_info *tpi_info)
+{
+ struct tpi_adapter_info *info = (struct tpi_adapter_info *)tpi_info;
+
inc_irq_stat(IRQIO_GAL);
- process_gib_alert_list();
+
+ if ((info->forward || info->error) &&
+ IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
+ aen_process_gait(info->isc);
+ if (info->aism != 0)
+ process_gib_alert_list();
+ } else {
+ process_gib_alert_list();
+ }
}
static struct airq_struct gib_alert_irq = {
.handler = gib_alert_irq_handler,
- .lsi_ptr = &gib_alert_irq.lsi_mask,
};
void kvm_s390_gib_destroy(void)
{
if (!gib)
return;
+ if (kvm_s390_pci_interp_allowed() && aift) {
+ mutex_lock(&aift->aift_lock);
+ kvm_s390_pci_aen_exit();
+ mutex_unlock(&aift->aift_lock);
+ }
chsc_sgib(0);
unregister_adapter_interrupt(&gib_alert_irq);
free_page((unsigned long)gib);
gib = NULL;
}
-int kvm_s390_gib_init(u8 nisc)
+int __init kvm_s390_gib_init(u8 nisc)
{
+ u32 gib_origin;
int rc = 0;
if (!css_general_characteristics.aiv) {
@@ -3223,7 +3435,7 @@ int kvm_s390_gib_init(u8 nisc)
goto out;
}
- gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA);
if (!gib) {
rc = -ENOMEM;
goto out;
@@ -3235,9 +3447,12 @@ int kvm_s390_gib_init(u8 nisc)
rc = -EIO;
goto out_free_gib;
}
+ /* adapter interrupts used for AP (applicable here) don't use the LSI */
+ *gib_alert_irq.lsi_ptr = 0xff;
gib->nisc = nisc;
- if (chsc_sgib((u32)(u64)gib)) {
+ gib_origin = virt_to_phys(gib);
+ if (chsc_sgib(gib_origin)) {
pr_err("Associating the GIB with the AIV facility failed\n");
free_page((unsigned long)gib);
gib = NULL;
@@ -3245,6 +3460,14 @@ int kvm_s390_gib_init(u8 nisc)
goto out_unreg_gal;
}
+ if (kvm_s390_pci_interp_allowed()) {
+ if (kvm_s390_pci_aen_init(nisc)) {
+ pr_err("Initializing AEN for PCI failed\n");
+ rc = -EIO;
+ goto out_unreg_gal;
+ }
+ }
+
KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc);
goto out;
diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h
deleted file mode 100644
index 484608c71dd0..000000000000
--- a/arch/s390/kvm/irq.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * s390 irqchip routines
- *
- * Copyright IBM Corp. 2014
- *
- * Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
- */
-#ifndef __KVM_IRQ_H
-#define __KVM_IRQ_H
-
-#include <linux/kvm_host.h>
-
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
- return 1;
-}
-
-#endif
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d9e6bf3d54f0..ea63ac769889 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2,11 +2,10 @@
/*
* hosting IBM Z kernel virtual machines (s390x)
*
- * Copyright IBM Corp. 2008, 2018
+ * Copyright IBM Corp. 2008, 2020
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
- * Heiko Carstens <heiko.carstens@de.ibm.com>
* Christian Ehrhardt <ehrhardt@de.ibm.com>
* Jason J. Herne <jjherne@us.ibm.com>
*/
@@ -31,11 +30,12 @@
#include <linux/bitmap.h>
#include <linux/sched/signal.h>
#include <linux/string.h>
+#include <linux/pgtable.h>
+#include <linux/mmu_notifier.h>
#include <asm/asm-offsets.h>
#include <asm/lowcore.h>
#include <asm/stp.h>
-#include <asm/pgtable.h>
#include <asm/gmap.h>
#include <asm/nmi.h>
#include <asm/switch_to.h>
@@ -44,8 +44,11 @@
#include <asm/cpacf.h>
#include <asm/timex.h>
#include <asm/ap.h>
+#include <asm/uv.h>
+#include <asm/fpu/api.h>
#include "kvm-s390.h"
#include "gaccess.h"
+#include "pci.h"
#define CREATE_TRACE_POINTS
#include "trace.h"
@@ -56,118 +59,137 @@
#define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
(KVM_MAX_VCPUS + LOCAL_IRQS))
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-
-struct kvm_stats_debugfs_item debugfs_entries[] = {
- { "userspace_handled", VCPU_STAT(exit_userspace) },
- { "exit_null", VCPU_STAT(exit_null) },
- { "exit_validity", VCPU_STAT(exit_validity) },
- { "exit_stop_request", VCPU_STAT(exit_stop_request) },
- { "exit_external_request", VCPU_STAT(exit_external_request) },
- { "exit_io_request", VCPU_STAT(exit_io_request) },
- { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) },
- { "exit_instruction", VCPU_STAT(exit_instruction) },
- { "exit_pei", VCPU_STAT(exit_pei) },
- { "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
- { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
- { "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
- { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
- { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
- { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
- { "halt_no_poll_steal", VCPU_STAT(halt_no_poll_steal) },
- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
- { "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
- { "instruction_lctl", VCPU_STAT(instruction_lctl) },
- { "instruction_stctl", VCPU_STAT(instruction_stctl) },
- { "instruction_stctg", VCPU_STAT(instruction_stctg) },
- { "deliver_ckc", VCPU_STAT(deliver_ckc) },
- { "deliver_cputm", VCPU_STAT(deliver_cputm) },
- { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
- { "deliver_external_call", VCPU_STAT(deliver_external_call) },
- { "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
- { "deliver_virtio", VCPU_STAT(deliver_virtio) },
- { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
- { "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) },
- { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
- { "deliver_program", VCPU_STAT(deliver_program) },
- { "deliver_io", VCPU_STAT(deliver_io) },
- { "deliver_machine_check", VCPU_STAT(deliver_machine_check) },
- { "exit_wait_state", VCPU_STAT(exit_wait_state) },
- { "inject_ckc", VCPU_STAT(inject_ckc) },
- { "inject_cputm", VCPU_STAT(inject_cputm) },
- { "inject_external_call", VCPU_STAT(inject_external_call) },
- { "inject_float_mchk", VM_STAT(inject_float_mchk) },
- { "inject_emergency_signal", VCPU_STAT(inject_emergency_signal) },
- { "inject_io", VM_STAT(inject_io) },
- { "inject_mchk", VCPU_STAT(inject_mchk) },
- { "inject_pfault_done", VM_STAT(inject_pfault_done) },
- { "inject_program", VCPU_STAT(inject_program) },
- { "inject_restart", VCPU_STAT(inject_restart) },
- { "inject_service_signal", VM_STAT(inject_service_signal) },
- { "inject_set_prefix", VCPU_STAT(inject_set_prefix) },
- { "inject_stop_signal", VCPU_STAT(inject_stop_signal) },
- { "inject_pfault_init", VCPU_STAT(inject_pfault_init) },
- { "inject_virtio", VM_STAT(inject_virtio) },
- { "instruction_epsw", VCPU_STAT(instruction_epsw) },
- { "instruction_gs", VCPU_STAT(instruction_gs) },
- { "instruction_io_other", VCPU_STAT(instruction_io_other) },
- { "instruction_lpsw", VCPU_STAT(instruction_lpsw) },
- { "instruction_lpswe", VCPU_STAT(instruction_lpswe) },
- { "instruction_pfmf", VCPU_STAT(instruction_pfmf) },
- { "instruction_ptff", VCPU_STAT(instruction_ptff) },
- { "instruction_stidp", VCPU_STAT(instruction_stidp) },
- { "instruction_sck", VCPU_STAT(instruction_sck) },
- { "instruction_sckpf", VCPU_STAT(instruction_sckpf) },
- { "instruction_spx", VCPU_STAT(instruction_spx) },
- { "instruction_stpx", VCPU_STAT(instruction_stpx) },
- { "instruction_stap", VCPU_STAT(instruction_stap) },
- { "instruction_iske", VCPU_STAT(instruction_iske) },
- { "instruction_ri", VCPU_STAT(instruction_ri) },
- { "instruction_rrbe", VCPU_STAT(instruction_rrbe) },
- { "instruction_sske", VCPU_STAT(instruction_sske) },
- { "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) },
- { "instruction_essa", VCPU_STAT(instruction_essa) },
- { "instruction_stsi", VCPU_STAT(instruction_stsi) },
- { "instruction_stfl", VCPU_STAT(instruction_stfl) },
- { "instruction_tb", VCPU_STAT(instruction_tb) },
- { "instruction_tpi", VCPU_STAT(instruction_tpi) },
- { "instruction_tprot", VCPU_STAT(instruction_tprot) },
- { "instruction_tsch", VCPU_STAT(instruction_tsch) },
- { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
- { "instruction_sie", VCPU_STAT(instruction_sie) },
- { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
- { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
- { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
- { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) },
- { "instruction_sigp_cond_emergency", VCPU_STAT(instruction_sigp_cond_emergency) },
- { "instruction_sigp_start", VCPU_STAT(instruction_sigp_start) },
- { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
- { "instruction_sigp_stop_store_status", VCPU_STAT(instruction_sigp_stop_store_status) },
- { "instruction_sigp_store_status", VCPU_STAT(instruction_sigp_store_status) },
- { "instruction_sigp_store_adtl_status", VCPU_STAT(instruction_sigp_store_adtl_status) },
- { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
- { "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) },
- { "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) },
- { "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) },
- { "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) },
- { "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) },
- { "instruction_diag_10", VCPU_STAT(diagnose_10) },
- { "instruction_diag_44", VCPU_STAT(diagnose_44) },
- { "instruction_diag_9c", VCPU_STAT(diagnose_9c) },
- { "diag_9c_ignored", VCPU_STAT(diagnose_9c_ignored) },
- { "instruction_diag_258", VCPU_STAT(diagnose_258) },
- { "instruction_diag_308", VCPU_STAT(diagnose_308) },
- { "instruction_diag_500", VCPU_STAT(diagnose_500) },
- { "instruction_diag_other", VCPU_STAT(diagnose_other) },
- { NULL }
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ KVM_GENERIC_VM_STATS(),
+ STATS_DESC_COUNTER(VM, inject_io),
+ STATS_DESC_COUNTER(VM, inject_float_mchk),
+ STATS_DESC_COUNTER(VM, inject_pfault_done),
+ STATS_DESC_COUNTER(VM, inject_service_signal),
+ STATS_DESC_COUNTER(VM, inject_virtio),
+ STATS_DESC_COUNTER(VM, aen_forward),
+ STATS_DESC_COUNTER(VM, gmap_shadow_reuse),
+ STATS_DESC_COUNTER(VM, gmap_shadow_create),
+ STATS_DESC_COUNTER(VM, gmap_shadow_r1_entry),
+ STATS_DESC_COUNTER(VM, gmap_shadow_r2_entry),
+ STATS_DESC_COUNTER(VM, gmap_shadow_r3_entry),
+ STATS_DESC_COUNTER(VM, gmap_shadow_sg_entry),
+ STATS_DESC_COUNTER(VM, gmap_shadow_pg_entry),
};
-struct kvm_s390_tod_clock_ext {
- __u8 epoch_idx;
- __u64 tod;
- __u8 reserved[7];
-} __packed;
+const struct kvm_stats_header kvm_vm_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ KVM_GENERIC_VCPU_STATS(),
+ STATS_DESC_COUNTER(VCPU, exit_userspace),
+ STATS_DESC_COUNTER(VCPU, exit_null),
+ STATS_DESC_COUNTER(VCPU, exit_external_request),
+ STATS_DESC_COUNTER(VCPU, exit_io_request),
+ STATS_DESC_COUNTER(VCPU, exit_external_interrupt),
+ STATS_DESC_COUNTER(VCPU, exit_stop_request),
+ STATS_DESC_COUNTER(VCPU, exit_validity),
+ STATS_DESC_COUNTER(VCPU, exit_instruction),
+ STATS_DESC_COUNTER(VCPU, exit_pei),
+ STATS_DESC_COUNTER(VCPU, halt_no_poll_steal),
+ STATS_DESC_COUNTER(VCPU, instruction_lctl),
+ STATS_DESC_COUNTER(VCPU, instruction_lctlg),
+ STATS_DESC_COUNTER(VCPU, instruction_stctl),
+ STATS_DESC_COUNTER(VCPU, instruction_stctg),
+ STATS_DESC_COUNTER(VCPU, exit_program_interruption),
+ STATS_DESC_COUNTER(VCPU, exit_instr_and_program),
+ STATS_DESC_COUNTER(VCPU, exit_operation_exception),
+ STATS_DESC_COUNTER(VCPU, deliver_ckc),
+ STATS_DESC_COUNTER(VCPU, deliver_cputm),
+ STATS_DESC_COUNTER(VCPU, deliver_external_call),
+ STATS_DESC_COUNTER(VCPU, deliver_emergency_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_service_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_virtio),
+ STATS_DESC_COUNTER(VCPU, deliver_stop_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_prefix_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_restart_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_program),
+ STATS_DESC_COUNTER(VCPU, deliver_io),
+ STATS_DESC_COUNTER(VCPU, deliver_machine_check),
+ STATS_DESC_COUNTER(VCPU, exit_wait_state),
+ STATS_DESC_COUNTER(VCPU, inject_ckc),
+ STATS_DESC_COUNTER(VCPU, inject_cputm),
+ STATS_DESC_COUNTER(VCPU, inject_external_call),
+ STATS_DESC_COUNTER(VCPU, inject_emergency_signal),
+ STATS_DESC_COUNTER(VCPU, inject_mchk),
+ STATS_DESC_COUNTER(VCPU, inject_pfault_init),
+ STATS_DESC_COUNTER(VCPU, inject_program),
+ STATS_DESC_COUNTER(VCPU, inject_restart),
+ STATS_DESC_COUNTER(VCPU, inject_set_prefix),
+ STATS_DESC_COUNTER(VCPU, inject_stop_signal),
+ STATS_DESC_COUNTER(VCPU, instruction_epsw),
+ STATS_DESC_COUNTER(VCPU, instruction_gs),
+ STATS_DESC_COUNTER(VCPU, instruction_io_other),
+ STATS_DESC_COUNTER(VCPU, instruction_lpsw),
+ STATS_DESC_COUNTER(VCPU, instruction_lpswe),
+ STATS_DESC_COUNTER(VCPU, instruction_pfmf),
+ STATS_DESC_COUNTER(VCPU, instruction_ptff),
+ STATS_DESC_COUNTER(VCPU, instruction_sck),
+ STATS_DESC_COUNTER(VCPU, instruction_sckpf),
+ STATS_DESC_COUNTER(VCPU, instruction_stidp),
+ STATS_DESC_COUNTER(VCPU, instruction_spx),
+ STATS_DESC_COUNTER(VCPU, instruction_stpx),
+ STATS_DESC_COUNTER(VCPU, instruction_stap),
+ STATS_DESC_COUNTER(VCPU, instruction_iske),
+ STATS_DESC_COUNTER(VCPU, instruction_ri),
+ STATS_DESC_COUNTER(VCPU, instruction_rrbe),
+ STATS_DESC_COUNTER(VCPU, instruction_sske),
+ STATS_DESC_COUNTER(VCPU, instruction_ipte_interlock),
+ STATS_DESC_COUNTER(VCPU, instruction_stsi),
+ STATS_DESC_COUNTER(VCPU, instruction_stfl),
+ STATS_DESC_COUNTER(VCPU, instruction_tb),
+ STATS_DESC_COUNTER(VCPU, instruction_tpi),
+ STATS_DESC_COUNTER(VCPU, instruction_tprot),
+ STATS_DESC_COUNTER(VCPU, instruction_tsch),
+ STATS_DESC_COUNTER(VCPU, instruction_sie),
+ STATS_DESC_COUNTER(VCPU, instruction_essa),
+ STATS_DESC_COUNTER(VCPU, instruction_sthyi),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_sense),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_sense_running),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_external_call),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_emergency),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_cond_emergency),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_start),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_stop),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_stop_store_status),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_store_status),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_store_adtl_status),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_arch),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_prefix),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_restart),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_init_cpu_reset),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_cpu_reset),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_unknown),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_10),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_44),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_9c),
+ STATS_DESC_COUNTER(VCPU, diag_9c_ignored),
+ STATS_DESC_COUNTER(VCPU, diag_9c_forward),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_258),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_308),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_500),
+ STATS_DESC_COUNTER(VCPU, instruction_diagnose_other),
+ STATS_DESC_COUNTER(VCPU, pfault_sync)
+};
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vcpu_stats_desc),
+};
/* allow nested virtualization in KVM (if enabled by user space) */
static int nested;
@@ -184,6 +206,24 @@ static u8 halt_poll_max_steal = 10;
module_param(halt_poll_max_steal, byte, 0644);
MODULE_PARM_DESC(halt_poll_max_steal, "Maximum percentage of steal time to allow polling");
+/* if set to true, the GISA will be initialized and used if available */
+static bool use_gisa = true;
+module_param(use_gisa, bool, 0644);
+MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it.");
+
+/* maximum diag9c forwarding per second */
+unsigned int diag9c_forwarding_hz;
+module_param(diag9c_forwarding_hz, uint, 0644);
+MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off");
+
+/*
+ * allow asynchronous deinit for protected guests; enable by default since
+ * the feature is opt-in anyway
+ */
+static int async_destroy = 1;
+module_param(async_destroy, int, 0444);
+MODULE_PARM_DESC(async_destroy, "Asynchronous destroy for protected guests");
+
/*
* For now we handle at most 16 double words as this is what the s390 base
* kernel handles and stores in the prefix page. If we ever need to go beyond
@@ -207,7 +247,7 @@ static unsigned long kvm_s390_fac_size(void)
BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_MASK_SIZE_U64);
BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_LIST_SIZE_U64);
BUILD_BUG_ON(SIZE_INTERNAL * sizeof(unsigned long) >
- sizeof(S390_lowcore.stfle_fac_list));
+ sizeof(stfle_fac_list));
return SIZE_INTERNAL;
}
@@ -220,21 +260,13 @@ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
static struct gmap_notifier gmap_notifier;
static struct gmap_notifier vsie_gmap_notifier;
debug_info_t *kvm_s390_dbf;
+debug_info_t *kvm_s390_dbf_uv;
/* Section: not file related */
-int kvm_arch_hardware_enable(void)
-{
- /* every s390 is virtualization enabled ;-) */
- return 0;
-}
-
-int kvm_arch_check_processor_compat(void)
-{
- return 0;
-}
-
+/* forward declarations */
static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
unsigned long end);
+static int sca_switch_to_extended(struct kvm *kvm);
static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta)
{
@@ -269,7 +301,7 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
unsigned long long *delta = v;
list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -293,25 +325,6 @@ static struct notifier_block kvm_clock_notifier = {
.notifier_call = kvm_clock_sync,
};
-int kvm_arch_hardware_setup(void)
-{
- gmap_notifier.notifier_call = kvm_gmap_notifier;
- gmap_register_pte_notifier(&gmap_notifier);
- vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
- gmap_register_pte_notifier(&vsie_gmap_notifier);
- atomic_notifier_chain_register(&s390_epoch_delta_notifier,
- &kvm_clock_notifier);
- return 0;
-}
-
-void kvm_arch_hardware_unsetup(void)
-{
- gmap_unregister_pte_notifier(&gmap_notifier);
- gmap_unregister_pte_notifier(&vsie_gmap_notifier);
- atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
- &kvm_clock_notifier);
-}
-
static void allow_cpu_feat(unsigned long nr)
{
set_bit_inv(nr, kvm_s390_available_cpu_feat);
@@ -319,37 +332,37 @@ static void allow_cpu_feat(unsigned long nr)
static inline int plo_test_bit(unsigned char nr)
{
- register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+ unsigned long function = (unsigned long)nr | 0x100;
int cc;
asm volatile(
+ " lgr 0,%[function]\n"
/* Parameter registers are ignored for "test bit" */
" plo 0,0,0,0(0)\n"
" ipm %0\n"
" srl %0,28\n"
: "=d" (cc)
- : "d" (r0)
- : "cc");
+ : [function] "d" (function)
+ : "cc", "0");
return cc == 0;
}
static __always_inline void __insn32_query(unsigned int opcode, u8 *query)
{
- register unsigned long r0 asm("0") = 0; /* query function */
- register unsigned long r1 asm("1") = (unsigned long) query;
-
asm volatile(
- /* Parameter regs are ignored */
+ " lghi 0,0\n"
+ " lgr 1,%[query]\n"
+ /* Parameter registers are ignored */
" .insn rrf,%[opc] << 16,2,4,6,0\n"
:
- : "d" (r0), "a" (r1), [opc] "i" (opcode)
- : "cc", "memory");
+ : [query] "d" ((unsigned long)query), [opc] "i" (opcode)
+ : "cc", "memory", "0", "1");
}
#define INSN_SORTL 0xb938
#define INSN_DFLTCC 0xb939
-static void kvm_s390_cpu_feat_init(void)
+static void __init kvm_s390_cpu_feat_init(void)
{
int i;
@@ -452,7 +465,7 @@ static void kvm_s390_cpu_feat_init(void)
*/
}
-int kvm_arch_init(void *opaque)
+static int __init __kvm_s390_init(void)
{
int rc = -ENOMEM;
@@ -460,8 +473,13 @@ int kvm_arch_init(void *opaque)
if (!kvm_s390_dbf)
return -ENOMEM;
- if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view))
- goto out;
+ kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long));
+ if (!kvm_s390_dbf_uv)
+ goto err_kvm_uv;
+
+ if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) ||
+ debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view))
+ goto err_debug_view;
kvm_s390_cpu_feat_init();
@@ -469,24 +487,54 @@ int kvm_arch_init(void *opaque)
rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
if (rc) {
pr_err("A FLIC registration call failed with rc=%d\n", rc);
- goto out;
+ goto err_flic;
+ }
+
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
+ rc = kvm_s390_pci_init();
+ if (rc) {
+ pr_err("Unable to allocate AIFT for PCI\n");
+ goto err_pci;
+ }
}
rc = kvm_s390_gib_init(GAL_ISC);
if (rc)
- goto out;
+ goto err_gib;
+
+ gmap_notifier.notifier_call = kvm_gmap_notifier;
+ gmap_register_pte_notifier(&gmap_notifier);
+ vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+ gmap_register_pte_notifier(&vsie_gmap_notifier);
+ atomic_notifier_chain_register(&s390_epoch_delta_notifier,
+ &kvm_clock_notifier);
return 0;
-out:
- kvm_arch_exit();
+err_gib:
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+ kvm_s390_pci_exit();
+err_pci:
+err_flic:
+err_debug_view:
+ debug_unregister(kvm_s390_dbf_uv);
+err_kvm_uv:
+ debug_unregister(kvm_s390_dbf);
return rc;
}
-void kvm_arch_exit(void)
+static void __kvm_s390_exit(void)
{
+ gmap_unregister_pte_notifier(&gmap_notifier);
+ gmap_unregister_pte_notifier(&vsie_gmap_notifier);
+ atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
+ &kvm_clock_notifier);
+
kvm_s390_gib_destroy();
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+ kvm_s390_pci_exit();
debug_unregister(kvm_s390_dbf);
+ debug_unregister(kvm_s390_dbf_uv);
}
/* Section: device related */
@@ -515,7 +563,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ENABLE_CAP:
case KVM_CAP_S390_CSS_SUPPORT:
case KVM_CAP_IOEVENTFD:
- case KVM_CAP_DEVICE_CTRL:
case KVM_CAP_S390_IRQCHIP:
case KVM_CAP_VM_ATTRIBUTES:
case KVM_CAP_MP_STATE:
@@ -529,8 +576,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_CMMA_MIGRATION:
case KVM_CAP_S390_AIS:
case KVM_CAP_S390_AIS_MIGRATION:
+ case KVM_CAP_S390_VCPU_RESETS:
+ case KVM_CAP_SET_GUEST_DEBUG:
+ case KVM_CAP_S390_DIAG318:
+ case KVM_CAP_IRQFD_RESAMPLE:
r = 1;
break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ r = KVM_GUESTDBG_VALID_MASK;
+ break;
case KVM_CAP_S390_HPAGE_1M:
r = 0;
if (hpage && !kvm_is_ucontrol(kvm))
@@ -539,6 +593,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_MEM_OP:
r = MEM_OP_MAX_SIZE;
break;
+ case KVM_CAP_S390_MEM_OP_EXTENSION:
+ /*
+ * Flag bits indicating which extensions are supported.
+ * If r > 0, the base extension must also be supported/indicated,
+ * in order to maintain backwards compatibility.
+ */
+ r = KVM_S390_MEMOP_EXTENSION_CAP_BASE |
+ KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG;
+ break;
case KVM_CAP_NR_VCPUS:
case KVM_CAP_MAX_VCPUS:
case KVM_CAP_MAX_VCPU_ID:
@@ -547,12 +610,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_MAX_VCPUS;
else if (sclp.has_esca && sclp.has_64bscao)
r = KVM_S390_ESCA_CPU_SLOTS;
+ if (ext == KVM_CAP_NR_VCPUS)
+ r = min_t(unsigned int, num_online_cpus(), r);
break;
case KVM_CAP_S390_COW:
r = MACHINE_HAS_ESOP;
break;
case KVM_CAP_S390_VECTOR_REGISTERS:
- r = MACHINE_HAS_VX;
+ r = test_facility(129);
break;
case KVM_CAP_S390_RI:
r = test_facility(64);
@@ -563,14 +628,45 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_BPB:
r = test_facility(82);
break;
+ case KVM_CAP_S390_PROTECTED_ASYNC_DISABLE:
+ r = async_destroy && is_prot_virt_host();
+ break;
+ case KVM_CAP_S390_PROTECTED:
+ r = is_prot_virt_host();
+ break;
+ case KVM_CAP_S390_PROTECTED_DUMP: {
+ u64 pv_cmds_dump[] = {
+ BIT_UVC_CMD_DUMP_INIT,
+ BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE,
+ BIT_UVC_CMD_DUMP_CPU,
+ BIT_UVC_CMD_DUMP_COMPLETE,
+ };
+ int i;
+
+ r = is_prot_virt_host();
+
+ for (i = 0; i < ARRAY_SIZE(pv_cmds_dump); i++) {
+ if (!test_bit_inv(pv_cmds_dump[i],
+ (unsigned long *)&uv_info.inst_calls_list)) {
+ r = 0;
+ break;
+ }
+ }
+ break;
+ }
+ case KVM_CAP_S390_ZPCI_OP:
+ r = kvm_s390_pci_interp_allowed();
+ break;
+ case KVM_CAP_S390_CPU_TOPOLOGY:
+ r = test_facility(11);
+ break;
default:
r = 0;
}
return r;
}
-static void kvm_s390_sync_dirty_log(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
int i;
gfn_t cur_gfn, last_gfn;
@@ -611,9 +707,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
{
int r;
unsigned long n;
- struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
- int is_dirty = 0;
+ int is_dirty;
if (kvm_is_ucontrol(kvm))
return -EINVAL;
@@ -624,14 +719,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
if (log->slot >= KVM_USER_MEM_SLOTS)
goto out;
- slots = kvm_memslots(kvm);
- memslot = id_to_memslot(slots, log->slot);
- r = -ENOENT;
- if (!memslot->dirty_bitmap)
- goto out;
-
- kvm_s390_sync_dirty_log(kvm, memslot);
- r = kvm_get_dirty_log(kvm, log, &is_dirty);
+ r = kvm_get_dirty_log(kvm, log, &is_dirty, &memslot);
if (r)
goto out;
@@ -648,7 +736,7 @@ out:
static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
{
- unsigned int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -678,7 +766,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
mutex_lock(&kvm->lock);
if (kvm->created_vcpus) {
r = -EBUSY;
- } else if (MACHINE_HAS_VX) {
+ } else if (cpu_has_vx()) {
set_kvm_facility(kvm->arch.model.fac_mask, 129);
set_kvm_facility(kvm->arch.model.fac_list, 129);
if (test_facility(134)) {
@@ -697,6 +785,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
set_kvm_facility(kvm->arch.model.fac_mask, 152);
set_kvm_facility(kvm->arch.model.fac_list, 152);
}
+ if (test_facility(192)) {
+ set_kvm_facility(kvm->arch.model.fac_mask, 192);
+ set_kvm_facility(kvm->arch.model.fac_list, 192);
+ }
r = 0;
} else
r = -EINVAL;
@@ -753,9 +845,9 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
r = -EINVAL;
else {
r = 0;
- down_write(&kvm->mm->mmap_sem);
+ mmap_write_lock(kvm->mm);
kvm->mm->context.allow_gmap_hpage_1m = 1;
- up_write(&kvm->mm->mmap_sem);
+ mmap_write_unlock(kvm->mm);
/*
* We might have to create fake 4k page
* tables. To avoid that the hardware works on
@@ -779,6 +871,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
icpt_operexc_on_all_vcpus(kvm);
r = 0;
break;
+ case KVM_CAP_S390_CPU_TOPOLOGY:
+ r = -EINVAL;
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus) {
+ r = -EBUSY;
+ } else if (test_facility(11)) {
+ set_kvm_facility(kvm->arch.model.fac_mask, 11);
+ set_kvm_facility(kvm->arch.model.fac_list, 11);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s",
+ r ? "(not available)" : "(success)");
+ break;
default:
r = -EINVAL;
break;
@@ -898,7 +1004,7 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
kvm_s390_vcpu_block_all(kvm);
@@ -981,9 +1087,45 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
return 0;
}
+static void kvm_s390_vcpu_pci_setup(struct kvm_vcpu *vcpu)
+{
+ /* Only set the ECB bits after guest requests zPCI interpretation */
+ if (!vcpu->kvm->arch.use_zpci_interp)
+ return;
+
+ vcpu->arch.sie_block->ecb2 |= ECB2_ZPCI_LSI;
+ vcpu->arch.sie_block->ecb3 |= ECB3_AISII + ECB3_AISI;
+}
+
+void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ lockdep_assert_held(&kvm->lock);
+
+ if (!kvm_s390_pci_interp_allowed())
+ return;
+
+ /*
+ * If host is configured for PCI and the necessary facilities are
+ * available, turn on interpretation for the life of this guest
+ */
+ kvm->arch.use_zpci_interp = 1;
+
+ kvm_s390_vcpu_block_all(kvm);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_s390_vcpu_pci_setup(vcpu);
+ kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu);
+ }
+
+ kvm_s390_vcpu_unblock_all(kvm);
+}
+
static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
{
- int cx;
+ unsigned long cx;
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(cx, vcpu, kvm)
@@ -999,13 +1141,13 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
struct kvm_memory_slot *ms;
struct kvm_memslots *slots;
unsigned long ram_pages = 0;
- int slotnr;
+ int bkt;
/* migration mode already enabled */
if (kvm->arch.migration_mode)
return 0;
slots = kvm_memslots(kvm);
- if (!slots || !slots->used_slots)
+ if (!slots || kvm_memslots_empty(slots))
return -EINVAL;
if (!kvm->arch.use_cmma) {
@@ -1013,8 +1155,7 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
return 0;
}
/* mark all the pages in active slots as dirty */
- for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
- ms = slots->memslots + slotnr;
+ kvm_for_each_memslot(ms, bkt, slots) {
if (!ms->dirty_bitmap)
return -EINVAL;
/*
@@ -1081,6 +1222,8 @@ static int kvm_s390_vm_get_migration(struct kvm *kvm,
return 0;
}
+static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
+
static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr)
{
struct kvm_s390_vm_tod_clock gtod;
@@ -1090,7 +1233,7 @@ static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr)
if (!test_kvm_facility(kvm, 139) && gtod.epoch_idx)
return -EINVAL;
- kvm_s390_set_tod_clock(kvm, &gtod);
+ __kvm_s390_set_tod_clock(kvm, &gtod);
VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx",
gtod.epoch_idx, gtod.tod);
@@ -1121,7 +1264,7 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
sizeof(gtod.tod)))
return -EFAULT;
- kvm_s390_set_tod_clock(kvm, &gtod);
+ __kvm_s390_set_tod_clock(kvm, &gtod);
VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod.tod);
return 0;
}
@@ -1133,6 +1276,16 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr)
if (attr->flags)
return -EINVAL;
+ mutex_lock(&kvm->lock);
+ /*
+ * For protected guests, the TOD is managed by the ultravisor, so trying
+ * to change it will never bring the expected results.
+ */
+ if (kvm_s390_pv_is_protected(kvm)) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
switch (attr->attr) {
case KVM_S390_VM_TOD_EXT:
ret = kvm_s390_set_tod_ext(kvm, attr);
@@ -1147,23 +1300,26 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr)
ret = -ENXIO;
break;
}
+
+out_unlock:
+ mutex_unlock(&kvm->lock);
return ret;
}
static void kvm_s390_get_tod_clock(struct kvm *kvm,
struct kvm_s390_vm_tod_clock *gtod)
{
- struct kvm_s390_tod_clock_ext htod;
+ union tod_clock clk;
preempt_disable();
- get_tod_clock_ext((char *)&htod);
+ store_tod_clock_ext(&clk);
- gtod->tod = htod.tod + kvm->arch.epoch;
+ gtod->tod = clk.tod + kvm->arch.epoch;
gtod->epoch_idx = 0;
if (test_kvm_facility(kvm, 139)) {
- gtod->epoch_idx = htod.epoch_idx + kvm->arch.epdx;
- if (gtod->tod < htod.tod)
+ gtod->epoch_idx = clk.ei + kvm->arch.epdx;
+ if (gtod->tod < clk.tod)
gtod->epoch_idx += 1;
}
@@ -1243,7 +1399,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
ret = -EBUSY;
goto out;
}
- proc = kzalloc(sizeof(*proc), GFP_KERNEL);
+ proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT);
if (!proc) {
ret = -ENOMEM;
goto out;
@@ -1295,8 +1451,7 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm,
mutex_unlock(&kvm->lock);
return -EBUSY;
}
- bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
- KVM_S390_VM_CPU_FEAT_NR_BITS);
+ bitmap_from_arr64(kvm->arch.cpu_feat, data.feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
mutex_unlock(&kvm->lock);
VM_EVENT(kvm, 3, "SET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
data.feat[0],
@@ -1382,6 +1537,39 @@ static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
return 0;
}
+#define KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK \
+( \
+ ((struct kvm_s390_vm_cpu_uv_feat){ \
+ .ap = 1, \
+ .ap_intr = 1, \
+ }) \
+ .feat \
+)
+
+static int kvm_s390_set_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_uv_feat __user *ptr = (void __user *)attr->addr;
+ unsigned long data, filter;
+
+ filter = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK;
+ if (get_user(data, &ptr->feat))
+ return -EFAULT;
+ if (!bitmap_subset(&data, &filter, KVM_S390_VM_CPU_UV_FEAT_NR_BITS))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus) {
+ mutex_unlock(&kvm->lock);
+ return -EBUSY;
+ }
+ kvm->arch.model.uv_feat_guest.feat = data;
+ mutex_unlock(&kvm->lock);
+
+ VM_EVENT(kvm, 3, "SET: guest UV-feat: 0x%16.16lx", data);
+
+ return 0;
+}
+
static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret = -ENXIO;
@@ -1396,6 +1584,9 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
ret = kvm_s390_set_processor_subfunc(kvm, attr);
break;
+ case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST:
+ ret = kvm_s390_set_uv_feat(kvm, attr);
+ break;
}
return ret;
}
@@ -1405,7 +1596,7 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr)
struct kvm_s390_vm_cpu_processor *proc;
int ret = 0;
- proc = kzalloc(sizeof(*proc), GFP_KERNEL);
+ proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT);
if (!proc) {
ret = -ENOMEM;
goto out;
@@ -1433,7 +1624,7 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
struct kvm_s390_vm_cpu_machine *mach;
int ret = 0;
- mach = kzalloc(sizeof(*mach), GFP_KERNEL);
+ mach = kzalloc(sizeof(*mach), GFP_KERNEL_ACCOUNT);
if (!mach) {
ret = -ENOMEM;
goto out;
@@ -1442,8 +1633,8 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
mach->ibc = sclp.ibc;
memcpy(&mach->fac_mask, kvm->arch.model.fac_mask,
S390_ARCH_FAC_LIST_SIZE_BYTE);
- memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
- sizeof(S390_lowcore.stfle_fac_list));
+ memcpy((unsigned long *)&mach->fac_list, stfle_fac_list,
+ sizeof(stfle_fac_list));
VM_EVENT(kvm, 3, "GET: host ibc: 0x%4.4x, host cpuid: 0x%16.16llx",
kvm->arch.model.ibc,
kvm->arch.model.cpuid);
@@ -1467,8 +1658,7 @@ static int kvm_s390_get_processor_feat(struct kvm *kvm,
{
struct kvm_s390_vm_cpu_feat data;
- bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat,
- KVM_S390_VM_CPU_FEAT_NR_BITS);
+ bitmap_to_arr64(data.feat, kvm->arch.cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
return -EFAULT;
VM_EVENT(kvm, 3, "GET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
@@ -1483,9 +1673,7 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm,
{
struct kvm_s390_vm_cpu_feat data;
- bitmap_copy((unsigned long *) data.feat,
- kvm_s390_available_cpu_feat,
- KVM_S390_VM_CPU_FEAT_NR_BITS);
+ bitmap_to_arr64(data.feat, kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
return -EFAULT;
VM_EVENT(kvm, 3, "GET: host feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
@@ -1631,6 +1819,33 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
return 0;
}
+static int kvm_s390_get_processor_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr;
+ unsigned long feat = kvm->arch.model.uv_feat_guest.feat;
+
+ if (put_user(feat, &dst->feat))
+ return -EFAULT;
+ VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat);
+
+ return 0;
+}
+
+static int kvm_s390_get_machine_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr;
+ unsigned long feat;
+
+ BUILD_BUG_ON(sizeof(*dst) != sizeof(uv_info.uv_feature_indications));
+
+ feat = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK;
+ if (put_user(feat, &dst->feat))
+ return -EFAULT;
+ VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat);
+
+ return 0;
+}
+
static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret = -ENXIO;
@@ -1654,10 +1869,67 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
ret = kvm_s390_get_machine_subfunc(kvm, attr);
break;
+ case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST:
+ ret = kvm_s390_get_processor_uv_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST:
+ ret = kvm_s390_get_machine_uv_feat(kvm, attr);
+ break;
}
return ret;
}
+/**
+ * kvm_s390_update_topology_change_report - update CPU topology change report
+ * @kvm: guest KVM description
+ * @val: set or clear the MTCR bit
+ *
+ * Updates the Multiprocessor Topology-Change-Report bit to signal
+ * the guest with a topology change.
+ * This is only relevant if the topology facility is present.
+ *
+ * The SCA version, bsca or esca, doesn't matter as offset is the same.
+ */
+static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val)
+{
+ union sca_utility new, old;
+ struct bsca_block *sca;
+
+ read_lock(&kvm->arch.sca_lock);
+ sca = kvm->arch.sca;
+ do {
+ old = READ_ONCE(sca->utility);
+ new = old;
+ new.mtcr = val;
+ } while (cmpxchg(&sca->utility.val, old.val, new.val) != old.val);
+ read_unlock(&kvm->arch.sca_lock);
+}
+
+static int kvm_s390_set_topo_change_indication(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ if (!test_kvm_facility(kvm, 11))
+ return -ENXIO;
+
+ kvm_s390_update_topology_change_report(kvm, !!attr->attr);
+ return 0;
+}
+
+static int kvm_s390_get_topo_change_indication(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ u8 topo;
+
+ if (!test_kvm_facility(kvm, 11))
+ return -ENXIO;
+
+ read_lock(&kvm->arch.sca_lock);
+ topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr;
+ read_unlock(&kvm->arch.sca_lock);
+
+ return put_user(topo, (u8 __user *)attr->addr);
+}
+
static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret;
@@ -1678,6 +1950,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_MIGRATION:
ret = kvm_s390_vm_set_migration(kvm, attr);
break;
+ case KVM_S390_VM_CPU_TOPOLOGY:
+ ret = kvm_s390_set_topo_change_indication(kvm, attr);
+ break;
default:
ret = -ENXIO;
break;
@@ -1703,6 +1978,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_MIGRATION:
ret = kvm_s390_vm_get_migration(kvm, attr);
break;
+ case KVM_S390_VM_CPU_TOPOLOGY:
+ ret = kvm_s390_get_topo_change_indication(kvm, attr);
+ break;
default:
ret = -ENXIO;
break;
@@ -1749,6 +2027,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_MACHINE_FEAT:
case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+ case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST:
+ case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST:
ret = 0;
break;
default:
@@ -1776,6 +2056,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_MIGRATION:
ret = 0;
break;
+ case KVM_S390_VM_CPU_TOPOLOGY:
+ ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO;
+ break;
default:
ret = -ENXIO;
break;
@@ -1784,7 +2067,7 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
return ret;
}
-static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
+static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
{
uint8_t *keys;
uint64_t hva;
@@ -1801,11 +2084,11 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
return -EINVAL;
- keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL);
+ keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
if (!keys)
return -ENOMEM;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
srcu_idx = srcu_read_lock(&kvm->srcu);
for (i = 0; i < args->count; i++) {
hva = gfn_to_hva(kvm, args->start_gfn + i);
@@ -1819,7 +2102,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
break;
}
srcu_read_unlock(&kvm->srcu, srcu_idx);
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (!r) {
r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
@@ -1832,7 +2115,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
return r;
}
-static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
+static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
{
uint8_t *keys;
uint64_t hva;
@@ -1846,7 +2129,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
return -EINVAL;
- keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL);
+ keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
if (!keys)
return -ENOMEM;
@@ -1863,7 +2146,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
goto out;
i = 0;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
srcu_idx = srcu_read_lock(&kvm->srcu);
while (i < args->count) {
unlocked = false;
@@ -1881,7 +2164,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
r = set_guest_storage_key(current->mm, hva, keys[i], 0);
if (r) {
- r = fixup_user_fault(current, current->mm, hva,
+ r = fixup_user_fault(current->mm, hva,
FAULT_FLAG_WRITE, &unlocked);
if (r)
break;
@@ -1890,7 +2173,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
i++;
}
srcu_read_unlock(&kvm->srcu, srcu_idx);
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
out:
kvfree(keys);
return r;
@@ -1905,38 +2188,6 @@ out:
/* for consistency */
#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
-/*
- * Similar to gfn_to_memslot, but returns the index of a memslot also when the
- * address falls in a hole. In that case the index of one of the memslots
- * bordering the hole is returned.
- */
-static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
-{
- int start = 0, end = slots->used_slots;
- int slot = atomic_read(&slots->lru_slot);
- struct kvm_memory_slot *memslots = slots->memslots;
-
- if (gfn >= memslots[slot].base_gfn &&
- gfn < memslots[slot].base_gfn + memslots[slot].npages)
- return slot;
-
- while (start < end) {
- slot = start + (end - start) / 2;
-
- if (gfn >= memslots[slot].base_gfn)
- end = slot;
- else
- start = slot + 1;
- }
-
- if (gfn >= memslots[start].base_gfn &&
- gfn < memslots[start].base_gfn + memslots[start].npages) {
- atomic_set(&slots->lru_slot, start);
- }
-
- return start;
-}
-
static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
u8 *res, unsigned long bufsize)
{
@@ -1960,27 +2211,36 @@ static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
return 0;
}
+static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots,
+ gfn_t gfn)
+{
+ return ____gfn_to_memslot(slots, gfn, true);
+}
+
static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
unsigned long cur_gfn)
{
- int slotidx = gfn_to_memslot_approx(slots, cur_gfn);
- struct kvm_memory_slot *ms = slots->memslots + slotidx;
+ struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn);
unsigned long ofs = cur_gfn - ms->base_gfn;
+ struct rb_node *mnode = &ms->gfn_node[slots->node_idx];
if (ms->base_gfn + ms->npages <= cur_gfn) {
- slotidx--;
+ mnode = rb_next(mnode);
/* If we are above the highest slot, wrap around */
- if (slotidx < 0)
- slotidx = slots->used_slots - 1;
+ if (!mnode)
+ mnode = rb_first(&slots->gfn_tree);
- ms = slots->memslots + slotidx;
+ ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
ofs = 0;
}
+
+ if (cur_gfn < ms->base_gfn)
+ ofs = 0;
+
ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
- while ((slotidx > 0) && (ofs >= ms->npages)) {
- slotidx--;
- ms = slots->memslots + slotidx;
- ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0);
+ while (ofs >= ms->npages && (mnode = rb_next(mnode))) {
+ ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
+ ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages);
}
return ms->base_gfn + ofs;
}
@@ -1992,6 +2252,9 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *ms;
+ if (unlikely(kvm_memslots_empty(slots)))
+ return 0;
+
cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn);
ms = gfn_to_memslot(kvm, cur_gfn);
args->count = 0;
@@ -1999,7 +2262,7 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
if (!ms)
return 0;
next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
- mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages;
+ mem_end = kvm_s390_get_gfn_end(slots);
while (args->count < bufsize) {
hva = gfn_to_hva(kvm, cur_gfn);
@@ -2073,14 +2336,14 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
if (!values)
return -ENOMEM;
- down_read(&kvm->mm->mmap_sem);
+ mmap_read_lock(kvm->mm);
srcu_idx = srcu_read_lock(&kvm->srcu);
if (peek)
ret = kvm_s390_peek_cmma(kvm, args, values, bufsize);
else
ret = kvm_s390_get_cmma(kvm, args, values, bufsize);
srcu_read_unlock(&kvm->srcu, srcu_idx);
- up_read(&kvm->mm->mmap_sem);
+ mmap_read_unlock(kvm->mm);
if (kvm->arch.migration_mode)
args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages);
@@ -2130,7 +2393,7 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
goto out;
}
- down_read(&kvm->mm->mmap_sem);
+ mmap_read_lock(kvm->mm);
srcu_idx = srcu_read_lock(&kvm->srcu);
for (i = 0; i < args->count; i++) {
hva = gfn_to_hva(kvm, args->start_gfn + i);
@@ -2145,20 +2408,579 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
set_pgste_bits(kvm->mm, hva, mask, pgstev);
}
srcu_read_unlock(&kvm->srcu, srcu_idx);
- up_read(&kvm->mm->mmap_sem);
+ mmap_read_unlock(kvm->mm);
if (!kvm->mm->context.uses_cmm) {
- down_write(&kvm->mm->mmap_sem);
+ mmap_write_lock(kvm->mm);
kvm->mm->context.uses_cmm = 1;
- up_write(&kvm->mm->mmap_sem);
+ mmap_write_unlock(kvm->mm);
}
out:
vfree(bits);
return r;
}
-long kvm_arch_vm_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+/**
+ * kvm_s390_cpus_from_pv - Convert all protected vCPUs in a protected VM to
+ * non protected.
+ * @kvm: the VM whose protected vCPUs are to be converted
+ * @rc: return value for the RC field of the UVC (in case of error)
+ * @rrc: return value for the RRC field of the UVC (in case of error)
+ *
+ * Does not stop in case of error, tries to convert as many
+ * CPUs as possible. In case of error, the RC and RRC of the last error are
+ * returned.
+ *
+ * Return: 0 in case of success, otherwise -EIO
+ */
+int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ u16 _rc, _rrc;
+ int ret = 0;
+
+ /*
+ * We ignore failures and try to destroy as many CPUs as possible.
+ * At the same time we must not free the assigned resources when
+ * this fails, as the ultravisor has still access to that memory.
+ * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak
+ * behind.
+ * We want to return the first failure rc and rrc, though.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ mutex_lock(&vcpu->mutex);
+ if (kvm_s390_pv_destroy_cpu(vcpu, &_rc, &_rrc) && !ret) {
+ *rc = _rc;
+ *rrc = _rrc;
+ ret = -EIO;
+ }
+ mutex_unlock(&vcpu->mutex);
+ }
+ /* Ensure that we re-enable gisa if the non-PV guest used it but the PV guest did not. */
+ if (use_gisa)
+ kvm_s390_gisa_enable(kvm);
+ return ret;
+}
+
+/**
+ * kvm_s390_cpus_to_pv - Convert all non-protected vCPUs in a protected VM
+ * to protected.
+ * @kvm: the VM whose protected vCPUs are to be converted
+ * @rc: return value for the RC field of the UVC (in case of error)
+ * @rrc: return value for the RRC field of the UVC (in case of error)
+ *
+ * Tries to undo the conversion in case of error.
+ *
+ * Return: 0 in case of success, otherwise -EIO
+ */
+static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ unsigned long i;
+ int r = 0;
+ u16 dummy;
+
+ struct kvm_vcpu *vcpu;
+
+ /* Disable the GISA if the ultravisor does not support AIV. */
+ if (!uv_has_feature(BIT_UV_FEAT_AIV))
+ kvm_s390_gisa_disable(kvm);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ mutex_lock(&vcpu->mutex);
+ r = kvm_s390_pv_create_cpu(vcpu, rc, rrc);
+ mutex_unlock(&vcpu->mutex);
+ if (r)
+ break;
+ }
+ if (r)
+ kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
+ return r;
+}
+
+/*
+ * Here we provide user space with a direct interface to query UV
+ * related data like UV maxima and available features as well as
+ * feature specific data.
+ *
+ * To facilitate future extension of the data structures we'll try to
+ * write data up to the maximum requested length.
+ */
+static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info)
+{
+ ssize_t len_min;
+
+ switch (info->header.id) {
+ case KVM_PV_INFO_VM: {
+ len_min = sizeof(info->header) + sizeof(info->vm);
+
+ if (info->header.len_max < len_min)
+ return -EINVAL;
+
+ memcpy(info->vm.inst_calls_list,
+ uv_info.inst_calls_list,
+ sizeof(uv_info.inst_calls_list));
+
+ /* It's max cpuid not max cpus, so it's off by one */
+ info->vm.max_cpus = uv_info.max_guest_cpu_id + 1;
+ info->vm.max_guests = uv_info.max_num_sec_conf;
+ info->vm.max_guest_addr = uv_info.max_sec_stor_addr;
+ info->vm.feature_indication = uv_info.uv_feature_indications;
+
+ return len_min;
+ }
+ case KVM_PV_INFO_DUMP: {
+ len_min = sizeof(info->header) + sizeof(info->dump);
+
+ if (info->header.len_max < len_min)
+ return -EINVAL;
+
+ info->dump.dump_cpu_buffer_len = uv_info.guest_cpu_stor_len;
+ info->dump.dump_config_mem_buffer_per_1m = uv_info.conf_dump_storage_state_len;
+ info->dump.dump_config_finalize_len = uv_info.conf_dump_finalize_len;
+ return len_min;
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd,
+ struct kvm_s390_pv_dmp dmp)
+{
+ int r = -EINVAL;
+ void __user *result_buff = (void __user *)dmp.buff_addr;
+
+ switch (dmp.subcmd) {
+ case KVM_PV_DUMP_INIT: {
+ if (kvm->arch.pv.dumping)
+ break;
+
+ /*
+ * Block SIE entry as concurrent dump UVCs could lead
+ * to validities.
+ */
+ kvm_s390_vcpu_block_all(kvm);
+
+ r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+ UVC_CMD_DUMP_INIT, &cmd->rc, &cmd->rrc);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP INIT: rc %x rrc %x",
+ cmd->rc, cmd->rrc);
+ if (!r) {
+ kvm->arch.pv.dumping = true;
+ } else {
+ kvm_s390_vcpu_unblock_all(kvm);
+ r = -EINVAL;
+ }
+ break;
+ }
+ case KVM_PV_DUMP_CONFIG_STOR_STATE: {
+ if (!kvm->arch.pv.dumping)
+ break;
+
+ /*
+ * gaddr is an output parameter since we might stop
+ * early. As dmp will be copied back in our caller, we
+ * don't need to do it ourselves.
+ */
+ r = kvm_s390_pv_dump_stor_state(kvm, result_buff, &dmp.gaddr, dmp.buff_len,
+ &cmd->rc, &cmd->rrc);
+ break;
+ }
+ case KVM_PV_DUMP_COMPLETE: {
+ if (!kvm->arch.pv.dumping)
+ break;
+
+ r = -EINVAL;
+ if (dmp.buff_len < uv_info.conf_dump_finalize_len)
+ break;
+
+ r = kvm_s390_pv_dump_complete(kvm, result_buff,
+ &cmd->rc, &cmd->rrc);
+ break;
+ }
+ default:
+ r = -ENOTTY;
+ break;
+ }
+
+ return r;
+}
+
+static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
+{
+ const bool need_lock = (cmd->cmd != KVM_PV_ASYNC_CLEANUP_PERFORM);
+ void __user *argp = (void __user *)cmd->data;
+ int r = 0;
+ u16 dummy;
+
+ if (need_lock)
+ mutex_lock(&kvm->lock);
+
+ switch (cmd->cmd) {
+ case KVM_PV_ENABLE: {
+ r = -EINVAL;
+ if (kvm_s390_pv_is_protected(kvm))
+ break;
+
+ /*
+ * FMT 4 SIE needs esca. As we never switch back to bsca from
+ * esca, we need no cleanup in the error cases below
+ */
+ r = sca_switch_to_extended(kvm);
+ if (r)
+ break;
+
+ mmap_write_lock(current->mm);
+ r = gmap_mark_unmergeable();
+ mmap_write_unlock(current->mm);
+ if (r)
+ break;
+
+ r = kvm_s390_pv_init_vm(kvm, &cmd->rc, &cmd->rrc);
+ if (r)
+ break;
+
+ r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc);
+ if (r)
+ kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
+
+ /* we need to block service interrupts from now on */
+ set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
+ break;
+ }
+ case KVM_PV_ASYNC_CLEANUP_PREPARE:
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm) || !async_destroy)
+ break;
+
+ r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc);
+ /*
+ * If a CPU could not be destroyed, destroy VM will also fail.
+ * There is no point in trying to destroy it. Instead return
+ * the rc and rrc from the first CPU that failed destroying.
+ */
+ if (r)
+ break;
+ r = kvm_s390_pv_set_aside(kvm, &cmd->rc, &cmd->rrc);
+
+ /* no need to block service interrupts any more */
+ clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
+ break;
+ case KVM_PV_ASYNC_CLEANUP_PERFORM:
+ r = -EINVAL;
+ if (!async_destroy)
+ break;
+ /* kvm->lock must not be held; this is asserted inside the function. */
+ r = kvm_s390_pv_deinit_aside_vm(kvm, &cmd->rc, &cmd->rrc);
+ break;
+ case KVM_PV_DISABLE: {
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm))
+ break;
+
+ r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc);
+ /*
+ * If a CPU could not be destroyed, destroy VM will also fail.
+ * There is no point in trying to destroy it. Instead return
+ * the rc and rrc from the first CPU that failed destroying.
+ */
+ if (r)
+ break;
+ r = kvm_s390_pv_deinit_cleanup_all(kvm, &cmd->rc, &cmd->rrc);
+
+ /* no need to block service interrupts any more */
+ clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
+ break;
+ }
+ case KVM_PV_SET_SEC_PARMS: {
+ struct kvm_s390_pv_sec_parm parms = {};
+ void *hdr;
+
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm))
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&parms, argp, sizeof(parms)))
+ break;
+
+ /* Currently restricted to 8KB */
+ r = -EINVAL;
+ if (parms.length > PAGE_SIZE * 2)
+ break;
+
+ r = -ENOMEM;
+ hdr = vmalloc(parms.length);
+ if (!hdr)
+ break;
+
+ r = -EFAULT;
+ if (!copy_from_user(hdr, (void __user *)parms.origin,
+ parms.length))
+ r = kvm_s390_pv_set_sec_parms(kvm, hdr, parms.length,
+ &cmd->rc, &cmd->rrc);
+
+ vfree(hdr);
+ break;
+ }
+ case KVM_PV_UNPACK: {
+ struct kvm_s390_pv_unp unp = {};
+
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm) || !mm_is_protected(kvm->mm))
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&unp, argp, sizeof(unp)))
+ break;
+
+ r = kvm_s390_pv_unpack(kvm, unp.addr, unp.size, unp.tweak,
+ &cmd->rc, &cmd->rrc);
+ break;
+ }
+ case KVM_PV_VERIFY: {
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm))
+ break;
+
+ r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+ UVC_CMD_VERIFY_IMG, &cmd->rc, &cmd->rrc);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT VERIFY: rc %x rrc %x", cmd->rc,
+ cmd->rrc);
+ break;
+ }
+ case KVM_PV_PREP_RESET: {
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm))
+ break;
+
+ r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+ UVC_CMD_PREPARE_RESET, &cmd->rc, &cmd->rrc);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT PREP RESET: rc %x rrc %x",
+ cmd->rc, cmd->rrc);
+ break;
+ }
+ case KVM_PV_UNSHARE_ALL: {
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm))
+ break;
+
+ r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+ UVC_CMD_SET_UNSHARE_ALL, &cmd->rc, &cmd->rrc);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT UNSHARE: rc %x rrc %x",
+ cmd->rc, cmd->rrc);
+ break;
+ }
+ case KVM_PV_INFO: {
+ struct kvm_s390_pv_info info = {};
+ ssize_t data_len;
+
+ /*
+ * No need to check the VM protection here.
+ *
+ * Maybe user space wants to query some of the data
+ * when the VM is still unprotected. If we see the
+ * need to fence a new data command we can still
+ * return an error in the info handler.
+ */
+
+ r = -EFAULT;
+ if (copy_from_user(&info, argp, sizeof(info.header)))
+ break;
+
+ r = -EINVAL;
+ if (info.header.len_max < sizeof(info.header))
+ break;
+
+ data_len = kvm_s390_handle_pv_info(&info);
+ if (data_len < 0) {
+ r = data_len;
+ break;
+ }
+ /*
+ * If a data command struct is extended (multiple
+ * times) this can be used to determine how much of it
+ * is valid.
+ */
+ info.header.len_written = data_len;
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &info, data_len))
+ break;
+
+ r = 0;
+ break;
+ }
+ case KVM_PV_DUMP: {
+ struct kvm_s390_pv_dmp dmp;
+
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm))
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&dmp, argp, sizeof(dmp)))
+ break;
+
+ r = kvm_s390_pv_dmp(kvm, cmd, dmp);
+ if (r)
+ break;
+
+ if (copy_to_user(argp, &dmp, sizeof(dmp))) {
+ r = -EFAULT;
+ break;
+ }
+
+ break;
+ }
+ default:
+ r = -ENOTTY;
+ }
+ if (need_lock)
+ mutex_unlock(&kvm->lock);
+
+ return r;
+}
+
+static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_flags)
+{
+ if (mop->flags & ~supported_flags || !mop->size)
+ return -EINVAL;
+ if (mop->size > MEM_OP_MAX_SIZE)
+ return -E2BIG;
+ if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) {
+ if (mop->key > 0xf)
+ return -EINVAL;
+ } else {
+ mop->key = 0;
+ }
+ return 0;
+}
+
+static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+ void __user *uaddr = (void __user *)mop->buf;
+ enum gacc_mode acc_mode;
+ void *tmpbuf = NULL;
+ int r, srcu_idx;
+
+ r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION |
+ KVM_S390_MEMOP_F_CHECK_ONLY);
+ if (r)
+ return r;
+
+ if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
+ tmpbuf = vmalloc(mop->size);
+ if (!tmpbuf)
+ return -ENOMEM;
+ }
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ if (kvm_is_error_gpa(kvm, mop->gaddr)) {
+ r = PGM_ADDRESSING;
+ goto out_unlock;
+ }
+
+ acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE;
+ if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+ r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key);
+ goto out_unlock;
+ }
+ if (acc_mode == GACC_FETCH) {
+ r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
+ mop->size, GACC_FETCH, mop->key);
+ if (r)
+ goto out_unlock;
+ if (copy_to_user(uaddr, tmpbuf, mop->size))
+ r = -EFAULT;
+ } else {
+ if (copy_from_user(tmpbuf, uaddr, mop->size)) {
+ r = -EFAULT;
+ goto out_unlock;
+ }
+ r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
+ mop->size, GACC_STORE, mop->key);
+ }
+
+out_unlock:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+ vfree(tmpbuf);
+ return r;
+}
+
+static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+ void __user *uaddr = (void __user *)mop->buf;
+ void __user *old_addr = (void __user *)mop->old_addr;
+ union {
+ __uint128_t quad;
+ char raw[sizeof(__uint128_t)];
+ } old = { .quad = 0}, new = { .quad = 0 };
+ unsigned int off_in_quad = sizeof(new) - mop->size;
+ int r, srcu_idx;
+ bool success;
+
+ r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION);
+ if (r)
+ return r;
+ /*
+ * This validates off_in_quad. Checking that size is a power
+ * of two is not necessary, as cmpxchg_guest_abs_with_key
+ * takes care of that
+ */
+ if (mop->size > sizeof(new))
+ return -EINVAL;
+ if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size))
+ return -EFAULT;
+ if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size))
+ return -EFAULT;
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ if (kvm_is_error_gpa(kvm, mop->gaddr)) {
+ r = PGM_ADDRESSING;
+ goto out_unlock;
+ }
+
+ r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad,
+ new.quad, mop->key, &success);
+ if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size))
+ r = -EFAULT;
+
+out_unlock:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ return r;
+}
+
+static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+ /*
+ * This is technically a heuristic only, if the kvm->lock is not
+ * taken, it is not guaranteed that the vm is/remains non-protected.
+ * This is ok from a kernel perspective, wrongdoing is detected
+ * on the access, -EFAULT is returned and the vm may crash the
+ * next time it accesses the memory in question.
+ * There is no sane usecase to do switching and a memop on two
+ * different CPUs at the same time.
+ */
+ if (kvm_s390_pv_get_handle(kvm))
+ return -EINVAL;
+
+ switch (mop->op) {
+ case KVM_S390_MEMOP_ABSOLUTE_READ:
+ case KVM_S390_MEMOP_ABSOLUTE_WRITE:
+ return kvm_s390_vm_mem_op_abs(kvm, mop);
+ case KVM_S390_MEMOP_ABSOLUTE_CMPXCHG:
+ return kvm_s390_vm_mem_op_cmpxchg(kvm, mop);
+ default:
+ return -EINVAL;
+ }
+}
+
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
@@ -2254,6 +3076,54 @@ long kvm_arch_vm_ioctl(struct file *filp,
mutex_unlock(&kvm->slots_lock);
break;
}
+ case KVM_S390_PV_COMMAND: {
+ struct kvm_pv_cmd args;
+
+ /* protvirt means user cpu state */
+ kvm_s390_set_user_cpu_state_ctrl(kvm);
+ r = 0;
+ if (!is_prot_virt_host()) {
+ r = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&args, argp, sizeof(args))) {
+ r = -EFAULT;
+ break;
+ }
+ if (args.flags) {
+ r = -EINVAL;
+ break;
+ }
+ /* must be called without kvm->lock */
+ r = kvm_s390_handle_pv(kvm, &args);
+ if (copy_to_user(argp, &args, sizeof(args))) {
+ r = -EFAULT;
+ break;
+ }
+ break;
+ }
+ case KVM_S390_MEM_OP: {
+ struct kvm_s390_mem_op mem_op;
+
+ if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
+ r = kvm_s390_vm_mem_op(kvm, &mem_op);
+ else
+ r = -EFAULT;
+ break;
+ }
+ case KVM_S390_ZPCI_OP: {
+ struct kvm_s390_zpci_op args;
+
+ r = -EINVAL;
+ if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+ break;
+ if (copy_from_user(&args, argp, sizeof(args))) {
+ r = -EFAULT;
+ break;
+ }
+ r = kvm_s390_pci_zpci_op(kvm, &args);
+ break;
+ }
default:
r = -ENOTTY;
}
@@ -2298,12 +3168,26 @@ static void kvm_s390_set_crycb_format(struct kvm *kvm)
kvm->arch.crypto.crycbd |= CRYCB_FORMAT1;
}
+/*
+ * kvm_arch_crypto_set_masks
+ *
+ * @kvm: pointer to the target guest's KVM struct containing the crypto masks
+ * to be set.
+ * @apm: the mask identifying the accessible AP adapters
+ * @aqm: the mask identifying the accessible AP domains
+ * @adm: the mask identifying the accessible AP control domains
+ *
+ * Set the masks that identify the adapters, domains and control domains to
+ * which the KVM guest is granted access.
+ *
+ * Note: The kvm->lock mutex must be locked by the caller before invoking this
+ * function.
+ */
void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
unsigned long *aqm, unsigned long *adm)
{
struct kvm_s390_crypto_cb *crycb = kvm->arch.crypto.crycb;
- mutex_lock(&kvm->lock);
kvm_s390_vcpu_block_all(kvm);
switch (kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) {
@@ -2334,13 +3218,23 @@ void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
/* recreate the shadow crycb for each vcpu */
kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
kvm_s390_vcpu_unblock_all(kvm);
- mutex_unlock(&kvm->lock);
}
EXPORT_SYMBOL_GPL(kvm_arch_crypto_set_masks);
+/*
+ * kvm_arch_crypto_clear_masks
+ *
+ * @kvm: pointer to the target guest's KVM struct containing the crypto masks
+ * to be cleared.
+ *
+ * Clear the masks that identify the adapters, domains and control domains to
+ * which the KVM guest is granted access.
+ *
+ * Note: The kvm->lock mutex must be locked by the caller before invoking this
+ * function.
+ */
void kvm_arch_crypto_clear_masks(struct kvm *kvm)
{
- mutex_lock(&kvm->lock);
kvm_s390_vcpu_block_all(kvm);
memset(&kvm->arch.crypto.crycb->apcb0, 0,
@@ -2352,7 +3246,6 @@ void kvm_arch_crypto_clear_masks(struct kvm *kvm)
/* recreate the shadow crycb for each vcpu */
kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART);
kvm_s390_vcpu_unblock_all(kvm);
- mutex_unlock(&kvm->lock);
}
EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks);
@@ -2369,6 +3262,7 @@ static void kvm_s390_crypto_init(struct kvm *kvm)
{
kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb;
kvm_s390_set_crycb_format(kvm);
+ init_rwsem(&kvm->arch.crypto.pqap_hook_rwsem);
if (!test_kvm_facility(kvm, 76))
return;
@@ -2391,9 +3285,17 @@ static void sca_dispose(struct kvm *kvm)
kvm->arch.sca = NULL;
}
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+ kvm_s390_pci_clear_list(kvm);
+
+ __kvm_arch_free_vm(kvm);
+}
+
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
- gfp_t alloc_flags = GFP_KERNEL;
+ gfp_t alloc_flags = GFP_KERNEL_ACCOUNT;
int i, rc;
char debug_name[16];
static unsigned long sca_offset;
@@ -2438,7 +3340,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
BUILD_BUG_ON(sizeof(struct sie_page2) != 4096);
kvm->arch.sie_page2 =
- (struct sie_page2 *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ (struct sie_page2 *) get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA);
if (!kvm->arch.sie_page2)
goto out_err;
@@ -2446,10 +3348,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list;
for (i = 0; i < kvm_s390_fac_size(); i++) {
- kvm->arch.model.fac_mask[i] = S390_lowcore.stfle_fac_list[i] &
+ kvm->arch.model.fac_mask[i] = stfle_fac_list[i] &
(kvm_s390_fac_base[i] |
kvm_s390_fac_ext[i]);
- kvm->arch.model.fac_list[i] = S390_lowcore.stfle_fac_list[i] &
+ kvm->arch.model.fac_list[i] = stfle_fac_list[i] &
kvm_s390_fac_base[i];
}
kvm->arch.model.subfuncs = kvm_s390_available_subfunc;
@@ -2471,8 +3373,17 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
kvm->arch.model.ibc = sclp.ibc & 0x0fff;
+ kvm->arch.model.uv_feat_guest.feat = 0;
+
kvm_s390_crypto_init(kvm);
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
+ mutex_lock(&kvm->lock);
+ kvm_s390_pci_init_list(kvm);
+ kvm_s390_vcpu_pci_enable_interp(kvm);
+ mutex_unlock(&kvm->lock);
+ }
+
mutex_init(&kvm->arch.float_int.ais_lock);
spin_lock_init(&kvm->arch.float_int.lock);
for (i = 0; i < FIRQ_LIST_COUNT; i++)
@@ -2503,7 +3414,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.use_skf = sclp.has_skey;
spin_lock_init(&kvm->arch.start_stop_lock);
kvm_s390_vsie_init(kvm);
- kvm_s390_gisa_init(kvm);
+ if (use_gisa)
+ kvm_s390_gisa_init(kvm);
+ INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup);
+ kvm->arch.pv.set_aside = NULL;
KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
return 0;
@@ -2517,46 +3431,50 @@ out_err:
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
+ u16 rc, rrc;
+
VCPU_EVENT(vcpu, 3, "%s", "free cpu");
trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
kvm_s390_clear_local_irqs(vcpu);
kvm_clear_async_pf_completion_queue(vcpu);
if (!kvm_is_ucontrol(vcpu->kvm))
sca_del_vcpu(vcpu);
+ kvm_s390_update_topology_change_report(vcpu->kvm, 1);
if (kvm_is_ucontrol(vcpu->kvm))
gmap_remove(vcpu->arch.gmap);
if (vcpu->kvm->arch.use_cmma)
kvm_s390_vcpu_unsetup_cmma(vcpu);
+ /* We can not hold the vcpu mutex here, we are already dying */
+ if (kvm_s390_pv_cpu_get_handle(vcpu))
+ kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc);
free_page((unsigned long)(vcpu->arch.sie_block));
-
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(kvm_vcpu_cache, vcpu);
-}
-
-static void kvm_free_vcpus(struct kvm *kvm)
-{
- unsigned int i;
- struct kvm_vcpu *vcpu;
-
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_arch_vcpu_destroy(vcpu);
-
- mutex_lock(&kvm->lock);
- for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
- kvm->vcpus[i] = NULL;
-
- atomic_set(&kvm->online_vcpus, 0);
- mutex_unlock(&kvm->lock);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
{
- kvm_free_vcpus(kvm);
+ u16 rc, rrc;
+
+ kvm_destroy_vcpus(kvm);
sca_dispose(kvm);
- debug_unregister(kvm->arch.dbf);
kvm_s390_gisa_destroy(kvm);
+ /*
+ * We are already at the end of life and kvm->lock is not taken.
+ * This is ok as the file descriptor is closed by now and nobody
+ * can mess with the pv state.
+ */
+ kvm_s390_pv_deinit_cleanup_all(kvm, &rc, &rrc);
+ /*
+ * Remove the mmu notifier only when the whole KVM VM is torn down,
+ * and only if one was registered to begin with. If the VM is
+ * currently not protected, but has been previously been protected,
+ * then it's possible that the notifier is still registered.
+ */
+ if (kvm->arch.pv.mmu_notifier.ops)
+ mmu_notifier_unregister(&kvm->arch.pv.mmu_notifier, kvm->mm);
+
+ debug_unregister(kvm->arch.dbf);
free_page((unsigned long)kvm->arch.sie_page2);
if (!kvm_is_ucontrol(kvm))
gmap_remove(kvm->arch.gmap);
@@ -2599,28 +3517,30 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu)
static void sca_add_vcpu(struct kvm_vcpu *vcpu)
{
if (!kvm_s390_use_sca_entries()) {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
+ phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca);
/* we still need the basic sca for the ipte control */
- vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
- vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
+ vcpu->arch.sie_block->scaoh = sca_phys >> 32;
+ vcpu->arch.sie_block->scaol = sca_phys;
return;
}
read_lock(&vcpu->kvm->arch.sca_lock);
if (vcpu->kvm->arch.use_esca) {
struct esca_block *sca = vcpu->kvm->arch.sca;
+ phys_addr_t sca_phys = virt_to_phys(sca);
- sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block;
- vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
- vcpu->arch.sie_block->scaol = (__u32)(__u64)sca & ~0x3fU;
+ sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
+ vcpu->arch.sie_block->scaoh = sca_phys >> 32;
+ vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK;
vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
} else {
struct bsca_block *sca = vcpu->kvm->arch.sca;
+ phys_addr_t sca_phys = virt_to_phys(sca);
- sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block;
- vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
- vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
+ sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
+ vcpu->arch.sie_block->scaoh = sca_phys >> 32;
+ vcpu->arch.sie_block->scaol = sca_phys;
set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn);
}
read_unlock(&vcpu->kvm->arch.sca_lock);
@@ -2649,15 +3569,20 @@ static int sca_switch_to_extended(struct kvm *kvm)
struct bsca_block *old_sca = kvm->arch.sca;
struct esca_block *new_sca;
struct kvm_vcpu *vcpu;
- unsigned int vcpu_idx;
+ unsigned long vcpu_idx;
u32 scaol, scaoh;
+ phys_addr_t new_sca_phys;
- new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL|__GFP_ZERO);
+ if (kvm->arch.use_esca)
+ return 0;
+
+ new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!new_sca)
return -ENOMEM;
- scaoh = (u32)((u64)(new_sca) >> 32);
- scaol = (u32)(u64)(new_sca) & ~0x3fU;
+ new_sca_phys = virt_to_phys(new_sca);
+ scaoh = new_sca_phys >> 32;
+ scaol = new_sca_phys & ESCA_SCAOL_MASK;
kvm_s390_vcpu_block_all(kvm);
write_lock(&kvm->arch.sca_lock);
@@ -2696,46 +3621,11 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
if (!sclp.has_esca || !sclp.has_64bscao)
return false;
- mutex_lock(&kvm->lock);
rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm);
- mutex_unlock(&kvm->lock);
return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS;
}
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
- kvm_clear_async_pf_completion_queue(vcpu);
- vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX |
- KVM_SYNC_GPRS |
- KVM_SYNC_ACRS |
- KVM_SYNC_CRS |
- KVM_SYNC_ARCH0 |
- KVM_SYNC_PFAULT;
- kvm_s390_set_prefix(vcpu, 0);
- if (test_kvm_facility(vcpu->kvm, 64))
- vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
- if (test_kvm_facility(vcpu->kvm, 82))
- vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC;
- if (test_kvm_facility(vcpu->kvm, 133))
- vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB;
- if (test_kvm_facility(vcpu->kvm, 156))
- vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN;
- /* fprs can be synchronized via vrs, even if the guest has no vx. With
- * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
- */
- if (MACHINE_HAS_VX)
- vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS;
- else
- vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
-
- if (kvm_is_ucontrol(vcpu->kvm))
- return __kvm_ucontrol_vcpu_init(vcpu);
-
- return 0;
-}
-
/* needs disabled preemption to protect from TOD sync and vcpu_load/put */
static void __start_cpu_timer_accounting(struct kvm_vcpu *vcpu)
{
@@ -2844,35 +3734,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
}
-static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
-{
- /* this equals initial cpu reset in pop, but we don't switch to ESA */
- vcpu->arch.sie_block->gpsw.mask = 0UL;
- vcpu->arch.sie_block->gpsw.addr = 0UL;
- kvm_s390_set_prefix(vcpu, 0);
- kvm_s390_set_cpu_timer(vcpu, 0);
- vcpu->arch.sie_block->ckc = 0UL;
- vcpu->arch.sie_block->todpr = 0;
- memset(vcpu->arch.sie_block->gcr, 0, 16 * sizeof(__u64));
- vcpu->arch.sie_block->gcr[0] = CR0_UNUSED_56 |
- CR0_INTERRUPT_KEY_SUBMASK |
- CR0_MEASUREMENT_ALERT_SUBMASK;
- vcpu->arch.sie_block->gcr[14] = CR14_UNUSED_32 |
- CR14_UNUSED_33 |
- CR14_EXTERNAL_DAMAGE_SUBMASK;
- /* make sure the new fpc will be lazily loaded */
- save_fpu_regs();
- current->thread.fpu.fpc = 0;
- vcpu->arch.sie_block->gbea = 1;
- vcpu->arch.sie_block->pp = 0;
- vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
- vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
- kvm_clear_async_pf_completion_queue(vcpu);
- if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
- kvm_s390_vcpu_stop(vcpu);
- kvm_s390_clear_local_irqs(vcpu);
-}
-
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
mutex_lock(&vcpu->kvm->lock);
@@ -2941,15 +3802,18 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu)
{
- free_page(vcpu->arch.sie_block->cbrlo);
+ free_page((unsigned long)phys_to_virt(vcpu->arch.sie_block->cbrlo));
vcpu->arch.sie_block->cbrlo = 0;
}
int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
{
- vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL);
- if (!vcpu->arch.sie_block->cbrlo)
+ void *cbrlo_page = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+
+ if (!cbrlo_page)
return -ENOMEM;
+
+ vcpu->arch.sie_block->cbrlo = virt_to_phys(cbrlo_page);
return 0;
}
@@ -2959,12 +3823,13 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->ibc = model->ibc;
if (test_kvm_facility(vcpu->kvm, 7))
- vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list;
+ vcpu->arch.sie_block->fac = virt_to_phys(model->fac_list);
}
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
{
int rc = 0;
+ u16 uvrc, uvrrc;
atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
CPUSTAT_SM |
@@ -2982,8 +3847,12 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT;
if (test_kvm_facility(vcpu->kvm, 9))
vcpu->arch.sie_block->ecb |= ECB_SRSI;
+ if (test_kvm_facility(vcpu->kvm, 11))
+ vcpu->arch.sie_block->ecb |= ECB_PTF;
if (test_kvm_facility(vcpu->kvm, 73))
vcpu->arch.sie_block->ecb |= ECB_TE;
+ if (!kvm_is_ucontrol(vcpu->kvm))
+ vcpu->arch.sie_block->ecb |= ECB_SPECI;
if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi)
vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI;
@@ -3011,9 +3880,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u",
vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id);
}
- vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx)
- | SDNXC;
- vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
+ vcpu->arch.sie_block->sdnxo = virt_to_phys(&vcpu->run->s.regs.sdnx) | SDNXC;
+ vcpu->arch.sie_block->riccbd = virt_to_phys(&vcpu->run->s.regs.riccb);
if (sclp.has_kss)
kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS);
@@ -3032,62 +3900,102 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
kvm_s390_vcpu_crypto_setup(vcpu);
+ kvm_s390_vcpu_pci_setup(vcpu);
+
+ mutex_lock(&vcpu->kvm->lock);
+ if (kvm_s390_pv_is_protected(vcpu->kvm)) {
+ rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc);
+ if (rc)
+ kvm_s390_vcpu_unsetup_cmma(vcpu);
+ }
+ mutex_unlock(&vcpu->kvm->lock);
+
return rc;
}
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
- unsigned int id)
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
- struct kvm_vcpu *vcpu;
- struct sie_page *sie_page;
- int rc = -EINVAL;
-
if (!kvm_is_ucontrol(kvm) && !sca_can_add_vcpu(kvm, id))
- goto out;
-
- rc = -ENOMEM;
+ return -EINVAL;
+ return 0;
+}
- vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
- if (!vcpu)
- goto out;
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct sie_page *sie_page;
+ int rc;
BUILD_BUG_ON(sizeof(struct sie_page) != 4096);
- sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL);
+ sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!sie_page)
- goto out_free_cpu;
+ return -ENOMEM;
vcpu->arch.sie_block = &sie_page->sie_block;
- vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
+ vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb);
/* the real guest size will always be smaller than msl */
vcpu->arch.sie_block->mso = 0;
vcpu->arch.sie_block->msl = sclp.hamax;
- vcpu->arch.sie_block->icpua = id;
+ vcpu->arch.sie_block->icpua = vcpu->vcpu_id;
spin_lock_init(&vcpu->arch.local_int.lock);
- vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa_int.origin;
- if (vcpu->arch.sie_block->gd && sclp.has_gisaf)
- vcpu->arch.sie_block->gd |= GISA_FORMAT1;
+ vcpu->arch.sie_block->gd = kvm_s390_get_gisa_desc(vcpu->kvm);
seqcount_init(&vcpu->arch.cputm_seqcount);
- rc = kvm_vcpu_init(vcpu, kvm, id);
+ vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+ kvm_clear_async_pf_completion_queue(vcpu);
+ vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX |
+ KVM_SYNC_GPRS |
+ KVM_SYNC_ACRS |
+ KVM_SYNC_CRS |
+ KVM_SYNC_ARCH0 |
+ KVM_SYNC_PFAULT |
+ KVM_SYNC_DIAG318;
+ kvm_s390_set_prefix(vcpu, 0);
+ if (test_kvm_facility(vcpu->kvm, 64))
+ vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
+ if (test_kvm_facility(vcpu->kvm, 82))
+ vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC;
+ if (test_kvm_facility(vcpu->kvm, 133))
+ vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB;
+ if (test_kvm_facility(vcpu->kvm, 156))
+ vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN;
+ /* fprs can be synchronized via vrs, even if the guest has no vx. With
+ * cpu_has_vx(), (load|store)_fpu_regs() will work with vrs format.
+ */
+ if (cpu_has_vx())
+ vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS;
+ else
+ vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
+
+ if (kvm_is_ucontrol(vcpu->kvm)) {
+ rc = __kvm_ucontrol_vcpu_init(vcpu);
+ if (rc)
+ goto out_free_sie_block;
+ }
+
+ VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK",
+ vcpu->vcpu_id, vcpu, vcpu->arch.sie_block);
+ trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block);
+
+ rc = kvm_s390_vcpu_setup(vcpu);
if (rc)
- goto out_free_sie_block;
- VM_EVENT(kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", id, vcpu,
- vcpu->arch.sie_block);
- trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block);
+ goto out_ucontrol_uninit;
- return vcpu;
+ kvm_s390_update_topology_change_report(vcpu->kvm, 1);
+ return 0;
+
+out_ucontrol_uninit:
+ if (kvm_is_ucontrol(vcpu->kvm))
+ gmap_remove(vcpu->arch.gmap);
out_free_sie_block:
free_page((unsigned long)(vcpu->arch.sie_block));
-out_free_cpu:
- kmem_cache_free(kvm_vcpu_cache, vcpu);
-out:
- return ERR_PTR(rc);
+ return rc;
}
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
+ clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask);
return kvm_s390_vcpu_has_irq(vcpu, 0);
}
@@ -3139,7 +4047,7 @@ void exit_sie(struct kvm_vcpu *vcpu)
/* Kick a guest cpu out of SIE to process a request synchronously */
void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
{
- kvm_make_request(req, vcpu);
+ __kvm_make_request(req, vcpu);
kvm_s390_vcpu_request(vcpu);
}
@@ -3149,7 +4057,9 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
struct kvm *kvm = gmap->private;
struct kvm_vcpu *vcpu;
unsigned long prefix;
- int i;
+ unsigned long i;
+
+ trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap));
if (gmap_is_shadow(gmap))
return;
@@ -3162,7 +4072,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
start, end);
- kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
+ kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
}
}
}
@@ -3171,7 +4081,7 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
{
/* do not poll with more than halt_poll_max_steal percent of steal time */
if (S390_lowcore.avg_steal_timer * 100 / (TICK_USEC << 12) >=
- halt_poll_max_steal) {
+ READ_ONCE(halt_poll_max_steal)) {
vcpu->stat.halt_no_poll_steal++;
return true;
}
@@ -3287,10 +4197,76 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
return r;
}
-static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
+static void kvm_arch_vcpu_ioctl_normal_reset(struct kvm_vcpu *vcpu)
{
- kvm_s390_vcpu_initial_reset(vcpu);
- return 0;
+ vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_RI;
+ vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+ memset(vcpu->run->s.regs.riccb, 0, sizeof(vcpu->run->s.regs.riccb));
+
+ kvm_clear_async_pf_completion_queue(vcpu);
+ if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm))
+ kvm_s390_vcpu_stop(vcpu);
+ kvm_s390_clear_local_irqs(vcpu);
+}
+
+static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
+{
+ /* Initial reset is a superset of the normal reset */
+ kvm_arch_vcpu_ioctl_normal_reset(vcpu);
+
+ /*
+ * This equals initial cpu reset in pop, but we don't switch to ESA.
+ * We do not only reset the internal data, but also ...
+ */
+ vcpu->arch.sie_block->gpsw.mask = 0;
+ vcpu->arch.sie_block->gpsw.addr = 0;
+ kvm_s390_set_prefix(vcpu, 0);
+ kvm_s390_set_cpu_timer(vcpu, 0);
+ vcpu->arch.sie_block->ckc = 0;
+ memset(vcpu->arch.sie_block->gcr, 0, sizeof(vcpu->arch.sie_block->gcr));
+ vcpu->arch.sie_block->gcr[0] = CR0_INITIAL_MASK;
+ vcpu->arch.sie_block->gcr[14] = CR14_INITIAL_MASK;
+
+ /* ... the data in sync regs */
+ memset(vcpu->run->s.regs.crs, 0, sizeof(vcpu->run->s.regs.crs));
+ vcpu->run->s.regs.ckc = 0;
+ vcpu->run->s.regs.crs[0] = CR0_INITIAL_MASK;
+ vcpu->run->s.regs.crs[14] = CR14_INITIAL_MASK;
+ vcpu->run->psw_addr = 0;
+ vcpu->run->psw_mask = 0;
+ vcpu->run->s.regs.todpr = 0;
+ vcpu->run->s.regs.cputm = 0;
+ vcpu->run->s.regs.ckc = 0;
+ vcpu->run->s.regs.pp = 0;
+ vcpu->run->s.regs.gbea = 1;
+ vcpu->run->s.regs.fpc = 0;
+ /*
+ * Do not reset these registers in the protected case, as some of
+ * them are overlaid and they are not accessible in this case
+ * anyway.
+ */
+ if (!kvm_s390_pv_cpu_is_protected(vcpu)) {
+ vcpu->arch.sie_block->gbea = 1;
+ vcpu->arch.sie_block->pp = 0;
+ vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
+ vcpu->arch.sie_block->todpr = 0;
+ }
+}
+
+static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_sync_regs *regs = &vcpu->run->s.regs;
+
+ /* Clear reset is a superset of the initial reset */
+ kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+
+ memset(&regs->gprs, 0, sizeof(regs->gprs));
+ memset(&regs->vrs, 0, sizeof(regs->vrs));
+ memset(&regs->acrs, 0, sizeof(regs->acrs));
+ memset(&regs->gscb, 0, sizeof(regs->gscb));
+
+ regs->etoken = 0;
+ regs->etoken_extension = 0;
}
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
@@ -3339,18 +4315,13 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
vcpu_load(vcpu);
- if (test_fp_ctl(fpu->fpc)) {
- ret = -EINVAL;
- goto out;
- }
vcpu->run->s.regs.fpc = fpu->fpc;
- if (MACHINE_HAS_VX)
+ if (cpu_has_vx())
convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs,
(freg_t *) fpu->fprs);
else
memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs));
-out:
vcpu_put(vcpu);
return ret;
}
@@ -3359,9 +4330,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
vcpu_load(vcpu);
- /* make sure we have the latest values */
- save_fpu_regs();
- if (MACHINE_HAS_VX)
+ if (cpu_has_vx())
convert_vx_to_fp((freg_t *) fpu->fprs,
(__vector128 *) vcpu->run->s.regs.vrs);
else
@@ -3460,18 +4429,24 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
vcpu_load(vcpu);
/* user space knows about this interface - let it control the state */
- vcpu->kvm->arch.user_cpu_state_ctrl = 1;
+ kvm_s390_set_user_cpu_state_ctrl(vcpu->kvm);
switch (mp_state->mp_state) {
case KVM_MP_STATE_STOPPED:
- kvm_s390_vcpu_stop(vcpu);
+ rc = kvm_s390_vcpu_stop(vcpu);
break;
case KVM_MP_STATE_OPERATING:
- kvm_s390_vcpu_start(vcpu);
+ rc = kvm_s390_vcpu_start(vcpu);
break;
case KVM_MP_STATE_LOAD:
+ if (!kvm_s390_pv_cpu_is_protected(vcpu)) {
+ rc = -ENXIO;
+ break;
+ }
+ rc = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR_LOAD);
+ break;
case KVM_MP_STATE_CHECK_STOP:
- /* fall through - CHECK_STOP and LOAD are not supported yet */
+ fallthrough; /* CHECK_STOP and LOAD are not supported yet */
default:
rc = -ENXIO;
}
@@ -3492,19 +4467,19 @@ retry:
if (!kvm_request_pending(vcpu))
return 0;
/*
- * We use MMU_RELOAD just to re-arm the ipte notifier for the
+ * If the guest prefix changed, re-arm the ipte notifier for the
* guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
* This ensures that the ipte instruction for this request has
* already finished. We might race against a second unmapper that
* wants to set the blocking bit. Lets just retry the request loop.
*/
- if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
+ if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) {
int rc;
rc = gmap_mprotect_notify(vcpu->arch.gmap,
kvm_s390_get_prefix(vcpu),
PAGE_SIZE * 2, PROT_WRITE);
if (rc) {
- kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
return rc;
}
goto retry;
@@ -3557,30 +4532,26 @@ retry:
goto retry;
}
- /* nothing to do, just clear the request */
- kvm_clear_request(KVM_REQ_UNHALT, vcpu);
/* we left the vsie handler, nothing to do, just clear the request */
kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu);
return 0;
}
-void kvm_s390_set_tod_clock(struct kvm *kvm,
- const struct kvm_s390_vm_tod_clock *gtod)
+static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
{
struct kvm_vcpu *vcpu;
- struct kvm_s390_tod_clock_ext htod;
- int i;
+ union tod_clock clk;
+ unsigned long i;
- mutex_lock(&kvm->lock);
preempt_disable();
- get_tod_clock_ext((char *)&htod);
+ store_tod_clock_ext(&clk);
- kvm->arch.epoch = gtod->tod - htod.tod;
+ kvm->arch.epoch = gtod->tod - clk.tod;
kvm->arch.epdx = 0;
if (test_kvm_facility(kvm, 139)) {
- kvm->arch.epdx = gtod->epoch_idx - htod.epoch_idx;
+ kvm->arch.epdx = gtod->epoch_idx - clk.ei;
if (kvm->arch.epoch > gtod->tod)
kvm->arch.epdx -= 1;
}
@@ -3593,7 +4564,15 @@ void kvm_s390_set_tod_clock(struct kvm *kvm,
kvm_s390_vcpu_unblock_all(kvm);
preempt_enable();
+}
+
+int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
+{
+ if (!mutex_trylock(&kvm->lock))
+ return 0;
+ __kvm_s390_set_tod_clock(kvm, gtod);
mutex_unlock(&kvm->lock);
+ return 1;
}
/**
@@ -3629,11 +4608,13 @@ static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
}
}
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
trace_kvm_s390_pfault_init(vcpu, work->arch.pfault_token);
__kvm_inject_pfault_token(vcpu, true, work->arch.pfault_token);
+
+ return true;
}
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
@@ -3649,7 +4630,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
/* s390 will always inject the page directly */
}
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
{
/*
* s390 will always inject the page directly,
@@ -3658,33 +4639,31 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
return true;
}
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
+static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
{
hva_t hva;
struct kvm_arch_async_pf arch;
- int rc;
if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
- return 0;
+ return false;
if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) !=
vcpu->arch.pfault_compare)
- return 0;
+ return false;
if (psw_extint_disabled(vcpu))
- return 0;
+ return false;
if (kvm_s390_vcpu_has_irq(vcpu, 0))
- return 0;
+ return false;
if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK))
- return 0;
+ return false;
if (!vcpu->arch.gmap->pfault_enabled)
- return 0;
+ return false;
hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr));
hva += current->thread.gmap_addr & ~PAGE_MASK;
if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8))
- return 0;
+ return false;
- rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
- return rc;
+ return kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
}
static int vcpu_pre_run(struct kvm_vcpu *vcpu)
@@ -3704,12 +4683,9 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
if (need_resched())
schedule();
- if (test_cpu_flag(CIF_MCCK_PENDING))
- s390_handle_mcck();
-
if (!kvm_is_ucontrol(vcpu->kvm)) {
rc = kvm_s390_deliver_pending_interrupts(vcpu);
- if (rc)
+ if (rc || guestdbg_exit_pending(vcpu))
return rc;
}
@@ -3722,7 +4698,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
kvm_s390_patch_guest_per_regs(vcpu);
}
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask);
+ clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask);
vcpu->arch.sie_block->icptcode = 0;
cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
@@ -3816,27 +4792,30 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
current->thread.gmap_pfault = 0;
if (kvm_arch_setup_async_pf(vcpu))
return 0;
+ vcpu->stat.pfault_sync++;
return kvm_arch_fault_in_page(vcpu, current->thread.gmap_addr, 1);
}
return vcpu_post_run_fault_in_sie(vcpu);
}
+#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK)
static int __vcpu_run(struct kvm_vcpu *vcpu)
{
int rc, exit_reason;
+ struct sie_page *sie_page = (struct sie_page *)vcpu->arch.sie_block;
/*
* We try to hold kvm->srcu during most of vcpu_run (except when run-
* ning the guest), so that memslots (and other stuff) are protected
*/
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
do {
rc = vcpu_pre_run(vcpu);
- if (rc)
+ if (rc || guestdbg_exit_pending(vcpu))
break;
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
/*
* As PF_VCPU will be used in fault handler, between
* guest_enter and guest_exit should be no uaccess.
@@ -3845,23 +4824,46 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
guest_enter_irqoff();
__disable_cpu_timer_accounting(vcpu);
local_irq_enable();
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ memcpy(sie_page->pv_grregs,
+ vcpu->run->s.regs.gprs,
+ sizeof(sie_page->pv_grregs));
+ }
+ if (test_cpu_flag(CIF_FPU))
+ load_fpu_regs();
exit_reason = sie64a(vcpu->arch.sie_block,
vcpu->run->s.regs.gprs);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ memcpy(vcpu->run->s.regs.gprs,
+ sie_page->pv_grregs,
+ sizeof(sie_page->pv_grregs));
+ /*
+ * We're not allowed to inject interrupts on intercepts
+ * that leave the guest state in an "in-between" state
+ * where the next SIE entry will do a continuation.
+ * Fence interrupts in our "internal" PSW.
+ */
+ if (vcpu->arch.sie_block->icptcode == ICPT_PV_INSTR ||
+ vcpu->arch.sie_block->icptcode == ICPT_PV_PREF) {
+ vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK;
+ }
+ }
local_irq_disable();
__enable_cpu_timer_accounting(vcpu);
guest_exit_irqoff();
local_irq_enable();
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
rc = vcpu_post_run(vcpu, exit_reason);
} while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
return rc;
}
-static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void sync_regs_fmt2(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *kvm_run = vcpu->run;
struct runtime_instr_cb *riccb;
struct gs_cb *gscb;
@@ -3869,16 +4871,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
gscb = (struct gs_cb *) &kvm_run->s.regs.gscb;
vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
- if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX)
- kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
- if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) {
- memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128);
- /* some control register changes require a tlb flush */
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- }
if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) {
- kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm);
- vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc;
vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr;
vcpu->arch.sie_block->pp = kvm_run->s.regs.pp;
vcpu->arch.sie_block->gbea = kvm_run->s.regs.gbea;
@@ -3890,6 +4883,11 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
kvm_clear_async_pf_completion_queue(vcpu);
}
+ if (kvm_run->kvm_dirty_regs & KVM_SYNC_DIAG318) {
+ vcpu->arch.diag318_info.val = kvm_run->s.regs.diag318;
+ vcpu->arch.sie_block->cpnc = vcpu->arch.diag318_info.cpnc;
+ VCPU_EVENT(vcpu, 3, "setting cpnc to %d", vcpu->arch.diag318_info.cpnc);
+ }
/*
* If userspace sets the riccb (e.g. after migration) to a valid state,
* we should enable RI here instead of doing the lazy enablement.
@@ -3919,23 +4917,9 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0;
}
- save_access_regs(vcpu->arch.host_acrs);
- restore_access_regs(vcpu->run->s.regs.acrs);
- /* save host (userspace) fprs/vrs */
- save_fpu_regs();
- vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
- vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
- if (MACHINE_HAS_VX)
- current->thread.fpu.regs = vcpu->run->s.regs.vrs;
- else
- current->thread.fpu.regs = vcpu->run->s.regs.fprs;
- current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
- if (test_fp_ctl(current->thread.fpu.fpc))
- /* User space provided an invalid FPC, let's clear it */
- current->thread.fpu.fpc = 0;
if (MACHINE_HAS_GS) {
preempt_disable();
- __ctl_set_bit(2, 4);
+ local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
if (current->thread.gs_cb) {
vcpu->arch.host_gscb = current->thread.gs_cb;
save_gs_cb(vcpu->arch.host_gscb);
@@ -3948,25 +4932,93 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
preempt_enable();
}
/* SIE will load etoken directly from SDNX and therefore kvm_run */
+}
+
+static void sync_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX)
+ kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
+ if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) {
+ memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128);
+ /* some control register changes require a tlb flush */
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
+ if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) {
+ kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm);
+ vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc;
+ }
+ save_access_regs(vcpu->arch.host_acrs);
+ restore_access_regs(vcpu->run->s.regs.acrs);
+ /* save host (userspace) fprs/vrs */
+ save_fpu_regs();
+ vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
+ vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
+ if (cpu_has_vx())
+ current->thread.fpu.regs = vcpu->run->s.regs.vrs;
+ else
+ current->thread.fpu.regs = vcpu->run->s.regs.fprs;
+ current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
+
+ /* Sync fmt2 only data */
+ if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) {
+ sync_regs_fmt2(vcpu);
+ } else {
+ /*
+ * In several places we have to modify our internal view to
+ * not do things that are disallowed by the ultravisor. For
+ * example we must not inject interrupts after specific exits
+ * (e.g. 112 prefix page not secure). We do this by turning
+ * off the machine check, external and I/O interrupt bits
+ * of our PSW copy. To avoid getting validity intercepts, we
+ * do only accept the condition code from userspace.
+ */
+ vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_CC;
+ vcpu->arch.sie_block->gpsw.mask |= kvm_run->psw_mask &
+ PSW_MASK_CC;
+ }
kvm_run->kvm_dirty_regs = 0;
}
-static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void store_regs_fmt2(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr;
+ kvm_run->s.regs.pp = vcpu->arch.sie_block->pp;
+ kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea;
+ kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
+ kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val;
+ if (MACHINE_HAS_GS) {
+ preempt_disable();
+ local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
+ if (vcpu->arch.gs_enabled)
+ save_gs_cb(current->thread.gs_cb);
+ current->thread.gs_cb = vcpu->arch.host_gscb;
+ restore_gs_cb(vcpu->arch.host_gscb);
+ if (!vcpu->arch.host_gscb)
+ local_ctl_clear_bit(2, CR2_GUARDED_STORAGE_BIT);
+ vcpu->arch.host_gscb = NULL;
+ preempt_enable();
+ }
+ /* SIE will save etoken directly into SDNX and therefore kvm_run */
+}
+
+static void store_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask;
kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr;
kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu);
memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128);
kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu);
kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc;
- kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr;
- kvm_run->s.regs.pp = vcpu->arch.sie_block->pp;
- kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea;
kvm_run->s.regs.pft = vcpu->arch.pfault_token;
kvm_run->s.regs.pfs = vcpu->arch.pfault_select;
kvm_run->s.regs.pfc = vcpu->arch.pfault_compare;
- kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
save_access_regs(vcpu->run->s.regs.acrs);
restore_access_regs(vcpu->arch.host_acrs);
/* Save guest register state */
@@ -3975,25 +5027,24 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
/* Restore will be done lazily at return */
current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc;
current->thread.fpu.regs = vcpu->arch.host_fpregs.regs;
- if (MACHINE_HAS_GS) {
- __ctl_set_bit(2, 4);
- if (vcpu->arch.gs_enabled)
- save_gs_cb(current->thread.gs_cb);
- preempt_disable();
- current->thread.gs_cb = vcpu->arch.host_gscb;
- restore_gs_cb(vcpu->arch.host_gscb);
- preempt_enable();
- if (!vcpu->arch.host_gscb)
- __ctl_clear_bit(2, 4);
- vcpu->arch.host_gscb = NULL;
- }
- /* SIE will save etoken directly into SDNX and therefore kvm_run */
+ if (likely(!kvm_s390_pv_cpu_is_protected(vcpu)))
+ store_regs_fmt2(vcpu);
}
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *kvm_run = vcpu->run;
int rc;
+ /*
+ * Running a VM while dumping always has the potential to
+ * produce inconsistent dump data. But for PV vcpus a SIE
+ * entry while dumping could also lead to a fatal validity
+ * intercept which we absolutely want to avoid.
+ */
+ if (vcpu->kvm->arch.pv.dumping)
+ return -EINVAL;
+
if (kvm_run->immediate_exit)
return -EINTR;
@@ -4011,6 +5062,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_sigset_activate(vcpu);
+ /*
+ * no need to check the return value of vcpu_start as it can only have
+ * an error for protvirt, but protvirt means user cpu state
+ */
if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) {
kvm_s390_vcpu_start(vcpu);
} else if (is_vcpu_stopped(vcpu)) {
@@ -4020,7 +5075,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
goto out;
}
- sync_regs(vcpu, kvm_run);
+ sync_regs(vcpu);
enable_cpu_timer_accounting(vcpu);
might_fault();
@@ -4042,7 +5097,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
disable_cpu_timer_accounting(vcpu);
- store_regs(vcpu, kvm_run);
+ store_regs(vcpu);
kvm_sigset_deactivate(vcpu);
@@ -4079,7 +5134,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
gpa -= __LC_FPREGS_SAVE_AREA;
/* manually convert vector registers if necessary */
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
fprs, 128);
@@ -4132,7 +5187,7 @@ static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
{
- unsigned int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -4148,20 +5203,29 @@ static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
}
-void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
+int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
{
- int i, online_vcpus, started_vcpus = 0;
+ int i, online_vcpus, r = 0, started_vcpus = 0;
if (!is_vcpu_stopped(vcpu))
- return;
+ return 0;
trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1);
/* Only one cpu at a time may enter/leave the STOPPED state. */
spin_lock(&vcpu->kvm->arch.start_stop_lock);
online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
+ /* Let's tell the UV that we want to change into the operating state */
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR);
+ if (r) {
+ spin_unlock(&vcpu->kvm->arch.start_stop_lock);
+ return r;
+ }
+ }
+
for (i = 0; i < online_vcpus; i++) {
- if (!is_vcpu_stopped(vcpu->kvm->vcpus[i]))
+ if (!is_vcpu_stopped(kvm_get_vcpu(vcpu->kvm, i)))
started_vcpus++;
}
@@ -4172,44 +5236,67 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
/*
* As we are starting a second VCPU, we have to disable
* the IBS facility on all VCPUs to remove potentially
- * oustanding ENABLE requests.
+ * outstanding ENABLE requests.
*/
__disable_ibs_on_all_vcpus(vcpu->kvm);
}
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_STOPPED);
/*
+ * The real PSW might have changed due to a RESTART interpreted by the
+ * ultravisor. We block all interrupts and let the next sie exit
+ * refresh our view.
+ */
+ if (kvm_s390_pv_cpu_is_protected(vcpu))
+ vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK;
+ /*
* Another VCPU might have used IBS while we were offline.
* Let's play safe and flush the VCPU at startup.
*/
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
spin_unlock(&vcpu->kvm->arch.start_stop_lock);
- return;
+ return 0;
}
-void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
+int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
{
- int i, online_vcpus, started_vcpus = 0;
+ int i, online_vcpus, r = 0, started_vcpus = 0;
struct kvm_vcpu *started_vcpu = NULL;
if (is_vcpu_stopped(vcpu))
- return;
+ return 0;
trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0);
/* Only one cpu at a time may enter/leave the STOPPED state. */
spin_lock(&vcpu->kvm->arch.start_stop_lock);
online_vcpus = atomic_read(&vcpu->kvm->online_vcpus);
- /* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */
- kvm_s390_clear_stop_irq(vcpu);
+ /* Let's tell the UV that we want to change into the stopped state */
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_STP);
+ if (r) {
+ spin_unlock(&vcpu->kvm->arch.start_stop_lock);
+ return r;
+ }
+ }
+ /*
+ * Set the VCPU to STOPPED and THEN clear the interrupt flag,
+ * now that the SIGP STOP and SIGP STOP AND STORE STATUS orders
+ * have been fully processed. This will ensure that the VCPU
+ * is kept BUSY if another VCPU is inquiring with SIGP SENSE.
+ */
kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOPPED);
+ kvm_s390_clear_stop_irq(vcpu);
+
__disable_ibs_on_vcpu(vcpu);
for (i = 0; i < online_vcpus; i++) {
- if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) {
+ struct kvm_vcpu *tmp = kvm_get_vcpu(vcpu->kvm, i);
+
+ if (!is_vcpu_stopped(tmp)) {
started_vcpus++;
- started_vcpu = vcpu->kvm->vcpus[i];
+ started_vcpu = tmp;
}
}
@@ -4222,7 +5309,7 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
}
spin_unlock(&vcpu->kvm->arch.start_stop_lock);
- return;
+ return 0;
}
static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
@@ -4249,64 +5336,116 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return r;
}
-static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
+static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu,
struct kvm_s390_mem_op *mop)
{
void __user *uaddr = (void __user *)mop->buf;
- void *tmpbuf = NULL;
- int r, srcu_idx;
- const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION
- | KVM_S390_MEMOP_F_CHECK_ONLY;
+ void *sida_addr;
+ int r = 0;
- if (mop->flags & ~supported_flags || mop->ar >= NUM_ACRS || !mop->size)
+ if (mop->flags || !mop->size)
return -EINVAL;
-
- if (mop->size > MEM_OP_MAX_SIZE)
+ if (mop->size + mop->sida_offset < mop->size)
+ return -EINVAL;
+ if (mop->size + mop->sida_offset > sida_size(vcpu->arch.sie_block))
return -E2BIG;
+ if (!kvm_s390_pv_cpu_is_protected(vcpu))
+ return -EINVAL;
+
+ sida_addr = (char *)sida_addr(vcpu->arch.sie_block) + mop->sida_offset;
+ switch (mop->op) {
+ case KVM_S390_MEMOP_SIDA_READ:
+ if (copy_to_user(uaddr, sida_addr, mop->size))
+ r = -EFAULT;
+
+ break;
+ case KVM_S390_MEMOP_SIDA_WRITE:
+ if (copy_from_user(sida_addr, uaddr, mop->size))
+ r = -EFAULT;
+ break;
+ }
+ return r;
+}
+
+static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu,
+ struct kvm_s390_mem_op *mop)
+{
+ void __user *uaddr = (void __user *)mop->buf;
+ enum gacc_mode acc_mode;
+ void *tmpbuf = NULL;
+ int r;
+
+ r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION |
+ KVM_S390_MEMOP_F_CHECK_ONLY |
+ KVM_S390_MEMOP_F_SKEY_PROTECTION);
+ if (r)
+ return r;
+ if (mop->ar >= NUM_ACRS)
+ return -EINVAL;
+ if (kvm_s390_pv_cpu_is_protected(vcpu))
+ return -EINVAL;
if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
tmpbuf = vmalloc(mop->size);
if (!tmpbuf)
return -ENOMEM;
}
+ acc_mode = mop->op == KVM_S390_MEMOP_LOGICAL_READ ? GACC_FETCH : GACC_STORE;
+ if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+ r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
+ acc_mode, mop->key);
+ goto out_inject;
+ }
+ if (acc_mode == GACC_FETCH) {
+ r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
+ mop->size, mop->key);
+ if (r)
+ goto out_inject;
+ if (copy_to_user(uaddr, tmpbuf, mop->size)) {
+ r = -EFAULT;
+ goto out_free;
+ }
+ } else {
+ if (copy_from_user(tmpbuf, uaddr, mop->size)) {
+ r = -EFAULT;
+ goto out_free;
+ }
+ r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
+ mop->size, mop->key);
+ }
+
+out_inject:
+ if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
+ kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+
+out_free:
+ vfree(tmpbuf);
+ return r;
+}
+
+static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu,
+ struct kvm_s390_mem_op *mop)
+{
+ int r, srcu_idx;
+
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
switch (mop->op) {
case KVM_S390_MEMOP_LOGICAL_READ:
- if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gva_range(vcpu, mop->gaddr, mop->ar,
- mop->size, GACC_FETCH);
- break;
- }
- r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
- if (r == 0) {
- if (copy_to_user(uaddr, tmpbuf, mop->size))
- r = -EFAULT;
- }
- break;
case KVM_S390_MEMOP_LOGICAL_WRITE:
- if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gva_range(vcpu, mop->gaddr, mop->ar,
- mop->size, GACC_STORE);
- break;
- }
- if (copy_from_user(tmpbuf, uaddr, mop->size)) {
- r = -EFAULT;
- break;
- }
- r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
+ r = kvm_s390_vcpu_mem_op(vcpu, mop);
+ break;
+ case KVM_S390_MEMOP_SIDA_READ:
+ case KVM_S390_MEMOP_SIDA_WRITE:
+ /* we are locked against sida going away by the vcpu->mutex */
+ r = kvm_s390_vcpu_sida_op(vcpu, mop);
break;
default:
r = -EINVAL;
}
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
-
- if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
- kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
-
- vfree(tmpbuf);
return r;
}
@@ -4315,6 +5454,7 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
+ int rc;
switch (ioctl) {
case KVM_S390_IRQ: {
@@ -4322,7 +5462,8 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
if (copy_from_user(&s390irq, argp, sizeof(s390irq)))
return -EFAULT;
- return kvm_s390_inject_vcpu(vcpu, &s390irq);
+ rc = kvm_s390_inject_vcpu(vcpu, &s390irq);
+ break;
}
case KVM_S390_INTERRUPT: {
struct kvm_s390_interrupt s390int;
@@ -4332,10 +5473,67 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
return -EFAULT;
if (s390int_to_s390irq(&s390int, &s390irq))
return -EINVAL;
- return kvm_s390_inject_vcpu(vcpu, &s390irq);
+ rc = kvm_s390_inject_vcpu(vcpu, &s390irq);
+ break;
}
+ default:
+ rc = -ENOIOCTLCMD;
+ break;
}
- return -ENOIOCTLCMD;
+
+ /*
+ * To simplify single stepping of userspace-emulated instructions,
+ * KVM_EXIT_S390_SIEIC exit sets KVM_GUESTDBG_EXIT_PENDING (see
+ * should_handle_per_ifetch()). However, if userspace emulation injects
+ * an interrupt, it needs to be cleared, so that KVM_EXIT_DEBUG happens
+ * after (and not before) the interrupt delivery.
+ */
+ if (!rc)
+ vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING;
+
+ return rc;
+}
+
+static int kvm_s390_handle_pv_vcpu_dump(struct kvm_vcpu *vcpu,
+ struct kvm_pv_cmd *cmd)
+{
+ struct kvm_s390_pv_dmp dmp;
+ void *data;
+ int ret;
+
+ /* Dump initialization is a prerequisite */
+ if (!vcpu->kvm->arch.pv.dumping)
+ return -EINVAL;
+
+ if (copy_from_user(&dmp, (__u8 __user *)cmd->data, sizeof(dmp)))
+ return -EFAULT;
+
+ /* We only handle this subcmd right now */
+ if (dmp.subcmd != KVM_PV_DUMP_CPU)
+ return -EINVAL;
+
+ /* CPU dump length is the same as create cpu storage donation. */
+ if (dmp.buff_len != uv_info.guest_cpu_stor_len)
+ return -EINVAL;
+
+ data = kvzalloc(uv_info.guest_cpu_stor_len, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ ret = kvm_s390_pv_dump_cpu(vcpu, data, &cmd->rc, &cmd->rrc);
+
+ VCPU_EVENT(vcpu, 3, "PROTVIRT DUMP CPU %d rc %x rrc %x",
+ vcpu->vcpu_id, cmd->rc, cmd->rrc);
+
+ if (ret)
+ ret = -EINVAL;
+
+ /* On success copy over the dump data */
+ if (!ret && copy_to_user((__u8 __user *)dmp.buff_addr, data, uv_info.guest_cpu_stor_len))
+ ret = -EFAULT;
+
+ kvfree(data);
+ return ret;
}
long kvm_arch_vcpu_ioctl(struct file *filp,
@@ -4345,13 +5543,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
void __user *argp = (void __user *)arg;
int idx;
long r;
+ u16 rc, rrc;
vcpu_load(vcpu);
switch (ioctl) {
case KVM_S390_STORE_STATUS:
idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = kvm_s390_vcpu_store_status(vcpu, arg);
+ r = kvm_s390_store_status_unloaded(vcpu, arg);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
case KVM_S390_SET_INITIAL_PSW: {
@@ -4363,12 +5562,43 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
break;
}
+ case KVM_S390_CLEAR_RESET:
+ r = 0;
+ kvm_arch_vcpu_ioctl_clear_reset(vcpu);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+ UVC_CMD_CPU_RESET_CLEAR, &rc, &rrc);
+ VCPU_EVENT(vcpu, 3, "PROTVIRT RESET CLEAR VCPU: rc %x rrc %x",
+ rc, rrc);
+ }
+ break;
case KVM_S390_INITIAL_RESET:
- r = kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+ r = 0;
+ kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+ UVC_CMD_CPU_RESET_INITIAL,
+ &rc, &rrc);
+ VCPU_EVENT(vcpu, 3, "PROTVIRT RESET INITIAL VCPU: rc %x rrc %x",
+ rc, rrc);
+ }
+ break;
+ case KVM_S390_NORMAL_RESET:
+ r = 0;
+ kvm_arch_vcpu_ioctl_normal_reset(vcpu);
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+ UVC_CMD_CPU_RESET, &rc, &rrc);
+ VCPU_EVENT(vcpu, 3, "PROTVIRT RESET NORMAL VCPU: rc %x rrc %x",
+ rc, rrc);
+ }
break;
case KVM_SET_ONE_REG:
case KVM_GET_ONE_REG: {
struct kvm_one_reg reg;
+ r = -EINVAL;
+ if (kvm_s390_pv_cpu_is_protected(vcpu))
+ break;
r = -EFAULT;
if (copy_from_user(&reg, argp, sizeof(reg)))
break;
@@ -4431,7 +5661,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_s390_mem_op mem_op;
if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
- r = kvm_s390_guest_mem_op(vcpu, &mem_op);
+ r = kvm_s390_vcpu_memsida_op(vcpu, &mem_op);
else
r = -EFAULT;
break;
@@ -4470,6 +5700,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
irq_state.len);
break;
}
+ case KVM_S390_PV_CPU_COMMAND: {
+ struct kvm_pv_cmd cmd;
+
+ r = -EINVAL;
+ if (!is_prot_virt_host())
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&cmd, argp, sizeof(cmd)))
+ break;
+
+ r = -EINVAL;
+ if (cmd.flags)
+ break;
+
+ /* We only handle this cmd right now */
+ if (cmd.cmd != KVM_PV_DUMP)
+ break;
+
+ r = kvm_s390_handle_pv_vcpu_dump(vcpu, &cmd);
+
+ /* Always copy over UV rc / rrc data */
+ if (copy_to_user((__u8 __user *)argp, &cmd.rc,
+ sizeof(cmd.rc) + sizeof(cmd.rrc)))
+ r = -EFAULT;
+ break;
+ }
default:
r = -ENOTTY;
}
@@ -4491,38 +5748,63 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
- unsigned long npages)
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
{
- return 0;
+ return true;
}
/* Section: memory related */
int kvm_arch_prepare_memory_region(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
- const struct kvm_userspace_memory_region *mem,
+ const struct kvm_memory_slot *old,
+ struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- /* A few sanity checks. We can have memory slots which have to be
- located/ended at a segment boundary (1MB). The memory in userland is
- ok to be fragmented into various different vmas. It is okay to mmap()
- and munmap() stuff in this slot after doing this call at any time */
+ gpa_t size;
- if (mem->userspace_addr & 0xffffful)
+ /* When we are protected, we should not change the memory slots */
+ if (kvm_s390_pv_get_handle(kvm))
return -EINVAL;
- if (mem->memory_size & 0xffffful)
- return -EINVAL;
+ if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) {
+ /*
+ * A few sanity checks. We can have memory slots which have to be
+ * located/ended at a segment boundary (1MB). The memory in userland is
+ * ok to be fragmented into various different vmas. It is okay to mmap()
+ * and munmap() stuff in this slot after doing this call at any time
+ */
- if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit)
- return -EINVAL;
+ if (new->userspace_addr & 0xffffful)
+ return -EINVAL;
+
+ size = new->npages * PAGE_SIZE;
+ if (size & 0xffffful)
+ return -EINVAL;
+
+ if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
+ return -EINVAL;
+ }
+
+ if (!kvm->arch.migration_mode)
+ return 0;
+
+ /*
+ * Turn off migration mode when:
+ * - userspace creates a new memslot with dirty logging off,
+ * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and
+ * dirty logging is turned off.
+ * Migration mode expects dirty page logging being enabled to store
+ * its dirty bitmap.
+ */
+ if (change != KVM_MR_DELETE &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ WARN(kvm_s390_vm_stop_migration(kvm),
+ "Failed to stop migration mode");
return 0;
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem,
- const struct kvm_memory_slot *old,
+ struct kvm_memory_slot *old,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
@@ -4538,10 +5820,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
old->npages * PAGE_SIZE);
if (rc)
break;
- /* FALLTHROUGH */
+ fallthrough;
case KVM_MR_CREATE:
- rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr,
- mem->guest_phys_addr, mem->memory_size);
+ rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr,
+ new->base_gfn * PAGE_SIZE,
+ new->npages * PAGE_SIZE);
break;
case KVM_MR_FLAGS_ONLY:
break;
@@ -4560,14 +5843,9 @@ static inline unsigned long nonhyp_mask(int i)
return 0x0000ffffffffffffUL >> (nonhyp_fai << 4);
}
-void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu)
-{
- vcpu->valid_wakeup = false;
-}
-
static int __init kvm_s390_init(void)
{
- int i;
+ int i, r;
if (!sclp.has_sief2) {
pr_info("SIE is not available\n");
@@ -4581,14 +5859,25 @@ static int __init kvm_s390_init(void)
for (i = 0; i < 16; i++)
kvm_s390_fac_base[i] |=
- S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i);
+ stfle_fac_list[i] & nonhyp_mask(i);
+
+ r = __kvm_s390_init();
+ if (r)
+ return r;
- return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ if (r) {
+ __kvm_s390_exit();
+ return r;
+ }
+ return 0;
}
static void __exit kvm_s390_exit(void)
{
kvm_exit();
+
+ __kvm_s390_exit();
}
module_init(kvm_s390_init);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 6d9448dbd052..a7ea80cfa445 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -2,7 +2,7 @@
/*
* definition for kvm on s390
*
- * Copyright IBM Corp. 2008, 2009
+ * Copyright IBM Corp. 2008, 2020
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
@@ -15,6 +15,7 @@
#include <linux/hrtimer.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <linux/lockdep.h>
#include <asm/facility.h>
#include <asm/processor.h>
#include <asm/sclp.h>
@@ -22,9 +23,21 @@
/* Transactional Memory Execution related macros */
#define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE))
#define TDB_FORMAT1 1
-#define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1))
+#define IS_ITDB_VALID(vcpu) \
+ ((*(char *)phys_to_virt((vcpu)->arch.sie_block->itdba) == TDB_FORMAT1))
extern debug_info_t *kvm_s390_dbf;
+extern debug_info_t *kvm_s390_dbf_uv;
+
+#define KVM_UV_EVENT(d_kvm, d_loglevel, d_string, d_args...)\
+do { \
+ debug_sprintf_event((d_kvm)->arch.dbf, d_loglevel, d_string "\n", \
+ d_args); \
+ debug_sprintf_event(kvm_s390_dbf_uv, d_loglevel, \
+ "%d: " d_string "\n", (d_kvm)->userspace_pid, \
+ d_args); \
+} while (0)
+
#define KVM_EVENT(d_loglevel, d_string, d_args...)\
do { \
debug_sprintf_event(kvm_s390_dbf, d_loglevel, d_string "\n", \
@@ -67,7 +80,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
{
- return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ return test_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask);
}
static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -93,7 +106,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
prefix);
vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
}
static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar)
@@ -196,6 +209,67 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
return kvm->arch.user_cpu_state_ctrl != 0;
}
+static inline void kvm_s390_set_user_cpu_state_ctrl(struct kvm *kvm)
+{
+ if (kvm->arch.user_cpu_state_ctrl)
+ return;
+
+ VM_EVENT(kvm, 3, "%s", "ENABLE: Userspace CPU state control");
+ kvm->arch.user_cpu_state_ctrl = 1;
+}
+
+/* get the end gfn of the last (highest gfn) memslot */
+static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots)
+{
+ struct rb_node *node;
+ struct kvm_memory_slot *ms;
+
+ if (WARN_ON(kvm_memslots_empty(slots)))
+ return 0;
+
+ node = rb_last(&slots->gfn_tree);
+ ms = container_of(node, struct kvm_memory_slot, gfn_node[slots->node_idx]);
+ return ms->base_gfn + ms->npages;
+}
+
+static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm)
+{
+ u32 gd = virt_to_phys(kvm->arch.gisa_int.origin);
+
+ if (gd && sclp.has_gisaf)
+ gd |= GISA_FORMAT1;
+ return gd;
+}
+
+/* implemented in pv.c */
+int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
+int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
+int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
+ u16 *rrc);
+int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
+ unsigned long tweak, u16 *rc, u16 *rrc);
+int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state);
+int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc);
+int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
+ u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc);
+int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
+ u16 *rc, u16 *rrc);
+
+static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm)
+{
+ return kvm->arch.pv.handle;
+}
+
+static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.pv.handle;
+}
+
/* implemented in interrupt.c */
int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
@@ -281,13 +355,12 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
/* implemented in kvm-s390.c */
-void kvm_s390_set_tod_clock(struct kvm *kvm,
- const struct kvm_s390_vm_tod_clock *gtod);
+int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
-void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
-void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
+int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
+int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu);
bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu);
@@ -297,13 +370,14 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
+int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc);
/* implemented in diag.c */
int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
{
- int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
WARN_ON(!mutex_is_locked(&kvm->lock));
@@ -313,7 +387,7 @@ static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm)
{
- int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm)
@@ -373,6 +447,7 @@ void kvm_s390_destroy_adapters(struct kvm *kvm);
int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu);
extern struct kvm_device_ops kvm_flic_ops;
int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu);
+int kvm_s390_is_restart_irq_pending(struct kvm_vcpu *vcpu);
void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu);
int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu,
void __user *buf, int len);
@@ -381,7 +456,9 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu,
void kvm_s390_gisa_init(struct kvm *kvm);
void kvm_s390_gisa_clear(struct kvm *kvm);
void kvm_s390_gisa_destroy(struct kvm *kvm);
-int kvm_s390_gib_init(u8 nisc);
+void kvm_s390_gisa_disable(struct kvm *kvm);
+void kvm_s390_gisa_enable(struct kvm *kvm);
+int __init kvm_s390_gib_init(u8 nisc);
void kvm_s390_gib_destroy(void);
/* implemented in guestdbg.c */
@@ -426,4 +503,22 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
* @kvm: the KVM guest
*/
void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm);
+
+/**
+ * kvm_s390_vcpu_pci_enable_interp
+ *
+ * Set the associated PCI attributes for each vcpu to allow for zPCI Load/Store
+ * interpretation as well as adapter interruption forwarding.
+ *
+ * @kvm: the KVM guest
+ */
+void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm);
+
+/**
+ * diag9c_forwarding_hz
+ *
+ * Set the maximum number of diag9c forwarding per second
+ */
+extern unsigned int diag9c_forwarding_hz;
+
#endif
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
new file mode 100644
index 000000000000..ffa7739c7a28
--- /dev/null
+++ b/arch/s390/kvm/pci.c
@@ -0,0 +1,704 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * s390 kvm PCI passthrough support
+ *
+ * Copyright IBM Corp. 2022
+ *
+ * Author(s): Matthew Rosato <mjrosato@linux.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/pci.h>
+#include <asm/pci.h>
+#include <asm/pci_insn.h>
+#include <asm/pci_io.h>
+#include <asm/sclp.h>
+#include "pci.h"
+#include "kvm-s390.h"
+
+struct zpci_aift *aift;
+
+static inline int __set_irq_noiib(u16 ctl, u8 isc)
+{
+ union zpci_sic_iib iib = {{0}};
+
+ return zpci_set_irq_ctrl(ctl, isc, &iib);
+}
+
+void kvm_s390_pci_aen_exit(void)
+{
+ unsigned long flags;
+ struct kvm_zdev **gait_kzdev;
+
+ lockdep_assert_held(&aift->aift_lock);
+
+ /*
+ * Contents of the aipb remain registered for the life of the host
+ * kernel, the information preserved in zpci_aipb and zpci_aif_sbv
+ * in case we insert the KVM module again later. Clear the AIFT
+ * information and free anything not registered with underlying
+ * firmware.
+ */
+ spin_lock_irqsave(&aift->gait_lock, flags);
+ gait_kzdev = aift->kzdev;
+ aift->gait = NULL;
+ aift->sbv = NULL;
+ aift->kzdev = NULL;
+ spin_unlock_irqrestore(&aift->gait_lock, flags);
+
+ kfree(gait_kzdev);
+}
+
+static int zpci_setup_aipb(u8 nisc)
+{
+ struct page *page;
+ int size, rc;
+
+ zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL);
+ if (!zpci_aipb)
+ return -ENOMEM;
+
+ aift->sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL);
+ if (!aift->sbv) {
+ rc = -ENOMEM;
+ goto free_aipb;
+ }
+ zpci_aif_sbv = aift->sbv;
+ size = get_order(PAGE_ALIGN(ZPCI_NR_DEVICES *
+ sizeof(struct zpci_gaite)));
+ page = alloc_pages(GFP_KERNEL | __GFP_ZERO, size);
+ if (!page) {
+ rc = -ENOMEM;
+ goto free_sbv;
+ }
+ aift->gait = (struct zpci_gaite *)page_to_virt(page);
+
+ zpci_aipb->aipb.faisb = virt_to_phys(aift->sbv->vector);
+ zpci_aipb->aipb.gait = virt_to_phys(aift->gait);
+ zpci_aipb->aipb.afi = nisc;
+ zpci_aipb->aipb.faal = ZPCI_NR_DEVICES;
+
+ /* Setup Adapter Event Notification Interpretation */
+ if (zpci_set_irq_ctrl(SIC_SET_AENI_CONTROLS, 0, zpci_aipb)) {
+ rc = -EIO;
+ goto free_gait;
+ }
+
+ return 0;
+
+free_gait:
+ free_pages((unsigned long)aift->gait, size);
+free_sbv:
+ airq_iv_release(aift->sbv);
+ zpci_aif_sbv = NULL;
+free_aipb:
+ kfree(zpci_aipb);
+ zpci_aipb = NULL;
+
+ return rc;
+}
+
+static int zpci_reset_aipb(u8 nisc)
+{
+ /*
+ * AEN registration can only happen once per system boot. If
+ * an aipb already exists then AEN was already registered and
+ * we can re-use the aipb contents. This can only happen if
+ * the KVM module was removed and re-inserted. However, we must
+ * ensure that the same forwarding ISC is used as this is assigned
+ * during KVM module load.
+ */
+ if (zpci_aipb->aipb.afi != nisc)
+ return -EINVAL;
+
+ aift->sbv = zpci_aif_sbv;
+ aift->gait = phys_to_virt(zpci_aipb->aipb.gait);
+
+ return 0;
+}
+
+int kvm_s390_pci_aen_init(u8 nisc)
+{
+ int rc = 0;
+
+ /* If already enabled for AEN, bail out now */
+ if (aift->gait || aift->sbv)
+ return -EPERM;
+
+ mutex_lock(&aift->aift_lock);
+ aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev *),
+ GFP_KERNEL);
+ if (!aift->kzdev) {
+ rc = -ENOMEM;
+ goto unlock;
+ }
+
+ if (!zpci_aipb)
+ rc = zpci_setup_aipb(nisc);
+ else
+ rc = zpci_reset_aipb(nisc);
+ if (rc)
+ goto free_zdev;
+
+ /* Enable floating IRQs */
+ if (__set_irq_noiib(SIC_IRQ_MODE_SINGLE, nisc)) {
+ rc = -EIO;
+ kvm_s390_pci_aen_exit();
+ }
+
+ goto unlock;
+
+free_zdev:
+ kfree(aift->kzdev);
+unlock:
+ mutex_unlock(&aift->aift_lock);
+ return rc;
+}
+
+/* Modify PCI: Register floating adapter interruption forwarding */
+static int kvm_zpci_set_airq(struct zpci_dev *zdev)
+{
+ u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT);
+ struct zpci_fib fib = {};
+ u8 status;
+
+ fib.fmt0.isc = zdev->kzdev->fib.fmt0.isc;
+ fib.fmt0.sum = 1; /* enable summary notifications */
+ fib.fmt0.noi = airq_iv_end(zdev->aibv);
+ fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector);
+ fib.fmt0.aibvo = 0;
+ fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8);
+ fib.fmt0.aisbo = zdev->aisb & 63;
+ fib.gd = zdev->gisa;
+
+ return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
+}
+
+/* Modify PCI: Unregister floating adapter interruption forwarding */
+static int kvm_zpci_clear_airq(struct zpci_dev *zdev)
+{
+ u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT);
+ struct zpci_fib fib = {};
+ u8 cc, status;
+
+ fib.gd = zdev->gisa;
+
+ cc = zpci_mod_fc(req, &fib, &status);
+ if (cc == 3 || (cc == 1 && status == 24))
+ /* Function already gone or IRQs already deregistered. */
+ cc = 0;
+
+ return cc ? -EIO : 0;
+}
+
+static inline void unaccount_mem(unsigned long nr_pages)
+{
+ struct user_struct *user = get_uid(current_user());
+
+ if (user)
+ atomic_long_sub(nr_pages, &user->locked_vm);
+ if (current->mm)
+ atomic64_sub(nr_pages, &current->mm->pinned_vm);
+}
+
+static inline int account_mem(unsigned long nr_pages)
+{
+ struct user_struct *user = get_uid(current_user());
+ unsigned long page_limit, cur_pages, new_pages;
+
+ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ do {
+ cur_pages = atomic_long_read(&user->locked_vm);
+ new_pages = cur_pages + nr_pages;
+ if (new_pages > page_limit)
+ return -ENOMEM;
+ } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
+ new_pages) != cur_pages);
+
+ atomic64_add(nr_pages, &current->mm->pinned_vm);
+
+ return 0;
+}
+
+static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib,
+ bool assist)
+{
+ struct page *pages[1], *aibv_page, *aisb_page = NULL;
+ unsigned int msi_vecs, idx;
+ struct zpci_gaite *gaite;
+ unsigned long hva, bit;
+ struct kvm *kvm;
+ phys_addr_t gaddr;
+ int rc = 0, gisc, npages, pcount = 0;
+
+ /*
+ * Interrupt forwarding is only applicable if the device is already
+ * enabled for interpretation
+ */
+ if (zdev->gisa == 0)
+ return -EINVAL;
+
+ kvm = zdev->kzdev->kvm;
+ msi_vecs = min_t(unsigned int, fib->fmt0.noi, zdev->max_msi);
+
+ /* Get the associated forwarding ISC - if invalid, return the error */
+ gisc = kvm_s390_gisc_register(kvm, fib->fmt0.isc);
+ if (gisc < 0)
+ return gisc;
+
+ /* Replace AIBV address */
+ idx = srcu_read_lock(&kvm->srcu);
+ hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aibv));
+ npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, pages);
+ srcu_read_unlock(&kvm->srcu, idx);
+ if (npages < 1) {
+ rc = -EIO;
+ goto out;
+ }
+ aibv_page = pages[0];
+ pcount++;
+ gaddr = page_to_phys(aibv_page) + (fib->fmt0.aibv & ~PAGE_MASK);
+ fib->fmt0.aibv = gaddr;
+
+ /* Pin the guest AISB if one was specified */
+ if (fib->fmt0.sum == 1) {
+ idx = srcu_read_lock(&kvm->srcu);
+ hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aisb));
+ npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM,
+ pages);
+ srcu_read_unlock(&kvm->srcu, idx);
+ if (npages < 1) {
+ rc = -EIO;
+ goto unpin1;
+ }
+ aisb_page = pages[0];
+ pcount++;
+ }
+
+ /* Account for pinned pages, roll back on failure */
+ if (account_mem(pcount))
+ goto unpin2;
+
+ /* AISB must be allocated before we can fill in GAITE */
+ mutex_lock(&aift->aift_lock);
+ bit = airq_iv_alloc_bit(aift->sbv);
+ if (bit == -1UL)
+ goto unlock;
+ zdev->aisb = bit; /* store the summary bit number */
+ zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA |
+ AIRQ_IV_BITLOCK |
+ AIRQ_IV_GUESTVEC,
+ phys_to_virt(fib->fmt0.aibv));
+
+ spin_lock_irq(&aift->gait_lock);
+ gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb *
+ sizeof(struct zpci_gaite));
+
+ /* If assist not requested, host will get all alerts */
+ if (assist)
+ gaite->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
+ else
+ gaite->gisa = 0;
+
+ gaite->gisc = fib->fmt0.isc;
+ gaite->count++;
+ gaite->aisbo = fib->fmt0.aisbo;
+ gaite->aisb = virt_to_phys(page_address(aisb_page) + (fib->fmt0.aisb &
+ ~PAGE_MASK));
+ aift->kzdev[zdev->aisb] = zdev->kzdev;
+ spin_unlock_irq(&aift->gait_lock);
+
+ /* Update guest FIB for re-issue */
+ fib->fmt0.aisbo = zdev->aisb & 63;
+ fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8);
+ fib->fmt0.isc = gisc;
+
+ /* Save some guest fib values in the host for later use */
+ zdev->kzdev->fib.fmt0.isc = fib->fmt0.isc;
+ zdev->kzdev->fib.fmt0.aibv = fib->fmt0.aibv;
+ mutex_unlock(&aift->aift_lock);
+
+ /* Issue the clp to setup the irq now */
+ rc = kvm_zpci_set_airq(zdev);
+ return rc;
+
+unlock:
+ mutex_unlock(&aift->aift_lock);
+unpin2:
+ if (fib->fmt0.sum == 1)
+ unpin_user_page(aisb_page);
+unpin1:
+ unpin_user_page(aibv_page);
+out:
+ return rc;
+}
+
+static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force)
+{
+ struct kvm_zdev *kzdev = zdev->kzdev;
+ struct zpci_gaite *gaite;
+ struct page *vpage = NULL, *spage = NULL;
+ int rc, pcount = 0;
+ u8 isc;
+
+ if (zdev->gisa == 0)
+ return -EINVAL;
+
+ mutex_lock(&aift->aift_lock);
+
+ /*
+ * If the clear fails due to an error, leave now unless we know this
+ * device is about to go away (force) -- In that case clear the GAITE
+ * regardless.
+ */
+ rc = kvm_zpci_clear_airq(zdev);
+ if (rc && !force)
+ goto out;
+
+ if (zdev->kzdev->fib.fmt0.aibv == 0)
+ goto out;
+ spin_lock_irq(&aift->gait_lock);
+ gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb *
+ sizeof(struct zpci_gaite));
+ isc = gaite->gisc;
+ gaite->count--;
+ if (gaite->count == 0) {
+ /* Release guest AIBV and AISB */
+ vpage = phys_to_page(kzdev->fib.fmt0.aibv);
+ if (gaite->aisb != 0)
+ spage = phys_to_page(gaite->aisb);
+ /* Clear the GAIT entry */
+ gaite->aisb = 0;
+ gaite->gisc = 0;
+ gaite->aisbo = 0;
+ gaite->gisa = 0;
+ aift->kzdev[zdev->aisb] = NULL;
+ /* Clear zdev info */
+ airq_iv_free_bit(aift->sbv, zdev->aisb);
+ airq_iv_release(zdev->aibv);
+ zdev->aisb = 0;
+ zdev->aibv = NULL;
+ }
+ spin_unlock_irq(&aift->gait_lock);
+ kvm_s390_gisc_unregister(kzdev->kvm, isc);
+ kzdev->fib.fmt0.isc = 0;
+ kzdev->fib.fmt0.aibv = 0;
+
+ if (vpage) {
+ unpin_user_page(vpage);
+ pcount++;
+ }
+ if (spage) {
+ unpin_user_page(spage);
+ pcount++;
+ }
+ if (pcount > 0)
+ unaccount_mem(pcount);
+out:
+ mutex_unlock(&aift->aift_lock);
+
+ return rc;
+}
+
+static int kvm_s390_pci_dev_open(struct zpci_dev *zdev)
+{
+ struct kvm_zdev *kzdev;
+
+ kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL);
+ if (!kzdev)
+ return -ENOMEM;
+
+ kzdev->zdev = zdev;
+ zdev->kzdev = kzdev;
+
+ return 0;
+}
+
+static void kvm_s390_pci_dev_release(struct zpci_dev *zdev)
+{
+ struct kvm_zdev *kzdev;
+
+ kzdev = zdev->kzdev;
+ WARN_ON(kzdev->zdev != zdev);
+ zdev->kzdev = NULL;
+ kfree(kzdev);
+}
+
+
+/*
+ * Register device with the specified KVM. If interpretation facilities are
+ * available, enable them and let userspace indicate whether or not they will
+ * be used (specify SHM bit to disable).
+ */
+static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm)
+{
+ struct zpci_dev *zdev = opaque;
+ u8 status;
+ int rc;
+
+ if (!zdev)
+ return -EINVAL;
+
+ mutex_lock(&zdev->kzdev_lock);
+
+ if (zdev->kzdev || zdev->gisa != 0 || !kvm) {
+ mutex_unlock(&zdev->kzdev_lock);
+ return -EINVAL;
+ }
+
+ kvm_get_kvm(kvm);
+
+ mutex_lock(&kvm->lock);
+
+ rc = kvm_s390_pci_dev_open(zdev);
+ if (rc)
+ goto err;
+
+ /*
+ * If interpretation facilities aren't available, add the device to
+ * the kzdev list but don't enable for interpretation.
+ */
+ if (!kvm_s390_pci_interp_allowed())
+ goto out;
+
+ /*
+ * If this is the first request to use an interpreted device, make the
+ * necessary vcpu changes
+ */
+ if (!kvm->arch.use_zpci_interp)
+ kvm_s390_vcpu_pci_enable_interp(kvm);
+
+ if (zdev_enabled(zdev)) {
+ rc = zpci_disable_device(zdev);
+ if (rc)
+ goto err;
+ }
+
+ /*
+ * Store information about the identity of the kvm guest allowed to
+ * access this device via interpretation to be used by host CLP
+ */
+ zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
+
+ rc = zpci_enable_device(zdev);
+ if (rc)
+ goto clear_gisa;
+
+ /* Re-register the IOMMU that was already created */
+ rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+ virt_to_phys(zdev->dma_table), &status);
+ if (rc)
+ goto clear_gisa;
+
+out:
+ zdev->kzdev->kvm = kvm;
+
+ spin_lock(&kvm->arch.kzdev_list_lock);
+ list_add_tail(&zdev->kzdev->entry, &kvm->arch.kzdev_list);
+ spin_unlock(&kvm->arch.kzdev_list_lock);
+
+ mutex_unlock(&kvm->lock);
+ mutex_unlock(&zdev->kzdev_lock);
+ return 0;
+
+clear_gisa:
+ zdev->gisa = 0;
+err:
+ if (zdev->kzdev)
+ kvm_s390_pci_dev_release(zdev);
+ mutex_unlock(&kvm->lock);
+ mutex_unlock(&zdev->kzdev_lock);
+ kvm_put_kvm(kvm);
+ return rc;
+}
+
+static void kvm_s390_pci_unregister_kvm(void *opaque)
+{
+ struct zpci_dev *zdev = opaque;
+ struct kvm *kvm;
+ u8 status;
+
+ if (!zdev)
+ return;
+
+ mutex_lock(&zdev->kzdev_lock);
+
+ if (WARN_ON(!zdev->kzdev)) {
+ mutex_unlock(&zdev->kzdev_lock);
+ return;
+ }
+
+ kvm = zdev->kzdev->kvm;
+ mutex_lock(&kvm->lock);
+
+ /*
+ * A 0 gisa means interpretation was never enabled, just remove the
+ * device from the list.
+ */
+ if (zdev->gisa == 0)
+ goto out;
+
+ /* Forwarding must be turned off before interpretation */
+ if (zdev->kzdev->fib.fmt0.aibv != 0)
+ kvm_s390_pci_aif_disable(zdev, true);
+
+ /* Remove the host CLP guest designation */
+ zdev->gisa = 0;
+
+ if (zdev_enabled(zdev)) {
+ if (zpci_disable_device(zdev))
+ goto out;
+ }
+
+ if (zpci_enable_device(zdev))
+ goto out;
+
+ /* Re-register the IOMMU that was already created */
+ zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+ virt_to_phys(zdev->dma_table), &status);
+
+out:
+ spin_lock(&kvm->arch.kzdev_list_lock);
+ list_del(&zdev->kzdev->entry);
+ spin_unlock(&kvm->arch.kzdev_list_lock);
+ kvm_s390_pci_dev_release(zdev);
+
+ mutex_unlock(&kvm->lock);
+ mutex_unlock(&zdev->kzdev_lock);
+
+ kvm_put_kvm(kvm);
+}
+
+void kvm_s390_pci_init_list(struct kvm *kvm)
+{
+ spin_lock_init(&kvm->arch.kzdev_list_lock);
+ INIT_LIST_HEAD(&kvm->arch.kzdev_list);
+}
+
+void kvm_s390_pci_clear_list(struct kvm *kvm)
+{
+ /*
+ * This list should already be empty, either via vfio device closures
+ * or kvm fd cleanup.
+ */
+ spin_lock(&kvm->arch.kzdev_list_lock);
+ WARN_ON_ONCE(!list_empty(&kvm->arch.kzdev_list));
+ spin_unlock(&kvm->arch.kzdev_list_lock);
+}
+
+static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh)
+{
+ struct zpci_dev *zdev = NULL;
+ struct kvm_zdev *kzdev;
+
+ spin_lock(&kvm->arch.kzdev_list_lock);
+ list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) {
+ if (kzdev->zdev->fh == fh) {
+ zdev = kzdev->zdev;
+ break;
+ }
+ }
+ spin_unlock(&kvm->arch.kzdev_list_lock);
+
+ return zdev;
+}
+
+static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev,
+ struct kvm_s390_zpci_op *args)
+{
+ struct zpci_fib fib = {};
+ bool hostflag;
+
+ fib.fmt0.aibv = args->u.reg_aen.ibv;
+ fib.fmt0.isc = args->u.reg_aen.isc;
+ fib.fmt0.noi = args->u.reg_aen.noi;
+ if (args->u.reg_aen.sb != 0) {
+ fib.fmt0.aisb = args->u.reg_aen.sb;
+ fib.fmt0.aisbo = args->u.reg_aen.sbo;
+ fib.fmt0.sum = 1;
+ } else {
+ fib.fmt0.aisb = 0;
+ fib.fmt0.aisbo = 0;
+ fib.fmt0.sum = 0;
+ }
+
+ hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST);
+ return kvm_s390_pci_aif_enable(zdev, &fib, hostflag);
+}
+
+int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args)
+{
+ struct kvm_zdev *kzdev;
+ struct zpci_dev *zdev;
+ int r;
+
+ zdev = get_zdev_from_kvm_by_fh(kvm, args->fh);
+ if (!zdev)
+ return -ENODEV;
+
+ mutex_lock(&zdev->kzdev_lock);
+ mutex_lock(&kvm->lock);
+
+ kzdev = zdev->kzdev;
+ if (!kzdev) {
+ r = -ENODEV;
+ goto out;
+ }
+ if (kzdev->kvm != kvm) {
+ r = -EPERM;
+ goto out;
+ }
+
+ switch (args->op) {
+ case KVM_S390_ZPCIOP_REG_AEN:
+ /* Fail on unknown flags */
+ if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) {
+ r = -EINVAL;
+ break;
+ }
+ r = kvm_s390_pci_zpci_reg_aen(zdev, args);
+ break;
+ case KVM_S390_ZPCIOP_DEREG_AEN:
+ r = kvm_s390_pci_aif_disable(zdev, false);
+ break;
+ default:
+ r = -EINVAL;
+ }
+
+out:
+ mutex_unlock(&kvm->lock);
+ mutex_unlock(&zdev->kzdev_lock);
+ return r;
+}
+
+int __init kvm_s390_pci_init(void)
+{
+ zpci_kvm_hook.kvm_register = kvm_s390_pci_register_kvm;
+ zpci_kvm_hook.kvm_unregister = kvm_s390_pci_unregister_kvm;
+
+ if (!kvm_s390_pci_interp_allowed())
+ return 0;
+
+ aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
+ if (!aift)
+ return -ENOMEM;
+
+ spin_lock_init(&aift->gait_lock);
+ mutex_init(&aift->aift_lock);
+
+ return 0;
+}
+
+void kvm_s390_pci_exit(void)
+{
+ zpci_kvm_hook.kvm_register = NULL;
+ zpci_kvm_hook.kvm_unregister = NULL;
+
+ if (!kvm_s390_pci_interp_allowed())
+ return;
+
+ mutex_destroy(&aift->aift_lock);
+
+ kfree(aift);
+}
diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
new file mode 100644
index 000000000000..ff0972dd5e71
--- /dev/null
+++ b/arch/s390/kvm/pci.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * s390 kvm PCI passthrough support
+ *
+ * Copyright IBM Corp. 2022
+ *
+ * Author(s): Matthew Rosato <mjrosato@linux.ibm.com>
+ */
+
+#ifndef __KVM_S390_PCI_H
+#define __KVM_S390_PCI_H
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <asm/airq.h>
+#include <asm/cpu.h>
+
+struct kvm_zdev {
+ struct zpci_dev *zdev;
+ struct kvm *kvm;
+ struct zpci_fib fib;
+ struct list_head entry;
+};
+
+struct zpci_gaite {
+ u32 gisa;
+ u8 gisc;
+ u8 count;
+ u8 reserved;
+ u8 aisbo;
+ u64 aisb;
+};
+
+struct zpci_aift {
+ struct zpci_gaite *gait;
+ struct airq_iv *sbv;
+ struct kvm_zdev **kzdev;
+ spinlock_t gait_lock; /* Protects the gait, used during AEN forward */
+ struct mutex aift_lock; /* Protects the other structures in aift */
+};
+
+extern struct zpci_aift *aift;
+
+static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift,
+ unsigned long si)
+{
+ if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) || !aift->kzdev ||
+ !aift->kzdev[si])
+ return NULL;
+ return aift->kzdev[si]->kvm;
+};
+
+int kvm_s390_pci_aen_init(u8 nisc);
+void kvm_s390_pci_aen_exit(void);
+
+void kvm_s390_pci_init_list(struct kvm *kvm);
+void kvm_s390_pci_clear_list(struct kvm *kvm);
+
+int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args);
+
+int __init kvm_s390_pci_init(void);
+void kvm_s390_pci_exit(void);
+
+static inline bool kvm_s390_pci_interp_allowed(void)
+{
+ struct cpuid cpu_id;
+
+ get_cpu_id(&cpu_id);
+ switch (cpu_id.machine) {
+ case 0x2817:
+ case 0x2818:
+ case 0x2827:
+ case 0x2828:
+ case 0x2964:
+ case 0x2965:
+ /* No SHM on certain machines */
+ return false;
+ default:
+ return (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) &&
+ sclp.has_zpci_lsi && sclp.has_aeni && sclp.has_aisi &&
+ sclp.has_aisii);
+ }
+}
+
+#endif /* __KVM_S390_PCI_H */
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index ed52ffa8d5d4..621a17fd1a1b 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -2,7 +2,7 @@
/*
* handling privileged instructions
*
- * Copyright IBM Corp. 2008, 2018
+ * Copyright IBM Corp. 2008, 2020
*
* Author(s): Carsten Otte <cotte@de.ibm.com>
* Christian Borntraeger <borntraeger@de.ibm.com>
@@ -11,20 +11,17 @@
#include <linux/kvm.h>
#include <linux/gfp.h>
#include <linux/errno.h>
-#include <linux/compat.h>
#include <linux/mm_types.h>
-
+#include <linux/pgtable.h>
+#include <linux/io.h>
#include <asm/asm-offsets.h>
#include <asm/facility.h>
#include <asm/current.h>
#include <asm/debug.h>
#include <asm/ebcdic.h>
#include <asm/sysinfo.h>
-#include <asm/pgtable.h>
#include <asm/page-states.h>
-#include <asm/pgalloc.h>
#include <asm/gmap.h>
-#include <asm/io.h>
#include <asm/ptrace.h>
#include <asm/sclp.h>
#include <asm/ap.h>
@@ -60,7 +57,7 @@ static int handle_gs(struct kvm_vcpu *vcpu)
if (test_kvm_facility(vcpu->kvm, 133)) {
VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (lazy)");
preempt_disable();
- __ctl_set_bit(2, 4);
+ local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
current->thread.gs_cb = (struct gs_cb *)&vcpu->run->s.regs.gscb;
restore_gs_cb(current->thread.gs_cb);
preempt_enable();
@@ -103,7 +100,20 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
return kvm_s390_inject_prog_cond(vcpu, rc);
VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", gtod.tod);
- kvm_s390_set_tod_clock(vcpu->kvm, &gtod);
+ /*
+ * To set the TOD clock the kvm lock must be taken, but the vcpu lock
+ * is already held in handle_set_clock. The usual lock order is the
+ * opposite. As SCK is deprecated and should not be used in several
+ * cases, for example when the multiple epoch facility or TOD clock
+ * steering facility is installed (see Principles of Operation), a
+ * slow path can be used. If the lock can not be taken via try_lock,
+ * the instruction will be retried via -EAGAIN at a later point in
+ * time.
+ */
+ if (!kvm_s390_try_set_tod_clock(vcpu->kvm, &gtod)) {
+ kvm_s390_retry_instr(vcpu);
+ return -EAGAIN;
+ }
kvm_s390_set_psw_cc(vcpu, 0);
return 0;
@@ -270,18 +280,18 @@ static int handle_iske(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
retry:
unlocked = false;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
rc = get_guest_storage_key(current->mm, vmaddr, &key);
if (rc) {
- rc = fixup_user_fault(current, current->mm, vmaddr,
+ rc = fixup_user_fault(current->mm, vmaddr,
FAULT_FLAG_WRITE, &unlocked);
if (!rc) {
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
goto retry;
}
}
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (rc == -EFAULT)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
if (rc < 0)
@@ -317,17 +327,17 @@ static int handle_rrbe(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
retry:
unlocked = false;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
rc = reset_guest_reference_bit(current->mm, vmaddr);
if (rc < 0) {
- rc = fixup_user_fault(current, current->mm, vmaddr,
+ rc = fixup_user_fault(current->mm, vmaddr,
FAULT_FLAG_WRITE, &unlocked);
if (!rc) {
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
goto retry;
}
}
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (rc == -EFAULT)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
if (rc < 0)
@@ -385,19 +395,21 @@ static int handle_sske(struct kvm_vcpu *vcpu)
if (kvm_is_error_hva(vmaddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey,
m3 & SSKE_NQ, m3 & SSKE_MR,
m3 & SSKE_MC);
if (rc < 0) {
- rc = fixup_user_fault(current, current->mm, vmaddr,
+ rc = fixup_user_fault(current->mm, vmaddr,
FAULT_FLAG_WRITE, &unlocked);
rc = !rc ? -EAGAIN : rc;
}
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (rc == -EFAULT)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ if (rc == -EAGAIN)
+ continue;
if (rc < 0)
return rc;
start += PAGE_SIZE;
@@ -429,7 +441,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu)
vcpu->stat.instruction_ipte_interlock++;
if (psw_bits(vcpu->arch.sie_block->gpsw).pstate)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
- wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu));
+ wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu->kvm));
kvm_s390_retry_instr(vcpu);
VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation");
return 0;
@@ -611,6 +623,7 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
static int handle_pqap(struct kvm_vcpu *vcpu)
{
struct ap_queue_status status = {};
+ crypto_hook pqap_hook;
unsigned long reg0;
int ret;
uint8_t fc;
@@ -626,10 +639,12 @@ static int handle_pqap(struct kvm_vcpu *vcpu)
* available for the guest are AQIC and TAPQ with the t bit set
* since we do not set IC.3 (FIII) we currently will only intercept
* the AQIC function code.
+ * Note: running nested under z/VM can result in intercepts for other
+ * function codes, e.g. PQAP(QCI). We do not support this and bail out.
*/
reg0 = vcpu->run->s.regs.gprs[0];
fc = (reg0 >> 24) & 0xff;
- if (WARN_ON_ONCE(fc != 0x03))
+ if (fc != 0x03)
return -EOPNOTSUPP;
/* PQAP instruction is allowed for guest kernel only */
@@ -653,18 +668,20 @@ static int handle_pqap(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
/*
- * Verify that the hook callback is registered, lock the owner
- * and call the hook.
+ * If the hook callback is registered, there will be a pointer to the
+ * hook function pointer in the kvm_s390_crypto structure. Lock the
+ * owner, retrieve the hook function pointer and call the hook.
*/
+ down_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem);
if (vcpu->kvm->arch.crypto.pqap_hook) {
- if (!try_module_get(vcpu->kvm->arch.crypto.pqap_hook->owner))
- return -EOPNOTSUPP;
- ret = vcpu->kvm->arch.crypto.pqap_hook->hook(vcpu);
- module_put(vcpu->kvm->arch.crypto.pqap_hook->owner);
+ pqap_hook = *vcpu->kvm->arch.crypto.pqap_hook;
+ ret = pqap_hook(vcpu);
if (!ret && vcpu->run->s.regs.gprs[1] & 0x00ff0000)
kvm_s390_set_psw_cc(vcpu, 3);
+ up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem);
return ret;
}
+ up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem);
/*
* A vfio_driver must register a hook.
* No hook means no driver to enable the SIE CRYCB and no queues.
@@ -855,10 +872,18 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
- if (fc > 3) {
- kvm_s390_set_psw_cc(vcpu, 3);
- return 0;
- }
+ /* Bailout forbidden function codes */
+ if (fc > 3 && fc != 15)
+ goto out_no_data;
+
+ /*
+ * fc 15 is provided only with
+ * - PTF/CPU topology support through facility 15
+ * - KVM_CAP_S390_USER_STSI
+ */
+ if (fc == 15 && (!test_kvm_facility(vcpu->kvm, 11) ||
+ !vcpu->kvm->arch.user_stsi))
+ goto out_no_data;
if (vcpu->run->s.regs.gprs[0] & 0x0fffff00
|| vcpu->run->s.regs.gprs[1] & 0xffff0000)
@@ -872,13 +897,13 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
operand2 = kvm_s390_get_base_disp_s(vcpu, &ar);
- if (operand2 & 0xfff)
+ if (!kvm_s390_pv_cpu_is_protected(vcpu) && (operand2 & 0xfff))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
switch (fc) {
case 1: /* same handling for 1 and 2 */
case 2:
- mem = get_zeroed_page(GFP_KERNEL);
+ mem = get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!mem)
goto out_no_data;
if (stsi((void *) mem, fc, sel1, sel2))
@@ -887,14 +912,22 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
case 3:
if (sel1 != 2 || sel2 != 2)
goto out_no_data;
- mem = get_zeroed_page(GFP_KERNEL);
+ mem = get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!mem)
goto out_no_data;
handle_stsi_3_2_2(vcpu, (void *) mem);
break;
+ case 15: /* fc 15 is fully handled in userspace */
+ insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2);
+ trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
+ return -EREMOTE;
+ }
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ memcpy(sida_addr(vcpu->arch.sie_block), (void *)mem, PAGE_SIZE);
+ rc = 0;
+ } else {
+ rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE);
}
-
- rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE);
if (rc) {
rc = kvm_s390_inject_prog_cond(vcpu, rc);
goto out;
@@ -1084,15 +1117,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
if (rc)
return rc;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
rc = cond_set_guest_storage_key(current->mm, vmaddr,
key, NULL, nq, mr, mc);
if (rc < 0) {
- rc = fixup_user_fault(current, current->mm, vmaddr,
+ rc = fixup_user_fault(current->mm, vmaddr,
FAULT_FLAG_WRITE, &unlocked);
rc = !rc ? -EAGAIN : rc;
}
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (rc == -EFAULT)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
if (rc == -EAGAIN)
@@ -1115,7 +1148,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
/*
- * Must be called with relevant read locks held (kvm->mm->mmap_sem, kvm->srcu)
+ * Must be called with relevant read locks held (kvm->mm->mmap_lock, kvm->srcu)
*/
static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
{
@@ -1213,9 +1246,9 @@ static int handle_essa(struct kvm_vcpu *vcpu)
* already correct, we do nothing and avoid the lock.
*/
if (vcpu->kvm->mm->context.uses_cmm == 0) {
- down_write(&vcpu->kvm->mm->mmap_sem);
+ mmap_write_lock(vcpu->kvm->mm);
vcpu->kvm->mm->context.uses_cmm = 1;
- up_write(&vcpu->kvm->mm->mmap_sem);
+ mmap_write_unlock(vcpu->kvm->mm);
}
/*
* If we are here, we are supposed to have CMMA enabled in
@@ -1232,11 +1265,11 @@ static int handle_essa(struct kvm_vcpu *vcpu)
} else {
int srcu_idx;
- down_read(&vcpu->kvm->mm->mmap_sem);
+ mmap_read_lock(vcpu->kvm->mm);
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
i = __do_essa(vcpu, orc);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
- up_read(&vcpu->kvm->mm->mmap_sem);
+ mmap_read_unlock(vcpu->kvm->mm);
if (i < 0)
return i;
/* Account for the possible extra cbrl entry */
@@ -1244,10 +1277,10 @@ static int handle_essa(struct kvm_vcpu *vcpu)
}
vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */
cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
for (i = 0; i < entries; ++i)
__gmap_zap(gmap, cbrlo[i]);
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return 0;
}
@@ -1432,10 +1465,11 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
static int handle_tprot(struct kvm_vcpu *vcpu)
{
- u64 address1, address2;
- unsigned long hva, gpa;
- int ret = 0, cc = 0;
+ u64 address, operand2;
+ unsigned long gpa;
+ u8 access_key;
bool writable;
+ int ret, cc;
u8 ar;
vcpu->stat.instruction_tprot++;
@@ -1443,45 +1477,48 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
- kvm_s390_get_base_disp_sse(vcpu, &address1, &address2, &ar, NULL);
+ kvm_s390_get_base_disp_sse(vcpu, &address, &operand2, &ar, NULL);
+ access_key = (operand2 & 0xf0) >> 4;
- /* we only handle the Linux memory detection case:
- * access key == 0
- * everything else goes to userspace. */
- if (address2 & 0xf0)
- return -EOPNOTSUPP;
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
- ipte_lock(vcpu);
- ret = guest_translate_address(vcpu, address1, ar, &gpa, GACC_STORE);
- if (ret == PGM_PROTECTION) {
+ ipte_lock(vcpu->kvm);
+
+ ret = guest_translate_address_with_key(vcpu, address, ar, &gpa,
+ GACC_STORE, access_key);
+ if (ret == 0) {
+ gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable);
+ } else if (ret == PGM_PROTECTION) {
+ writable = false;
/* Write protected? Try again with read-only... */
- cc = 1;
- ret = guest_translate_address(vcpu, address1, ar, &gpa,
- GACC_FETCH);
+ ret = guest_translate_address_with_key(vcpu, address, ar, &gpa,
+ GACC_FETCH, access_key);
}
- if (ret) {
- if (ret == PGM_ADDRESSING || ret == PGM_TRANSLATION_SPEC) {
- ret = kvm_s390_inject_program_int(vcpu, ret);
- } else if (ret > 0) {
- /* Translation not available */
- kvm_s390_set_psw_cc(vcpu, 3);
+ if (ret >= 0) {
+ cc = -1;
+
+ /* Fetching permitted; storing permitted */
+ if (ret == 0 && writable)
+ cc = 0;
+ /* Fetching permitted; storing not permitted */
+ else if (ret == 0 && !writable)
+ cc = 1;
+ /* Fetching not permitted; storing not permitted */
+ else if (ret == PGM_PROTECTION)
+ cc = 2;
+ /* Translation not available */
+ else if (ret != PGM_ADDRESSING && ret != PGM_TRANSLATION_SPEC)
+ cc = 3;
+
+ if (cc != -1) {
+ kvm_s390_set_psw_cc(vcpu, cc);
ret = 0;
+ } else {
+ ret = kvm_s390_inject_program_int(vcpu, ret);
}
- goto out_unlock;
}
- hva = gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable);
- if (kvm_is_error_hva(hva)) {
- ret = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- } else {
- if (!writable)
- cc = 1; /* Write not permitted ==> read-only */
- kvm_s390_set_psw_cc(vcpu, cc);
- /* Note: CC2 only occurs for storage keys (not supported yet) */
- }
-out_unlock:
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
- ipte_unlock(vcpu);
+ ipte_unlock(vcpu->kvm);
return ret;
}
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
new file mode 100644
index 000000000000..75e81ba26d04
--- /dev/null
+++ b/arch/s390/kvm/pv.c
@@ -0,0 +1,894 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hosting Protected Virtual Machines
+ *
+ * Copyright IBM Corp. 2019, 2020
+ * Author(s): Janosch Frank <frankja@linux.ibm.com>
+ */
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/minmax.h>
+#include <linux/pagemap.h>
+#include <linux/sched/signal.h>
+#include <asm/gmap.h>
+#include <asm/uv.h>
+#include <asm/mman.h>
+#include <linux/pagewalk.h>
+#include <linux/sched/mm.h>
+#include <linux/mmu_notifier.h>
+#include "kvm-s390.h"
+
+bool kvm_s390_pv_is_protected(struct kvm *kvm)
+{
+ lockdep_assert_held(&kvm->lock);
+ return !!kvm_s390_pv_get_handle(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_s390_pv_is_protected);
+
+bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
+{
+ lockdep_assert_held(&vcpu->mutex);
+ return !!kvm_s390_pv_cpu_get_handle(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected);
+
+/**
+ * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to
+ * be destroyed
+ *
+ * @list: list head for the list of leftover VMs
+ * @old_gmap_table: the gmap table of the leftover protected VM
+ * @handle: the handle of the leftover protected VM
+ * @stor_var: pointer to the variable storage of the leftover protected VM
+ * @stor_base: address of the base storage of the leftover protected VM
+ *
+ * Represents a protected VM that is still registered with the Ultravisor,
+ * but which does not correspond any longer to an active KVM VM. It should
+ * be destroyed at some point later, either asynchronously or when the
+ * process terminates.
+ */
+struct pv_vm_to_be_destroyed {
+ struct list_head list;
+ unsigned long old_gmap_table;
+ u64 handle;
+ void *stor_var;
+ unsigned long stor_base;
+};
+
+static void kvm_s390_clear_pv_state(struct kvm *kvm)
+{
+ kvm->arch.pv.handle = 0;
+ kvm->arch.pv.guest_len = 0;
+ kvm->arch.pv.stor_base = 0;
+ kvm->arch.pv.stor_var = NULL;
+}
+
+int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
+{
+ int cc;
+
+ if (!kvm_s390_pv_cpu_get_handle(vcpu))
+ return 0;
+
+ cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), UVC_CMD_DESTROY_SEC_CPU, rc, rrc);
+
+ KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT DESTROY VCPU %d: rc %x rrc %x",
+ vcpu->vcpu_id, *rc, *rrc);
+ WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x", *rc, *rrc);
+
+ /* Intended memory leak for something that should never happen. */
+ if (!cc)
+ free_pages(vcpu->arch.pv.stor_base,
+ get_order(uv_info.guest_cpu_stor_len));
+
+ free_page((unsigned long)sida_addr(vcpu->arch.sie_block));
+ vcpu->arch.sie_block->pv_handle_cpu = 0;
+ vcpu->arch.sie_block->pv_handle_config = 0;
+ memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv));
+ vcpu->arch.sie_block->sdf = 0;
+ /*
+ * The sidad field (for sdf == 2) is now the gbea field (for sdf == 0).
+ * Use the reset value of gbea to avoid leaking the kernel pointer of
+ * the just freed sida.
+ */
+ vcpu->arch.sie_block->gbea = 1;
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+ return cc ? EIO : 0;
+}
+
+int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_csc uvcb = {
+ .header.cmd = UVC_CMD_CREATE_SEC_CPU,
+ .header.len = sizeof(uvcb),
+ };
+ void *sida_addr;
+ int cc;
+
+ if (kvm_s390_pv_cpu_get_handle(vcpu))
+ return -EINVAL;
+
+ vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT,
+ get_order(uv_info.guest_cpu_stor_len));
+ if (!vcpu->arch.pv.stor_base)
+ return -ENOMEM;
+
+ /* Input */
+ uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm);
+ uvcb.num = vcpu->arch.sie_block->icpua;
+ uvcb.state_origin = virt_to_phys(vcpu->arch.sie_block);
+ uvcb.stor_origin = virt_to_phys((void *)vcpu->arch.pv.stor_base);
+
+ /* Alloc Secure Instruction Data Area Designation */
+ sida_addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!sida_addr) {
+ free_pages(vcpu->arch.pv.stor_base,
+ get_order(uv_info.guest_cpu_stor_len));
+ return -ENOMEM;
+ }
+ vcpu->arch.sie_block->sidad = virt_to_phys(sida_addr);
+
+ cc = uv_call(0, (u64)&uvcb);
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ KVM_UV_EVENT(vcpu->kvm, 3,
+ "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x",
+ vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc,
+ uvcb.header.rrc);
+
+ if (cc) {
+ u16 dummy;
+
+ kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy);
+ return -EIO;
+ }
+
+ /* Output */
+ vcpu->arch.pv.handle = uvcb.cpu_handle;
+ vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle;
+ vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm);
+ vcpu->arch.sie_block->sdf = 2;
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ return 0;
+}
+
+/* only free resources when the destroy was successful */
+static void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
+{
+ vfree(kvm->arch.pv.stor_var);
+ free_pages(kvm->arch.pv.stor_base,
+ get_order(uv_info.guest_base_stor_len));
+ kvm_s390_clear_pv_state(kvm);
+}
+
+static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
+{
+ unsigned long base = uv_info.guest_base_stor_len;
+ unsigned long virt = uv_info.guest_virt_var_stor_len;
+ unsigned long npages = 0, vlen = 0;
+
+ kvm->arch.pv.stor_var = NULL;
+ kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT, get_order(base));
+ if (!kvm->arch.pv.stor_base)
+ return -ENOMEM;
+
+ /*
+ * Calculate current guest storage for allocation of the
+ * variable storage, which is based on the length in MB.
+ *
+ * Slots are sorted by GFN
+ */
+ mutex_lock(&kvm->slots_lock);
+ npages = kvm_s390_get_gfn_end(kvm_memslots(kvm));
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm->arch.pv.guest_len = npages * PAGE_SIZE;
+
+ /* Allocate variable storage */
+ vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
+ vlen += uv_info.guest_virt_base_stor_len;
+ kvm->arch.pv.stor_var = vzalloc(vlen);
+ if (!kvm->arch.pv.stor_var)
+ goto out_err;
+ return 0;
+
+out_err:
+ kvm_s390_pv_dealloc_vm(kvm);
+ return -ENOMEM;
+}
+
+/**
+ * kvm_s390_pv_dispose_one_leftover - Clean up one leftover protected VM.
+ * @kvm: the KVM that was associated with this leftover protected VM
+ * @leftover: details about the leftover protected VM that needs a clean up
+ * @rc: the RC code of the Destroy Secure Configuration UVC
+ * @rrc: the RRC code of the Destroy Secure Configuration UVC
+ *
+ * Destroy one leftover protected VM.
+ * On success, kvm->mm->context.protected_count will be decremented atomically
+ * and all other resources used by the VM will be freed.
+ *
+ * Return: 0 in case of success, otherwise 1
+ */
+static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm,
+ struct pv_vm_to_be_destroyed *leftover,
+ u16 *rc, u16 *rrc)
+{
+ int cc;
+
+ /* It used the destroy-fast UVC, nothing left to do here */
+ if (!leftover->handle)
+ goto done_fast;
+ cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc);
+ WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc);
+ if (cc)
+ return cc;
+ /*
+ * Intentionally leak unusable memory. If the UVC fails, the memory
+ * used for the VM and its metadata is permanently unusable.
+ * This can only happen in case of a serious KVM or hardware bug; it
+ * is not expected to happen in normal operation.
+ */
+ free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len));
+ free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER);
+ vfree(leftover->stor_var);
+done_fast:
+ atomic_dec(&kvm->mm->context.protected_count);
+ return 0;
+}
+
+/**
+ * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory.
+ * @kvm: the VM whose memory is to be cleared.
+ *
+ * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot.
+ * The CPUs of the protected VM need to be destroyed beforehand.
+ */
+static void kvm_s390_destroy_lower_2g(struct kvm *kvm)
+{
+ const unsigned long pages_2g = SZ_2G / PAGE_SIZE;
+ struct kvm_memory_slot *slot;
+ unsigned long len;
+ int srcu_idx;
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ /* Take the memslot containing guest absolute address 0 */
+ slot = gfn_to_memslot(kvm, 0);
+ /* Clear all slots or parts thereof that are below 2GB */
+ while (slot && slot->base_gfn < pages_2g) {
+ len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE;
+ s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len);
+ /* Take the next memslot */
+ slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages);
+ }
+
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
+static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_destroy_fast uvcb = {
+ .header.cmd = UVC_CMD_DESTROY_SEC_CONF_FAST,
+ .header.len = sizeof(uvcb),
+ .handle = kvm_s390_pv_get_handle(kvm),
+ };
+ int cc;
+
+ cc = uv_call_sched(0, (u64)&uvcb);
+ if (rc)
+ *rc = uvcb.header.rc;
+ if (rrc)
+ *rrc = uvcb.header.rrc;
+ WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x",
+ uvcb.header.rc, uvcb.header.rrc);
+ WARN_ONCE(cc && uvcb.header.rc != 0x104,
+ "protvirt destroy vm fast failed handle %llx rc %x rrc %x",
+ kvm_s390_pv_get_handle(kvm), uvcb.header.rc, uvcb.header.rrc);
+ /* Intended memory leak on "impossible" error */
+ if (!cc)
+ kvm_s390_pv_dealloc_vm(kvm);
+ return cc ? -EIO : 0;
+}
+
+static inline bool is_destroy_fast_available(void)
+{
+ return test_bit_inv(BIT_UVC_CMD_DESTROY_SEC_CONF_FAST, uv_info.inst_calls_list);
+}
+
+/**
+ * kvm_s390_pv_set_aside - Set aside a protected VM for later teardown.
+ * @kvm: the VM
+ * @rc: return value for the RC field of the UVCB
+ * @rrc: return value for the RRC field of the UVCB
+ *
+ * Set aside the protected VM for a subsequent teardown. The VM will be able
+ * to continue immediately as a non-secure VM, and the information needed to
+ * properly tear down the protected VM is set aside. If another protected VM
+ * was already set aside without starting its teardown, this function will
+ * fail.
+ * The CPUs of the protected VM need to be destroyed beforehand.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return: 0 in case of success, -EINVAL if another protected VM was already set
+ * aside, -ENOMEM if the system ran out of memory.
+ */
+int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ struct pv_vm_to_be_destroyed *priv;
+ int res = 0;
+
+ lockdep_assert_held(&kvm->lock);
+ /*
+ * If another protected VM was already prepared for teardown, refuse.
+ * A normal deinitialization has to be performed instead.
+ */
+ if (kvm->arch.pv.set_aside)
+ return -EINVAL;
+
+ /* Guest with segment type ASCE, refuse to destroy asynchronously */
+ if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+ return -EINVAL;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ if (is_destroy_fast_available()) {
+ res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc);
+ } else {
+ priv->stor_var = kvm->arch.pv.stor_var;
+ priv->stor_base = kvm->arch.pv.stor_base;
+ priv->handle = kvm_s390_pv_get_handle(kvm);
+ priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table;
+ WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+ if (s390_replace_asce(kvm->arch.gmap))
+ res = -ENOMEM;
+ }
+
+ if (res) {
+ kfree(priv);
+ return res;
+ }
+
+ kvm_s390_destroy_lower_2g(kvm);
+ kvm_s390_clear_pv_state(kvm);
+ kvm->arch.pv.set_aside = priv;
+
+ *rc = UVC_RC_EXECUTED;
+ *rrc = 42;
+ return 0;
+}
+
+/**
+ * kvm_s390_pv_deinit_vm - Deinitialize the current protected VM
+ * @kvm: the KVM whose protected VM needs to be deinitialized
+ * @rc: the RC code of the UVC
+ * @rrc: the RRC code of the UVC
+ *
+ * Deinitialize the current protected VM. This function will destroy and
+ * cleanup the current protected VM, but it will not cleanup the guest
+ * memory. This function should only be called when the protected VM has
+ * just been created and therefore does not have any guest memory, or when
+ * the caller cleans up the guest memory separately.
+ *
+ * This function should not fail, but if it does, the donated memory must
+ * not be freed.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return: 0 in case of success, otherwise -EIO
+ */
+int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ int cc;
+
+ cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+ UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
+ WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+ if (!cc) {
+ atomic_dec(&kvm->mm->context.protected_count);
+ kvm_s390_pv_dealloc_vm(kvm);
+ } else {
+ /* Intended memory leak on "impossible" error */
+ s390_replace_asce(kvm->arch.gmap);
+ }
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
+ WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
+
+ return cc ? -EIO : 0;
+}
+
+/**
+ * kvm_s390_pv_deinit_cleanup_all - Clean up all protected VMs associated
+ * with a specific KVM.
+ * @kvm: the KVM to be cleaned up
+ * @rc: the RC code of the first failing UVC
+ * @rrc: the RRC code of the first failing UVC
+ *
+ * This function will clean up all protected VMs associated with a KVM.
+ * This includes the active one, the one prepared for deinitialization with
+ * kvm_s390_pv_set_aside, and any still pending in the need_cleanup list.
+ *
+ * Context: kvm->lock needs to be held unless being called from
+ * kvm_arch_destroy_vm.
+ *
+ * Return: 0 if all VMs are successfully cleaned up, otherwise -EIO
+ */
+int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ struct pv_vm_to_be_destroyed *cur;
+ bool need_zap = false;
+ u16 _rc, _rrc;
+ int cc = 0;
+
+ /*
+ * Nothing to do if the counter was already 0. Otherwise make sure
+ * the counter does not reach 0 before calling s390_uv_destroy_range.
+ */
+ if (!atomic_inc_not_zero(&kvm->mm->context.protected_count))
+ return 0;
+
+ *rc = 1;
+ /* If the current VM is protected, destroy it */
+ if (kvm_s390_pv_get_handle(kvm)) {
+ cc = kvm_s390_pv_deinit_vm(kvm, rc, rrc);
+ need_zap = true;
+ }
+
+ /* If a previous protected VM was set aside, put it in the need_cleanup list */
+ if (kvm->arch.pv.set_aside) {
+ list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
+ kvm->arch.pv.set_aside = NULL;
+ }
+
+ /* Cleanup all protected VMs in the need_cleanup list */
+ while (!list_empty(&kvm->arch.pv.need_cleanup)) {
+ cur = list_first_entry(&kvm->arch.pv.need_cleanup, typeof(*cur), list);
+ need_zap = true;
+ if (kvm_s390_pv_dispose_one_leftover(kvm, cur, &_rc, &_rrc)) {
+ cc = 1;
+ /*
+ * Only return the first error rc and rrc, so make
+ * sure it is not overwritten. All destroys will
+ * additionally be reported via KVM_UV_EVENT().
+ */
+ if (*rc == UVC_RC_EXECUTED) {
+ *rc = _rc;
+ *rrc = _rrc;
+ }
+ }
+ list_del(&cur->list);
+ kfree(cur);
+ }
+
+ /*
+ * If the mm still has a mapping, try to mark all its pages as
+ * accessible. The counter should not reach zero before this
+ * cleanup has been performed.
+ */
+ if (need_zap && mmget_not_zero(kvm->mm)) {
+ s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
+ mmput(kvm->mm);
+ }
+
+ /* Now the counter can safely reach 0 */
+ atomic_dec(&kvm->mm->context.protected_count);
+ return cc ? -EIO : 0;
+}
+
+/**
+ * kvm_s390_pv_deinit_aside_vm - Teardown a previously set aside protected VM.
+ * @kvm: the VM previously associated with the protected VM
+ * @rc: return value for the RC field of the UVCB
+ * @rrc: return value for the RRC field of the UVCB
+ *
+ * Tear down the protected VM that had been previously prepared for teardown
+ * using kvm_s390_pv_set_aside_vm. Ideally this should be called by
+ * userspace asynchronously from a separate thread.
+ *
+ * Context: kvm->lock must not be held.
+ *
+ * Return: 0 in case of success, -EINVAL if no protected VM had been
+ * prepared for asynchronous teardowm, -EIO in case of other errors.
+ */
+int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ struct pv_vm_to_be_destroyed *p;
+ int ret = 0;
+
+ lockdep_assert_not_held(&kvm->lock);
+ mutex_lock(&kvm->lock);
+ p = kvm->arch.pv.set_aside;
+ kvm->arch.pv.set_aside = NULL;
+ mutex_unlock(&kvm->lock);
+ if (!p)
+ return -EINVAL;
+
+ /* When a fatal signal is received, stop immediately */
+ if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
+ goto done;
+ if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
+ ret = -EIO;
+ kfree(p);
+ p = NULL;
+done:
+ /*
+ * p is not NULL if we aborted because of a fatal signal, in which
+ * case queue the leftover for later cleanup.
+ */
+ if (p) {
+ mutex_lock(&kvm->lock);
+ list_add(&p->list, &kvm->arch.pv.need_cleanup);
+ mutex_unlock(&kvm->lock);
+ /* Did not finish, but pretend things went well */
+ *rc = UVC_RC_EXECUTED;
+ *rrc = 42;
+ }
+ return ret;
+}
+
+static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
+ struct mm_struct *mm)
+{
+ struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier);
+ u16 dummy;
+ int r;
+
+ /*
+ * No locking is needed since this is the last thread of the last user of this
+ * struct mm.
+ * When the struct kvm gets deinitialized, this notifier is also
+ * unregistered. This means that if this notifier runs, then the
+ * struct kvm is still valid.
+ */
+ r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
+ if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm))
+ kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy);
+}
+
+static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = {
+ .release = kvm_s390_pv_mmu_notifier_release,
+};
+
+int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_cgc uvcb = {
+ .header.cmd = UVC_CMD_CREATE_SEC_CONF,
+ .header.len = sizeof(uvcb)
+ };
+ int cc, ret;
+ u16 dummy;
+
+ ret = kvm_s390_pv_alloc_vm(kvm);
+ if (ret)
+ return ret;
+
+ /* Inputs */
+ uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
+ uvcb.guest_stor_len = kvm->arch.pv.guest_len;
+ uvcb.guest_asce = kvm->arch.gmap->asce;
+ uvcb.guest_sca = virt_to_phys(kvm->arch.sca);
+ uvcb.conf_base_stor_origin =
+ virt_to_phys((void *)kvm->arch.pv.stor_base);
+ uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
+ uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap;
+ uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr;
+
+ cc = uv_call_sched(0, (u64)&uvcb);
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x flags %04x",
+ uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc, uvcb.flags.raw);
+
+ /* Outputs */
+ kvm->arch.pv.handle = uvcb.guest_handle;
+
+ atomic_inc(&kvm->mm->context.protected_count);
+ if (cc) {
+ if (uvcb.header.rc & UVC_RC_NEED_DESTROY) {
+ kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
+ } else {
+ atomic_dec(&kvm->mm->context.protected_count);
+ kvm_s390_pv_dealloc_vm(kvm);
+ }
+ return -EIO;
+ }
+ kvm->arch.gmap->guest_handle = uvcb.guest_handle;
+ /* Add the notifier only once. No races because we hold kvm->lock */
+ if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
+ kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
+ mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
+ }
+ return 0;
+}
+
+int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
+ u16 *rrc)
+{
+ struct uv_cb_ssc uvcb = {
+ .header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS,
+ .header.len = sizeof(uvcb),
+ .sec_header_origin = (u64)hdr,
+ .sec_header_len = length,
+ .guest_handle = kvm_s390_pv_get_handle(kvm),
+ };
+ int cc = uv_call(0, (u64)&uvcb);
+
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x",
+ *rc, *rrc);
+ return cc ? -EINVAL : 0;
+}
+
+static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
+ u64 offset, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_unp uvcb = {
+ .header.cmd = UVC_CMD_UNPACK_IMG,
+ .header.len = sizeof(uvcb),
+ .guest_handle = kvm_s390_pv_get_handle(kvm),
+ .gaddr = addr,
+ .tweak[0] = tweak,
+ .tweak[1] = offset,
+ };
+ int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb);
+
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+
+ if (ret && ret != -EAGAIN)
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x",
+ uvcb.gaddr, *rc, *rrc);
+ return ret;
+}
+
+int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
+ unsigned long tweak, u16 *rc, u16 *rrc)
+{
+ u64 offset = 0;
+ int ret = 0;
+
+ if (addr & ~PAGE_MASK || !size || size & ~PAGE_MASK)
+ return -EINVAL;
+
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx",
+ addr, size);
+
+ while (offset < size) {
+ ret = unpack_one(kvm, addr, tweak, offset, rc, rrc);
+ if (ret == -EAGAIN) {
+ cond_resched();
+ if (fatal_signal_pending(current))
+ break;
+ continue;
+ }
+ if (ret)
+ break;
+ addr += PAGE_SIZE;
+ offset += PAGE_SIZE;
+ }
+ if (!ret)
+ KVM_UV_EVENT(kvm, 3, "%s", "PROTVIRT VM UNPACK: successful");
+ return ret;
+}
+
+int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state)
+{
+ struct uv_cb_cpu_set_state uvcb = {
+ .header.cmd = UVC_CMD_CPU_SET_STATE,
+ .header.len = sizeof(uvcb),
+ .cpu_handle = kvm_s390_pv_cpu_get_handle(vcpu),
+ .state = state,
+ };
+ int cc;
+
+ cc = uv_call(0, (u64)&uvcb);
+ KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT SET CPU %d STATE %d rc %x rrc %x",
+ vcpu->vcpu_id, state, uvcb.header.rc, uvcb.header.rrc);
+ if (cc)
+ return -EINVAL;
+ return 0;
+}
+
+int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_dump_cpu uvcb = {
+ .header.cmd = UVC_CMD_DUMP_CPU,
+ .header.len = sizeof(uvcb),
+ .cpu_handle = vcpu->arch.pv.handle,
+ .dump_area_origin = (u64)buff,
+ };
+ int cc;
+
+ cc = uv_call_sched(0, (u64)&uvcb);
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ return cc;
+}
+
+/* Size of the cache for the storage state dump data. 1MB for now */
+#define DUMP_BUFF_LEN HPAGE_SIZE
+
+/**
+ * kvm_s390_pv_dump_stor_state
+ *
+ * @kvm: pointer to the guest's KVM struct
+ * @buff_user: Userspace pointer where we will write the results to
+ * @gaddr: Starting absolute guest address for which the storage state
+ * is requested.
+ * @buff_user_len: Length of the buff_user buffer
+ * @rc: Pointer to where the uvcb return code is stored
+ * @rrc: Pointer to where the uvcb return reason code is stored
+ *
+ * Stores buff_len bytes of tweak component values to buff_user
+ * starting with the 1MB block specified by the absolute guest address
+ * (gaddr). The gaddr pointer will be updated with the last address
+ * for which data was written when returning to userspace. buff_user
+ * might be written to even if an error rc is returned. For instance
+ * if we encounter a fault after writing the first page of data.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return:
+ * 0 on success
+ * -ENOMEM if allocating the cache fails
+ * -EINVAL if gaddr is not aligned to 1MB
+ * -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len
+ * -EINVAL if the UV call fails, rc and rrc will be set in this case
+ * -EFAULT if copying the result to buff_user failed
+ */
+int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
+ u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_dump_stor_state uvcb = {
+ .header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE,
+ .header.len = sizeof(uvcb),
+ .config_handle = kvm->arch.pv.handle,
+ .gaddr = *gaddr,
+ .dump_area_origin = 0,
+ };
+ const u64 increment_len = uv_info.conf_dump_storage_state_len;
+ size_t buff_kvm_size;
+ size_t size_done = 0;
+ u8 *buff_kvm = NULL;
+ int cc, ret;
+
+ ret = -EINVAL;
+ /* UV call processes 1MB guest storage chunks at a time */
+ if (!IS_ALIGNED(*gaddr, HPAGE_SIZE))
+ goto out;
+
+ /*
+ * We provide the storage state for 1MB chunks of guest
+ * storage. The buffer will need to be aligned to
+ * conf_dump_storage_state_len so we don't end on a partial
+ * chunk.
+ */
+ if (!buff_user_len ||
+ !IS_ALIGNED(buff_user_len, increment_len))
+ goto out;
+
+ /*
+ * Allocate a buffer from which we will later copy to the user
+ * process. We don't want userspace to dictate our buffer size
+ * so we limit it to DUMP_BUFF_LEN.
+ */
+ ret = -ENOMEM;
+ buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN);
+ buff_kvm = vzalloc(buff_kvm_size);
+ if (!buff_kvm)
+ goto out;
+
+ ret = 0;
+ uvcb.dump_area_origin = (u64)buff_kvm;
+ /* We will loop until the user buffer is filled or an error occurs */
+ do {
+ /* Get 1MB worth of guest storage state data */
+ cc = uv_call_sched(0, (u64)&uvcb);
+
+ /* All or nothing */
+ if (cc) {
+ ret = -EINVAL;
+ break;
+ }
+
+ size_done += increment_len;
+ uvcb.dump_area_origin += increment_len;
+ buff_user_len -= increment_len;
+ uvcb.gaddr += HPAGE_SIZE;
+
+ /* KVM Buffer full, time to copy to the process */
+ if (!buff_user_len || size_done == DUMP_BUFF_LEN) {
+ if (copy_to_user(buff_user, buff_kvm, size_done)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ buff_user += size_done;
+ size_done = 0;
+ uvcb.dump_area_origin = (u64)buff_kvm;
+ }
+ } while (buff_user_len);
+
+ /* Report back where we ended dumping */
+ *gaddr = uvcb.gaddr;
+
+ /* Lets only log errors, we don't want to spam */
+out:
+ if (ret)
+ KVM_UV_EVENT(kvm, 3,
+ "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x",
+ uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc);
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ vfree(buff_kvm);
+
+ return ret;
+}
+
+/**
+ * kvm_s390_pv_dump_complete
+ *
+ * @kvm: pointer to the guest's KVM struct
+ * @buff_user: Userspace pointer where we will write the results to
+ * @rc: Pointer to where the uvcb return code is stored
+ * @rrc: Pointer to where the uvcb return reason code is stored
+ *
+ * Completes the dumping operation and writes the completion data to
+ * user space.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return:
+ * 0 on success
+ * -ENOMEM if allocating the completion buffer fails
+ * -EINVAL if the UV call fails, rc and rrc will be set in this case
+ * -EFAULT if copying the result to buff_user failed
+ */
+int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
+ u16 *rc, u16 *rrc)
+{
+ struct uv_cb_dump_complete complete = {
+ .header.len = sizeof(complete),
+ .header.cmd = UVC_CMD_DUMP_COMPLETE,
+ .config_handle = kvm_s390_pv_get_handle(kvm),
+ };
+ u64 *compl_data;
+ int ret;
+
+ /* Allocate dump area */
+ compl_data = vzalloc(uv_info.conf_dump_finalize_len);
+ if (!compl_data)
+ return -ENOMEM;
+ complete.dump_area_origin = (u64)compl_data;
+
+ ret = uv_call_sched(0, (u64)&complete);
+ *rc = complete.header.rc;
+ *rrc = complete.header.rrc;
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x",
+ complete.header.rc, complete.header.rrc);
+
+ if (!ret) {
+ /*
+ * kvm_s390_pv_dealloc_vm() will also (mem)set
+ * this to false on a reboot or other destroy
+ * operation for this vm.
+ */
+ kvm->arch.pv.dumping = false;
+ kvm_s390_vcpu_unblock_all(kvm);
+ ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len);
+ if (ret)
+ ret = -EFAULT;
+ }
+ vfree(compl_data);
+ /* If the UVC returned an error, translate it to -EINVAL */
+ if (ret > 0)
+ ret = -EINVAL;
+ return ret;
+}
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 683036c1c92a..d9696b530064 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -151,22 +151,10 @@ static int __sigp_stop_and_store_status(struct kvm_vcpu *vcpu,
static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter,
u64 *status_reg)
{
- unsigned int i;
- struct kvm_vcpu *v;
- bool all_stopped = true;
-
- kvm_for_each_vcpu(i, v, vcpu->kvm) {
- if (v == vcpu)
- continue;
- if (!is_vcpu_stopped(v))
- all_stopped = false;
- }
-
*status_reg &= 0xffffffff00000000UL;
/* Reject set arch order, with czam we're always in z/Arch mode. */
- *status_reg |= (all_stopped ? SIGP_STATUS_INVALID_PARAMETER :
- SIGP_STATUS_INCORRECT_STATE);
+ *status_reg |= SIGP_STATUS_INVALID_PARAMETER;
return SIGP_CC_STATUS_STORED;
}
@@ -288,6 +276,34 @@ static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code,
if (!dst_vcpu)
return SIGP_CC_NOT_OPERATIONAL;
+ /*
+ * SIGP RESTART, SIGP STOP, and SIGP STOP AND STORE STATUS orders
+ * are processed asynchronously. Until the affected VCPU finishes
+ * its work and calls back into KVM to clear the (RESTART or STOP)
+ * interrupt, we need to return any new non-reset orders "busy".
+ *
+ * This is important because a single VCPU could issue:
+ * 1) SIGP STOP $DESTINATION
+ * 2) SIGP SENSE $DESTINATION
+ *
+ * If the SIGP SENSE would not be rejected as "busy", it could
+ * return an incorrect answer as to whether the VCPU is STOPPED
+ * or OPERATING.
+ */
+ if (order_code != SIGP_INITIAL_CPU_RESET &&
+ order_code != SIGP_CPU_RESET) {
+ /*
+ * Lockless check. Both SIGP STOP and SIGP (RE)START
+ * properly synchronize everything while processing
+ * their orders, while the guest cannot observe a
+ * difference when issuing other orders from two
+ * different VCPUs.
+ */
+ if (kvm_s390_is_stop_irq_pending(dst_vcpu) ||
+ kvm_s390_is_restart_irq_pending(dst_vcpu))
+ return SIGP_CC_BUSY;
+ }
+
switch (order_code) {
case SIGP_SENSE:
vcpu->stat.instruction_sigp_sense++;
@@ -453,7 +469,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
*
* This interception will occur at the source cpu when a source cpu sends an
* external call to a target cpu and the target cpu has the WAIT bit set in
- * its cpuflags. Interception will occurr after the interrupt indicator bits at
+ * its cpuflags. Interception will occur after the interrupt indicator bits at
* the target cpu have been set. All error cases will lead to instruction
* interception, therefore nothing is to be checked or prepared.
*/
@@ -464,9 +480,9 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu)
struct kvm_vcpu *dest_vcpu;
u8 order_code = kvm_s390_get_base_disp_rs(vcpu, NULL);
- trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
-
if (order_code == SIGP_EXTERNAL_CALL) {
+ trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
+
dest_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, cpu_addr);
BUG_ON(dest_vcpu == NULL);
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 6f0209d45164..9ac92dbf680d 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -333,6 +333,29 @@ TRACE_EVENT(kvm_s390_airq_suppressed,
__entry->id, __entry->isc)
);
+/*
+ * Trace point for gmap notifier calls.
+ */
+TRACE_EVENT(kvm_s390_gmap_notifier,
+ TP_PROTO(unsigned long start, unsigned long end, unsigned int shadow),
+ TP_ARGS(start, end, shadow),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, start)
+ __field(unsigned long, end)
+ __field(unsigned int, shadow)
+ ),
+
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ __entry->shadow = shadow;
+ ),
+
+ TP_printk("gmap notified (start:0x%lx end:0x%lx shadow:%d)",
+ __entry->start, __entry->end, __entry->shadow)
+ );
+
#endif /* _TRACE_KVMS390_H */
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 076090f9e666..fef42e2a80a2 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -18,6 +18,8 @@
#include <asm/sclp.h>
#include <asm/nmi.h>
#include <asm/dis.h>
+#include <asm/fpu/api.h>
+#include <asm/facility.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -137,11 +139,15 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
}
/* Copy to APCB FORMAT1 from APCB FORMAT0 */
static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s,
- unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h)
+ unsigned long crycb_gpa, struct kvm_s390_apcb1 *apcb_h)
{
struct kvm_s390_apcb0 tmp;
+ unsigned long apcb_gpa;
- if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0)))
+ apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0);
+
+ if (read_guest_real(vcpu, apcb_gpa, &tmp,
+ sizeof(struct kvm_s390_apcb0)))
return -EFAULT;
apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0];
@@ -156,19 +162,24 @@ static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s,
* setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0
* @vcpu: pointer to the virtual CPU
* @apcb_s: pointer to start of apcb in the shadow crycb
- * @apcb_o: pointer to start of original apcb in the guest2
+ * @crycb_gpa: guest physical address to start of original guest crycb
* @apcb_h: pointer to start of apcb in the guest1
*
* Returns 0 and -EFAULT on error reading guest apcb
*/
static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
- unsigned long apcb_o, unsigned long *apcb_h)
+ unsigned long crycb_gpa, unsigned long *apcb_h)
{
- if (read_guest_real(vcpu, apcb_o, apcb_s,
+ unsigned long apcb_gpa;
+
+ apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0);
+
+ if (read_guest_real(vcpu, apcb_gpa, apcb_s,
sizeof(struct kvm_s390_apcb0)))
return -EFAULT;
- bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0));
+ bitmap_and(apcb_s, apcb_s, apcb_h,
+ BITS_PER_BYTE * sizeof(struct kvm_s390_apcb0));
return 0;
}
@@ -177,20 +188,25 @@ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
* setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB
* @vcpu: pointer to the virtual CPU
* @apcb_s: pointer to start of apcb in the shadow crycb
- * @apcb_o: pointer to start of original guest apcb
+ * @crycb_gpa: guest physical address to start of original guest crycb
* @apcb_h: pointer to start of apcb in the host
*
* Returns 0 and -EFAULT on error reading guest apcb
*/
static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
- unsigned long apcb_o,
+ unsigned long crycb_gpa,
unsigned long *apcb_h)
{
- if (read_guest_real(vcpu, apcb_o, apcb_s,
+ unsigned long apcb_gpa;
+
+ apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb1);
+
+ if (read_guest_real(vcpu, apcb_gpa, apcb_s,
sizeof(struct kvm_s390_apcb1)))
return -EFAULT;
- bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1));
+ bitmap_and(apcb_s, apcb_s, apcb_h,
+ BITS_PER_BYTE * sizeof(struct kvm_s390_apcb1));
return 0;
}
@@ -199,7 +215,7 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
* setup_apcb - Create a shadow copy of the apcb.
* @vcpu: pointer to the virtual CPU
* @crycb_s: pointer to shadow crycb
- * @crycb_o: pointer to original guest crycb
+ * @crycb_gpa: guest physical address of original guest crycb
* @crycb_h: pointer to the host crycb
* @fmt_o: format of the original guest crycb.
* @fmt_h: format of the host crycb.
@@ -210,50 +226,46 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
* Return 0 or an error number if the guest and host crycb are incompatible.
*/
static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s,
- const u32 crycb_o,
+ const u32 crycb_gpa,
struct kvm_s390_crypto_cb *crycb_h,
int fmt_o, int fmt_h)
{
- struct kvm_s390_crypto_cb *crycb;
-
- crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o;
-
switch (fmt_o) {
case CRYCB_FORMAT2:
- if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK))
+ if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 256) & PAGE_MASK))
return -EACCES;
if (fmt_h != CRYCB_FORMAT2)
return -EINVAL;
return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1,
- (unsigned long) &crycb->apcb1,
+ crycb_gpa,
(unsigned long *)&crycb_h->apcb1);
case CRYCB_FORMAT1:
switch (fmt_h) {
case CRYCB_FORMAT2:
return setup_apcb10(vcpu, &crycb_s->apcb1,
- (unsigned long) &crycb->apcb0,
+ crycb_gpa,
&crycb_h->apcb1);
case CRYCB_FORMAT1:
return setup_apcb00(vcpu,
(unsigned long *) &crycb_s->apcb0,
- (unsigned long) &crycb->apcb0,
+ crycb_gpa,
(unsigned long *) &crycb_h->apcb0);
}
break;
case CRYCB_FORMAT0:
- if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK))
+ if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 32) & PAGE_MASK))
return -EACCES;
switch (fmt_h) {
case CRYCB_FORMAT2:
return setup_apcb10(vcpu, &crycb_s->apcb1,
- (unsigned long) &crycb->apcb0,
+ crycb_gpa,
&crycb_h->apcb1);
case CRYCB_FORMAT1:
case CRYCB_FORMAT0:
return setup_apcb00(vcpu,
(unsigned long *) &crycb_s->apcb0,
- (unsigned long) &crycb->apcb0,
+ crycb_gpa,
(unsigned long *) &crycb_h->apcb0);
}
}
@@ -416,11 +428,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
memcpy((void *)((u64)scb_o + 0xc0),
(void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
break;
- case ICPT_PARTEXEC:
- /* MVPG only */
- memcpy((void *)((u64)scb_o + 0xc0),
- (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
- break;
}
if (scb_s->ihcpu != 0xffffU)
@@ -498,7 +505,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
scb_s->mso = new_mso;
scb_s->prefix = new_prefix;
- /* We have to definetly flush the tlb if this scb never ran */
+ /* We have to definitely flush the tlb if this scb never ran */
if (scb_s->ihcpu != 0xffffU)
scb_s->ihcpu = scb_o->ihcpu;
@@ -507,6 +514,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* Host-protection-interruption introduced with ESOP */
if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT;
+ /*
+ * CPU Topology
+ * This facility only uses the utility field of the SCA and none of
+ * the cpu entries that are problematic with the other interpretation
+ * facilities so we can pass it through
+ */
+ if (test_kvm_facility(vcpu->kvm, 11))
+ scb_s->ecb |= scb_o->ecb & ECB_PTF;
/* transactional execution */
if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) {
/* remap the prefix is tx is toggled on */
@@ -514,6 +529,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
prefix_unmapped(vsie_page);
scb_s->ecb |= ECB_TE;
}
+ /* specification exception interpretation */
+ scb_s->ecb |= scb_o->ecb & ECB_SPECI;
/* branch prediction */
if (test_kvm_facility(vcpu->kvm, 82))
scb_s->fpf |= scb_o->fpf & FPF_BPBC;
@@ -540,14 +557,17 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
scb_s->eca |= scb_o->eca & ECA_CEI;
/* Epoch Extension */
- if (test_kvm_facility(vcpu->kvm, 139))
+ if (test_kvm_facility(vcpu->kvm, 139)) {
scb_s->ecd |= scb_o->ecd & ECD_MEF;
+ scb_s->epdx = scb_o->epdx;
+ }
/* etoken */
if (test_kvm_facility(vcpu->kvm, 156))
scb_s->ecd |= scb_o->ecd & ECD_ETOKENF;
scb_s->hpid = HPID_VSIE;
+ scb_s->cpnc = scb_o->cpnc;
prepare_ibc(vcpu, vsie_page);
rc = shadow_crycb(vcpu, vsie_page);
@@ -568,10 +588,6 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
if (!gmap_is_shadow(gmap))
return;
- if (start >= 1UL << 31)
- /* We are only interested in prefix pages */
- return;
-
/*
* Only new shadow blocks are added to the list during runtime,
* therefore we can safely reference them all the time.
@@ -618,10 +634,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* with mso/msl, the prefix lies at offset *mso* */
prefix += scb_s->mso;
- rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
if (!rc && (scb_s->ecb & ECB_TE))
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- prefix + PAGE_SIZE);
+ prefix + PAGE_SIZE, NULL);
/*
* We don't have to mprotect, we will be called for all unshadows.
* SIE will detect if protection applies and trigger a validity.
@@ -647,7 +663,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
page = gfn_to_page(kvm, gpa_to_gfn(gpa));
if (is_error_page(page))
return -EINVAL;
- *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+ *hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK);
return 0;
}
@@ -862,7 +878,7 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
WARN_ON_ONCE(rc);
return 1;
}
- vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+ vsie_page->scb_o = phys_to_virt(hpa);
return 0;
}
@@ -882,7 +898,7 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
(vaddr & 0xfffffffffffff000UL) |
/* 52-53: store / fetch */
(((unsigned int) !write_flag) + 1) << 10,
- /* 62-63: asce id (alway primary == 0) */
+ /* 62-63: asce id (always primary == 0) */
.exc_access_id = 0, /* always primary */
.op_access_id = 0, /* not MVPG */
};
@@ -912,7 +928,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
current->thread.gmap_addr, 1);
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- current->thread.gmap_addr);
+ current->thread.gmap_addr, NULL);
if (rc > 0) {
rc = inject_fault(vcpu, rc,
current->thread.gmap_addr,
@@ -934,7 +950,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu,
{
if (vsie_page->fault_addr)
kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- vsie_page->fault_addr);
+ vsie_page->fault_addr, NULL);
vsie_page->fault_addr = 0;
}
@@ -969,12 +985,26 @@ static void retry_vsie_icpt(struct vsie_page *vsie_page)
static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
- __u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U;
+ __u32 fac = READ_ONCE(vsie_page->scb_o->fac);
+ /*
+ * Alternate-STFLE-Interpretive-Execution facilities are not supported
+ * -> format-0 flcb
+ */
if (fac && test_kvm_facility(vcpu->kvm, 7)) {
retry_vsie_icpt(vsie_page);
+ /*
+ * The facility list origin (FLO) is in bits 1 - 28 of the FLD
+ * so we need to mask here before reading.
+ */
+ fac = fac & 0x7ffffff8U;
+ /*
+ * format-0 -> size of nested guest's facility list == guest's size
+ * guest's size == host's size, since STFLE is interpretatively executed
+ * using a format-0 for the guest, too.
+ */
if (read_guest_real(vcpu, fac, &vsie_page->fac,
- sizeof(vsie_page->fac)))
+ stfle_size() * sizeof(u64)))
return set_validity_icpt(scb_s, 0x1090U);
scb_s->fac = (__u32)(__u64) &vsie_page->fac;
}
@@ -982,6 +1012,98 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
}
/*
+ * Get a register for a nested guest.
+ * @vcpu the vcpu of the guest
+ * @vsie_page the vsie_page for the nested guest
+ * @reg the register number, the upper 4 bits are ignored.
+ * returns: the value of the register.
+ */
+static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg)
+{
+ /* no need to validate the parameter and/or perform error handling */
+ reg &= 0xf;
+ switch (reg) {
+ case 15:
+ return vsie_page->scb_s.gg15;
+ case 14:
+ return vsie_page->scb_s.gg14;
+ default:
+ return vcpu->run->s.regs.gprs[reg];
+ }
+}
+
+static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ unsigned long pei_dest, pei_src, src, dest, mask, prefix;
+ u64 *pei_block = &vsie_page->scb_o->mcic;
+ int edat, rc_dest, rc_src;
+ union ctlreg0 cr0;
+
+ cr0.val = vcpu->arch.sie_block->gcr[0];
+ edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+ mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK);
+ prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+
+ dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask;
+ dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso;
+ src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
+ src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
+
+ rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
+ rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
+ /*
+ * Either everything went well, or something non-critical went wrong
+ * e.g. because of a race. In either case, simply retry.
+ */
+ if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) {
+ retry_vsie_icpt(vsie_page);
+ return -EAGAIN;
+ }
+ /* Something more serious went wrong, propagate the error */
+ if (rc_dest < 0)
+ return rc_dest;
+ if (rc_src < 0)
+ return rc_src;
+
+ /* The only possible suppressing exception: just deliver it */
+ if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) {
+ clear_vsie_icpt(vsie_page);
+ rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC);
+ WARN_ON_ONCE(rc_dest);
+ return 1;
+ }
+
+ /*
+ * Forward the PEI intercept to the guest if it was a page fault, or
+ * also for segment and region table faults if EDAT applies.
+ */
+ if (edat) {
+ rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0;
+ rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0;
+ } else {
+ rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0;
+ rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
+ }
+ if (!rc_dest && !rc_src) {
+ pei_block[0] = pei_dest;
+ pei_block[1] = pei_src;
+ return 1;
+ }
+
+ retry_vsie_icpt(vsie_page);
+
+ /*
+ * The host has edat, and the guest does not, or it was an ASCE type
+ * exception. The host needs to inject the appropriate DAT interrupts
+ * into the guest.
+ */
+ if (rc_dest)
+ return inject_fault(vcpu, rc_dest, dest, 1);
+ return inject_fault(vcpu, rc_src, src, 0);
+}
+
+/*
* Run the vsie on a shadow scb and a shadow gmap, without any further
* sanity checks, handling SIE faults.
*
@@ -1000,12 +1122,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
handle_last_fault(vcpu, vsie_page);
- if (need_resched())
- schedule();
- if (test_cpu_flag(CIF_MCCK_PENDING))
- s390_handle_mcck();
-
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
/* save current guest state of bp isolation override */
guest_bp_isolation = test_thread_flag(TIF_ISOLATE_BP_GUEST);
@@ -1032,6 +1149,8 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
*/
vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
barrier();
+ if (test_cpu_flag(CIF_FPU))
+ load_fpu_regs();
if (!kvm_s390_vcpu_sie_inhibited(vcpu))
rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
barrier();
@@ -1045,7 +1164,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (!guest_bp_isolation)
clear_thread_flag(TIF_ISOLATE_BP_GUEST);
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
if (rc == -EINTR) {
VCPU_EVENT(vcpu, 3, "%s", "machine check");
@@ -1072,6 +1191,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if ((scb_s->ipa & 0xf000) != 0xf000)
scb_s->ipa += 0x1000;
break;
+ case ICPT_PARTEXEC:
+ if (scb_s->ipa == 0xb254)
+ rc = vsie_handle_mvpg(vcpu, vsie_page);
+ break;
}
return rc;
}
@@ -1102,8 +1225,10 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
* we're holding has been unshadowed. If the gmap is still valid,
* we can safely reuse it.
*/
- if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
+ if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) {
+ vcpu->kvm->stat.gmap_shadow_reuse++;
return 0;
+ }
/* release the old shadow - if any, and mark the prefix as unmapped */
release_gmap_shadow(vsie_page);
@@ -1111,6 +1236,7 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
if (IS_ERR(gmap))
return PTR_ERR(gmap);
gmap->private = vcpu->kvm;
+ vcpu->kvm->stat.gmap_shadow_create++;
WRITE_ONCE(vsie_page->gmap, gmap);
return 0;
}
@@ -1185,6 +1311,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
kvm_s390_vcpu_has_irq(vcpu, 0) ||
kvm_s390_vcpu_sie_inhibited(vcpu))
break;
+ cond_resched();
}
if (rc == -EFAULT) {
@@ -1202,6 +1329,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
scb_s->iprcc = PGM_ADDRESSING;
scb_s->pgmilc = 4;
scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+ rc = 1;
}
return rc;
}
@@ -1236,7 +1364,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
mutex_lock(&kvm->arch.vsie.mutex);
if (kvm->arch.vsie.page_count < nr_vcpus) {
- page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA);
if (!page) {
mutex_unlock(&kvm->arch.vsie.mutex);
return ERR_PTR(-ENOMEM);
@@ -1338,7 +1466,7 @@ out_put:
void kvm_s390_vsie_init(struct kvm *kvm)
{
mutex_init(&kvm->arch.vsie.mutex);
- INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+ INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL_ACCOUNT);
}
/* Destroy the vsie data structures. To be called when a vm is destroyed. */
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index 28fd66d558ff..7c50eca85ca4 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -3,10 +3,12 @@
# Makefile for s390-specific library files..
#
-lib-y += delay.o string.o uaccess.o find.o spinlock.o
+lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o
obj-y += mem.o xor.o
lib-$(CONFIG_KPROBES) += probes.o
lib-$(CONFIG_UPROBES) += probes.o
+obj-$(CONFIG_S390_KPROBES_SANITY_TEST) += test_kprobes_s390.o
+test_kprobes_s390-objs += test_kprobes_asm.o test_kprobes.o
# Instrumenting memory accesses to __user data (in different address space)
# produce false positives
@@ -14,3 +16,10 @@ KASAN_SANITIZE_uaccess.o := n
obj-$(CONFIG_S390_UNWIND_SELFTEST) += test_unwind.o
CFLAGS_test_unwind.o += -fno-optimize-sibling-calls
+
+obj-$(CONFIG_S390_MODULES_SANITY_TEST) += test_modules.o
+obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o
+
+lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+
+obj-$(CONFIG_EXPOLINE_EXTERN) += expoline/
diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c
index d4aa10795605..be14c58cb989 100644
--- a/arch/s390/lib/delay.c
+++ b/arch/s390/lib/delay.c
@@ -4,126 +4,42 @@
*
* Copyright IBM Corp. 1999, 2008
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*/
-#include <linux/sched.h>
+#include <linux/processor.h>
#include <linux/delay.h>
-#include <linux/timex.h>
-#include <linux/export.h>
-#include <linux/irqflags.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <asm/vtimer.h>
#include <asm/div64.h>
-#include <asm/idle.h>
+#include <asm/timex.h>
void __delay(unsigned long loops)
{
- /*
- * To end the bloody studid and useless discussion about the
- * BogoMips number I took the liberty to define the __delay
- * function in a way that that resulting BogoMips number will
- * yield the megahertz number of the cpu. The important function
- * is udelay and that is done using the tod clock. -- martin.
- */
+ /*
+ * Loop 'loops' times. Callers must not assume a specific
+ * amount of time passes before this function returns.
+ */
asm volatile("0: brct %0,0b" : : "d" ((loops/2) + 1));
}
EXPORT_SYMBOL(__delay);
-static void __udelay_disabled(unsigned long long usecs)
+static void delay_loop(unsigned long delta)
{
- unsigned long cr0, cr0_new, psw_mask;
- struct s390_idle_data idle;
- u64 end;
+ unsigned long end;
- end = get_tod_clock() + (usecs << 12);
- __ctl_store(cr0, 0, 0);
- cr0_new = cr0 & ~CR0_IRQ_SUBCLASS_MASK;
- cr0_new |= (1UL << (63 - 52)); /* enable clock comparator irq */
- __ctl_load(cr0_new, 0, 0);
- psw_mask = __extract_psw() | PSW_MASK_EXT | PSW_MASK_WAIT;
- set_clock_comparator(end);
- set_cpu_flag(CIF_IGNORE_IRQ);
- psw_idle(&idle, psw_mask);
- clear_cpu_flag(CIF_IGNORE_IRQ);
- set_clock_comparator(S390_lowcore.clock_comparator);
- __ctl_load(cr0, 0, 0);
-}
-
-static void __udelay_enabled(unsigned long long usecs)
-{
- u64 clock_saved, end;
-
- end = get_tod_clock_fast() + (usecs << 12);
- do {
- clock_saved = 0;
- if (tod_after(S390_lowcore.clock_comparator, end)) {
- clock_saved = local_tick_disable();
- set_clock_comparator(end);
- }
- enabled_wait();
- if (clock_saved)
- local_tick_enable(clock_saved);
- } while (get_tod_clock_fast() < end);
+ end = get_tod_clock_monotonic() + delta;
+ while (!tod_after(get_tod_clock_monotonic(), end))
+ cpu_relax();
}
-/*
- * Waits for 'usecs' microseconds using the TOD clock comparator.
- */
-void __udelay(unsigned long long usecs)
+void __udelay(unsigned long usecs)
{
- unsigned long flags;
-
- preempt_disable();
- local_irq_save(flags);
- if (in_irq()) {
- __udelay_disabled(usecs);
- goto out;
- }
- if (in_softirq()) {
- if (raw_irqs_disabled_flags(flags))
- __udelay_disabled(usecs);
- else
- __udelay_enabled(usecs);
- goto out;
- }
- if (raw_irqs_disabled_flags(flags)) {
- local_bh_disable();
- __udelay_disabled(usecs);
- _local_bh_enable();
- goto out;
- }
- __udelay_enabled(usecs);
-out:
- local_irq_restore(flags);
- preempt_enable();
+ delay_loop(usecs << 12);
}
EXPORT_SYMBOL(__udelay);
-/*
- * Simple udelay variant. To be used on startup and reboot
- * when the interrupt handler isn't working.
- */
-void udelay_simple(unsigned long long usecs)
-{
- u64 end;
-
- end = get_tod_clock_fast() + (usecs << 12);
- while (get_tod_clock_fast() < end)
- cpu_relax();
-}
-
-void __ndelay(unsigned long long nsecs)
+void __ndelay(unsigned long nsecs)
{
- u64 end;
-
nsecs <<= 9;
do_div(nsecs, 125);
- end = get_tod_clock_fast() + nsecs;
- if (nsecs & ~0xfffUL)
- __udelay(nsecs >> 12);
- while (get_tod_clock_fast() < end)
- barrier();
+ delay_loop(nsecs);
}
EXPORT_SYMBOL(__ndelay);
diff --git a/arch/s390/lib/error-inject.c b/arch/s390/lib/error-inject.c
new file mode 100644
index 000000000000..8c9d4da87eef
--- /dev/null
+++ b/arch/s390/lib/error-inject.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0+
+#include <asm/ptrace.h>
+#include <linux/error-injection.h>
+#include <linux/kprobes.h>
+
+void override_function_with_return(struct pt_regs *regs)
+{
+ /*
+ * Emulate 'br 14'. 'regs' is captured by kprobes on entry to some
+ * kernel function.
+ */
+ regs->psw.addr = regs->gprs[14];
+}
+NOKPROBE_SYMBOL(override_function_with_return);
diff --git a/arch/s390/lib/expoline/Makefile b/arch/s390/lib/expoline/Makefile
new file mode 100644
index 000000000000..854631d9cb03
--- /dev/null
+++ b/arch/s390/lib/expoline/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += expoline.o
diff --git a/arch/s390/lib/expoline/expoline.S b/arch/s390/lib/expoline/expoline.S
new file mode 100644
index 000000000000..92ed8409a7a4
--- /dev/null
+++ b/arch/s390/lib/expoline/expoline.S
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/nospec-insn.h>
+#include <linux/linkage.h>
+
+.macro GEN_ALL_BR_THUNK_EXTERN
+ .irp r1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+ GEN_BR_THUNK_EXTERN %r\r1
+ .endr
+.endm
+
+GEN_ALL_BR_THUNK_EXTERN
diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S
index dc0874f2e203..08f60a42b9a6 100644
--- a/arch/s390/lib/mem.S
+++ b/arch/s390/lib/mem.S
@@ -5,8 +5,8 @@
* Copyright IBM Corp. 2012
*/
+#include <linux/export.h>
#include <linux/linkage.h>
-#include <asm/export.h>
#include <asm/nospec-insn.h>
GEN_BR_THUNK %r14
@@ -14,8 +14,7 @@
/*
* void *memmove(void *dest, const void *src, size_t n)
*/
-WEAK(memmove)
-ENTRY(__memmove)
+SYM_FUNC_START(__memmove)
ltgr %r4,%r4
lgr %r1,%r2
jz .Lmemmove_exit
@@ -48,7 +47,10 @@ ENTRY(__memmove)
BR_EX %r14
.Lmemmove_mvc:
mvc 0(1,%r1),0(%r3)
-ENDPROC(__memmove)
+SYM_FUNC_END(__memmove)
+EXPORT_SYMBOL(__memmove)
+
+SYM_FUNC_ALIAS(memmove, __memmove)
EXPORT_SYMBOL(memmove)
/*
@@ -66,8 +68,7 @@ EXPORT_SYMBOL(memmove)
* return __builtin_memset(s, c, n);
* }
*/
-WEAK(memset)
-ENTRY(__memset)
+SYM_FUNC_START(__memset)
ltgr %r4,%r4
jz .Lmemset_exit
ltgr %r3,%r3
@@ -111,7 +112,10 @@ ENTRY(__memset)
xc 0(1,%r1),0(%r1)
.Lmemset_mvc:
mvc 1(1,%r1),0(%r1)
-ENDPROC(__memset)
+SYM_FUNC_END(__memset)
+EXPORT_SYMBOL(__memset)
+
+SYM_FUNC_ALIAS(memset, __memset)
EXPORT_SYMBOL(memset)
/*
@@ -119,8 +123,7 @@ EXPORT_SYMBOL(memset)
*
* void *memcpy(void *dest, const void *src, size_t n)
*/
-WEAK(memcpy)
-ENTRY(__memcpy)
+SYM_FUNC_START(__memcpy)
ltgr %r4,%r4
jz .Lmemcpy_exit
aghi %r4,-1
@@ -141,7 +144,10 @@ ENTRY(__memcpy)
j .Lmemcpy_remainder
.Lmemcpy_mvc:
mvc 0(1,%r1),0(%r3)
-ENDPROC(__memcpy)
+SYM_FUNC_END(__memcpy)
+EXPORT_SYMBOL(__memcpy)
+
+SYM_FUNC_ALIAS(memcpy, __memcpy)
EXPORT_SYMBOL(memcpy)
/*
@@ -152,7 +158,7 @@ EXPORT_SYMBOL(memcpy)
* void *__memset64(uint64_t *s, uint64_t v, size_t count)
*/
.macro __MEMSET bits,bytes,insn
-ENTRY(__memset\bits)
+SYM_FUNC_START(__memset\bits)
ltgr %r4,%r4
jz .L__memset_exit\bits
cghi %r4,\bytes
@@ -178,7 +184,7 @@ ENTRY(__memset\bits)
BR_EX %r14
.L__memset_mvc\bits:
mvc \bytes(1,%r1),0(%r1)
-ENDPROC(__memset\bits)
+SYM_FUNC_END(__memset\bits)
.endm
__MEMSET 16,2,sth
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index ce1e4bbe53aa..81c53440b3e6 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -13,8 +13,8 @@
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/percpu.h>
+#include <linux/io.h>
#include <asm/alternative.h>
-#include <asm/io.h>
int spin_retry = -1;
@@ -26,7 +26,7 @@ static int __init spin_retry_init(void)
}
early_initcall(spin_retry_init);
-/**
+/*
* spin_retry= parameter
*/
static int __init spin_retry_setup(char *str)
@@ -75,7 +75,7 @@ static inline int arch_load_niai4(int *lock)
int owner;
asm_inline volatile(
- ALTERNATIVE("", ".long 0xb2fa0040", 49) /* NIAI 4 */
+ ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", 49) /* NIAI 4 */
" l %0,%1\n"
: "=d" (owner) : "Q" (*lock) : "memory");
return owner;
@@ -86,7 +86,7 @@ static inline int arch_cmpxchg_niai8(int *lock, int old, int new)
int expected = old;
asm_inline volatile(
- ALTERNATIVE("", ".long 0xb2fa0080", 49) /* NIAI 8 */
+ ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", 49) /* NIAI 8 */
" cs %0,%3,%1\n"
: "=d" (old), "=Q" (*lock)
: "0" (old), "d" (new), "Q" (*lock)
@@ -242,7 +242,6 @@ static inline void arch_spin_lock_classic(arch_spinlock_t *lp)
void arch_spin_lock_wait(arch_spinlock_t *lp)
{
- /* Use classic spinlocks + niai if the steal time is >= 10% */
if (test_cpu_flag(CIF_DEDICATED_CPU))
arch_spin_lock_queued(lp);
else
diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c
index 0e30e6e43b0c..7d8741818239 100644
--- a/arch/s390/lib/string.c
+++ b/arch/s390/lib/string.c
@@ -8,6 +8,9 @@
*/
#define IN_ARCH_STRING_C 1
+#ifndef __NO_FORTIFY
+# define __NO_FORTIFY
+#endif
#include <linux/types.h>
#include <linux/string.h>
@@ -18,23 +21,30 @@
*/
static inline char *__strend(const char *s)
{
- register unsigned long r0 asm("0") = 0;
-
- asm volatile ("0: srst %0,%1\n"
- " jo 0b"
- : "+d" (r0), "+a" (s) : : "cc", "memory");
- return (char *) r0;
+ unsigned long e = 0;
+
+ asm volatile(
+ " lghi 0,0\n"
+ "0: srst %[e],%[s]\n"
+ " jo 0b\n"
+ : [e] "+&a" (e), [s] "+&a" (s)
+ :
+ : "cc", "memory", "0");
+ return (char *)e;
}
static inline char *__strnend(const char *s, size_t n)
{
- register unsigned long r0 asm("0") = 0;
const char *p = s + n;
- asm volatile ("0: srst %0,%1\n"
- " jo 0b"
- : "+d" (p), "+a" (s) : "d" (r0) : "cc", "memory");
- return (char *) p;
+ asm volatile(
+ " lghi 0,0\n"
+ "0: srst %[p],%[s]\n"
+ " jo 0b\n"
+ : [p] "+&d" (p), [s] "+&a" (s)
+ :
+ : "cc", "memory", "0");
+ return (char *)p;
}
/**
@@ -76,45 +86,21 @@ EXPORT_SYMBOL(strnlen);
#ifdef __HAVE_ARCH_STRCPY
char *strcpy(char *dest, const char *src)
{
- register int r0 asm("0") = 0;
char *ret = dest;
- asm volatile ("0: mvst %0,%1\n"
- " jo 0b"
- : "+&a" (dest), "+&a" (src) : "d" (r0)
- : "cc", "memory" );
+ asm volatile(
+ " lghi 0,0\n"
+ "0: mvst %[dest],%[src]\n"
+ " jo 0b\n"
+ : [dest] "+&a" (dest), [src] "+&a" (src)
+ :
+ : "cc", "memory", "0");
return ret;
}
EXPORT_SYMBOL(strcpy);
#endif
/**
- * strlcpy - Copy a %NUL terminated string into a sized buffer
- * @dest: Where to copy the string to
- * @src: Where to copy the string from
- * @size: size of destination buffer
- *
- * Compatible with *BSD: the result is always a valid
- * NUL-terminated string that fits in the buffer (unless,
- * of course, the buffer size is zero). It does not pad
- * out the result like strncpy() does.
- */
-#ifdef __HAVE_ARCH_STRLCPY
-size_t strlcpy(char *dest, const char *src, size_t size)
-{
- size_t ret = __strend(src) - src;
-
- if (size) {
- size_t len = (ret >= size) ? size-1 : ret;
- dest[len] = '\0';
- memcpy(dest, src, len);
- }
- return ret;
-}
-EXPORT_SYMBOL(strlcpy);
-#endif
-
-/**
* strncpy - Copy a length-limited, %NUL-terminated string
* @dest: Where to copy the string to
* @src: Where to copy the string from
@@ -144,16 +130,18 @@ EXPORT_SYMBOL(strncpy);
#ifdef __HAVE_ARCH_STRCAT
char *strcat(char *dest, const char *src)
{
- register int r0 asm("0") = 0;
- unsigned long dummy;
+ unsigned long dummy = 0;
char *ret = dest;
- asm volatile ("0: srst %0,%1\n"
- " jo 0b\n"
- "1: mvst %0,%2\n"
- " jo 1b"
- : "=&a" (dummy), "+a" (dest), "+a" (src)
- : "d" (r0), "0" (0UL) : "cc", "memory" );
+ asm volatile(
+ " lghi 0,0\n"
+ "0: srst %[dummy],%[dest]\n"
+ " jo 0b\n"
+ "1: mvst %[dummy],%[src]\n"
+ " jo 1b\n"
+ : [dummy] "+&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src)
+ :
+ : "cc", "memory", "0");
return ret;
}
EXPORT_SYMBOL(strcat);
@@ -221,58 +209,40 @@ EXPORT_SYMBOL(strncat);
#ifdef __HAVE_ARCH_STRCMP
int strcmp(const char *s1, const char *s2)
{
- register int r0 asm("0") = 0;
int ret = 0;
- asm volatile ("0: clst %2,%3\n"
- " jo 0b\n"
- " je 1f\n"
- " ic %0,0(%2)\n"
- " ic %1,0(%3)\n"
- " sr %0,%1\n"
- "1:"
- : "+d" (ret), "+d" (r0), "+a" (s1), "+a" (s2)
- : : "cc", "memory");
+ asm volatile(
+ " lghi 0,0\n"
+ "0: clst %[s1],%[s2]\n"
+ " jo 0b\n"
+ " je 1f\n"
+ " ic %[ret],0(%[s1])\n"
+ " ic 0,0(%[s2])\n"
+ " sr %[ret],0\n"
+ "1:"
+ : [ret] "+&d" (ret), [s1] "+&a" (s1), [s2] "+&a" (s2)
+ :
+ : "cc", "memory", "0");
return ret;
}
EXPORT_SYMBOL(strcmp);
#endif
-/**
- * strrchr - Find the last occurrence of a character in a string
- * @s: The string to be searched
- * @c: The character to search for
- */
-#ifdef __HAVE_ARCH_STRRCHR
-char *strrchr(const char *s, int c)
-{
- size_t len = __strend(s) - s;
-
- if (len)
- do {
- if (s[len] == (char) c)
- return (char *) s + len;
- } while (--len > 0);
- return NULL;
-}
-EXPORT_SYMBOL(strrchr);
-#endif
-
static inline int clcle(const char *s1, unsigned long l1,
const char *s2, unsigned long l2)
{
- register unsigned long r2 asm("2") = (unsigned long) s1;
- register unsigned long r3 asm("3") = (unsigned long) l1;
- register unsigned long r4 asm("4") = (unsigned long) s2;
- register unsigned long r5 asm("5") = (unsigned long) l2;
+ union register_pair r1 = { .even = (unsigned long)s1, .odd = l1, };
+ union register_pair r3 = { .even = (unsigned long)s2, .odd = l2, };
int cc;
- asm volatile ("0: clcle %1,%3,0\n"
- " jo 0b\n"
- " ipm %0\n"
- " srl %0,28"
- : "=&d" (cc), "+a" (r2), "+a" (r3),
- "+a" (r4), "+a" (r5) : : "cc", "memory");
+ asm volatile(
+ "0: clcle %[r1],%[r3],0\n"
+ " jo 0b\n"
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
+ : [cc] "=&d" (cc), [r1] "+&d" (r1.pair), [r3] "+&d" (r3.pair)
+ :
+ : "cc", "memory");
return cc;
}
@@ -315,15 +285,18 @@ EXPORT_SYMBOL(strstr);
#ifdef __HAVE_ARCH_MEMCHR
void *memchr(const void *s, int c, size_t n)
{
- register int r0 asm("0") = (char) c;
const void *ret = s + n;
- asm volatile ("0: srst %0,%1\n"
- " jo 0b\n"
- " jl 1f\n"
- " la %0,0\n"
- "1:"
- : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory");
+ asm volatile(
+ " lgr 0,%[c]\n"
+ "0: srst %[ret],%[s]\n"
+ " jo 0b\n"
+ " jl 1f\n"
+ " la %[ret],0\n"
+ "1:"
+ : [ret] "+&a" (ret), [s] "+&a" (s)
+ : [c] "d" (c)
+ : "cc", "memory", "0");
return (void *) ret;
}
EXPORT_SYMBOL(memchr);
@@ -333,7 +306,7 @@ EXPORT_SYMBOL(memchr);
* memcmp - Compare two areas of memory
* @s1: One area of memory
* @s2: Another area of memory
- * @count: The size of the area.
+ * @n: The size of the area.
*/
#ifdef __HAVE_ARCH_MEMCMP
int memcmp(const void *s1, const void *s2, size_t n)
@@ -360,13 +333,16 @@ EXPORT_SYMBOL(memcmp);
#ifdef __HAVE_ARCH_MEMSCAN
void *memscan(void *s, int c, size_t n)
{
- register int r0 asm("0") = (char) c;
const void *ret = s + n;
- asm volatile ("0: srst %0,%1\n"
- " jo 0b\n"
- : "+a" (ret), "+&a" (s) : "d" (r0) : "cc", "memory");
- return (void *) ret;
+ asm volatile(
+ " lgr 0,%[c]\n"
+ "0: srst %[ret],%[s]\n"
+ " jo 0b\n"
+ : [ret] "+&a" (ret), [s] "+&a" (s)
+ : [c] "d" (c)
+ : "cc", "memory", "0");
+ return (void *)ret;
}
EXPORT_SYMBOL(memscan);
#endif
diff --git a/arch/s390/lib/test_kprobes.c b/arch/s390/lib/test_kprobes.c
new file mode 100644
index 000000000000..9e62d62812e5
--- /dev/null
+++ b/arch/s390/lib/test_kprobes.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/random.h>
+#include <kunit/test.h>
+#include "test_kprobes.h"
+
+static struct kprobe kp;
+
+static void setup_kprobe(struct kunit *test, struct kprobe *kp,
+ const char *symbol, int offset)
+{
+ kp->offset = offset;
+ kp->addr = NULL;
+ kp->symbol_name = symbol;
+}
+
+static void test_kprobe_offset(struct kunit *test, struct kprobe *kp,
+ const char *target, int offset)
+{
+ int ret;
+
+ setup_kprobe(test, kp, target, 0);
+ ret = register_kprobe(kp);
+ if (!ret)
+ unregister_kprobe(kp);
+ KUNIT_EXPECT_EQ(test, 0, ret);
+ setup_kprobe(test, kp, target, offset);
+ ret = register_kprobe(kp);
+ KUNIT_EXPECT_EQ(test, -EINVAL, ret);
+ if (!ret)
+ unregister_kprobe(kp);
+}
+
+static void test_kprobe_odd(struct kunit *test)
+{
+ test_kprobe_offset(test, &kp, "kprobes_target_odd",
+ kprobes_target_odd_offs);
+}
+
+static void test_kprobe_in_insn4(struct kunit *test)
+{
+ test_kprobe_offset(test, &kp, "kprobes_target_in_insn4",
+ kprobes_target_in_insn4_offs);
+}
+
+static void test_kprobe_in_insn6_lo(struct kunit *test)
+{
+ test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_lo",
+ kprobes_target_in_insn6_lo_offs);
+}
+
+static void test_kprobe_in_insn6_hi(struct kunit *test)
+{
+ test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_hi",
+ kprobes_target_in_insn6_hi_offs);
+}
+
+static struct kunit_case kprobes_testcases[] = {
+ KUNIT_CASE(test_kprobe_odd),
+ KUNIT_CASE(test_kprobe_in_insn4),
+ KUNIT_CASE(test_kprobe_in_insn6_lo),
+ KUNIT_CASE(test_kprobe_in_insn6_hi),
+ {}
+};
+
+static struct kunit_suite kprobes_test_suite = {
+ .name = "kprobes_test_s390",
+ .test_cases = kprobes_testcases,
+};
+
+kunit_test_suites(&kprobes_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/test_kprobes.h b/arch/s390/lib/test_kprobes.h
new file mode 100644
index 000000000000..2b4c9bc337f1
--- /dev/null
+++ b/arch/s390/lib/test_kprobes.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef TEST_KPROBES_H
+#define TEST_KPROBES_H
+
+extern unsigned long kprobes_target_odd_offs;
+extern unsigned long kprobes_target_in_insn4_offs;
+extern unsigned long kprobes_target_in_insn6_lo_offs;
+extern unsigned long kprobes_target_in_insn6_hi_offs;
+
+#endif
diff --git a/arch/s390/lib/test_kprobes_asm.S b/arch/s390/lib/test_kprobes_asm.S
new file mode 100644
index 000000000000..ade7a3042334
--- /dev/null
+++ b/arch/s390/lib/test_kprobes_asm.S
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#include <linux/linkage.h>
+#include <asm/ftrace.h>
+
+#define KPROBES_TARGET_START(name) \
+ SYM_FUNC_START(name); \
+ FTRACE_GEN_NOP_ASM(name)
+
+#define KPROBES_TARGET_END(name) \
+ SYM_FUNC_END(name); \
+ SYM_DATA(name##_offs, .quad 1b - name)
+
+KPROBES_TARGET_START(kprobes_target_in_insn4)
+ .word 0x4700 // bc 0,0
+1: .word 0x0000
+ br %r14
+KPROBES_TARGET_END(kprobes_target_in_insn4)
+
+KPROBES_TARGET_START(kprobes_target_in_insn6_lo)
+ .word 0xe310 // ly 1,0
+1: .word 0x0000
+ .word 0x0058
+ br %r14
+KPROBES_TARGET_END(kprobes_target_in_insn6_lo)
+
+KPROBES_TARGET_START(kprobes_target_in_insn6_hi)
+ .word 0xe310 // ly 1,0
+ .word 0x0000
+1: .word 0x0058
+ br %r14
+KPROBES_TARGET_END(kprobes_target_in_insn6_hi)
+
+KPROBES_TARGET_START(kprobes_target_bp)
+ nop
+ .word 0x0000
+ nop
+1: br %r14
+KPROBES_TARGET_END(kprobes_target_bp)
+
+KPROBES_TARGET_START(kprobes_target_odd)
+ .byte 0x07
+1: .byte 0x07
+ br %r14
+KPROBES_TARGET_END(kprobes_target_odd)
diff --git a/arch/s390/lib/test_modules.c b/arch/s390/lib/test_modules.c
new file mode 100644
index 000000000000..9894009fc1f2
--- /dev/null
+++ b/arch/s390/lib/test_modules.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <kunit/test.h>
+#include <linux/module.h>
+
+#include "test_modules.h"
+
+/*
+ * Test that modules with many relocations are loaded properly.
+ */
+static void test_modules_many_vmlinux_relocs(struct kunit *test)
+{
+ int result = 0;
+
+#define CALL_RETURN(i) result += test_modules_return_ ## i()
+ REPEAT_10000(CALL_RETURN);
+ KUNIT_ASSERT_EQ(test, result, 49995000);
+}
+
+static struct kunit_case modules_testcases[] = {
+ KUNIT_CASE(test_modules_many_vmlinux_relocs),
+ {}
+};
+
+static struct kunit_suite modules_test_suite = {
+ .name = "modules_test_s390",
+ .test_cases = modules_testcases,
+};
+
+kunit_test_suites(&modules_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/test_modules.h b/arch/s390/lib/test_modules.h
new file mode 100644
index 000000000000..6371fcf17684
--- /dev/null
+++ b/arch/s390/lib/test_modules.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef TEST_MODULES_H
+#define TEST_MODULES_H
+
+#define __REPEAT_10000_3(f, x) \
+ f(x ## 0); \
+ f(x ## 1); \
+ f(x ## 2); \
+ f(x ## 3); \
+ f(x ## 4); \
+ f(x ## 5); \
+ f(x ## 6); \
+ f(x ## 7); \
+ f(x ## 8); \
+ f(x ## 9)
+#define __REPEAT_10000_2(f, x) \
+ __REPEAT_10000_3(f, x ## 0); \
+ __REPEAT_10000_3(f, x ## 1); \
+ __REPEAT_10000_3(f, x ## 2); \
+ __REPEAT_10000_3(f, x ## 3); \
+ __REPEAT_10000_3(f, x ## 4); \
+ __REPEAT_10000_3(f, x ## 5); \
+ __REPEAT_10000_3(f, x ## 6); \
+ __REPEAT_10000_3(f, x ## 7); \
+ __REPEAT_10000_3(f, x ## 8); \
+ __REPEAT_10000_3(f, x ## 9)
+#define __REPEAT_10000_1(f, x) \
+ __REPEAT_10000_2(f, x ## 0); \
+ __REPEAT_10000_2(f, x ## 1); \
+ __REPEAT_10000_2(f, x ## 2); \
+ __REPEAT_10000_2(f, x ## 3); \
+ __REPEAT_10000_2(f, x ## 4); \
+ __REPEAT_10000_2(f, x ## 5); \
+ __REPEAT_10000_2(f, x ## 6); \
+ __REPEAT_10000_2(f, x ## 7); \
+ __REPEAT_10000_2(f, x ## 8); \
+ __REPEAT_10000_2(f, x ## 9)
+#define REPEAT_10000(f) \
+ __REPEAT_10000_1(f, 0); \
+ __REPEAT_10000_1(f, 1); \
+ __REPEAT_10000_1(f, 2); \
+ __REPEAT_10000_1(f, 3); \
+ __REPEAT_10000_1(f, 4); \
+ __REPEAT_10000_1(f, 5); \
+ __REPEAT_10000_1(f, 6); \
+ __REPEAT_10000_1(f, 7); \
+ __REPEAT_10000_1(f, 8); \
+ __REPEAT_10000_1(f, 9)
+
+#define DECLARE_RETURN(i) int test_modules_return_ ## i(void)
+REPEAT_10000(DECLARE_RETURN);
+
+#endif
diff --git a/arch/s390/lib/test_modules_helpers.c b/arch/s390/lib/test_modules_helpers.c
new file mode 100644
index 000000000000..1670349a03eb
--- /dev/null
+++ b/arch/s390/lib/test_modules_helpers.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <linux/export.h>
+
+#include "test_modules.h"
+
+#define DEFINE_RETURN(i) \
+ int test_modules_return_ ## i(void) \
+ { \
+ return 1 ## i - 10000; \
+ } \
+ EXPORT_SYMBOL_GPL(test_modules_return_ ## i)
+REPEAT_10000(DEFINE_RETURN);
diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c
index bda7ac0ddd29..2848e3fb2ff5 100644
--- a/arch/s390/lib/test_unwind.c
+++ b/arch/s390/lib/test_unwind.c
@@ -3,20 +3,28 @@
* Test module for unwind_for_each_frame
*/
-#define pr_fmt(fmt) "test_unwind: " fmt
+#include <kunit/test.h>
#include <asm/unwind.h>
#include <linux/completion.h>
#include <linux/kallsyms.h>
#include <linux/kthread.h>
+#include <linux/ftrace.h>
#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
#include <linux/string.h>
#include <linux/kprobes.h>
#include <linux/wait.h>
#include <asm/irq.h>
-#include <asm/delay.h>
+
+static struct kunit *current_test;
#define BT_BUF_SIZE (PAGE_SIZE * 4)
+static bool force_bt;
+module_param_named(backtrace, force_bt, bool, 0444);
+MODULE_PARM_DESC(backtrace, "print backtraces for all tests");
+
/*
* To avoid printk line limit split backtrace by lines
*/
@@ -28,7 +36,7 @@ static void print_backtrace(char *bt)
p = strsep(&bt, "\n");
if (!p)
break;
- pr_err("%s\n", p);
+ kunit_err(current_test, "%s\n", p);
}
}
@@ -39,7 +47,7 @@ static void print_backtrace(char *bt)
static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
unsigned long sp)
{
- int frame_count, prev_is_func2, seen_func2_func1;
+ int frame_count, prev_is_func2, seen_func2_func1, seen_arch_rethook_trampoline;
const int max_frames = 128;
struct unwind_state state;
size_t bt_pos = 0;
@@ -48,13 +56,14 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
bt = kmalloc(BT_BUF_SIZE, GFP_ATOMIC);
if (!bt) {
- pr_err("failed to allocate backtrace buffer\n");
+ kunit_err(current_test, "failed to allocate backtrace buffer\n");
return -ENOMEM;
}
/* Unwind. */
frame_count = 0;
prev_is_func2 = 0;
seen_func2_func1 = 0;
+ seen_arch_rethook_trampoline = 0;
unwind_for_each_frame(&state, task, regs, sp) {
unsigned long addr = unwind_get_return_address(&state);
char sym[KSYM_SYMBOL_LEN];
@@ -62,8 +71,9 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
if (frame_count++ == max_frames)
break;
if (state.reliable && !addr) {
- pr_err("unwind state reliable but addr is 0\n");
- return -EINVAL;
+ kunit_err(current_test, "unwind state reliable but addr is 0\n");
+ ret = -EINVAL;
+ break;
}
sprint_symbol(sym, addr);
if (bt_pos < BT_BUF_SIZE) {
@@ -73,28 +83,34 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
stack_type_name(state.stack_info.type),
(void *)state.sp, (void *)state.ip);
if (bt_pos >= BT_BUF_SIZE)
- pr_err("backtrace buffer is too small\n");
+ kunit_err(current_test, "backtrace buffer is too small\n");
}
frame_count += 1;
if (prev_is_func2 && str_has_prefix(sym, "unwindme_func1"))
seen_func2_func1 = 1;
prev_is_func2 = str_has_prefix(sym, "unwindme_func2");
+ if (str_has_prefix(sym, "arch_rethook_trampoline+0x0/"))
+ seen_arch_rethook_trampoline = 1;
}
/* Check the results. */
if (unwind_error(&state)) {
- pr_err("unwind error\n");
+ kunit_err(current_test, "unwind error\n");
ret = -EINVAL;
}
if (!seen_func2_func1) {
- pr_err("unwindme_func2 and unwindme_func1 not found\n");
+ kunit_err(current_test, "unwindme_func2 and unwindme_func1 not found\n");
ret = -EINVAL;
}
if (frame_count == max_frames) {
- pr_err("Maximum number of frames exceeded\n");
+ kunit_err(current_test, "Maximum number of frames exceeded\n");
ret = -EINVAL;
}
- if (ret)
+ if (seen_arch_rethook_trampoline) {
+ kunit_err(current_test, "arch_rethook_trampoline+0x0 in unwinding results\n");
+ ret = -EINVAL;
+ }
+ if (ret || force_bt)
print_backtrace(bt);
kfree(bt);
return ret;
@@ -118,31 +134,187 @@ static struct unwindme *unwindme;
#define UWM_REGS 0x2 /* Pass regs to test_unwind(). */
#define UWM_SP 0x4 /* Pass sp to test_unwind(). */
#define UWM_CALLER 0x8 /* Unwind starting from caller. */
-#define UWM_SWITCH_STACK 0x10 /* Use CALL_ON_STACK. */
+#define UWM_SWITCH_STACK 0x10 /* Use call_on_stack. */
#define UWM_IRQ 0x20 /* Unwind from irq context. */
-#define UWM_PGM 0x40 /* Unwind from program check handler. */
+#define UWM_PGM 0x40 /* Unwind from program check handler */
+#define UWM_KPROBE_ON_FTRACE 0x80 /* Unwind from kprobe handler called via ftrace. */
+#define UWM_FTRACE 0x100 /* Unwind from ftrace handler. */
+#define UWM_KRETPROBE 0x200 /* Unwind through kretprobed function. */
+#define UWM_KRETPROBE_HANDLER 0x400 /* Unwind from kretprobe handler. */
-static __always_inline unsigned long get_psw_addr(void)
+static __always_inline struct pt_regs fake_pt_regs(void)
{
- unsigned long psw_addr;
+ struct pt_regs regs;
+
+ memset(&regs, 0, sizeof(regs));
+ regs.gprs[15] = current_stack_pointer;
asm volatile(
"basr %[psw_addr],0\n"
- : [psw_addr] "=d" (psw_addr));
- return psw_addr;
+ : [psw_addr] "=d" (regs.psw.addr));
+ return regs;
}
-#ifdef CONFIG_KPROBES
-static int pgm_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int kretprobe_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct unwindme *u = unwindme;
+ if (!(u->flags & UWM_KRETPROBE_HANDLER))
+ return 0;
+
u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? regs : NULL,
(u->flags & UWM_SP) ? u->sp : 0);
+
return 0;
}
+
+static noinline notrace int test_unwind_kretprobed_func(struct unwindme *u)
+{
+ struct pt_regs regs;
+
+ if (!(u->flags & UWM_KRETPROBE))
+ return 0;
+
+ regs = fake_pt_regs();
+ return test_unwind(NULL, (u->flags & UWM_REGS) ? &regs : NULL,
+ (u->flags & UWM_SP) ? u->sp : 0);
+}
+
+static noinline int test_unwind_kretprobed_func_caller(struct unwindme *u)
+{
+ return test_unwind_kretprobed_func(u);
+}
+
+static int test_unwind_kretprobe(struct unwindme *u)
+{
+ int ret;
+ struct kretprobe my_kretprobe;
+
+ if (!IS_ENABLED(CONFIG_KPROBES))
+ kunit_skip(current_test, "requires CONFIG_KPROBES");
+
+ u->ret = -1; /* make sure kprobe is called */
+ unwindme = u;
+
+ memset(&my_kretprobe, 0, sizeof(my_kretprobe));
+ my_kretprobe.handler = kretprobe_ret_handler;
+ my_kretprobe.maxactive = 1;
+ my_kretprobe.kp.addr = (kprobe_opcode_t *)test_unwind_kretprobed_func;
+
+ ret = register_kretprobe(&my_kretprobe);
+
+ if (ret < 0) {
+ kunit_err(current_test, "register_kretprobe failed %d\n", ret);
+ return -EINVAL;
+ }
+
+ ret = test_unwind_kretprobed_func_caller(u);
+ unregister_kretprobe(&my_kretprobe);
+ unwindme = NULL;
+ if (u->flags & UWM_KRETPROBE_HANDLER)
+ ret = u->ret;
+ return ret;
+}
+
+static int kprobe_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct unwindme *u = unwindme;
+
+ u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? regs : NULL,
+ (u->flags & UWM_SP) ? u->sp : 0);
+ return 0;
+}
+
+extern const char test_unwind_kprobed_insn[];
+
+static noinline void test_unwind_kprobed_func(void)
+{
+ asm volatile(
+ " nopr %%r7\n"
+ "test_unwind_kprobed_insn:\n"
+ " nopr %%r7\n"
+ :);
+}
+
+static int test_unwind_kprobe(struct unwindme *u)
+{
+ struct kprobe kp;
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_KPROBES))
+ kunit_skip(current_test, "requires CONFIG_KPROBES");
+ if (!IS_ENABLED(CONFIG_KPROBES_ON_FTRACE) && u->flags & UWM_KPROBE_ON_FTRACE)
+ kunit_skip(current_test, "requires CONFIG_KPROBES_ON_FTRACE");
+
+ u->ret = -1; /* make sure kprobe is called */
+ unwindme = u;
+ memset(&kp, 0, sizeof(kp));
+ kp.pre_handler = kprobe_pre_handler;
+ kp.addr = u->flags & UWM_KPROBE_ON_FTRACE ?
+ (kprobe_opcode_t *)test_unwind_kprobed_func :
+ (kprobe_opcode_t *)test_unwind_kprobed_insn;
+ ret = register_kprobe(&kp);
+ if (ret < 0) {
+ kunit_err(current_test, "register_kprobe failed %d\n", ret);
+ return -EINVAL;
+ }
+
+ test_unwind_kprobed_func();
+ unregister_kprobe(&kp);
+ unwindme = NULL;
+ return u->ret;
+}
+
+static void notrace __used test_unwind_ftrace_handler(unsigned long ip,
+ unsigned long parent_ip,
+ struct ftrace_ops *fops,
+ struct ftrace_regs *fregs)
+{
+ struct unwindme *u = (struct unwindme *)fregs->regs.gprs[2];
+
+ u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? &fregs->regs : NULL,
+ (u->flags & UWM_SP) ? u->sp : 0);
+}
+
+static noinline int test_unwind_ftraced_func(struct unwindme *u)
+{
+ return READ_ONCE(u)->ret;
+}
+
+static int test_unwind_ftrace(struct unwindme *u)
+{
+ int ret;
+#ifdef CONFIG_DYNAMIC_FTRACE
+ struct ftrace_ops *fops;
+
+ fops = kunit_kzalloc(current_test, sizeof(*fops), GFP_KERNEL);
+ fops->func = test_unwind_ftrace_handler;
+ fops->flags = FTRACE_OPS_FL_DYNAMIC |
+ FTRACE_OPS_FL_RECURSION |
+ FTRACE_OPS_FL_SAVE_REGS |
+ FTRACE_OPS_FL_PERMANENT;
+#else
+ kunit_skip(current_test, "requires CONFIG_DYNAMIC_FTRACE");
#endif
+ ret = ftrace_set_filter_ip(fops, (unsigned long)test_unwind_ftraced_func, 0, 0);
+ if (ret) {
+ kunit_err(current_test, "failed to set ftrace filter (%d)\n", ret);
+ return -1;
+ }
+
+ ret = register_ftrace_function(fops);
+ if (!ret) {
+ ret = test_unwind_ftraced_func(u);
+ unregister_ftrace_function(fops);
+ } else {
+ kunit_err(current_test, "failed to register ftrace handler (%d)\n", ret);
+ }
+
+ ftrace_set_filter_ip(fops, (unsigned long)test_unwind_ftraced_func, 1, 0);
+ return ret;
+}
+
/* This function may or may not appear in the backtrace. */
static noinline int unwindme_func4(struct unwindme *u)
{
@@ -153,40 +325,15 @@ static noinline int unwindme_func4(struct unwindme *u)
wait_event(u->task_wq, kthread_should_park());
kthread_parkme();
return 0;
-#ifdef CONFIG_KPROBES
- } else if (u->flags & UWM_PGM) {
- struct kprobe kp;
- int ret;
-
- unwindme = u;
- memset(&kp, 0, sizeof(kp));
- kp.symbol_name = "do_report_trap";
- kp.pre_handler = pgm_pre_handler;
- ret = register_kprobe(&kp);
- if (ret < 0) {
- pr_err("register_kprobe failed %d\n", ret);
- return -EINVAL;
- }
-
- /*
- * trigger specification exception
- */
- asm volatile(
- " mvcl %%r1,%%r1\n"
- "0: nopr %%r7\n"
- EX_TABLE(0b, 0b)
- :);
-
- unregister_kprobe(&kp);
- unwindme = NULL;
- return u->ret;
-#endif
+ } else if (u->flags & (UWM_PGM | UWM_KPROBE_ON_FTRACE)) {
+ return test_unwind_kprobe(u);
+ } else if (u->flags & (UWM_KRETPROBE | UWM_KRETPROBE_HANDLER)) {
+ return test_unwind_kretprobe(u);
+ } else if (u->flags & UWM_FTRACE) {
+ return test_unwind_ftrace(u);
} else {
- struct pt_regs regs;
+ struct pt_regs regs = fake_pt_regs();
- memset(&regs, 0, sizeof(regs));
- regs.psw.addr = get_psw_addr();
- regs.gprs[15] = current_stack_pointer();
return test_unwind(NULL,
(u->flags & UWM_REGS) ? &regs : NULL,
(u->flags & UWM_SP) ? u->sp : 0);
@@ -203,12 +350,16 @@ static noinline int unwindme_func3(struct unwindme *u)
/* This function must appear in the backtrace. */
static noinline int unwindme_func2(struct unwindme *u)
{
+ unsigned long flags, mflags;
int rc;
if (u->flags & UWM_SWITCH_STACK) {
- preempt_disable();
- rc = CALL_ON_STACK(unwindme_func3, S390_lowcore.nodat_stack, 1, u);
- preempt_enable();
+ local_irq_save(flags);
+ local_mcck_save(mflags);
+ rc = call_on_stack(1, S390_lowcore.nodat_stack,
+ int, unwindme_func3, struct unwindme *, u);
+ local_mcck_restore(mflags);
+ local_irq_restore(flags);
return rc;
} else {
return unwindme_func3(u);
@@ -221,31 +372,27 @@ static noinline int unwindme_func1(void *u)
return unwindme_func2((struct unwindme *)u);
}
-static void unwindme_irq_handler(struct ext_code ext_code,
- unsigned int param32,
- unsigned long param64)
+static void unwindme_timer_fn(struct timer_list *unused)
{
struct unwindme *u = READ_ONCE(unwindme);
- if (u && u->task == current) {
+ if (u) {
unwindme = NULL;
u->task = NULL;
u->ret = unwindme_func1(u);
+ complete(&u->task_ready);
}
}
+static struct timer_list unwind_timer;
+
static int test_unwind_irq(struct unwindme *u)
{
- preempt_disable();
- if (register_external_irq(EXT_IRQ_CLK_COMP, unwindme_irq_handler)) {
- pr_info("Couldn't reqister external interrupt handler");
- return -1;
- }
- u->task = current;
unwindme = u;
- udelay(1);
- unregister_external_irq(EXT_IRQ_CLK_COMP, unwindme_irq_handler);
- preempt_enable();
+ init_completion(&u->task_ready);
+ timer_setup(&unwind_timer, unwindme_timer_fn, 0);
+ mod_timer(&unwind_timer, jiffies + 1);
+ wait_for_completion(&u->task_ready);
return u->ret;
}
@@ -265,7 +412,7 @@ static int test_unwind_task(struct unwindme *u)
*/
task = kthread_run(unwindme_func1, u, "%s", __func__);
if (IS_ERR(task)) {
- pr_err("kthread_run() failed\n");
+ kunit_err(current_test, "kthread_run() failed\n");
return PTR_ERR(task);
}
/*
@@ -280,68 +427,96 @@ static int test_unwind_task(struct unwindme *u)
return ret;
}
-static int test_unwind_flags(int flags)
+struct test_params {
+ int flags;
+ char *name;
+};
+
+/*
+ * Create required parameter list for tests
+ */
+#define TEST_WITH_FLAGS(f) { .flags = f, .name = #f }
+static const struct test_params param_list[] = {
+ TEST_WITH_FLAGS(UWM_DEFAULT),
+ TEST_WITH_FLAGS(UWM_SP),
+ TEST_WITH_FLAGS(UWM_REGS),
+ TEST_WITH_FLAGS(UWM_SWITCH_STACK),
+ TEST_WITH_FLAGS(UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_CALLER | UWM_SP),
+ TEST_WITH_FLAGS(UWM_CALLER | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK),
+ TEST_WITH_FLAGS(UWM_THREAD),
+ TEST_WITH_FLAGS(UWM_THREAD | UWM_SP),
+ TEST_WITH_FLAGS(UWM_THREAD | UWM_CALLER | UWM_SP),
+ TEST_WITH_FLAGS(UWM_IRQ),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_SWITCH_STACK),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_SP),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK),
+ TEST_WITH_FLAGS(UWM_PGM),
+ TEST_WITH_FLAGS(UWM_PGM | UWM_SP),
+ TEST_WITH_FLAGS(UWM_PGM | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_PGM | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE),
+ TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_SP),
+ TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_KPROBE_ON_FTRACE | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_FTRACE),
+ TEST_WITH_FLAGS(UWM_FTRACE | UWM_SP),
+ TEST_WITH_FLAGS(UWM_FTRACE | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_FTRACE | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_KRETPROBE),
+ TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_SP),
+ TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_KRETPROBE | UWM_SP | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER),
+ TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_SP),
+ TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_REGS),
+ TEST_WITH_FLAGS(UWM_KRETPROBE_HANDLER | UWM_SP | UWM_REGS),
+};
+
+/*
+ * Parameter description generator: required for KUNIT_ARRAY_PARAM()
+ */
+static void get_desc(const struct test_params *params, char *desc)
+{
+ strscpy(desc, params->name, KUNIT_PARAM_DESC_SIZE);
+}
+
+/*
+ * Create test_unwind_gen_params
+ */
+KUNIT_ARRAY_PARAM(test_unwind, param_list, get_desc);
+
+static void test_unwind_flags(struct kunit *test)
{
struct unwindme u;
+ const struct test_params *params;
- u.flags = flags;
+ current_test = test;
+ params = (const struct test_params *)test->param_value;
+ u.flags = params->flags;
if (u.flags & UWM_THREAD)
- return test_unwind_task(&u);
+ KUNIT_EXPECT_EQ(test, 0, test_unwind_task(&u));
else if (u.flags & UWM_IRQ)
- return test_unwind_irq(&u);
+ KUNIT_EXPECT_EQ(test, 0, test_unwind_irq(&u));
else
- return unwindme_func1(&u);
+ KUNIT_EXPECT_EQ(test, 0, unwindme_func1(&u));
}
-static int test_unwind_init(void)
-{
- int ret = 0;
-
-#define TEST(flags) \
-do { \
- pr_info("[ RUN ] " #flags "\n"); \
- if (!test_unwind_flags((flags))) { \
- pr_info("[ OK ] " #flags "\n"); \
- } else { \
- pr_err("[ FAILED ] " #flags "\n"); \
- ret = -EINVAL; \
- } \
-} while (0)
-
- TEST(UWM_DEFAULT);
- TEST(UWM_SP);
- TEST(UWM_REGS);
- TEST(UWM_SWITCH_STACK);
- TEST(UWM_SP | UWM_REGS);
- TEST(UWM_CALLER | UWM_SP);
- TEST(UWM_CALLER | UWM_SP | UWM_REGS);
- TEST(UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK);
- TEST(UWM_THREAD);
- TEST(UWM_THREAD | UWM_SP);
- TEST(UWM_THREAD | UWM_CALLER | UWM_SP);
- TEST(UWM_IRQ);
- TEST(UWM_IRQ | UWM_SWITCH_STACK);
- TEST(UWM_IRQ | UWM_SP);
- TEST(UWM_IRQ | UWM_REGS);
- TEST(UWM_IRQ | UWM_SP | UWM_REGS);
- TEST(UWM_IRQ | UWM_CALLER | UWM_SP);
- TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS);
- TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK);
-#ifdef CONFIG_KPROBES
- TEST(UWM_PGM);
- TEST(UWM_PGM | UWM_SP);
- TEST(UWM_PGM | UWM_REGS);
- TEST(UWM_PGM | UWM_SP | UWM_REGS);
-#endif
-#undef TEST
+static struct kunit_case unwind_test_cases[] = {
+ KUNIT_CASE_PARAM(test_unwind_flags, test_unwind_gen_params),
+ {}
+};
- return ret;
-}
+static struct kunit_suite test_unwind_suite = {
+ .name = "test_unwind",
+ .test_cases = unwind_test_cases,
+};
-static void test_unwind_exit(void)
-{
-}
+kunit_test_suites(&test_unwind_suite);
-module_init(test_unwind_init);
-module_exit(test_unwind_exit);
MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/tishift.S b/arch/s390/lib/tishift.S
new file mode 100644
index 000000000000..96214f51f49b
--- /dev/null
+++ b/arch/s390/lib/tishift.S
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/export.h>
+#include <linux/linkage.h>
+#include <asm/nospec-insn.h>
+
+ .section .noinstr.text, "ax"
+
+ GEN_BR_THUNK %r14
+
+SYM_FUNC_START(__ashlti3)
+ lmg %r0,%r1,0(%r3)
+ cije %r4,0,1f
+ lhi %r3,64
+ sr %r3,%r4
+ jnh 0f
+ srlg %r3,%r1,0(%r3)
+ sllg %r0,%r0,0(%r4)
+ sllg %r1,%r1,0(%r4)
+ ogr %r0,%r3
+ j 1f
+0: sllg %r0,%r1,-64(%r4)
+ lghi %r1,0
+1: stmg %r0,%r1,0(%r2)
+ BR_EX %r14
+SYM_FUNC_END(__ashlti3)
+EXPORT_SYMBOL(__ashlti3)
+
+SYM_FUNC_START(__ashrti3)
+ lmg %r0,%r1,0(%r3)
+ cije %r4,0,1f
+ lhi %r3,64
+ sr %r3,%r4
+ jnh 0f
+ sllg %r3,%r0,0(%r3)
+ srlg %r1,%r1,0(%r4)
+ srag %r0,%r0,0(%r4)
+ ogr %r1,%r3
+ j 1f
+0: srag %r1,%r0,-64(%r4)
+ srag %r0,%r0,63
+1: stmg %r0,%r1,0(%r2)
+ BR_EX %r14
+SYM_FUNC_END(__ashrti3)
+EXPORT_SYMBOL(__ashrti3)
+
+SYM_FUNC_START(__lshrti3)
+ lmg %r0,%r1,0(%r3)
+ cije %r4,0,1f
+ lhi %r3,64
+ sr %r3,%r4
+ jnh 0f
+ sllg %r3,%r0,0(%r3)
+ srlg %r1,%r1,0(%r4)
+ srlg %r0,%r0,0(%r4)
+ ogr %r1,%r3
+ j 1f
+0: srlg %r1,%r0,-64(%r4)
+ lghi %r0,0
+1: stmg %r0,%r1,0(%r2)
+ BR_EX %r14
+SYM_FUNC_END(__lshrti3)
+EXPORT_SYMBOL(__lshrti3)
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index c4f8039a35e8..61d8dcd95bbc 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -8,437 +8,179 @@
* Gerald Schaefer (gerald.schaefer@de.ibm.com)
*/
-#include <linux/jump_label.h>
#include <linux/uaccess.h>
#include <linux/export.h>
-#include <linux/errno.h>
#include <linux/mm.h>
-#include <asm/mmu_context.h>
-#include <asm/facility.h>
+#include <asm/asm-extable.h>
+#include <asm/ctlreg.h>
-#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES
-static DEFINE_STATIC_KEY_FALSE(have_mvcos);
-
-static int __init uaccess_init(void)
+#ifdef CONFIG_DEBUG_ENTRY
+void debug_user_asce(int exit)
{
- if (test_facility(27))
- static_branch_enable(&have_mvcos);
- return 0;
-}
-early_initcall(uaccess_init);
+ struct ctlreg cr1, cr7;
-static inline int copy_with_mvcos(void)
-{
- if (static_branch_likely(&have_mvcos))
- return 1;
- return 0;
-}
-#else
-static inline int copy_with_mvcos(void)
-{
- return 1;
+ local_ctl_store(1, &cr1);
+ local_ctl_store(7, &cr7);
+ if (cr1.val == S390_lowcore.kernel_asce.val && cr7.val == S390_lowcore.user_asce.val)
+ return;
+ panic("incorrect ASCE on kernel %s\n"
+ "cr1: %016lx cr7: %016lx\n"
+ "kernel: %016lx user: %016lx\n",
+ exit ? "exit" : "entry", cr1.val, cr7.val,
+ S390_lowcore.kernel_asce.val, S390_lowcore.user_asce.val);
}
-#endif
+#endif /*CONFIG_DEBUG_ENTRY */
-void set_fs(mm_segment_t fs)
+static unsigned long raw_copy_from_user_key(void *to, const void __user *from,
+ unsigned long size, unsigned long key)
{
- current->thread.mm_segment = fs;
- if (fs == USER_DS) {
- __ctl_load(S390_lowcore.user_asce, 1, 1);
- clear_cpu_flag(CIF_ASCE_PRIMARY);
- } else {
- __ctl_load(S390_lowcore.kernel_asce, 1, 1);
- set_cpu_flag(CIF_ASCE_PRIMARY);
- }
- if (fs & 1) {
- if (fs == USER_DS_SACF)
- __ctl_load(S390_lowcore.user_asce, 7, 7);
- else
- __ctl_load(S390_lowcore.kernel_asce, 7, 7);
- set_cpu_flag(CIF_ASCE_SECONDARY);
- }
-}
-EXPORT_SYMBOL(set_fs);
+ unsigned long rem;
+ union oac spec = {
+ .oac2.key = key,
+ .oac2.as = PSW_BITS_AS_SECONDARY,
+ .oac2.k = 1,
+ .oac2.a = 1,
+ };
-mm_segment_t enable_sacf_uaccess(void)
-{
- mm_segment_t old_fs;
- unsigned long asce, cr;
-
- old_fs = current->thread.mm_segment;
- if (old_fs & 1)
- return old_fs;
- current->thread.mm_segment |= 1;
- asce = S390_lowcore.kernel_asce;
- if (likely(old_fs == USER_DS)) {
- __ctl_store(cr, 1, 1);
- if (cr != S390_lowcore.kernel_asce) {
- __ctl_load(S390_lowcore.kernel_asce, 1, 1);
- set_cpu_flag(CIF_ASCE_PRIMARY);
- }
- asce = S390_lowcore.user_asce;
- }
- __ctl_store(cr, 7, 7);
- if (cr != asce) {
- __ctl_load(asce, 7, 7);
- set_cpu_flag(CIF_ASCE_SECONDARY);
- }
- return old_fs;
-}
-EXPORT_SYMBOL(enable_sacf_uaccess);
-
-void disable_sacf_uaccess(mm_segment_t old_fs)
-{
- current->thread.mm_segment = old_fs;
- if (old_fs == USER_DS && test_facility(27)) {
- __ctl_load(S390_lowcore.user_asce, 1, 1);
- clear_cpu_flag(CIF_ASCE_PRIMARY);
- }
-}
-EXPORT_SYMBOL(disable_sacf_uaccess);
-
-static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr,
- unsigned long size)
-{
- register unsigned long reg0 asm("0") = 0x01UL;
- unsigned long tmp1, tmp2;
-
- tmp1 = -4096UL;
- asm volatile(
- "0: .insn ss,0xc80000000000,0(%0,%2),0(%1),0\n"
- "6: jz 4f\n"
- "1: algr %0,%3\n"
- " slgr %1,%3\n"
- " slgr %2,%3\n"
- " j 0b\n"
- "2: la %4,4095(%1)\n"/* %4 = ptr + 4095 */
- " nr %4,%3\n" /* %4 = (ptr + 4095) & -4096 */
- " slgr %4,%1\n"
- " clgr %0,%4\n" /* copy crosses next page boundary? */
- " jnh 5f\n"
- "3: .insn ss,0xc80000000000,0(%4,%2),0(%1),0\n"
- "7: slgr %0,%4\n"
- " j 5f\n"
- "4: slgr %0,%0\n"
- "5:\n"
- EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b)
- : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
- : "d" (reg0) : "cc", "memory");
- return size;
-}
-
-static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr,
- unsigned long size)
-{
- unsigned long tmp1, tmp2;
- mm_segment_t old_fs;
-
- old_fs = enable_sacf_uaccess();
- tmp1 = -256UL;
asm volatile(
- " sacf 0\n"
- "0: mvcp 0(%0,%2),0(%1),%3\n"
- "7: jz 5f\n"
- "1: algr %0,%3\n"
- " la %1,256(%1)\n"
- " la %2,256(%2)\n"
- "2: mvcp 0(%0,%2),0(%1),%3\n"
- "8: jnz 1b\n"
- " j 5f\n"
- "3: la %4,255(%1)\n" /* %4 = ptr + 255 */
- " lghi %3,-4096\n"
- " nr %4,%3\n" /* %4 = (ptr + 255) & -4096 */
- " slgr %4,%1\n"
- " clgr %0,%4\n" /* copy crosses next page boundary? */
- " jnh 6f\n"
- "4: mvcp 0(%4,%2),0(%1),%3\n"
- "9: slgr %0,%4\n"
- " j 6f\n"
- "5: slgr %0,%0\n"
- "6: sacf 768\n"
- EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,6b)
- EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b)
- : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
- : : "cc", "memory");
- disable_sacf_uaccess(old_fs);
+ " lr 0,%[spec]\n"
+ "0: mvcos 0(%[to]),0(%[from]),%[size]\n"
+ "1: jz 5f\n"
+ " algr %[size],%[val]\n"
+ " slgr %[from],%[val]\n"
+ " slgr %[to],%[val]\n"
+ " j 0b\n"
+ "2: la %[rem],4095(%[from])\n" /* rem = from + 4095 */
+ " nr %[rem],%[val]\n" /* rem = (from + 4095) & -4096 */
+ " slgr %[rem],%[from]\n"
+ " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */
+ " jnh 6f\n"
+ "3: mvcos 0(%[to]),0(%[from]),%[rem]\n"
+ "4: slgr %[size],%[rem]\n"
+ " j 6f\n"
+ "5: slgr %[size],%[size]\n"
+ "6:\n"
+ EX_TABLE(0b, 2b)
+ EX_TABLE(1b, 2b)
+ EX_TABLE(3b, 6b)
+ EX_TABLE(4b, 6b)
+ : [size] "+&a" (size), [from] "+&a" (from), [to] "+&a" (to), [rem] "=&a" (rem)
+ : [val] "a" (-4096UL), [spec] "d" (spec.val)
+ : "cc", "memory", "0");
return size;
}
unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n)
{
- if (copy_with_mvcos())
- return copy_from_user_mvcos(to, from, n);
- return copy_from_user_mvcp(to, from, n);
+ return raw_copy_from_user_key(to, from, n, 0);
}
EXPORT_SYMBOL(raw_copy_from_user);
-static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x,
- unsigned long size)
+unsigned long _copy_from_user_key(void *to, const void __user *from,
+ unsigned long n, unsigned long key)
{
- register unsigned long reg0 asm("0") = 0x010000UL;
- unsigned long tmp1, tmp2;
+ unsigned long res = n;
- tmp1 = -4096UL;
- asm volatile(
- "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n"
- "6: jz 4f\n"
- "1: algr %0,%3\n"
- " slgr %1,%3\n"
- " slgr %2,%3\n"
- " j 0b\n"
- "2: la %4,4095(%1)\n"/* %4 = ptr + 4095 */
- " nr %4,%3\n" /* %4 = (ptr + 4095) & -4096 */
- " slgr %4,%1\n"
- " clgr %0,%4\n" /* copy crosses next page boundary? */
- " jnh 5f\n"
- "3: .insn ss,0xc80000000000,0(%4,%1),0(%2),0\n"
- "7: slgr %0,%4\n"
- " j 5f\n"
- "4: slgr %0,%0\n"
- "5:\n"
- EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b)
- : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
- : "d" (reg0) : "cc", "memory");
- return size;
+ might_fault();
+ if (!should_fail_usercopy()) {
+ instrument_copy_from_user_before(to, from, n);
+ res = raw_copy_from_user_key(to, from, n, key);
+ instrument_copy_from_user_after(to, from, n, res);
+ }
+ if (unlikely(res))
+ memset(to + (n - res), 0, res);
+ return res;
}
+EXPORT_SYMBOL(_copy_from_user_key);
-static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x,
- unsigned long size)
+static unsigned long raw_copy_to_user_key(void __user *to, const void *from,
+ unsigned long size, unsigned long key)
{
- unsigned long tmp1, tmp2;
- mm_segment_t old_fs;
+ unsigned long rem;
+ union oac spec = {
+ .oac1.key = key,
+ .oac1.as = PSW_BITS_AS_SECONDARY,
+ .oac1.k = 1,
+ .oac1.a = 1,
+ };
- old_fs = enable_sacf_uaccess();
- tmp1 = -256UL;
asm volatile(
- " sacf 0\n"
- "0: mvcs 0(%0,%1),0(%2),%3\n"
- "7: jz 5f\n"
- "1: algr %0,%3\n"
- " la %1,256(%1)\n"
- " la %2,256(%2)\n"
- "2: mvcs 0(%0,%1),0(%2),%3\n"
- "8: jnz 1b\n"
- " j 5f\n"
- "3: la %4,255(%1)\n" /* %4 = ptr + 255 */
- " lghi %3,-4096\n"
- " nr %4,%3\n" /* %4 = (ptr + 255) & -4096 */
- " slgr %4,%1\n"
- " clgr %0,%4\n" /* copy crosses next page boundary? */
- " jnh 6f\n"
- "4: mvcs 0(%4,%1),0(%2),%3\n"
- "9: slgr %0,%4\n"
- " j 6f\n"
- "5: slgr %0,%0\n"
- "6: sacf 768\n"
- EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,6b)
- EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b)
- : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
- : : "cc", "memory");
- disable_sacf_uaccess(old_fs);
+ " lr 0,%[spec]\n"
+ "0: mvcos 0(%[to]),0(%[from]),%[size]\n"
+ "1: jz 5f\n"
+ " algr %[size],%[val]\n"
+ " slgr %[to],%[val]\n"
+ " slgr %[from],%[val]\n"
+ " j 0b\n"
+ "2: la %[rem],4095(%[to])\n" /* rem = to + 4095 */
+ " nr %[rem],%[val]\n" /* rem = (to + 4095) & -4096 */
+ " slgr %[rem],%[to]\n"
+ " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */
+ " jnh 6f\n"
+ "3: mvcos 0(%[to]),0(%[from]),%[rem]\n"
+ "4: slgr %[size],%[rem]\n"
+ " j 6f\n"
+ "5: slgr %[size],%[size]\n"
+ "6:\n"
+ EX_TABLE(0b, 2b)
+ EX_TABLE(1b, 2b)
+ EX_TABLE(3b, 6b)
+ EX_TABLE(4b, 6b)
+ : [size] "+&a" (size), [to] "+&a" (to), [from] "+&a" (from), [rem] "=&a" (rem)
+ : [val] "a" (-4096UL), [spec] "d" (spec.val)
+ : "cc", "memory", "0");
return size;
}
unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n)
{
- if (copy_with_mvcos())
- return copy_to_user_mvcos(to, from, n);
- return copy_to_user_mvcs(to, from, n);
+ return raw_copy_to_user_key(to, from, n, 0);
}
EXPORT_SYMBOL(raw_copy_to_user);
-static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from,
- unsigned long size)
-{
- register unsigned long reg0 asm("0") = 0x010001UL;
- unsigned long tmp1, tmp2;
-
- tmp1 = -4096UL;
- /* FIXME: copy with reduced length. */
- asm volatile(
- "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n"
- " jz 2f\n"
- "1: algr %0,%3\n"
- " slgr %1,%3\n"
- " slgr %2,%3\n"
- " j 0b\n"
- "2:slgr %0,%0\n"
- "3: \n"
- EX_TABLE(0b,3b)
- : "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2)
- : "d" (reg0) : "cc", "memory");
- return size;
-}
-
-static inline unsigned long copy_in_user_mvc(void __user *to, const void __user *from,
- unsigned long size)
+unsigned long _copy_to_user_key(void __user *to, const void *from,
+ unsigned long n, unsigned long key)
{
- mm_segment_t old_fs;
- unsigned long tmp1;
-
- old_fs = enable_sacf_uaccess();
- asm volatile(
- " sacf 256\n"
- " aghi %0,-1\n"
- " jo 5f\n"
- " bras %3,3f\n"
- "0: aghi %0,257\n"
- "1: mvc 0(1,%1),0(%2)\n"
- " la %1,1(%1)\n"
- " la %2,1(%2)\n"
- " aghi %0,-1\n"
- " jnz 1b\n"
- " j 5f\n"
- "2: mvc 0(256,%1),0(%2)\n"
- " la %1,256(%1)\n"
- " la %2,256(%2)\n"
- "3: aghi %0,-256\n"
- " jnm 2b\n"
- "4: ex %0,1b-0b(%3)\n"
- "5: slgr %0,%0\n"
- "6: sacf 768\n"
- EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b)
- : "+a" (size), "+a" (to), "+a" (from), "=a" (tmp1)
- : : "cc", "memory");
- disable_sacf_uaccess(old_fs);
- return size;
-}
-
-unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n)
-{
- if (copy_with_mvcos())
- return copy_in_user_mvcos(to, from, n);
- return copy_in_user_mvc(to, from, n);
-}
-EXPORT_SYMBOL(raw_copy_in_user);
-
-static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size)
-{
- register unsigned long reg0 asm("0") = 0x010000UL;
- unsigned long tmp1, tmp2;
-
- tmp1 = -4096UL;
- asm volatile(
- "0: .insn ss,0xc80000000000,0(%0,%1),0(%4),0\n"
- " jz 4f\n"
- "1: algr %0,%2\n"
- " slgr %1,%2\n"
- " j 0b\n"
- "2: la %3,4095(%1)\n"/* %4 = to + 4095 */
- " nr %3,%2\n" /* %4 = (to + 4095) & -4096 */
- " slgr %3,%1\n"
- " clgr %0,%3\n" /* copy crosses next page boundary? */
- " jnh 5f\n"
- "3: .insn ss,0xc80000000000,0(%3,%1),0(%4),0\n"
- " slgr %0,%3\n"
- " j 5f\n"
- "4: slgr %0,%0\n"
- "5:\n"
- EX_TABLE(0b,2b) EX_TABLE(3b,5b)
- : "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2)
- : "a" (empty_zero_page), "d" (reg0) : "cc", "memory");
- return size;
-}
-
-static inline unsigned long clear_user_xc(void __user *to, unsigned long size)
-{
- mm_segment_t old_fs;
- unsigned long tmp1, tmp2;
-
- old_fs = enable_sacf_uaccess();
- asm volatile(
- " sacf 256\n"
- " aghi %0,-1\n"
- " jo 5f\n"
- " bras %3,3f\n"
- " xc 0(1,%1),0(%1)\n"
- "0: aghi %0,257\n"
- " la %2,255(%1)\n" /* %2 = ptr + 255 */
- " srl %2,12\n"
- " sll %2,12\n" /* %2 = (ptr + 255) & -4096 */
- " slgr %2,%1\n"
- " clgr %0,%2\n" /* clear crosses next page boundary? */
- " jnh 5f\n"
- " aghi %2,-1\n"
- "1: ex %2,0(%3)\n"
- " aghi %2,1\n"
- " slgr %0,%2\n"
- " j 5f\n"
- "2: xc 0(256,%1),0(%1)\n"
- " la %1,256(%1)\n"
- "3: aghi %0,-256\n"
- " jnm 2b\n"
- "4: ex %0,0(%3)\n"
- "5: slgr %0,%0\n"
- "6: sacf 768\n"
- EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b)
- : "+a" (size), "+a" (to), "=a" (tmp1), "=a" (tmp2)
- : : "cc", "memory");
- disable_sacf_uaccess(old_fs);
- return size;
+ might_fault();
+ if (should_fail_usercopy())
+ return n;
+ instrument_copy_to_user(to, from, n);
+ return raw_copy_to_user_key(to, from, n, key);
}
+EXPORT_SYMBOL(_copy_to_user_key);
unsigned long __clear_user(void __user *to, unsigned long size)
{
- if (copy_with_mvcos())
- return clear_user_mvcos(to, size);
- return clear_user_xc(to, size);
-}
-EXPORT_SYMBOL(__clear_user);
-
-static inline unsigned long strnlen_user_srst(const char __user *src,
- unsigned long size)
-{
- register unsigned long reg0 asm("0") = 0;
- unsigned long tmp1, tmp2;
+ unsigned long rem;
+ union oac spec = {
+ .oac1.as = PSW_BITS_AS_SECONDARY,
+ .oac1.a = 1,
+ };
asm volatile(
- " la %2,0(%1)\n"
- " la %3,0(%0,%1)\n"
- " slgr %0,%0\n"
- " sacf 256\n"
- "0: srst %3,%2\n"
- " jo 0b\n"
- " la %0,1(%3)\n" /* strnlen_user results includes \0 */
- " slgr %0,%1\n"
- "1: sacf 768\n"
- EX_TABLE(0b,1b)
- : "+a" (size), "+a" (src), "=a" (tmp1), "=a" (tmp2)
- : "d" (reg0) : "cc", "memory");
+ " lr 0,%[spec]\n"
+ "0: mvcos 0(%[to]),0(%[zeropg]),%[size]\n"
+ "1: jz 5f\n"
+ " algr %[size],%[val]\n"
+ " slgr %[to],%[val]\n"
+ " j 0b\n"
+ "2: la %[rem],4095(%[to])\n" /* rem = to + 4095 */
+ " nr %[rem],%[val]\n" /* rem = (to + 4095) & -4096 */
+ " slgr %[rem],%[to]\n"
+ " clgr %[size],%[rem]\n" /* copy crosses next page boundary? */
+ " jnh 6f\n"
+ "3: mvcos 0(%[to]),0(%[zeropg]),%[rem]\n"
+ "4: slgr %[size],%[rem]\n"
+ " j 6f\n"
+ "5: slgr %[size],%[size]\n"
+ "6:\n"
+ EX_TABLE(0b, 2b)
+ EX_TABLE(1b, 2b)
+ EX_TABLE(3b, 6b)
+ EX_TABLE(4b, 6b)
+ : [size] "+&a" (size), [to] "+&a" (to), [rem] "=&a" (rem)
+ : [val] "a" (-4096UL), [zeropg] "a" (empty_zero_page), [spec] "d" (spec.val)
+ : "cc", "memory", "0");
return size;
}
-
-unsigned long __strnlen_user(const char __user *src, unsigned long size)
-{
- mm_segment_t old_fs;
- unsigned long len;
-
- if (unlikely(!size))
- return 0;
- old_fs = enable_sacf_uaccess();
- len = strnlen_user_srst(src, size);
- disable_sacf_uaccess(old_fs);
- return len;
-}
-EXPORT_SYMBOL(__strnlen_user);
-
-long __strncpy_from_user(char *dst, const char __user *src, long size)
-{
- size_t done, len, offset, len_str;
-
- if (unlikely(size <= 0))
- return 0;
- done = 0;
- do {
- offset = (size_t)src & (L1_CACHE_BYTES - 1);
- len = min(size - done, L1_CACHE_BYTES - offset);
- if (copy_from_user(dst, src, len))
- return -EFAULT;
- len_str = strnlen(dst, len);
- done += len_str;
- src += len_str;
- dst += len_str;
- } while ((len_str == len) && (done < size));
- return done;
-}
-EXPORT_SYMBOL(__strncpy_from_user);
+EXPORT_SYMBOL(__clear_user);
diff --git a/arch/s390/lib/xor.c b/arch/s390/lib/xor.c
index 29d9470dbceb..fb924a8041dc 100644
--- a/arch/s390/lib/xor.c
+++ b/arch/s390/lib/xor.c
@@ -11,7 +11,8 @@
#include <linux/raid/xor.h>
#include <asm/xor.h>
-static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
{
asm volatile(
" larl 1,2f\n"
@@ -32,8 +33,9 @@ static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
: "0", "1", "cc", "memory");
}
-static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3)
+static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
{
asm volatile(
" larl 1,2f\n"
@@ -58,8 +60,10 @@ static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
: : "0", "1", "cc", "memory");
}
-static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4)
+static void xor_xc_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
{
asm volatile(
" larl 1,2f\n"
@@ -88,12 +92,12 @@ static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
: : "0", "1", "cc", "memory");
}
-static void xor_xc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4, unsigned long *p5)
+static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
{
- /* Get around a gcc oddity */
- register unsigned long *reg7 asm ("7") = p5;
-
asm volatile(
" larl 1,2f\n"
" aghi %0,-1\n"
@@ -122,7 +126,7 @@ static void xor_xc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
" xc 0(1,%1),0(%5)\n"
"3:\n"
: "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3), "+a" (p4),
- "+a" (reg7)
+ "+a" (p5)
: : "0", "1", "cc", "memory");
}
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 3175413186b9..352ff520fd94 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -4,12 +4,10 @@
#
obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o
-obj-y += page-states.o pageattr.o pgtable.o pgalloc.o
+obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o
obj-$(CONFIG_CMM) += cmm.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o
+obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o
obj-$(CONFIG_PGSTE) += gmap.o
-
-KASAN_SANITIZE_kasan_init.o := n
-obj-$(CONFIG_KASAN) += kasan_init.o
+obj-$(CONFIG_PFAULT) += pfault.o
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index a51c892f14f3..f8b13f247646 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -14,15 +14,13 @@
#include <linux/moduleparam.h>
#include <linux/gfp.h>
#include <linux/sched.h>
+#include <linux/string_helpers.h>
#include <linux/sysctl.h>
-#include <linux/ctype.h>
#include <linux/swap.h>
#include <linux/kthread.h>
#include <linux/oom.h>
-#include <linux/suspend.h>
#include <linux/uaccess.h>
-#include <asm/pgalloc.h>
#include <asm/diag.h>
#ifdef CONFIG_CMM_IUCV
@@ -49,7 +47,6 @@ static volatile long cmm_pages_target;
static volatile long cmm_timed_pages_target;
static long cmm_timeout_pages;
static long cmm_timeout_seconds;
-static int cmm_suspended;
static struct cmm_page_array *cmm_page_list;
static struct cmm_page_array *cmm_timed_page_list;
@@ -93,7 +90,7 @@ static long cmm_alloc_pages(long nr, long *counter,
} else
free_page((unsigned long) npa);
}
- diag10_range(addr >> PAGE_SHIFT, 1);
+ diag10_range(virt_to_pfn((void *)addr), 1);
pa->pages[pa->index++] = addr;
(*counter)++;
spin_unlock(&cmm_lock);
@@ -151,9 +148,9 @@ static int cmm_thread(void *dummy)
while (1) {
rc = wait_event_interruptible(cmm_thread_wait,
- (!cmm_suspended && (cmm_pages != cmm_pages_target ||
- cmm_timed_pages != cmm_timed_pages_target)) ||
- kthread_should_stop());
+ cmm_pages != cmm_pages_target ||
+ cmm_timed_pages != cmm_timed_pages_target ||
+ kthread_should_stop());
if (kthread_should_stop() || rc == -ERESTARTSYS) {
cmm_pages_target = cmm_pages;
cmm_timed_pages_target = cmm_timed_pages;
@@ -191,7 +188,7 @@ static void cmm_set_timer(void)
del_timer(&cmm_timer);
return;
}
- mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ);
+ mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC));
}
static void cmm_timer_fn(struct timer_list *unused)
@@ -247,7 +244,7 @@ static int cmm_skip_blanks(char *cp, char **endp)
}
static int cmm_pages_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
long nr = cmm_get_pages();
struct ctl_table ctl_entry = {
@@ -266,7 +263,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write,
}
static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
long nr = cmm_get_timed_pages();
@@ -286,7 +283,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
}
static int cmm_timeout_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char buf[64], *p;
long nr, seconds;
@@ -299,8 +296,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
if (write) {
len = min(*lenp, sizeof(buf));
- if (copy_from_user(buf, buffer, len))
- return -EFAULT;
+ memcpy(buf, buffer, len);
buf[len - 1] = '\0';
cmm_skip_blanks(buf, &p);
nr = simple_strtoul(p, &p, 0);
@@ -313,8 +309,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
cmm_timeout_pages, cmm_timeout_seconds);
if (len > *lenp)
len = *lenp;
- if (copy_to_user(buffer, buf, len))
- return -EFAULT;
+ memcpy(buffer, buf, len);
*lenp = len;
*ppos += len;
}
@@ -337,17 +332,6 @@ static struct ctl_table cmm_table[] = {
.mode = 0644,
.proc_handler = cmm_timeout_handler,
},
- { }
-};
-
-static struct ctl_table cmm_dir_table[] = {
- {
- .procname = "vm",
- .maxlen = 0,
- .mode = 0555,
- .child = cmm_table,
- },
- { }
};
#ifdef CONFIG_CMM_IUCV
@@ -390,54 +374,19 @@ static void cmm_smsg_target(const char *from, char *msg)
static struct ctl_table_header *cmm_sysctl_header;
-static int cmm_suspend(void)
-{
- cmm_suspended = 1;
- cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
- cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
- return 0;
-}
-
-static int cmm_resume(void)
-{
- cmm_suspended = 0;
- cmm_kick_thread();
- return 0;
-}
-
-static int cmm_power_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- switch (event) {
- case PM_POST_HIBERNATION:
- return cmm_resume();
- case PM_HIBERNATION_PREPARE:
- return cmm_suspend();
- default:
- return NOTIFY_DONE;
- }
-}
-
-static struct notifier_block cmm_power_notifier = {
- .notifier_call = cmm_power_event,
-};
-
static int __init cmm_init(void)
{
int rc = -ENOMEM;
- cmm_sysctl_header = register_sysctl_table(cmm_dir_table);
+ cmm_sysctl_header = register_sysctl("vm", cmm_table);
if (!cmm_sysctl_header)
goto out_sysctl;
#ifdef CONFIG_CMM_IUCV
/* convert sender to uppercase characters */
- if (sender) {
- int len = strlen(sender);
- while (len--)
- sender[len] = toupper(sender[len]);
- } else {
+ if (sender)
+ string_upper(sender, sender);
+ else
sender = cmm_default_sender;
- }
rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target);
if (rc < 0)
@@ -446,16 +395,11 @@ static int __init cmm_init(void)
rc = register_oom_notifier(&cmm_oom_nb);
if (rc < 0)
goto out_oom_notify;
- rc = register_pm_notifier(&cmm_power_notifier);
- if (rc)
- goto out_pm;
cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
if (!IS_ERR(cmm_thread_ptr))
return 0;
rc = PTR_ERR(cmm_thread_ptr);
- unregister_pm_notifier(&cmm_power_notifier);
-out_pm:
unregister_oom_notifier(&cmm_oom_nb);
out_oom_notify:
#ifdef CONFIG_CMM_IUCV
@@ -475,7 +419,6 @@ static void __exit cmm_exit(void)
#ifdef CONFIG_CMM_IUCV
smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target);
#endif
- unregister_pm_notifier(&cmm_power_notifier);
unregister_oom_notifier(&cmm_oom_nb);
kthread_stop(cmm_thread_ptr);
del_timer_sync(&cmm_timer);
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 5d67b81c704a..d37a8f607b71 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -1,12 +1,17 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/set_memory.h>
+#include <linux/ptdump.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
-#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/kfence.h>
#include <linux/kasan.h>
+#include <asm/ptdump.h>
#include <asm/kasan.h>
+#include <asm/abs_lowcore.h>
+#include <asm/nospec-branch.h>
#include <asm/sections.h>
-#include <asm/pgtable.h>
+#include <asm/maccess.h>
static unsigned long max_addr;
@@ -16,277 +21,295 @@ struct addr_marker {
};
enum address_markers_idx {
- IDENTITY_NR = 0,
+ IDENTITY_BEFORE_NR = 0,
+ IDENTITY_BEFORE_END_NR,
+ AMODE31_START_NR,
+ AMODE31_END_NR,
KERNEL_START_NR,
KERNEL_END_NR,
-#ifdef CONFIG_KASAN
- KASAN_SHADOW_START_NR,
- KASAN_SHADOW_END_NR,
+#ifdef CONFIG_KFENCE
+ KFENCE_START_NR,
+ KFENCE_END_NR,
#endif
+ IDENTITY_AFTER_NR,
+ IDENTITY_AFTER_END_NR,
VMEMMAP_NR,
+ VMEMMAP_END_NR,
VMALLOC_NR,
+ VMALLOC_END_NR,
MODULES_NR,
+ MODULES_END_NR,
+ ABS_LOWCORE_NR,
+ ABS_LOWCORE_END_NR,
+ MEMCPY_REAL_NR,
+ MEMCPY_REAL_END_NR,
+#ifdef CONFIG_KASAN
+ KASAN_SHADOW_START_NR,
+ KASAN_SHADOW_END_NR,
+#endif
};
static struct addr_marker address_markers[] = {
- [IDENTITY_NR] = {0, "Identity Mapping"},
+ [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"},
+ [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"},
+ [AMODE31_START_NR] = {0, "Amode31 Area Start"},
+ [AMODE31_END_NR] = {0, "Amode31 Area End"},
[KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"},
[KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"},
+#ifdef CONFIG_KFENCE
+ [KFENCE_START_NR] = {0, "KFence Pool Start"},
+ [KFENCE_END_NR] = {0, "KFence Pool End"},
+#endif
+ [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"},
+ [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"},
+ [VMEMMAP_NR] = {0, "vmemmap Area Start"},
+ [VMEMMAP_END_NR] = {0, "vmemmap Area End"},
+ [VMALLOC_NR] = {0, "vmalloc Area Start"},
+ [VMALLOC_END_NR] = {0, "vmalloc Area End"},
+ [MODULES_NR] = {0, "Modules Area Start"},
+ [MODULES_END_NR] = {0, "Modules Area End"},
+ [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"},
+ [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"},
+ [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"},
+ [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"},
#ifdef CONFIG_KASAN
[KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"},
[KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"},
#endif
- [VMEMMAP_NR] = {0, "vmemmap Area"},
- [VMALLOC_NR] = {0, "vmalloc Area"},
- [MODULES_NR] = {0, "Modules Area"},
{ -1, NULL }
};
struct pg_state {
+ struct ptdump_state ptdump;
+ struct seq_file *seq;
int level;
unsigned int current_prot;
+ bool check_wx;
+ unsigned long wx_pages;
unsigned long start_address;
- unsigned long current_address;
const struct addr_marker *marker;
};
+#define pt_dump_seq_printf(m, fmt, args...) \
+({ \
+ struct seq_file *__m = (m); \
+ \
+ if (__m) \
+ seq_printf(__m, fmt, ##args); \
+})
+
+#define pt_dump_seq_puts(m, fmt) \
+({ \
+ struct seq_file *__m = (m); \
+ \
+ if (__m) \
+ seq_printf(__m, fmt); \
+})
+
static void print_prot(struct seq_file *m, unsigned int pr, int level)
{
static const char * const level_name[] =
{ "ASCE", "PGD", "PUD", "PMD", "PTE" };
- seq_printf(m, "%s ", level_name[level]);
+ pt_dump_seq_printf(m, "%s ", level_name[level]);
if (pr & _PAGE_INVALID) {
- seq_printf(m, "I\n");
+ pt_dump_seq_printf(m, "I\n");
return;
}
- seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW ");
- seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n");
+ pt_dump_seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW ");
+ pt_dump_seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n");
}
-static void note_page(struct seq_file *m, struct pg_state *st,
- unsigned int new_prot, int level)
+static void note_prot_wx(struct pg_state *st, unsigned long addr)
+{
+#ifdef CONFIG_DEBUG_WX
+ if (!st->check_wx)
+ return;
+ if (st->current_prot & _PAGE_INVALID)
+ return;
+ if (st->current_prot & _PAGE_PROTECT)
+ return;
+ if (st->current_prot & _PAGE_NOEXEC)
+ return;
+ /*
+ * The first lowcore page is W+X if spectre mitigations are using
+ * trampolines or the BEAR enhancements facility is not installed,
+ * in which case we have two lpswe instructions in lowcore that need
+ * to be executable.
+ */
+ if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)))
+ return;
+ WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n",
+ (void *)st->start_address);
+ st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
+#endif /* CONFIG_DEBUG_WX */
+}
+
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
{
- static const char units[] = "KMGTPE";
int width = sizeof(unsigned long) * 2;
+ static const char units[] = "KMGTPE";
const char *unit = units;
- unsigned int prot, cur;
unsigned long delta;
+ struct pg_state *st;
+ struct seq_file *m;
+ unsigned int prot;
- /*
- * If we have a "break" in the series, we need to flush the state
- * that we have now. "break" is either changing perms, levels or
- * address space marker.
- */
- prot = new_prot;
- cur = st->current_prot;
-
- if (!st->level) {
- /* First entry */
- st->current_prot = new_prot;
+ st = container_of(pt_st, struct pg_state, ptdump);
+ m = st->seq;
+ prot = val & (_PAGE_PROTECT | _PAGE_NOEXEC);
+ if (level == 4 && (val & _PAGE_INVALID))
+ prot = _PAGE_INVALID;
+ /* For pmd_none() & friends val gets passed as zero. */
+ if (level != 4 && !val)
+ prot = _PAGE_INVALID;
+ /* Final flush from generic code. */
+ if (level == -1)
+ addr = max_addr;
+ if (st->level == -1) {
+ pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ st->start_address = addr;
+ st->current_prot = prot;
st->level = level;
- st->marker = address_markers;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
- } else if (prot != cur || level != st->level ||
- st->current_address >= st->marker[1].start_address) {
- /* Print the actual finished series */
- seq_printf(m, "0x%0*lx-0x%0*lx ",
- width, st->start_address,
- width, st->current_address);
- delta = (st->current_address - st->start_address) >> 10;
+ } else if (prot != st->current_prot || level != st->level ||
+ addr >= st->marker[1].start_address) {
+ note_prot_wx(st, addr);
+ pt_dump_seq_printf(m, "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, addr);
+ delta = (addr - st->start_address) >> 10;
while (!(delta & 0x3ff) && unit[1]) {
delta >>= 10;
unit++;
}
- seq_printf(m, "%9lu%c ", delta, *unit);
+ pt_dump_seq_printf(m, "%9lu%c ", delta, *unit);
print_prot(m, st->current_prot, st->level);
- while (st->current_address >= st->marker[1].start_address) {
+ while (addr >= st->marker[1].start_address) {
st->marker++;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
}
- st->start_address = st->current_address;
- st->current_prot = new_prot;
+ st->start_address = addr;
+ st->current_prot = prot;
st->level = level;
}
}
-#ifdef CONFIG_KASAN
-static void note_kasan_early_shadow_page(struct seq_file *m,
- struct pg_state *st)
-{
- unsigned int prot;
-
- prot = pte_val(*kasan_early_shadow_pte) &
- (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC);
- note_page(m, st, prot, 4);
-}
-#endif
-
-/*
- * The actual page table walker functions. In order to keep the
- * implementation of print_prot() short, we only check and pass
- * _PAGE_INVALID and _PAGE_PROTECT flags to note_page() if a region,
- * segment or page table entry is invalid or read-only.
- * After all it's just a hint that the current level being walked
- * contains an invalid or read-only entry.
- */
-static void walk_pte_level(struct seq_file *m, struct pg_state *st,
- pmd_t *pmd, unsigned long addr)
-{
- unsigned int prot;
- pte_t *pte;
- int i;
-
- for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) {
- st->current_address = addr;
- pte = pte_offset_kernel(pmd, addr);
- prot = pte_val(*pte) &
- (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC);
- note_page(m, st, prot, 4);
- addr += PAGE_SIZE;
- }
-}
-
-static void walk_pmd_level(struct seq_file *m, struct pg_state *st,
- pud_t *pud, unsigned long addr)
-{
- unsigned int prot;
- pmd_t *pmd;
- int i;
-
-#ifdef CONFIG_KASAN
- if ((pud_val(*pud) & PAGE_MASK) == __pa(kasan_early_shadow_pmd)) {
- note_kasan_early_shadow_page(m, st);
- return;
- }
-#endif
-
- pmd = pmd_offset(pud, addr);
- for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++, pmd++) {
- st->current_address = addr;
- if (!pmd_none(*pmd)) {
- if (pmd_large(*pmd)) {
- prot = pmd_val(*pmd) &
- (_SEGMENT_ENTRY_PROTECT |
- _SEGMENT_ENTRY_NOEXEC);
- note_page(m, st, prot, 3);
- } else
- walk_pte_level(m, st, pmd, addr);
- } else
- note_page(m, st, _PAGE_INVALID, 3);
- addr += PMD_SIZE;
- }
-}
-
-static void walk_pud_level(struct seq_file *m, struct pg_state *st,
- p4d_t *p4d, unsigned long addr)
+#ifdef CONFIG_DEBUG_WX
+void ptdump_check_wx(void)
{
- unsigned int prot;
- pud_t *pud;
- int i;
+ struct pg_state st = {
+ .ptdump = {
+ .note_page = note_page,
+ .range = (struct ptdump_range[]) {
+ {.start = 0, .end = max_addr},
+ {.start = 0, .end = 0},
+ }
+ },
+ .seq = NULL,
+ .level = -1,
+ .current_prot = 0,
+ .check_wx = true,
+ .wx_pages = 0,
+ .start_address = 0,
+ .marker = (struct addr_marker[]) {
+ { .start_address = 0, .name = NULL},
+ { .start_address = -1, .name = NULL},
+ },
+ };
-#ifdef CONFIG_KASAN
- if ((p4d_val(*p4d) & PAGE_MASK) == __pa(kasan_early_shadow_pud)) {
- note_kasan_early_shadow_page(m, st);
+ if (!MACHINE_HAS_NX)
return;
- }
-#endif
-
- pud = pud_offset(p4d, addr);
- for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++, pud++) {
- st->current_address = addr;
- if (!pud_none(*pud))
- if (pud_large(*pud)) {
- prot = pud_val(*pud) &
- (_REGION_ENTRY_PROTECT |
- _REGION_ENTRY_NOEXEC);
- note_page(m, st, prot, 2);
- } else
- walk_pmd_level(m, st, pud, addr);
- else
- note_page(m, st, _PAGE_INVALID, 2);
- addr += PUD_SIZE;
- }
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+ if (st.wx_pages)
+ pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages);
+ else
+ pr_info("Checked W+X mappings: passed, no %sW+X pages found\n",
+ (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ?
+ "unexpected " : "");
}
+#endif /* CONFIG_DEBUG_WX */
-static void walk_p4d_level(struct seq_file *m, struct pg_state *st,
- pgd_t *pgd, unsigned long addr)
+#ifdef CONFIG_PTDUMP_DEBUGFS
+static int ptdump_show(struct seq_file *m, void *v)
{
- p4d_t *p4d;
- int i;
-
-#ifdef CONFIG_KASAN
- if ((pgd_val(*pgd) & PAGE_MASK) == __pa(kasan_early_shadow_p4d)) {
- note_kasan_early_shadow_page(m, st);
- return;
- }
-#endif
+ struct pg_state st = {
+ .ptdump = {
+ .note_page = note_page,
+ .range = (struct ptdump_range[]) {
+ {.start = 0, .end = max_addr},
+ {.start = 0, .end = 0},
+ }
+ },
+ .seq = m,
+ .level = -1,
+ .current_prot = 0,
+ .check_wx = false,
+ .wx_pages = 0,
+ .start_address = 0,
+ .marker = address_markers,
+ };
- p4d = p4d_offset(pgd, addr);
- for (i = 0; i < PTRS_PER_P4D && addr < max_addr; i++, p4d++) {
- st->current_address = addr;
- if (!p4d_none(*p4d))
- walk_pud_level(m, st, p4d, addr);
- else
- note_page(m, st, _PAGE_INVALID, 2);
- addr += P4D_SIZE;
- }
+ get_online_mems();
+ mutex_lock(&cpa_mutex);
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+ mutex_unlock(&cpa_mutex);
+ put_online_mems();
+ return 0;
}
+DEFINE_SHOW_ATTRIBUTE(ptdump);
+#endif /* CONFIG_PTDUMP_DEBUGFS */
-static void walk_pgd_level(struct seq_file *m)
+/*
+ * Heapsort from lib/sort.c is not a stable sorting algorithm, do a simple
+ * insertion sort to preserve the original order of markers with the same
+ * start address.
+ */
+static void sort_address_markers(void)
{
- unsigned long addr = 0;
- struct pg_state st;
- pgd_t *pgd;
- int i;
+ struct addr_marker tmp;
+ int i, j;
- memset(&st, 0, sizeof(st));
- for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) {
- st.current_address = addr;
- pgd = pgd_offset_k(addr);
- if (!pgd_none(*pgd))
- walk_p4d_level(m, &st, pgd, addr);
- else
- note_page(m, &st, _PAGE_INVALID, 1);
- addr += PGDIR_SIZE;
- cond_resched();
+ for (i = 1; i < ARRAY_SIZE(address_markers) - 1; i++) {
+ tmp = address_markers[i];
+ for (j = i - 1; j >= 0 && address_markers[j].start_address > tmp.start_address; j--)
+ address_markers[j + 1] = address_markers[j];
+ address_markers[j + 1] = tmp;
}
- /* Flush out the last page */
- st.current_address = max_addr;
- note_page(m, &st, 0, 0);
-}
-
-static int ptdump_show(struct seq_file *m, void *v)
-{
- walk_pgd_level(m);
- return 0;
-}
-
-static int ptdump_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, ptdump_show, NULL);
}
-static const struct file_operations ptdump_fops = {
- .open = ptdump_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int pt_dump_init(void)
{
+#ifdef CONFIG_KFENCE
+ unsigned long kfence_start = (unsigned long)__kfence_pool;
+#endif
/*
* Figure out the maximum virtual address being accessible with the
* kernel ASCE. We need this to keep the page table walker functions
* from accessing non-existent entries.
*/
- max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2;
+ max_addr = (S390_lowcore.kernel_asce.val & _REGION_ENTRY_TYPE_MASK) >> 2;
max_addr = 1UL << (max_addr * 11 + 31);
+ address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size;
+ address_markers[AMODE31_START_NR].start_address = (unsigned long)__samode31;
+ address_markers[AMODE31_END_NR].start_address = (unsigned long)__eamode31;
address_markers[MODULES_NR].start_address = MODULES_VADDR;
+ address_markers[MODULES_END_NR].start_address = MODULES_END;
+ address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore;
+ address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE;
+ address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area;
+ address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + MEMCPY_REAL_SIZE;
address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
+ address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size;
address_markers[VMALLOC_NR].start_address = VMALLOC_START;
+ address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+#ifdef CONFIG_KFENCE
+ address_markers[KFENCE_START_NR].start_address = kfence_start;
+ address_markers[KFENCE_END_NR].start_address = kfence_start + KFENCE_POOL_SIZE;
+#endif
+ sort_address_markers();
+#ifdef CONFIG_PTDUMP_DEBUGFS
debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
+#endif /* CONFIG_PTDUMP_DEBUGFS */
return 0;
}
device_initcall(pt_dump_init);
diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c
new file mode 100644
index 000000000000..0a0738a473af
--- /dev/null
+++ b/arch/s390/mm/extable.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitfield.h>
+#include <linux/extable.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/panic.h>
+#include <asm/asm-extable.h>
+#include <asm/extable.h>
+
+const struct exception_table_entry *s390_search_extables(unsigned long addr)
+{
+ const struct exception_table_entry *fixup;
+ size_t num;
+
+ fixup = search_exception_tables(addr);
+ if (fixup)
+ return fixup;
+ num = __stop_amode31_ex_table - __start_amode31_ex_table;
+ return search_extable(__start_amode31_ex_table, num, addr);
+}
+
+static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+
+ regs->gprs[reg_err] = -EFAULT;
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+ unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+ size_t len = FIELD_GET(EX_DATA_LEN, ex->data);
+
+ regs->gprs[reg_err] = -EFAULT;
+ memset((void *)regs->gprs[reg_addr], 0, len);
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex,
+ bool pair, struct pt_regs *regs)
+{
+ unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+ unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+
+ regs->gprs[reg_err] = -EFAULT;
+ regs->gprs[reg_zero] = 0;
+ if (pair)
+ regs->gprs[reg_zero + 1] = 0;
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+static bool ex_handler_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+ unsigned int reg_data = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+ unsigned long data, addr, offset;
+
+ addr = regs->gprs[reg_addr];
+ offset = addr & (sizeof(unsigned long) - 1);
+ addr &= ~(sizeof(unsigned long) - 1);
+ data = *(unsigned long *)addr;
+ data <<= BITS_PER_BYTE * offset;
+ regs->gprs[reg_data] = data;
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+bool fixup_exception(struct pt_regs *regs)
+{
+ const struct exception_table_entry *ex;
+
+ ex = s390_search_extables(instruction_pointer(regs));
+ if (!ex)
+ return false;
+ switch (ex->type) {
+ case EX_TYPE_FIXUP:
+ return ex_handler_fixup(ex, regs);
+ case EX_TYPE_BPF:
+ return ex_handler_bpf(ex, regs);
+ case EX_TYPE_UA_STORE:
+ return ex_handler_ua_store(ex, regs);
+ case EX_TYPE_UA_LOAD_MEM:
+ return ex_handler_ua_load_mem(ex, regs);
+ case EX_TYPE_UA_LOAD_REG:
+ return ex_handler_ua_load_reg(ex, false, regs);
+ case EX_TYPE_UA_LOAD_REGPAIR:
+ return ex_handler_ua_load_reg(ex, true, regs);
+ case EX_TYPE_ZEROPAD:
+ return ex_handler_zeropad(ex, regs);
+ }
+ panic("invalid exception table entry");
+}
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index fd0dae9d10f4..e41869f5cc95 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -20,9 +20,9 @@
#include <linux/ctype.h>
#include <linux/ioport.h>
#include <linux/refcount.h>
+#include <linux/pgtable.h>
#include <asm/diag.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#include <asm/ebcdic.h>
#include <asm/errno.h>
#include <asm/extmem.h>
@@ -289,15 +289,17 @@ segment_overlaps_others (struct dcss_segment *seg)
/*
* real segment loading function, called from segment_load
+ * Must return either an error code < 0, or the segment type code >= 0
*/
static int
__segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long *end)
{
unsigned long start_addr, end_addr, dummy;
struct dcss_segment *seg;
- int rc, diag_cc;
+ int rc, diag_cc, segtype;
start_addr = end_addr = 0;
+ segtype = -1;
seg = kmalloc(sizeof(*seg), GFP_KERNEL | GFP_DMA);
if (seg == NULL) {
rc = -ENOMEM;
@@ -313,15 +315,10 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
goto out_free;
}
- rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
-
- if (rc)
- goto out_free;
-
seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL);
if (seg->res == NULL) {
rc = -ENOMEM;
- goto out_shared;
+ goto out_free;
}
seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
seg->res->start = seg->start_addr;
@@ -331,16 +328,21 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
seg->res_name[8] = '\0';
strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name));
seg->res->name = seg->res_name;
- rc = seg->vm_segtype;
- if (rc == SEG_TYPE_SC ||
- ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared))
+ segtype = seg->vm_segtype;
+ if (segtype == SEG_TYPE_SC ||
+ ((segtype == SEG_TYPE_SR || segtype == SEG_TYPE_ER) && !do_nonshared))
seg->res->flags |= IORESOURCE_READONLY;
+
+ /* Check for overlapping resources before adding the mapping. */
if (request_resource(&iomem_resource, seg->res)) {
rc = -EBUSY;
- kfree(seg->res);
- goto out_shared;
+ goto out_free_resource;
}
+ rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
+ if (rc)
+ goto out_resource;
+
if (do_nonshared)
diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name,
&start_addr, &end_addr);
@@ -351,14 +353,14 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
dcss_diag(&purgeseg_scode, seg->dcss_name,
&dummy, &dummy);
rc = diag_cc;
- goto out_resource;
+ goto out_mapping;
}
if (diag_cc > 1) {
pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr);
rc = dcss_diag_translate_rc(end_addr);
dcss_diag(&purgeseg_scode, seg->dcss_name,
&dummy, &dummy);
- goto out_resource;
+ goto out_mapping;
}
seg->start_addr = start_addr;
seg->end = end_addr;
@@ -377,15 +379,16 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
(void*) seg->end, segtype_string[seg->vm_segtype]);
}
goto out;
+ out_mapping:
+ vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
out_resource:
release_resource(seg->res);
+ out_free_resource:
kfree(seg->res);
- out_shared:
- vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
out_free:
kfree(seg);
out:
- return rc;
+ return rc < 0 ? rc : segtype;
}
/*
@@ -400,8 +403,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
* -EIO : could not perform query or load diagnose
* -ENOENT : no such segment
* -EOPNOTSUPP: multi-part segment cannot be used with linux
- * -ENOSPC : segment cannot be used (overlaps with storage)
- * -EBUSY : segment can temporarily not be used (overlaps with dcss)
+ * -EBUSY : segment cannot be used (overlaps with dcss or storage)
* -ERANGE : segment cannot be used (exceeds kernel mapping range)
* -EPERM : segment is currently loaded with incompatible permissions
* -ENOMEM : out of memory
@@ -626,10 +628,6 @@ void segment_warning(int rc, char *seg_name)
pr_err("DCSS %s has multiple page ranges and cannot be "
"loaded or queried\n", seg_name);
break;
- case -ENOSPC:
- pr_err("DCSS %s overlaps with used storage and cannot "
- "be loaded\n", seg_name);
- break;
case -EBUSY:
pr_err("%s needs used memory resources and cannot be "
"loaded or queried\n", seg_name);
@@ -642,10 +640,13 @@ void segment_warning(int rc, char *seg_name)
pr_err("There is not enough memory to load or query "
"DCSS %s\n", seg_name);
break;
- case -ERANGE:
- pr_err("DCSS %s exceeds the kernel mapping range (%lu) "
- "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS);
+ case -ERANGE: {
+ struct range mhp_range = arch_get_mappable_range();
+
+ pr_err("DCSS %s exceeds the kernel mapping range (%llu) "
+ "and cannot be loaded\n", seg_name, mhp_range.end + 1);
break;
+ }
default:
break;
}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 7b0bb475c166..ac4c78546d97 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -3,17 +3,19 @@
* S390 version
* Copyright IBM Corp. 1999
* Author(s): Hartmut Penner (hp@de.ibm.com)
- * Ulrich Weigand (uweigand@de.ibm.com)
+ * Ulrich Weigand (uweigand@de.ibm.com)
*
* Derived from "arch/i386/mm/fault.c"
* Copyright (C) 1995 Linus Torvalds
*/
#include <linux/kernel_stat.h>
+#include <linux/mmu_context.h>
#include <linux/perf_event.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
+#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
@@ -31,38 +33,30 @@
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/hugetlb.h>
+#include <linux/kfence.h>
+#include <asm/asm-extable.h>
#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+#include <asm/fault.h>
#include <asm/diag.h>
-#include <asm/pgtable.h>
#include <asm/gmap.h>
#include <asm/irq.h>
-#include <asm/mmu_context.h>
#include <asm/facility.h>
+#include <asm/uv.h>
#include "../kernel/entry.h"
-#define __FAIL_ADDR_MASK -4096L
-#define __SUBCODE_MASK 0x0600
-#define __PF_RES_FIELD 0x8000000000000000ULL
-
-#define VM_FAULT_BADCONTEXT 0x010000
-#define VM_FAULT_BADMAP 0x020000
-#define VM_FAULT_BADACCESS 0x040000
-#define VM_FAULT_SIGNAL 0x080000
-#define VM_FAULT_PFAULT 0x100000
-
enum fault_type {
KERNEL_FAULT,
USER_FAULT,
- VDSO_FAULT,
GMAP_FAULT,
};
-static unsigned long store_indication __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(have_store_indication);
static int __init fault_init(void)
{
if (test_facility(75))
- store_indication = 0xc00;
+ static_branch_enable(&have_store_indication);
return 0;
}
early_initcall(fault_init);
@@ -72,88 +66,88 @@ early_initcall(fault_init);
*/
static enum fault_type get_fault_type(struct pt_regs *regs)
{
- unsigned long trans_exc_code;
+ union teid teid = { .val = regs->int_parm_long };
- trans_exc_code = regs->int_parm_long & 3;
- if (likely(trans_exc_code == 0)) {
- /* primary space exception */
- if (IS_ENABLED(CONFIG_PGSTE) &&
- test_pt_regs_flag(regs, PIF_GUEST_FAULT))
- return GMAP_FAULT;
- if (current->thread.mm_segment == USER_DS)
+ if (likely(teid.as == PSW_BITS_AS_PRIMARY)) {
+ if (user_mode(regs))
return USER_FAULT;
- return KERNEL_FAULT;
- }
- if (trans_exc_code == 2) {
- /* secondary space exception */
- if (current->thread.mm_segment & 1) {
- if (current->thread.mm_segment == USER_DS_SACF)
- return USER_FAULT;
+ if (!IS_ENABLED(CONFIG_PGSTE))
return KERNEL_FAULT;
- }
- return VDSO_FAULT;
+ if (test_pt_regs_flag(regs, PIF_GUEST_FAULT))
+ return GMAP_FAULT;
+ return KERNEL_FAULT;
}
- if (trans_exc_code == 1) {
- /* access register mode, not used in the kernel */
+ if (teid.as == PSW_BITS_AS_SECONDARY)
return USER_FAULT;
- }
- /* home space exception -> access via kernel ASCE */
+ /* Access register mode, not used in the kernel */
+ if (teid.as == PSW_BITS_AS_ACCREG)
+ return USER_FAULT;
+ /* Home space -> access via kernel ASCE */
return KERNEL_FAULT;
}
-static int bad_address(void *p)
+static unsigned long get_fault_address(struct pt_regs *regs)
{
- unsigned long dummy;
+ union teid teid = { .val = regs->int_parm_long };
- return probe_kernel_address((unsigned long *)p, dummy);
+ return teid.addr * PAGE_SIZE;
+}
+
+static __always_inline bool fault_is_write(struct pt_regs *regs)
+{
+ union teid teid = { .val = regs->int_parm_long };
+
+ if (static_branch_likely(&have_store_indication))
+ return teid.fsi == TEID_FSI_STORE;
+ return false;
}
static void dump_pagetable(unsigned long asce, unsigned long address)
{
- unsigned long *table = __va(asce & _ASCE_ORIGIN);
+ unsigned long entry, *table = __va(asce & _ASCE_ORIGIN);
pr_alert("AS:%016lx ", asce);
switch (asce & _ASCE_TYPE_MASK) {
case _ASCE_TYPE_REGION1:
table += (address & _REGION1_INDEX) >> _REGION1_SHIFT;
- if (bad_address(table))
+ if (get_kernel_nofault(entry, table))
goto bad;
- pr_cont("R1:%016lx ", *table);
- if (*table & _REGION_ENTRY_INVALID)
+ pr_cont("R1:%016lx ", entry);
+ if (entry & _REGION_ENTRY_INVALID)
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ table = __va(entry & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_REGION2:
table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
- if (bad_address(table))
+ if (get_kernel_nofault(entry, table))
goto bad;
- pr_cont("R2:%016lx ", *table);
- if (*table & _REGION_ENTRY_INVALID)
+ pr_cont("R2:%016lx ", entry);
+ if (entry & _REGION_ENTRY_INVALID)
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ table = __va(entry & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_REGION3:
table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
- if (bad_address(table))
+ if (get_kernel_nofault(entry, table))
goto bad;
- pr_cont("R3:%016lx ", *table);
- if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
+ pr_cont("R3:%016lx ", entry);
+ if (entry & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ table = __va(entry & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_SEGMENT:
table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
- if (bad_address(table))
+ if (get_kernel_nofault(entry, table))
goto bad;
- pr_cont("S:%016lx ", *table);
- if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
+ pr_cont("S:%016lx ", entry);
+ if (entry & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
goto out;
- table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ table = __va(entry & _SEGMENT_ENTRY_ORIGIN);
}
table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
- if (bad_address(table))
+ if (get_kernel_nofault(entry, table))
goto bad;
- pr_cont("P:%016lx ", *table);
+ pr_cont("P:%016lx ", entry);
out:
pr_cont("\n");
return;
@@ -163,212 +157,113 @@ bad:
static void dump_fault_info(struct pt_regs *regs)
{
+ union teid teid = { .val = regs->int_parm_long };
unsigned long asce;
pr_alert("Failing address: %016lx TEID: %016lx\n",
- regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
+ get_fault_address(regs), teid.val);
pr_alert("Fault in ");
- switch (regs->int_parm_long & 3) {
- case 3:
+ switch (teid.as) {
+ case PSW_BITS_AS_HOME:
pr_cont("home space ");
break;
- case 2:
+ case PSW_BITS_AS_SECONDARY:
pr_cont("secondary space ");
break;
- case 1:
+ case PSW_BITS_AS_ACCREG:
pr_cont("access register ");
break;
- case 0:
+ case PSW_BITS_AS_PRIMARY:
pr_cont("primary space ");
break;
}
pr_cont("mode while using ");
switch (get_fault_type(regs)) {
case USER_FAULT:
- asce = S390_lowcore.user_asce;
+ asce = S390_lowcore.user_asce.val;
pr_cont("user ");
break;
- case VDSO_FAULT:
- asce = S390_lowcore.vdso_asce;
- pr_cont("vdso ");
- break;
case GMAP_FAULT:
- asce = ((struct gmap *) S390_lowcore.gmap)->asce;
+ asce = ((struct gmap *)S390_lowcore.gmap)->asce;
pr_cont("gmap ");
break;
case KERNEL_FAULT:
- asce = S390_lowcore.kernel_asce;
+ asce = S390_lowcore.kernel_asce.val;
pr_cont("kernel ");
break;
default:
unreachable();
}
pr_cont("ASCE.\n");
- dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
+ dump_pagetable(asce, get_fault_address(regs));
}
int show_unhandled_signals = 1;
void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
{
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+
if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
return;
if (!unhandled_signal(current, signr))
return;
- if (!printk_ratelimit())
+ if (!__ratelimit(&rs))
return;
- printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
- regs->int_code & 0xffff, regs->int_code >> 17);
+ pr_alert("User process fault: interruption code %04x ilc:%d ",
+ regs->int_code & 0xffff, regs->int_code >> 17);
print_vma_addr(KERN_CONT "in ", regs->psw.addr);
- printk(KERN_CONT "\n");
+ pr_cont("\n");
if (is_mm_fault)
dump_fault_info(regs);
show_regs(regs);
}
-/*
- * Send SIGSEGV to task. This is an external routine
- * to keep the stack usage of do_page_fault small.
- */
-static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
+static void do_sigsegv(struct pt_regs *regs, int si_code)
{
report_user_fault(regs, SIGSEGV, 1);
- force_sig_fault(SIGSEGV, si_code,
- (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
+ force_sig_fault(SIGSEGV, si_code, (void __user *)get_fault_address(regs));
}
-const struct exception_table_entry *s390_search_extables(unsigned long addr)
+static void handle_fault_error_nolock(struct pt_regs *regs, int si_code)
{
- const struct exception_table_entry *fixup;
-
- fixup = search_extable(__start_dma_ex_table,
- __stop_dma_ex_table - __start_dma_ex_table,
- addr);
- if (!fixup)
- fixup = search_exception_tables(addr);
- return fixup;
-}
-
-static noinline void do_no_context(struct pt_regs *regs)
-{
- const struct exception_table_entry *fixup;
+ enum fault_type fault_type;
+ unsigned long address;
+ bool is_write;
- /* Are we prepared to handle this kernel fault? */
- fixup = s390_search_extables(regs->psw.addr);
- if (fixup) {
- regs->psw.addr = extable_fixup(fixup);
+ if (user_mode(regs)) {
+ if (WARN_ON_ONCE(!si_code))
+ si_code = SEGV_MAPERR;
+ return do_sigsegv(regs, si_code);
+ }
+ if (fixup_exception(regs))
return;
+ fault_type = get_fault_type(regs);
+ if (fault_type == KERNEL_FAULT) {
+ address = get_fault_address(regs);
+ is_write = fault_is_write(regs);
+ if (kfence_handle_page_fault(address, is_write, regs))
+ return;
}
-
- /*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
- if (get_fault_type(regs) == KERNEL_FAULT)
- printk(KERN_ALERT "Unable to handle kernel pointer dereference"
- " in virtual kernel address space\n");
+ if (fault_type == KERNEL_FAULT)
+ pr_alert("Unable to handle kernel pointer dereference in virtual kernel address space\n");
else
- printk(KERN_ALERT "Unable to handle kernel paging request"
- " in virtual user address space\n");
+ pr_alert("Unable to handle kernel paging request in virtual user address space\n");
dump_fault_info(regs);
die(regs, "Oops");
- do_exit(SIGKILL);
}
-static noinline void do_low_address(struct pt_regs *regs)
+static void handle_fault_error(struct pt_regs *regs, int si_code)
{
- /* Low-address protection hit in kernel mode means
- NULL pointer write access in kernel mode. */
- if (regs->psw.mask & PSW_MASK_PSTATE) {
- /* Low-address protection hit in user mode 'cannot happen'. */
- die (regs, "Low-address protection");
- do_exit(SIGKILL);
- }
+ struct mm_struct *mm = current->mm;
- do_no_context(regs);
-}
-
-static noinline void do_sigbus(struct pt_regs *regs)
-{
- /*
- * Send a sigbus, regardless of whether we were in kernel
- * or user mode.
- */
- force_sig_fault(SIGBUS, BUS_ADRERR,
- (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
+ mmap_read_unlock(mm);
+ handle_fault_error_nolock(regs, si_code);
}
-static noinline int signal_return(struct pt_regs *regs)
+static void do_sigbus(struct pt_regs *regs)
{
- u16 instruction;
- int rc;
-
- rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
- if (rc)
- return rc;
- if (instruction == 0x0a77) {
- set_pt_regs_flag(regs, PIF_SYSCALL);
- regs->int_code = 0x00040077;
- return 0;
- } else if (instruction == 0x0aad) {
- set_pt_regs_flag(regs, PIF_SYSCALL);
- regs->int_code = 0x000400ad;
- return 0;
- }
- return -EACCES;
-}
-
-static noinline void do_fault_error(struct pt_regs *regs, int access,
- vm_fault_t fault)
-{
- int si_code;
-
- switch (fault) {
- case VM_FAULT_BADACCESS:
- if (access == VM_EXEC && signal_return(regs) == 0)
- break;
- /* fallthrough */
- case VM_FAULT_BADMAP:
- /* Bad memory access. Check if it is kernel or user space. */
- if (user_mode(regs)) {
- /* User mode accesses just cause a SIGSEGV */
- si_code = (fault == VM_FAULT_BADMAP) ?
- SEGV_MAPERR : SEGV_ACCERR;
- do_sigsegv(regs, si_code);
- break;
- }
- /* fallthrough */
- case VM_FAULT_BADCONTEXT:
- /* fallthrough */
- case VM_FAULT_PFAULT:
- do_no_context(regs);
- break;
- case VM_FAULT_SIGNAL:
- if (!user_mode(regs))
- do_no_context(regs);
- break;
- default: /* fault & VM_FAULT_ERROR */
- if (fault & VM_FAULT_OOM) {
- if (!user_mode(regs))
- do_no_context(regs);
- else
- pagefault_out_of_memory();
- } else if (fault & VM_FAULT_SIGSEGV) {
- /* Kernel mode? Handle exceptions or die */
- if (!user_mode(regs))
- do_no_context(regs);
- else
- do_sigsegv(regs, SEGV_MAPERR);
- } else if (fault & VM_FAULT_SIGBUS) {
- /* Kernel mode? Handle exceptions or die */
- if (!user_mode(regs))
- do_no_context(regs);
- else
- do_sigbus(regs);
- } else
- BUG();
- break;
- }
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)get_fault_address(regs));
}
/*
@@ -377,178 +272,178 @@ static noinline void do_fault_error(struct pt_regs *regs, int access,
* routines.
*
* interruption code (int_code):
- * 04 Protection -> Write-Protection (suprression)
- * 10 Segment translation -> Not present (nullification)
- * 11 Page translation -> Not present (nullification)
- * 3b Region third trans. -> Not present (nullification)
+ * 04 Protection -> Write-Protection (suppression)
+ * 10 Segment translation -> Not present (nullification)
+ * 11 Page translation -> Not present (nullification)
+ * 3b Region third trans. -> Not present (nullification)
*/
-static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
+static void do_exception(struct pt_regs *regs, int access)
{
- struct gmap *gmap;
- struct task_struct *tsk;
- struct mm_struct *mm;
struct vm_area_struct *vma;
- enum fault_type type;
- unsigned long trans_exc_code;
unsigned long address;
+ struct mm_struct *mm;
+ enum fault_type type;
unsigned int flags;
+ struct gmap *gmap;
vm_fault_t fault;
+ bool is_write;
- tsk = current;
/*
* The instruction that caused the program check has
* been nullified. Don't signal single step via SIGTRAP.
*/
- clear_pt_regs_flag(regs, PIF_PER_TRAP);
-
+ clear_thread_flag(TIF_PER_TRAP);
if (kprobe_page_fault(regs, 14))
- return 0;
-
- mm = tsk->mm;
- trans_exc_code = regs->int_parm_long;
-
- /*
- * Verify that the fault happened in user space, that
- * we are not in an interrupt and that there is a
- * user context.
- */
- fault = VM_FAULT_BADCONTEXT;
+ return;
+ mm = current->mm;
+ address = get_fault_address(regs);
+ is_write = fault_is_write(regs);
type = get_fault_type(regs);
switch (type) {
case KERNEL_FAULT:
- goto out;
- case VDSO_FAULT:
- fault = VM_FAULT_BADMAP;
- goto out;
+ return handle_fault_error_nolock(regs, 0);
case USER_FAULT:
case GMAP_FAULT:
if (faulthandler_disabled() || !mm)
- goto out;
+ return handle_fault_error_nolock(regs, 0);
break;
}
-
- address = trans_exc_code & __FAIL_ADDR_MASK;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
+ if (is_write)
+ access = VM_WRITE;
+ if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
- down_read(&mm->mmap_sem);
+ if (!(flags & FAULT_FLAG_USER))
+ goto lock_mmap;
+ vma = lock_vma_under_rcu(mm, address);
+ if (!vma)
+ goto lock_mmap;
+ if (!(vma->vm_flags & access)) {
+ vma_end_read(vma);
+ goto lock_mmap;
+ }
+ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+ if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+ vma_end_read(vma);
+ if (!(fault & VM_FAULT_RETRY)) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ if (unlikely(fault & VM_FAULT_ERROR))
+ goto error;
+ return;
+ }
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
+ if (fault & VM_FAULT_MAJOR)
+ flags |= FAULT_FLAG_TRIED;
+ /* Quick path to respond to signals */
+ if (fault_signal_pending(fault, regs)) {
+ if (!user_mode(regs))
+ handle_fault_error_nolock(regs, 0);
+ return;
+ }
+lock_mmap:
+ mmap_read_lock(mm);
gmap = NULL;
if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
- gmap = (struct gmap *) S390_lowcore.gmap;
+ gmap = (struct gmap *)S390_lowcore.gmap;
current->thread.gmap_addr = address;
current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
current->thread.gmap_int_code = regs->int_code & 0xffff;
address = __gmap_translate(gmap, address);
- if (address == -EFAULT) {
- fault = VM_FAULT_BADMAP;
- goto out_up;
- }
+ if (address == -EFAULT)
+ return handle_fault_error(regs, SEGV_MAPERR);
if (gmap->pfault_enabled)
flags |= FAULT_FLAG_RETRY_NOWAIT;
}
-
retry:
- fault = VM_FAULT_BADMAP;
vma = find_vma(mm, address);
if (!vma)
- goto out_up;
-
+ return handle_fault_error(regs, SEGV_MAPERR);
if (unlikely(vma->vm_start > address)) {
if (!(vma->vm_flags & VM_GROWSDOWN))
- goto out_up;
- if (expand_stack(vma, address))
- goto out_up;
+ return handle_fault_error(regs, SEGV_MAPERR);
+ vma = expand_stack(mm, address);
+ if (!vma)
+ return handle_fault_error_nolock(regs, SEGV_MAPERR);
}
-
- /*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
- fault = VM_FAULT_BADACCESS;
if (unlikely(!(vma->vm_flags & access)))
- goto out_up;
-
- if (is_vm_hugetlb_page(vma))
- address &= HPAGE_MASK;
- /*
- * If for any reason at all we couldn't handle the fault,
- * make sure we exit gracefully rather than endlessly redo
- * the fault.
- */
- fault = handle_mm_fault(vma, address, flags);
- /* No reason to continue if interrupted by SIGKILL. */
- if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
- fault = VM_FAULT_SIGNAL;
+ return handle_fault_error(regs, SEGV_ACCERR);
+ fault = handle_mm_fault(vma, address, flags, regs);
+ if (fault_signal_pending(fault, regs)) {
if (flags & FAULT_FLAG_RETRY_NOWAIT)
- goto out_up;
- goto out;
+ mmap_read_unlock(mm);
+ if (!user_mode(regs))
+ handle_fault_error_nolock(regs, 0);
+ return;
}
- if (unlikely(fault & VM_FAULT_ERROR))
- goto out_up;
-
- /*
- * Major/minor page fault accounting is only done on the
- * initial attempt. If we go through a retry, it is extremely
- * likely that the page will be found in page cache at that point.
- */
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_MAJOR) {
- tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
- } else {
- tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED) {
+ if (gmap) {
+ mmap_read_lock(mm);
+ goto gmap;
}
- if (fault & VM_FAULT_RETRY) {
- if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
- (flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /* FAULT_FLAG_RETRY_NOWAIT has been set,
- * mmap_sem has not been released */
- current->thread.gmap_pfault = 1;
- fault = VM_FAULT_PFAULT;
- goto out_up;
- }
- /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
- * of starvation. */
- flags &= ~(FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_RETRY_NOWAIT);
- flags |= FAULT_FLAG_TRIED;
- down_read(&mm->mmap_sem);
- goto retry;
+ return;
+ }
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ mmap_read_unlock(mm);
+ goto error;
+ }
+ if (fault & VM_FAULT_RETRY) {
+ if (IS_ENABLED(CONFIG_PGSTE) && gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT has been set,
+ * mmap_lock has not been released
+ */
+ current->thread.gmap_pfault = 1;
+ return handle_fault_error(regs, 0);
}
+ flags &= ~FAULT_FLAG_RETRY_NOWAIT;
+ flags |= FAULT_FLAG_TRIED;
+ mmap_read_lock(mm);
+ goto retry;
}
+gmap:
if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
address = __gmap_link(gmap, current->thread.gmap_addr,
address);
- if (address == -EFAULT) {
- fault = VM_FAULT_BADMAP;
- goto out_up;
- }
+ if (address == -EFAULT)
+ return handle_fault_error(regs, SEGV_MAPERR);
if (address == -ENOMEM) {
fault = VM_FAULT_OOM;
- goto out_up;
+ mmap_read_unlock(mm);
+ goto error;
}
}
- fault = 0;
-out_up:
- up_read(&mm->mmap_sem);
-out:
- return fault;
+ mmap_read_unlock(mm);
+ return;
+error:
+ if (fault & VM_FAULT_OOM) {
+ if (!user_mode(regs))
+ handle_fault_error_nolock(regs, 0);
+ else
+ pagefault_out_of_memory();
+ } else if (fault & VM_FAULT_SIGSEGV) {
+ if (!user_mode(regs))
+ handle_fault_error_nolock(regs, 0);
+ else
+ do_sigsegv(regs, SEGV_MAPERR);
+ } else if (fault & VM_FAULT_SIGBUS) {
+ if (!user_mode(regs))
+ handle_fault_error_nolock(regs, 0);
+ else
+ do_sigbus(regs);
+ } else {
+ BUG();
+ }
}
void do_protection_exception(struct pt_regs *regs)
{
- unsigned long trans_exc_code;
- int access;
- vm_fault_t fault;
+ union teid teid = { .val = regs->int_parm_long };
- trans_exc_code = regs->int_parm_long;
/*
* Protection exceptions are suppressing, decrement psw address.
* The exception to this rule are aborted transactions, for these
@@ -561,258 +456,140 @@ void do_protection_exception(struct pt_regs *regs)
* as a special case because the translation exception code
* field is not guaranteed to contain valid data in this case.
*/
- if (unlikely(!(trans_exc_code & 4))) {
- do_low_address(regs);
- return;
+ if (unlikely(!teid.b61)) {
+ if (user_mode(regs)) {
+ /* Low-address protection in user mode: cannot happen */
+ die(regs, "Low-address protection");
+ }
+ /*
+ * Low-address protection in kernel mode means
+ * NULL pointer write access in kernel mode.
+ */
+ return handle_fault_error_nolock(regs, 0);
}
- if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) {
- regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) |
- (regs->psw.addr & PAGE_MASK);
- access = VM_EXEC;
- fault = VM_FAULT_BADACCESS;
- } else {
- access = VM_WRITE;
- fault = do_exception(regs, access);
+ if (unlikely(MACHINE_HAS_NX && teid.b56)) {
+ regs->int_parm_long = (teid.addr * PAGE_SIZE) | (regs->psw.addr & PAGE_MASK);
+ return handle_fault_error_nolock(regs, SEGV_ACCERR);
}
- if (unlikely(fault))
- do_fault_error(regs, access, fault);
+ do_exception(regs, VM_WRITE);
}
NOKPROBE_SYMBOL(do_protection_exception);
void do_dat_exception(struct pt_regs *regs)
{
- int access;
- vm_fault_t fault;
-
- access = VM_READ | VM_EXEC | VM_WRITE;
- fault = do_exception(regs, access);
- if (unlikely(fault))
- do_fault_error(regs, access, fault);
+ do_exception(regs, VM_ACCESS_FLAGS);
}
NOKPROBE_SYMBOL(do_dat_exception);
-#ifdef CONFIG_PFAULT
-/*
- * 'pfault' pseudo page faults routines.
- */
-static int pfault_disable;
-
-static int __init nopfault(char *str)
-{
- pfault_disable = 1;
- return 1;
-}
-
-__setup("nopfault", nopfault);
-
-struct pfault_refbk {
- u16 refdiagc;
- u16 reffcode;
- u16 refdwlen;
- u16 refversn;
- u64 refgaddr;
- u64 refselmk;
- u64 refcmpmk;
- u64 reserved;
-} __attribute__ ((packed, aligned(8)));
-
-static struct pfault_refbk pfault_init_refbk = {
- .refdiagc = 0x258,
- .reffcode = 0,
- .refdwlen = 5,
- .refversn = 2,
- .refgaddr = __LC_LPP,
- .refselmk = 1ULL << 48,
- .refcmpmk = 1ULL << 48,
- .reserved = __PF_RES_FIELD
-};
-
-int pfault_init(void)
-{
- int rc;
-
- if (pfault_disable)
- return -1;
- diag_stat_inc(DIAG_STAT_X258);
- asm volatile(
- " diag %1,%0,0x258\n"
- "0: j 2f\n"
- "1: la %0,8\n"
- "2:\n"
- EX_TABLE(0b,1b)
- : "=d" (rc)
- : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc");
- return rc;
-}
-
-static struct pfault_refbk pfault_fini_refbk = {
- .refdiagc = 0x258,
- .reffcode = 1,
- .refdwlen = 5,
- .refversn = 2,
-};
+#if IS_ENABLED(CONFIG_PGSTE)
-void pfault_fini(void)
+void do_secure_storage_access(struct pt_regs *regs)
{
-
- if (pfault_disable)
- return;
- diag_stat_inc(DIAG_STAT_X258);
- asm volatile(
- " diag %0,0,0x258\n"
- "0: nopr %%r7\n"
- EX_TABLE(0b,0b)
- : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc");
-}
-
-static DEFINE_SPINLOCK(pfault_lock);
-static LIST_HEAD(pfault_list);
-
-#define PF_COMPLETE 0x0080
-
-/*
- * The mechanism of our pfault code: if Linux is running as guest, runs a user
- * space process and the user space process accesses a page that the host has
- * paged out we get a pfault interrupt.
- *
- * This allows us, within the guest, to schedule a different process. Without
- * this mechanism the host would have to suspend the whole virtual cpu until
- * the page has been paged in.
- *
- * So when we get such an interrupt then we set the state of the current task
- * to uninterruptible and also set the need_resched flag. Both happens within
- * interrupt context(!). If we later on want to return to user space we
- * recognize the need_resched flag and then call schedule(). It's not very
- * obvious how this works...
- *
- * Of course we have a lot of additional fun with the completion interrupt (->
- * host signals that a page of a process has been paged in and the process can
- * continue to run). This interrupt can arrive on any cpu and, since we have
- * virtual cpus, actually appear before the interrupt that signals that a page
- * is missing.
- */
-static void pfault_interrupt(struct ext_code ext_code,
- unsigned int param32, unsigned long param64)
-{
- struct task_struct *tsk;
- __u16 subcode;
- pid_t pid;
+ union teid teid = { .val = regs->int_parm_long };
+ unsigned long addr = get_fault_address(regs);
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ struct page *page;
+ struct gmap *gmap;
+ int rc;
/*
- * Get the external interruption subcode & pfault initial/completion
- * signal bit. VM stores this in the 'cpu address' field associated
- * with the external interrupt.
+ * Bit 61 indicates if the address is valid, if it is not the
+ * kernel should be stopped or SIGSEGV should be sent to the
+ * process. Bit 61 is not reliable without the misc UV feature,
+ * therefore this needs to be checked too.
*/
- subcode = ext_code.subcode;
- if ((subcode & 0xff00) != __SUBCODE_MASK)
- return;
- inc_irq_stat(IRQEXT_PFL);
- /* Get the token (= pid of the affected task). */
- pid = param64 & LPP_PID_MASK;
- rcu_read_lock();
- tsk = find_task_by_pid_ns(pid, &init_pid_ns);
- if (tsk)
- get_task_struct(tsk);
- rcu_read_unlock();
- if (!tsk)
- return;
- spin_lock(&pfault_lock);
- if (subcode & PF_COMPLETE) {
- /* signal bit is set -> a page has been swapped in by VM */
- if (tsk->thread.pfault_wait == 1) {
- /* Initial interrupt was faster than the completion
- * interrupt. pfault_wait is valid. Set pfault_wait
- * back to zero and wake up the process. This can
- * safely be done because the task is still sleeping
- * and can't produce new pfaults. */
- tsk->thread.pfault_wait = 0;
- list_del(&tsk->thread.list);
- wake_up_process(tsk);
- put_task_struct(tsk);
- } else {
- /* Completion interrupt was faster than initial
- * interrupt. Set pfault_wait to -1 so the initial
- * interrupt doesn't put the task to sleep.
- * If the task is not running, ignore the completion
- * interrupt since it must be a leftover of a PFAULT
- * CANCEL operation which didn't remove all pending
- * completion interrupts. */
- if (tsk->state == TASK_RUNNING)
- tsk->thread.pfault_wait = -1;
+ if (uv_has_feature(BIT_UV_FEAT_MISC) && !teid.b61) {
+ /*
+ * When this happens, userspace did something that it
+ * was not supposed to do, e.g. branching into secure
+ * memory. Trigger a segmentation fault.
+ */
+ if (user_mode(regs)) {
+ send_sig(SIGSEGV, current, 0);
+ return;
}
- } else {
- /* signal bit not set -> a real page is missing. */
- if (WARN_ON_ONCE(tsk != current))
- goto out;
- if (tsk->thread.pfault_wait == 1) {
- /* Already on the list with a reference: put to sleep */
- goto block;
- } else if (tsk->thread.pfault_wait == -1) {
- /* Completion interrupt was faster than the initial
- * interrupt (pfault_wait == -1). Set pfault_wait
- * back to zero and exit. */
- tsk->thread.pfault_wait = 0;
- } else {
- /* Initial interrupt arrived before completion
- * interrupt. Let the task sleep.
- * An extra task reference is needed since a different
- * cpu may set the task state to TASK_RUNNING again
- * before the scheduler is reached. */
- get_task_struct(tsk);
- tsk->thread.pfault_wait = 1;
- list_add(&tsk->thread.list, &pfault_list);
-block:
- /* Since this must be a userspace fault, there
- * is no kernel task state to trample. Rely on the
- * return to userspace schedule() to block. */
- __set_current_state(TASK_UNINTERRUPTIBLE);
- set_tsk_need_resched(tsk);
- set_preempt_need_resched();
+ /*
+ * The kernel should never run into this case and
+ * there is no way out of this situation.
+ */
+ panic("Unexpected PGM 0x3d with TEID bit 61=0");
+ }
+ switch (get_fault_type(regs)) {
+ case GMAP_FAULT:
+ mm = current->mm;
+ gmap = (struct gmap *)S390_lowcore.gmap;
+ mmap_read_lock(mm);
+ addr = __gmap_translate(gmap, addr);
+ mmap_read_unlock(mm);
+ if (IS_ERR_VALUE(addr))
+ return handle_fault_error_nolock(regs, SEGV_MAPERR);
+ fallthrough;
+ case USER_FAULT:
+ mm = current->mm;
+ mmap_read_lock(mm);
+ vma = find_vma(mm, addr);
+ if (!vma)
+ return handle_fault_error(regs, SEGV_MAPERR);
+ page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
+ if (IS_ERR_OR_NULL(page)) {
+ mmap_read_unlock(mm);
+ break;
}
+ if (arch_make_page_accessible(page))
+ send_sig(SIGSEGV, current, 0);
+ put_page(page);
+ mmap_read_unlock(mm);
+ break;
+ case KERNEL_FAULT:
+ page = phys_to_page(addr);
+ if (unlikely(!try_get_page(page)))
+ break;
+ rc = arch_make_page_accessible(page);
+ put_page(page);
+ if (rc)
+ BUG();
+ break;
+ default:
+ unreachable();
}
-out:
- spin_unlock(&pfault_lock);
- put_task_struct(tsk);
}
+NOKPROBE_SYMBOL(do_secure_storage_access);
-static int pfault_cpu_dead(unsigned int cpu)
+void do_non_secure_storage_access(struct pt_regs *regs)
{
- struct thread_struct *thread, *next;
- struct task_struct *tsk;
-
- spin_lock_irq(&pfault_lock);
- list_for_each_entry_safe(thread, next, &pfault_list, list) {
- thread->pfault_wait = 0;
- list_del(&thread->list);
- tsk = container_of(thread, struct task_struct, thread);
- wake_up_process(tsk);
- put_task_struct(tsk);
- }
- spin_unlock_irq(&pfault_lock);
- return 0;
+ struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
+ unsigned long gaddr = get_fault_address(regs);
+
+ if (WARN_ON_ONCE(get_fault_type(regs) != GMAP_FAULT))
+ return handle_fault_error_nolock(regs, SEGV_MAPERR);
+ if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
+ send_sig(SIGSEGV, current, 0);
}
+NOKPROBE_SYMBOL(do_non_secure_storage_access);
-static int __init pfault_irq_init(void)
+void do_secure_storage_violation(struct pt_regs *regs)
{
- int rc;
-
- rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
- if (rc)
- goto out_extint;
- rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
- if (rc)
- goto out_pfault;
- irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
- cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
- NULL, pfault_cpu_dead);
- return 0;
+ struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
+ unsigned long gaddr = get_fault_address(regs);
-out_pfault:
- unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
-out_extint:
- pfault_disable = 1;
- return rc;
+ /*
+ * If the VM has been rebooted, its address space might still contain
+ * secure pages from the previous boot.
+ * Clear the page so it can be reused.
+ */
+ if (!gmap_destroy_page(gmap, gaddr))
+ return;
+ /*
+ * Either KVM messed up the secure guest mapping or the same
+ * page is mapped into multiple secure guests.
+ *
+ * This exception is only triggered when a guest 2 is running
+ * and can therefore never occur in kernel context.
+ */
+ pr_warn_ratelimited("Secure storage violation in task: %s, pid %d\n",
+ current->comm, current->pid);
+ send_sig(SIGSEGV, current, 0);
}
-early_initcall(pfault_irq_init);
-#endif /* CONFIG_PFAULT */
+#endif /* CONFIG_PGSTE */
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index edcdca97e85e..6f96b5a71c63 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2,7 +2,7 @@
/*
* KVM guest address space mapping code
*
- * Copyright IBM Corp. 2007, 2016, 2018
+ * Copyright IBM Corp. 2007, 2020
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
* David Hildenbrand <david@redhat.com>
* Janosch Frank <frankja@linux.vnet.ibm.com>
@@ -17,17 +17,28 @@
#include <linux/swapops.h>
#include <linux/ksm.h>
#include <linux/mman.h>
-
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
+#include <asm/page-states.h>
#include <asm/pgalloc.h>
#include <asm/gmap.h>
+#include <asm/page.h>
#include <asm/tlb.h>
#define GMAP_SHADOW_FAKE_TABLE 1ULL
+static struct page *gmap_alloc_crst(void)
+{
+ struct page *page;
+
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+ if (!page)
+ return NULL;
+ __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER);
+ return page;
+}
+
/**
* gmap_alloc - allocate and initialize a guest address space
- * @mm: pointer to the parent mm_struct
* @limit: maximum address of the gmap address space
*
* Returns a guest address space structure.
@@ -56,24 +67,24 @@ static struct gmap *gmap_alloc(unsigned long limit)
atype = _ASCE_TYPE_REGION1;
etype = _REGION1_ENTRY_EMPTY;
}
- gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
+ gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
if (!gmap)
goto out;
INIT_LIST_HEAD(&gmap->crst_list);
INIT_LIST_HEAD(&gmap->children);
INIT_LIST_HEAD(&gmap->pt_list);
- INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
- INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
- INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
+ INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
+ INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
+ INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
spin_lock_init(&gmap->guest_table_lock);
spin_lock_init(&gmap->shadow_lock);
refcount_set(&gmap->ref_count, 1);
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = gmap_alloc_crst();
if (!page)
goto out_free;
page->index = 0;
list_add(&page->lru, &gmap->crst_list);
- table = (unsigned long *) page_to_phys(page);
+ table = page_to_virt(page);
crst_table_init(table, etype);
gmap->table = table;
gmap->asce = atype | _ASCE_TABLE_LENGTH |
@@ -300,7 +311,7 @@ struct gmap *gmap_get_enabled(void)
EXPORT_SYMBOL_GPL(gmap_get_enabled);
/*
- * gmap_alloc_table is assumed to be called with mmap_sem held
+ * gmap_alloc_table is assumed to be called with mmap_lock held
*/
static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
unsigned long init, unsigned long gaddr)
@@ -309,15 +320,15 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
unsigned long *new;
/* since we dont free the gmap table until gmap_free we can unlock */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
- new = (unsigned long *) page_to_phys(page);
+ new = page_to_virt(page);
crst_table_init(new, init);
spin_lock(&gmap->guest_table_lock);
if (*table & _REGION_ENTRY_INVALID) {
list_add(&page->lru, &gmap->crst_list);
- *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
+ *table = __pa(new) | _REGION_ENTRY_LENGTH |
(*table & _REGION_ENTRY_TYPE_MASK);
page->index = gaddr;
page = NULL;
@@ -337,12 +348,11 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
static unsigned long __gmap_segment_gaddr(unsigned long *entry)
{
struct page *page;
- unsigned long offset, mask;
+ unsigned long offset;
offset = (unsigned long) entry / sizeof(unsigned long);
offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
- mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
- page = virt_to_page((void *)((unsigned long) entry & mask));
+ page = pmd_pgtable_page((pmd_t *) entry);
return page->index + offset;
}
@@ -405,10 +415,10 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
return -EINVAL;
flush = 0;
- down_write(&gmap->mm->mmap_sem);
+ mmap_write_lock(gmap->mm);
for (off = 0; off < len; off += PMD_SIZE)
flush |= __gmap_unmap_by_gaddr(gmap, to + off);
- up_write(&gmap->mm->mmap_sem);
+ mmap_write_unlock(gmap->mm);
if (flush)
gmap_flush_tlb(gmap);
return 0;
@@ -438,7 +448,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
return -EINVAL;
flush = 0;
- down_write(&gmap->mm->mmap_sem);
+ mmap_write_lock(gmap->mm);
for (off = 0; off < len; off += PMD_SIZE) {
/* Remove old translation */
flush |= __gmap_unmap_by_gaddr(gmap, to + off);
@@ -448,7 +458,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
(void *) from + off))
break;
}
- up_write(&gmap->mm->mmap_sem);
+ mmap_write_unlock(gmap->mm);
if (flush)
gmap_flush_tlb(gmap);
if (off >= len)
@@ -466,7 +476,7 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
* Returns user space address which corresponds to the guest address or
* -EFAULT if no such mapping exists.
* This function does not establish potentially missing page table entries.
- * The mmap_sem of the mm that belongs to the address space must be held
+ * The mmap_lock of the mm that belongs to the address space must be held
* when this function gets called.
*
* Note: Can also be called for shadow gmaps.
@@ -495,16 +505,16 @@ unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
{
unsigned long rc;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
rc = __gmap_translate(gmap, gaddr);
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_translate);
/**
* gmap_unlink - disconnect a page table from the gmap shadow tables
- * @gmap: pointer to guest mapping meta data structure
+ * @mm: pointer to the parent mm_struct
* @table: pointer to the host page table
* @vmaddr: vm address associated with the host page table
*/
@@ -527,14 +537,14 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
unsigned long gaddr);
/**
- * gmap_link - set up shadow page tables to connect a host to a guest address
+ * __gmap_link - set up shadow page tables to connect a host to a guest address
* @gmap: pointer to guest mapping meta data structure
* @gaddr: guest address
* @vmaddr: vm address
*
* Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
* if the vm address is already mapped to a different guest segment.
- * The mmap_sem of the mm that belongs to the address space must be held
+ * The mmap_lock of the mm that belongs to the address space must be held
* when this function gets called.
*/
int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
@@ -558,7 +568,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
gaddr & _REGION1_MASK))
return -ENOMEM;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
}
if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
@@ -566,7 +576,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
gaddr & _REGION2_MASK))
return -ENOMEM;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
}
if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
@@ -574,7 +584,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
gaddr & _REGION3_MASK))
return -ENOMEM;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
}
table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
/* Walk the parent mm page table */
@@ -594,7 +604,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
return -EFAULT;
/* Link gmap segment table entry location to page table. */
- rc = radix_tree_preload(GFP_KERNEL);
+ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
if (rc)
return rc;
ptl = pmd_lock(mm, pmd);
@@ -640,7 +650,7 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
int rc;
bool unlocked;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
retry:
unlocked = false;
@@ -649,13 +659,13 @@ retry:
rc = vmaddr;
goto out_up;
}
- if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
+ if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
&unlocked)) {
rc = -EFAULT;
goto out_up;
}
/*
- * In the case that fixup_user_fault unlocked the mmap_sem during
+ * In the case that fixup_user_fault unlocked the mmap_lock during
* faultin redo __gmap_translate to not race with a map/unmap_segment.
*/
if (unlocked)
@@ -663,16 +673,17 @@ retry:
rc = __gmap_link(gmap, gaddr, vmaddr);
out_up:
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_fault);
/*
- * this function is assumed to be called with mmap_sem held
+ * this function is assumed to be called with mmap_lock held
*/
void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
{
+ struct vm_area_struct *vma;
unsigned long vmaddr;
spinlock_t *ptl;
pte_t *ptep;
@@ -682,11 +693,17 @@ void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
gaddr >> PMD_SHIFT);
if (vmaddr) {
vmaddr |= gaddr & ~PMD_MASK;
+
+ vma = vma_lookup(gmap->mm, vmaddr);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return;
+
/* Get pointer to the page table entry */
ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
- if (likely(ptep))
+ if (likely(ptep)) {
ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
- pte_unmap_unlock(ptep, ptl);
+ pte_unmap_unlock(ptep, ptl);
+ }
}
}
EXPORT_SYMBOL_GPL(__gmap_zap);
@@ -696,7 +713,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
unsigned long gaddr, vmaddr, size;
struct vm_area_struct *vma;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
for (gaddr = from; gaddr < to;
gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
/* Find the vm address for the guest address */
@@ -717,9 +734,9 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
if (is_vm_hugetlb_page(vma))
continue;
size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
- zap_page_range(vma, vmaddr, size);
+ zap_page_range_single(vma, vmaddr, size, NULL);
}
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
}
EXPORT_SYMBOL_GPL(gmap_discard);
@@ -787,47 +804,51 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
static inline unsigned long *gmap_table_walk(struct gmap *gmap,
unsigned long gaddr, int level)
{
- unsigned long *table;
+ const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
+ unsigned long *table = gmap->table;
- if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
- return NULL;
if (gmap_is_shadow(gmap) && gmap->removed)
return NULL;
- if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+
+ if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
return NULL;
- table = gmap->table;
- switch (gmap->asce & _ASCE_TYPE_MASK) {
+
+ if (asce_type != _ASCE_TYPE_REGION1 &&
+ gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
+ return NULL;
+
+ switch (asce_type) {
case _ASCE_TYPE_REGION1:
table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
if (level == 4)
break;
if (*table & _REGION_ENTRY_INVALID)
return NULL;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* Fallthrough */
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_REGION2:
table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
if (level == 3)
break;
if (*table & _REGION_ENTRY_INVALID)
return NULL;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* Fallthrough */
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_REGION3:
table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
if (level == 2)
break;
if (*table & _REGION_ENTRY_INVALID)
return NULL;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* Fallthrough */
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_SEGMENT:
table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
if (level == 1)
break;
if (*table & _REGION_ENTRY_INVALID)
return NULL;
- table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
}
return table;
@@ -875,10 +896,10 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
BUG_ON(gmap_is_shadow(gmap));
fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
- if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+ if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
return -EFAULT;
if (unlocked)
- /* lost mmap_sem, caller has to retry __gmap_translate */
+ /* lost mmap_lock, caller has to retry __gmap_translate */
return 0;
/* Connect the page tables */
return __gmap_link(gmap, gaddr, vmaddr);
@@ -886,12 +907,12 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
/**
* gmap_pte_op_end - release the page table lock
- * @ptl: pointer to the spinlock pointer
+ * @ptep: pointer to the locked pte
+ * @ptl: pointer to the page table spinlock
*/
-static void gmap_pte_op_end(spinlock_t *ptl)
+static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl)
{
- if (ptl)
- spin_unlock(ptl);
+ pte_unmap_unlock(ptep, ptl);
}
/**
@@ -949,7 +970,7 @@ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
* -EAGAIN if a fixup is needed
* -EINVAL if unsupported notifier bits have been specified
*
- * Expected to be called with sg->mm->mmap_sem in read and
+ * Expected to be called with sg->mm->mmap_lock in read and
* guest_table_lock held.
*/
static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
@@ -964,18 +985,18 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
return -EAGAIN;
if (prot == PROT_NONE && !pmd_i) {
- pmd_val(new) |= _SEGMENT_ENTRY_INVALID;
+ new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
}
if (prot == PROT_READ && !pmd_p) {
- pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
- pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
+ new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
+ new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
}
if (bits & GMAP_NOTIFY_MPROT)
- pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN;
+ set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
/* Shadow GMAP protection needs split PMDs */
if (bits & GMAP_NOTIFY_SHADOW)
@@ -995,14 +1016,14 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
* Returns 0 if successfully protected, -ENOMEM if out of memory and
* -EAGAIN if a fixup is needed.
*
- * Expected to be called with sg->mm->mmap_sem in read
+ * Expected to be called with sg->mm->mmap_lock in read
*/
static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
pmd_t *pmdp, int prot, unsigned long bits)
{
int rc;
pte_t *ptep;
- spinlock_t *ptl = NULL;
+ spinlock_t *ptl;
unsigned long pbits = 0;
if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
@@ -1016,7 +1037,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
/* Protect and unlock. */
rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
- gmap_pte_op_end(ptl);
+ gmap_pte_op_end(ptep, ptl);
return rc;
}
@@ -1031,7 +1052,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
* Returns 0 if successfully protected, -ENOMEM if out of memory and
* -EFAULT if gaddr is invalid (or mapping for shadows is missing).
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
unsigned long len, int prot, unsigned long bits)
@@ -1102,9 +1123,9 @@ int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
return -EINVAL;
if (!MACHINE_HAS_ESOP && prot == PROT_READ)
return -EINVAL;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
@@ -1120,7 +1141,7 @@ EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
* if reading using the virtual address failed. -EINVAL if called on a gmap
* shadow.
*
- * Called with gmap->mm->mmap_sem in read.
+ * Called with gmap->mm->mmap_lock in read.
*/
int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
{
@@ -1140,12 +1161,12 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
address = pte_val(pte) & PAGE_MASK;
address += gaddr & ~PAGE_MASK;
- *val = *(unsigned long *) address;
- pte_val(*ptep) |= _PAGE_YOUNG;
+ *val = *(unsigned long *)__va(address);
+ set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
/* Do *NOT* clear the _PAGE_INVALID bit! */
rc = 0;
}
- gmap_pte_op_end(ptl);
+ gmap_pte_op_end(ptep, ptl);
}
if (!rc)
break;
@@ -1173,6 +1194,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table);
static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
struct gmap_rmap *rmap)
{
+ struct gmap_rmap *temp;
void __rcu **slot;
BUG_ON(!gmap_is_shadow(sg));
@@ -1180,6 +1202,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
if (slot) {
rmap->next = radix_tree_deref_slot_protected(slot,
&sg->guest_table_lock);
+ for (temp = rmap->next; temp; temp = temp->next) {
+ if (temp->raddr == rmap->raddr) {
+ kfree(rmap);
+ return;
+ }
+ }
radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
} else {
rmap->next = NULL;
@@ -1214,11 +1242,11 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
vmaddr = __gmap_translate(parent, paddr);
if (IS_ERR_VALUE(vmaddr))
return vmaddr;
- rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
if (!rmap)
return -ENOMEM;
rmap->raddr = raddr;
- rc = radix_tree_preload(GFP_KERNEL);
+ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
if (rc) {
kfree(rmap);
return rc;
@@ -1232,7 +1260,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
if (!rc)
gmap_insert_rmap(sg, vmaddr, rmap);
spin_unlock(&sg->guest_table_lock);
- gmap_pte_op_end(ptl);
+ gmap_pte_op_end(ptep, ptl);
}
radix_tree_preload_end();
if (rc) {
@@ -1268,7 +1296,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
{
asm volatile(
- " .insn rrf,0xb98e0000,%0,%1,0,0"
+ " idte %0,0,%1"
: : "a" (asce), "a" (vaddr) : "cc", "memory");
}
@@ -1318,7 +1346,8 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
*/
static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
{
- unsigned long sto, *ste, *pgt;
+ unsigned long *ste;
+ phys_addr_t sto, pgt;
struct page *page;
BUG_ON(!gmap_is_shadow(sg));
@@ -1326,13 +1355,13 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
return;
gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
- sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
+ sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
- pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+ pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
*ste = _SEGMENT_ENTRY_EMPTY;
- __gmap_unshadow_pgt(sg, raddr, pgt);
+ __gmap_unshadow_pgt(sg, raddr, __va(pgt));
/* Free page table */
- page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ page = phys_to_page(pgt);
list_del(&page->lru);
page_table_free_pgste(page);
}
@@ -1348,19 +1377,19 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
unsigned long *sgt)
{
- unsigned long *pgt;
struct page *page;
+ phys_addr_t pgt;
int i;
BUG_ON(!gmap_is_shadow(sg));
for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
continue;
- pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+ pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
sgt[i] = _SEGMENT_ENTRY_EMPTY;
- __gmap_unshadow_pgt(sg, raddr, pgt);
+ __gmap_unshadow_pgt(sg, raddr, __va(pgt));
/* Free page table */
- page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ page = phys_to_page(pgt);
list_del(&page->lru);
page_table_free_pgste(page);
}
@@ -1375,7 +1404,8 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
*/
static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
{
- unsigned long r3o, *r3e, *sgt;
+ unsigned long r3o, *r3e;
+ phys_addr_t sgt;
struct page *page;
BUG_ON(!gmap_is_shadow(sg));
@@ -1384,12 +1414,12 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
return;
gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
- gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
- sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+ gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
+ sgt = *r3e & _REGION_ENTRY_ORIGIN;
*r3e = _REGION3_ENTRY_EMPTY;
- __gmap_unshadow_sgt(sg, raddr, sgt);
+ __gmap_unshadow_sgt(sg, raddr, __va(sgt));
/* Free segment table */
- page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ page = phys_to_page(sgt);
list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1405,19 +1435,19 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
unsigned long *r3t)
{
- unsigned long *sgt;
struct page *page;
+ phys_addr_t sgt;
int i;
BUG_ON(!gmap_is_shadow(sg));
for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
continue;
- sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+ sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
r3t[i] = _REGION3_ENTRY_EMPTY;
- __gmap_unshadow_sgt(sg, raddr, sgt);
+ __gmap_unshadow_sgt(sg, raddr, __va(sgt));
/* Free segment table */
- page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ page = phys_to_page(sgt);
list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1432,7 +1462,8 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
*/
static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
{
- unsigned long r2o, *r2e, *r3t;
+ unsigned long r2o, *r2e;
+ phys_addr_t r3t;
struct page *page;
BUG_ON(!gmap_is_shadow(sg));
@@ -1441,12 +1472,12 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
return;
gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
- gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
- r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+ gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
+ r3t = *r2e & _REGION_ENTRY_ORIGIN;
*r2e = _REGION2_ENTRY_EMPTY;
- __gmap_unshadow_r3t(sg, raddr, r3t);
+ __gmap_unshadow_r3t(sg, raddr, __va(r3t));
/* Free region 3 table */
- page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ page = phys_to_page(r3t);
list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1462,7 +1493,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
unsigned long *r2t)
{
- unsigned long *r3t;
+ phys_addr_t r3t;
struct page *page;
int i;
@@ -1470,11 +1501,11 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
continue;
- r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+ r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
r2t[i] = _REGION2_ENTRY_EMPTY;
- __gmap_unshadow_r3t(sg, raddr, r3t);
+ __gmap_unshadow_r3t(sg, raddr, __va(r3t));
/* Free region 3 table */
- page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ page = phys_to_page(r3t);
list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1489,8 +1520,9 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
*/
static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
{
- unsigned long r1o, *r1e, *r2t;
+ unsigned long r1o, *r1e;
struct page *page;
+ phys_addr_t r2t;
BUG_ON(!gmap_is_shadow(sg));
r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
@@ -1498,12 +1530,12 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
return;
gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
- gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
- r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+ gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
+ r2t = *r1e & _REGION_ENTRY_ORIGIN;
*r1e = _REGION1_ENTRY_EMPTY;
- __gmap_unshadow_r2t(sg, raddr, r2t);
+ __gmap_unshadow_r2t(sg, raddr, __va(r2t));
/* Free region 2 table */
- page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ page = phys_to_page(r2t);
list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1519,22 +1551,23 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
unsigned long *r1t)
{
- unsigned long asce, *r2t;
+ unsigned long asce;
struct page *page;
+ phys_addr_t r2t;
int i;
BUG_ON(!gmap_is_shadow(sg));
- asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+ asce = __pa(r1t) | _ASCE_TYPE_REGION1;
for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
continue;
- r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
- __gmap_unshadow_r2t(sg, raddr, r2t);
+ r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
+ __gmap_unshadow_r2t(sg, raddr, __va(r2t));
/* Clear entry and flush translation r1t -> r2t */
gmap_idte_one(asce, raddr);
r1t[i] = _REGION1_ENTRY_EMPTY;
/* Free region 2 table */
- page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ page = phys_to_page(r2t);
list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1556,7 +1589,7 @@ static void gmap_unshadow(struct gmap *sg)
sg->removed = 1;
gmap_call_notifier(sg, 0, -1UL);
gmap_flush_tlb(sg);
- table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+ table = __va(sg->asce & _ASCE_ORIGIN);
switch (sg->asce & _ASCE_TYPE_MASK) {
case _ASCE_TYPE_REGION1:
__gmap_unshadow_r1t(sg, 0, table);
@@ -1692,11 +1725,11 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
}
spin_unlock(&parent->shadow_lock);
/* protect after insertion, so it will get properly invalidated */
- down_read(&parent->mm->mmap_sem);
+ mmap_read_lock(parent->mm);
rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
PROT_READ, GMAP_NOTIFY_SHADOW);
- up_read(&parent->mm->mmap_sem);
+ mmap_read_unlock(parent->mm);
spin_lock(&parent->shadow_lock);
new->initialized = true;
if (rc) {
@@ -1719,31 +1752,32 @@ EXPORT_SYMBOL_GPL(gmap_shadow);
* The r2t parameter specifies the address of the source table. The
* four pages of the source table are made read-only in the parent gmap
* address space. A write to the source table area @r2t will automatically
- * remove the shadow r2 table and all of its decendents.
+ * remove the shadow r2 table and all of its descendants.
*
* Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
int fake)
{
unsigned long raddr, origin, offset, len;
- unsigned long *s_r2t, *table;
+ unsigned long *table;
+ phys_addr_t s_r2t;
struct page *page;
int rc;
BUG_ON(!gmap_is_shadow(sg));
/* Allocate a shadow region second table */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
page->index = r2t & _REGION_ENTRY_ORIGIN;
if (fake)
page->index |= GMAP_SHADOW_FAKE_TABLE;
- s_r2t = (unsigned long *) page_to_phys(page);
+ s_r2t = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
@@ -1758,9 +1792,9 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
rc = -EAGAIN; /* Race with shadow */
goto out_free;
}
- crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+ crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
/* mark as invalid as long as the parent table is not protected */
- *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+ *table = s_r2t | _REGION_ENTRY_LENGTH |
_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= (r2t & _REGION_ENTRY_PROTECT);
@@ -1781,8 +1815,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
spin_lock(&sg->guest_table_lock);
if (!rc) {
table = gmap_table_walk(sg, saddr, 4);
- if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
- (unsigned long) s_r2t)
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
rc = -EAGAIN; /* Race with unshadow */
else
*table &= ~_REGION_ENTRY_INVALID;
@@ -1809,25 +1842,26 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
int fake)
{
unsigned long raddr, origin, offset, len;
- unsigned long *s_r3t, *table;
+ unsigned long *table;
+ phys_addr_t s_r3t;
struct page *page;
int rc;
BUG_ON(!gmap_is_shadow(sg));
/* Allocate a shadow region second table */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
page->index = r3t & _REGION_ENTRY_ORIGIN;
if (fake)
page->index |= GMAP_SHADOW_FAKE_TABLE;
- s_r3t = (unsigned long *) page_to_phys(page);
+ s_r3t = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
@@ -1840,10 +1874,11 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
goto out_free;
} else if (*table & _REGION_ENTRY_ORIGIN) {
rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
}
- crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+ crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
/* mark as invalid as long as the parent table is not protected */
- *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+ *table = s_r3t | _REGION_ENTRY_LENGTH |
_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= (r3t & _REGION_ENTRY_PROTECT);
@@ -1864,8 +1899,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
spin_lock(&sg->guest_table_lock);
if (!rc) {
table = gmap_table_walk(sg, saddr, 3);
- if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
- (unsigned long) s_r3t)
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
rc = -EAGAIN; /* Race with unshadow */
else
*table &= ~_REGION_ENTRY_INVALID;
@@ -1892,25 +1926,26 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
int fake)
{
unsigned long raddr, origin, offset, len;
- unsigned long *s_sgt, *table;
+ unsigned long *table;
+ phys_addr_t s_sgt;
struct page *page;
int rc;
BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
/* Allocate a shadow segment table */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
page->index = sgt & _REGION_ENTRY_ORIGIN;
if (fake)
page->index |= GMAP_SHADOW_FAKE_TABLE;
- s_sgt = (unsigned long *) page_to_phys(page);
+ s_sgt = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
@@ -1925,9 +1960,9 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
rc = -EAGAIN; /* Race with shadow */
goto out_free;
}
- crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+ crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
/* mark as invalid as long as the parent table is not protected */
- *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+ *table = s_sgt | _REGION_ENTRY_LENGTH |
_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= sgt & _REGION_ENTRY_PROTECT;
@@ -1948,8 +1983,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
spin_lock(&sg->guest_table_lock);
if (!rc) {
table = gmap_table_walk(sg, saddr, 2);
- if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
- (unsigned long) s_sgt)
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
rc = -EAGAIN; /* Race with unshadow */
else
*table &= ~_REGION_ENTRY_INVALID;
@@ -1966,7 +2000,7 @@ out_free:
EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
/**
- * gmap_shadow_lookup_pgtable - find a shadow page table
+ * gmap_shadow_pgt_lookup - find a shadow page table
* @sg: pointer to the shadow guest address space structure
* @saddr: the address in the shadow aguest address space
* @pgt: parent gmap address of the page table to get shadowed
@@ -1976,7 +2010,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
* Returns 0 if the shadow page table was found and -EAGAIN if the page
* table was not found.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
unsigned long *pgt, int *dat_protection,
@@ -2016,14 +2050,15 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
* shadow table structure is incomplete, -ENOMEM if out of memory,
* -EFAULT if an address in the parent gmap could not be resolved and
*
- * Called with gmap->mm->mmap_sem in read
+ * Called with gmap->mm->mmap_lock in read
*/
int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
int fake)
{
unsigned long raddr, origin;
- unsigned long *s_pgt, *table;
+ unsigned long *table;
struct page *page;
+ phys_addr_t s_pgt;
int rc;
BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
@@ -2034,7 +2069,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
if (fake)
page->index |= GMAP_SHADOW_FAKE_TABLE;
- s_pgt = (unsigned long *) page_to_phys(page);
+ s_pgt = page_to_phys(page);
/* Install shadow page table */
spin_lock(&sg->guest_table_lock);
table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
@@ -2067,8 +2102,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
spin_lock(&sg->guest_table_lock);
if (!rc) {
table = gmap_table_walk(sg, saddr, 1);
- if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
- (unsigned long) s_pgt)
+ if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
rc = -EAGAIN; /* Race with unshadow */
else
*table &= ~_SEGMENT_ENTRY_INVALID;
@@ -2095,7 +2129,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
{
@@ -2111,7 +2145,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
parent = sg->parent;
prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
- rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
if (!rmap)
return -ENOMEM;
rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
@@ -2123,7 +2157,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
rc = vmaddr;
break;
}
- rc = radix_tree_preload(GFP_KERNEL);
+ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
if (rc)
break;
rc = -EAGAIN;
@@ -2134,7 +2168,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
if (!tptep) {
spin_unlock(&sg->guest_table_lock);
- gmap_pte_op_end(ptl);
+ gmap_pte_op_end(sptep, ptl);
radix_tree_preload_end();
break;
}
@@ -2145,7 +2179,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
rmap = NULL;
rc = 0;
}
- gmap_pte_op_end(ptl);
+ gmap_pte_op_end(sptep, ptl);
spin_unlock(&sg->guest_table_lock);
}
radix_tree_preload_end();
@@ -2160,7 +2194,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
}
EXPORT_SYMBOL_GPL(gmap_shadow_page);
-/**
+/*
* gmap_shadow_notify - handle notifications for shadow gmap
*
* Called with sg->parent->shadow_lock.
@@ -2220,7 +2254,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
/**
* ptep_notify - call all invalidation callbacks for a specific pte.
* @mm: pointer to the process mm_struct
- * @addr: virtual address in the process address space
+ * @vmaddr: virtual address in the process address space
* @pte: pointer to the page table entry
* @bits: bits from the pgste that caused the notify call
*
@@ -2264,7 +2298,7 @@ EXPORT_SYMBOL_GPL(ptep_notify);
static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
unsigned long gaddr)
{
- pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN;
+ set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
}
@@ -2283,7 +2317,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
{
gaddr &= HPAGE_MASK;
pmdp_notify_gmap(gmap, pmdp, gaddr);
- pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN;
+ new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
if (MACHINE_HAS_TLB_GUEST)
__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
IDTE_GLOBAL);
@@ -2291,7 +2325,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
else
__pmdp_csp(pmdp);
- *pmdp = new;
+ set_pmd(pmdp, new);
}
static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
@@ -2313,7 +2347,7 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
_SEGMENT_ENTRY_GMAP_UC));
if (purge)
__pmdp_csp(pmdp);
- pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
+ set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
}
spin_unlock(&gmap->guest_table_lock);
}
@@ -2436,7 +2470,7 @@ static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
return false;
/* Clear UC indication and reset protection */
- pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC;
+ set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
return true;
}
@@ -2473,36 +2507,55 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
continue;
if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
set_bit(i, bitmap);
- spin_unlock(ptl);
+ pte_unmap_unlock(ptep, ptl);
}
}
gmap_pmd_op_end(gmap, pmdp);
}
EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+
+ split_huge_pmd(vma, pmd, addr);
+ return 0;
+}
+
+static const struct mm_walk_ops thp_split_walk_ops = {
+ .pmd_entry = thp_split_walk_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK_VERIFY,
+};
+
static inline void thp_split_mm(struct mm_struct *mm)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct vm_area_struct *vma;
- unsigned long addr;
+ VMA_ITERATOR(vmi, mm, 0);
- for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
- for (addr = vma->vm_start;
- addr < vma->vm_end;
- addr += PAGE_SIZE)
- follow_page(vma, addr, FOLL_SPLIT);
- vma->vm_flags &= ~VM_HUGEPAGE;
- vma->vm_flags |= VM_NOHUGEPAGE;
+ for_each_vma(vmi, vma) {
+ vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE);
+ walk_page_vma(vma, &thp_split_walk_ops, NULL);
}
mm->def_flags |= VM_NOHUGEPAGE;
-#endif
}
+#else
+static inline void thp_split_mm(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* Remove all empty zero pages from the mapping for lazy refaulting
* - This must be called after mm->context.has_pgste is set, to avoid
* future creation of zero pages
- * - This must be called after THP was enabled
+ * - This must be called after THP was disabled.
+ *
+ * mm contracts with s390, that even if mm were to remove a page table,
+ * racing with the loop below and so causing pte_offset_map_lock() to fail,
+ * it will never insert a page table containing empty zero pages once
+ * mm_forbids_zeropage(mm) i.e. mm->context.has_pgste is set.
*/
static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)
@@ -2514,6 +2567,8 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
spinlock_t *ptl;
ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!ptep)
+ break;
if (is_zero_pfn(pte_pfn(*ptep)))
ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
pte_unmap_unlock(ptep, ptl);
@@ -2523,6 +2578,7 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
static const struct mm_walk_ops zap_zero_walk_ops = {
.pmd_entry = __zap_zero_pages,
+ .walk_lock = PGWALK_WRLOCK,
};
/*
@@ -2538,16 +2594,27 @@ int s390_enable_sie(void)
/* Fail if the page tables are 2K */
if (!mm_alloc_pgste(mm))
return -EINVAL;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return 0;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
+int gmap_mark_unmergeable(void)
+{
+ /*
+ * Make sure to disable KSM (if enabled for the whole process or
+ * individual VMAs). Note that nothing currently hinders user space
+ * from re-enabling it.
+ */
+ return ksm_disable(current->mm);
+}
+EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
+
/*
* Enable storage key handling from now on and initialize the storage
* keys with the default key.
@@ -2560,6 +2627,18 @@ static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
return 0;
}
+/*
+ * Give a chance to schedule after setting a key to 256 pages.
+ * We only hold the mm lock, which is a rwsem and the kvm srcu.
+ * Both can sleep.
+ */
+static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ cond_resched();
+ return 0;
+}
+
static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
unsigned long hmask, unsigned long next,
struct mm_walk *walk)
@@ -2582,39 +2661,36 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
end = start + HPAGE_SIZE - 1;
__storage_key_init_range(start, end);
set_bit(PG_arch_1, &page->flags);
+ cond_resched();
return 0;
}
static const struct mm_walk_ops enable_skey_walk_ops = {
.hugetlb_entry = __s390_enable_skey_hugetlb,
.pte_entry = __s390_enable_skey_pte,
+ .pmd_entry = __s390_enable_skey_pmd,
+ .walk_lock = PGWALK_WRLOCK,
};
int s390_enable_skey(void)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
int rc = 0;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
if (mm_uses_skeys(mm))
goto out_up;
mm->context.uses_skeys = 1;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
- MADV_UNMERGEABLE, &vma->vm_flags)) {
- mm->context.uses_skeys = 0;
- rc = -ENOMEM;
- goto out_up;
- }
+ rc = gmap_mark_unmergeable();
+ if (rc) {
+ mm->context.uses_skeys = 0;
+ goto out_up;
}
- mm->def_flags &= ~VM_MERGEABLE;
-
walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
out_up:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return rc;
}
EXPORT_SYMBOL_GPL(s390_enable_skey);
@@ -2631,12 +2707,188 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
static const struct mm_walk_ops reset_cmma_walk_ops = {
.pte_entry = __s390_reset_cmma,
+ .walk_lock = PGWALK_WRLOCK,
};
void s390_reset_cmma(struct mm_struct *mm)
{
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
}
EXPORT_SYMBOL_GPL(s390_reset_cmma);
+
+#define GATHER_GET_PAGES 32
+
+struct reset_walk_state {
+ unsigned long next;
+ unsigned long count;
+ unsigned long pfns[GATHER_GET_PAGES];
+};
+
+static int s390_gather_pages(pte_t *ptep, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct reset_walk_state *p = walk->private;
+ pte_t pte = READ_ONCE(*ptep);
+
+ if (pte_present(pte)) {
+ /* we have a reference from the mapping, take an extra one */
+ get_page(phys_to_page(pte_val(pte)));
+ p->pfns[p->count] = phys_to_pfn(pte_val(pte));
+ p->next = next;
+ p->count++;
+ }
+ return p->count >= GATHER_GET_PAGES;
+}
+
+static const struct mm_walk_ops gather_pages_ops = {
+ .pte_entry = s390_gather_pages,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+/*
+ * Call the Destroy secure page UVC on each page in the given array of PFNs.
+ * Each page needs to have an extra reference, which will be released here.
+ */
+void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
+{
+ unsigned long i;
+
+ for (i = 0; i < count; i++) {
+ /* we always have an extra reference */
+ uv_destroy_owned_page(pfn_to_phys(pfns[i]));
+ /* get rid of the extra reference */
+ put_page(pfn_to_page(pfns[i]));
+ cond_resched();
+ }
+}
+EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
+
+/**
+ * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
+ * in the given range of the given address space.
+ * @mm: the mm to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ * @interruptible: if not 0, stop when a fatal signal is received
+ *
+ * Walk the given range of the given address space and call the destroy
+ * secure page UVC on each page. Optionally exit early if a fatal signal is
+ * pending.
+ *
+ * Return: 0 on success, -EINTR if the function stopped before completing
+ */
+int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, bool interruptible)
+{
+ struct reset_walk_state state = { .next = start };
+ int r = 1;
+
+ while (r > 0) {
+ state.count = 0;
+ mmap_read_lock(mm);
+ r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
+ mmap_read_unlock(mm);
+ cond_resched();
+ s390_uv_destroy_pfns(state.count, state.pfns);
+ if (interruptible && fatal_signal_pending(current))
+ return -EINTR;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
+
+/**
+ * s390_unlist_old_asce - Remove the topmost level of page tables from the
+ * list of page tables of the gmap.
+ * @gmap: the gmap whose table is to be removed
+ *
+ * On s390x, KVM keeps a list of all pages containing the page tables of the
+ * gmap (the CRST list). This list is used at tear down time to free all
+ * pages that are now not needed anymore.
+ *
+ * This function removes the topmost page of the tree (the one pointed to by
+ * the ASCE) from the CRST list.
+ *
+ * This means that it will not be freed when the VM is torn down, and needs
+ * to be handled separately by the caller, unless a leak is actually
+ * intended. Notice that this function will only remove the page from the
+ * list, the page will still be used as a top level page table (and ASCE).
+ */
+void s390_unlist_old_asce(struct gmap *gmap)
+{
+ struct page *old;
+
+ old = virt_to_page(gmap->table);
+ spin_lock(&gmap->guest_table_lock);
+ list_del(&old->lru);
+ /*
+ * Sometimes the topmost page might need to be "removed" multiple
+ * times, for example if the VM is rebooted into secure mode several
+ * times concurrently, or if s390_replace_asce fails after calling
+ * s390_remove_old_asce and is attempted again later. In that case
+ * the old asce has been removed from the list, and therefore it
+ * will not be freed when the VM terminates, but the ASCE is still
+ * in use and still pointed to.
+ * A subsequent call to replace_asce will follow the pointer and try
+ * to remove the same page from the list again.
+ * Therefore it's necessary that the page of the ASCE has valid
+ * pointers, so list_del can work (and do nothing) without
+ * dereferencing stale or invalid pointers.
+ */
+ INIT_LIST_HEAD(&old->lru);
+ spin_unlock(&gmap->guest_table_lock);
+}
+EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
+
+/**
+ * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
+ * @gmap: the gmap whose ASCE needs to be replaced
+ *
+ * If the ASCE is a SEGMENT type then this function will return -EINVAL,
+ * otherwise the pointers in the host_to_guest radix tree will keep pointing
+ * to the wrong pages, causing use-after-free and memory corruption.
+ * If the allocation of the new top level page table fails, the ASCE is not
+ * replaced.
+ * In any case, the old ASCE is always removed from the gmap CRST list.
+ * Therefore the caller has to make sure to save a pointer to it
+ * beforehand, unless a leak is actually intended.
+ */
+int s390_replace_asce(struct gmap *gmap)
+{
+ unsigned long asce;
+ struct page *page;
+ void *table;
+
+ s390_unlist_old_asce(gmap);
+
+ /* Replacing segment type ASCEs would cause serious issues */
+ if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+ return -EINVAL;
+
+ page = gmap_alloc_crst();
+ if (!page)
+ return -ENOMEM;
+ page->index = 0;
+ table = page_to_virt(page);
+ memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
+
+ /*
+ * The caller has to deal with the old ASCE, but here we make sure
+ * the new one is properly added to the CRST list, so that
+ * it will be freed when the VM is torn down.
+ */
+ spin_lock(&gmap->guest_table_lock);
+ list_add(&page->lru, &gmap->crst_list);
+ spin_unlock(&gmap->guest_table_lock);
+
+ /* Set new table origin while preserving existing ASCE control bits */
+ asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
+ WRITE_ONCE(gmap->asce, asce);
+ WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
+ WRITE_ONCE(gmap->table, table);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(s390_replace_asce);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index b0246c705a19..297a6d897d5a 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -2,15 +2,19 @@
/*
* IBM System z Huge TLB Page Support for Kernel.
*
- * Copyright IBM Corp. 2007,2016
+ * Copyright IBM Corp. 2007,2020
* Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
*/
#define KMSG_COMPONENT "hugetlb"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#include <asm/pgalloc.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/security.h>
/*
* If the bit selected by single-bit bitmask "a" is set within "x", move
@@ -69,8 +73,8 @@ static inline unsigned long __pte_to_rste(pte_t pte)
static inline pte_t __rste_to_pte(unsigned long rste)
{
+ unsigned long pteval;
int present;
- pte_t pte;
if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
present = pud_present(__pud(rste));
@@ -98,29 +102,21 @@ static inline pte_t __rste_to_pte(unsigned long rste)
* u unused, l large
*/
if (present) {
- pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
- pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT;
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_READ,
- _PAGE_READ);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE,
- _PAGE_WRITE);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID,
- _PAGE_INVALID);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT,
- _PAGE_PROTECT);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY,
- _PAGE_DIRTY);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG,
- _PAGE_YOUNG);
+ pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
+ pteval |= _PAGE_LARGE | _PAGE_PRESENT;
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG);
#ifdef CONFIG_MEM_SOFT_DIRTY
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY,
- _PAGE_DIRTY);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY);
#endif
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC,
- _PAGE_NOEXEC);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC);
} else
- pte_val(pte) = _PAGE_INVALID;
- return pte;
+ pteval = _PAGE_INVALID;
+ return __pte(pteval);
}
static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
@@ -146,7 +142,7 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
__storage_key_init_range(paddr, paddr + size - 1);
}
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
unsigned long rste;
@@ -156,12 +152,21 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
rste &= ~_SEGMENT_ENTRY_NOEXEC;
/* Set correct table type for 2G hugepages */
- if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
- rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE;
- else
+ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
+ if (likely(pte_present(pte)))
+ rste |= _REGION3_ENTRY_LARGE;
+ rste |= _REGION_ENTRY_TYPE_R3;
+ } else if (likely(pte_present(pte)))
rste |= _SEGMENT_ENTRY_LARGE;
+
clear_huge_pte_skeys(mm, rste);
- pte_val(*ptep) = rste;
+ set_pte(ptep, __pte(rste));
+}
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned long sz)
+{
+ __set_huge_pte_at(mm, addr, ptep, pte);
}
pte_t huge_ptep_get(pte_t *ptep)
@@ -183,7 +188,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
return pte;
}
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
pgd_t *pgdp;
@@ -238,32 +243,100 @@ int pud_huge(pud_t pud)
return pud_large(pud);
}
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int flags)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
- if (flags & FOLL_GET)
- return NULL;
+ if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
+ return true;
+ else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE)
+ return true;
+ else
+ return false;
+}
- return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = current->mm->mmap_base;
+ info.high_limit = TASK_SIZE;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ return vm_unmapped_area(&info);
}
-static __init int setup_hugepagesz(char *opt)
+static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
+ unsigned long addr0, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
{
- unsigned long size;
- char *string = opt;
-
- size = memparse(opt, &opt);
- if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) {
- hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
- } else {
- hugetlb_bad_size();
- pr_err("hugepagesz= specifies an unsupported page size %s\n",
- string);
- return 0;
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+ unsigned long addr;
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = PAGE_SIZE;
+ info.high_limit = current->mm->mmap_base;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ addr = vm_unmapped_area(&info);
+
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ if (addr & ~PAGE_MASK) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ addr = vm_unmapped_area(&info);
}
- return 1;
+
+ return addr;
+}
+
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+
+ if (len & ~huge_page_mask(h))
+ return -EINVAL;
+ if (len > TASK_SIZE - mmap_min_addr)
+ return -ENOMEM;
+
+ if (flags & MAP_FIXED) {
+ if (prepare_hugepage_range(file, addr, len))
+ return -EINVAL;
+ goto check_asce_limit;
+ }
+
+ if (addr) {
+ addr = ALIGN(addr, huge_page_size(h));
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ (!vma || addr + len <= vm_start_gap(vma)))
+ goto check_asce_limit;
+ }
+
+ if (mm->get_unmapped_area == arch_get_unmapped_area)
+ addr = hugetlb_get_unmapped_area_bottomup(file, addr, len,
+ pgoff, flags);
+ else
+ addr = hugetlb_get_unmapped_area_topdown(file, addr, len,
+ pgoff, flags);
+ if (offset_in_page(addr))
+ return addr;
+
+check_asce_limit:
+ return check_asce_limit(mm, addr, len);
}
-__setup("hugepagesz=", setup_hugepagesz);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index f0ce22220565..43e612bc2bcd 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -31,30 +31,35 @@
#include <linux/cma.h>
#include <linux/gfp.h>
#include <linux/dma-direct.h>
+#include <linux/percpu.h>
#include <asm/processor.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
+#include <asm/ctlreg.h>
+#include <asm/kfence.h>
+#include <asm/ptdump.h>
#include <asm/dma.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
-#include <asm/ctl_reg.h>
#include <asm/sclp.h>
#include <asm/set_memory.h>
#include <asm/kasan.h>
#include <asm/dma-mapping.h>
#include <asm/uv.h>
+#include <linux/virtio_anchor.h>
+#include <linux/virtio_config.h>
-pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir);
+pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir");
+pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir");
+
+struct ctlreg __bootdata_preserved(s390_invalid_asce);
unsigned long empty_zero_page, zero_page_mask;
EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(zero_page_mask);
-bool initmem_freed;
-
static void __init setup_zero_pages(void)
{
unsigned int order;
@@ -88,70 +93,44 @@ static void __init setup_zero_pages(void)
void __init paging_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
- unsigned long pgd_type, asce_bits;
- psw_t psw;
-
- init_mm.pgd = swapper_pg_dir;
- if (VMALLOC_END > _REGION2_SIZE) {
- asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
- pgd_type = _REGION2_ENTRY_EMPTY;
- } else {
- asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
- pgd_type = _REGION3_ENTRY_EMPTY;
- }
- init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
- S390_lowcore.kernel_asce = init_mm.context.asce;
- S390_lowcore.user_asce = S390_lowcore.kernel_asce;
- crst_table_init((unsigned long *) init_mm.pgd, pgd_type);
+
vmem_map_init();
- kasan_copy_shadow(init_mm.pgd);
-
- /* enable virtual mapping in kernel mode */
- __ctl_load(S390_lowcore.kernel_asce, 1, 1);
- __ctl_load(S390_lowcore.kernel_asce, 7, 7);
- __ctl_load(S390_lowcore.kernel_asce, 13, 13);
- psw.mask = __extract_psw();
- psw_bits(psw).dat = 1;
- psw_bits(psw).as = PSW_BITS_AS_HOME;
- __load_psw_mask(psw.mask);
- kasan_free_early_identity();
-
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
zone_dma_bits = 31;
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
- max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS);
+ max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS);
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
void mark_rodata_ro(void)
{
unsigned long size = __end_ro_after_init - __start_ro_after_init;
- set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT);
+ __set_memory_ro(__start_ro_after_init, __end_ro_after_init);
pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
+ debug_checkwx();
}
-int set_memory_encrypted(unsigned long addr, int numpages)
+int set_memory_encrypted(unsigned long vaddr, int numpages)
{
int i;
/* make specified pages unshared, (swiotlb, dma_free) */
for (i = 0; i < numpages; ++i) {
- uv_remove_shared(addr);
- addr += PAGE_SIZE;
+ uv_remove_shared(virt_to_phys((void *)vaddr));
+ vaddr += PAGE_SIZE;
}
return 0;
}
-int set_memory_decrypted(unsigned long addr, int numpages)
+int set_memory_decrypted(unsigned long vaddr, int numpages)
{
int i;
/* make specified pages shared (swiotlb, dma_alloca) */
for (i = 0; i < numpages; ++i) {
- uv_set_shared(addr);
- addr += PAGE_SIZE;
+ uv_set_shared(virt_to_phys((void *)vaddr));
+ vaddr += PAGE_SIZE;
}
return 0;
}
@@ -168,10 +147,11 @@ static void pv_init(void)
if (!is_prot_virt_guest())
return;
+ virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
+
/* make sure bounce buffers are shared */
- swiotlb_init(1);
+ swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE);
swiotlb_update_mem_attributes();
- swiotlb_force = SWIOTLB_FORCE;
}
void __init mem_init(void)
@@ -183,25 +163,17 @@ void __init mem_init(void)
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
pv_init();
-
- /* Setup guest page hinting */
- cmma_init();
+ kfence_split_mapping();
/* this will put all low memory onto the freelists */
memblock_free_all();
setup_zero_pages(); /* Setup zeroed pages. */
-
- cmma_init_nodat();
-
- mem_init_print_info(NULL);
}
void free_initmem(void)
{
- initmem_freed = true;
- __set_memory((unsigned long)_sinittext,
- (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
- SET_MEMORY_RW | SET_MEMORY_NX);
+ set_memory_rwnx((unsigned long)_sinittext,
+ (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT);
free_initmem_default(POISON_FREE_INITMEM);
}
@@ -214,6 +186,41 @@ unsigned long memory_block_size_bytes(void)
return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm);
}
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
+static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
+{
+ return LOCAL_DISTANCE;
+}
+
+static int __init pcpu_cpu_to_node(int cpu)
+{
+ return 0;
+}
+
+void __init setup_per_cpu_areas(void)
+{
+ unsigned long delta;
+ unsigned int cpu;
+ int rc;
+
+ /*
+ * Always reserve area for module percpu variables. That's
+ * what the legacy allocator did.
+ */
+ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+ PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
+ pcpu_cpu_distance,
+ pcpu_cpu_to_node);
+ if (rc < 0)
+ panic("Failed to initialize percpu areas.");
+
+ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+ for_each_possible_cpu(cpu)
+ __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
+}
+
#ifdef CONFIG_MEMORY_HOTPLUG
#ifdef CONFIG_CMA
@@ -268,34 +275,35 @@ device_initcall(s390_cma_mem_init);
#endif /* CONFIG_CMA */
int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long size_pages = PFN_DOWN(size);
int rc;
- if (WARN_ON_ONCE(restrictions->altmap))
+ if (WARN_ON_ONCE(params->altmap))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
return -EINVAL;
+ VM_BUG_ON(!mhp_range_allowed(start, size, true));
rc = vmem_add_mapping(start, size);
if (rc)
return rc;
- rc = __add_pages(nid, start_pfn, size_pages, restrictions);
+ rc = __add_pages(nid, start_pfn, size_pages, params);
if (rc)
vmem_remove_mapping(start, size);
return rc;
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct zone *zone;
- zone = page_zone(pfn_to_page(start_pfn));
- __remove_pages(zone, start_pfn, nr_pages, altmap);
+ __remove_pages(start_pfn, nr_pages, altmap);
vmem_remove_mapping(start, size);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c
deleted file mode 100644
index 460f25572940..000000000000
--- a/arch/s390/mm/kasan_init.c
+++ /dev/null
@@ -1,382 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kasan.h>
-#include <linux/sched/task.h>
-#include <linux/memblock.h>
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
-#include <asm/kasan.h>
-#include <asm/mem_detect.h>
-#include <asm/processor.h>
-#include <asm/sclp.h>
-#include <asm/facility.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-
-static unsigned long segment_pos __initdata;
-static unsigned long segment_low __initdata;
-static unsigned long pgalloc_pos __initdata;
-static unsigned long pgalloc_low __initdata;
-static unsigned long pgalloc_freeable __initdata;
-static bool has_edat __initdata;
-static bool has_nx __initdata;
-
-#define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x))
-
-static pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
-
-static void __init kasan_early_panic(const char *reason)
-{
- sclp_early_printk("The Linux kernel failed to boot with the KernelAddressSanitizer:\n");
- sclp_early_printk(reason);
- disabled_wait();
-}
-
-static void * __init kasan_early_alloc_segment(void)
-{
- segment_pos -= _SEGMENT_SIZE;
-
- if (segment_pos < segment_low)
- kasan_early_panic("out of memory during initialisation\n");
-
- return (void *)segment_pos;
-}
-
-static void * __init kasan_early_alloc_pages(unsigned int order)
-{
- pgalloc_pos -= (PAGE_SIZE << order);
-
- if (pgalloc_pos < pgalloc_low)
- kasan_early_panic("out of memory during initialisation\n");
-
- return (void *)pgalloc_pos;
-}
-
-static void * __init kasan_early_crst_alloc(unsigned long val)
-{
- unsigned long *table;
-
- table = kasan_early_alloc_pages(CRST_ALLOC_ORDER);
- if (table)
- crst_table_init(table, val);
- return table;
-}
-
-static pte_t * __init kasan_early_pte_alloc(void)
-{
- static void *pte_leftover;
- pte_t *pte;
-
- BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE);
-
- if (!pte_leftover) {
- pte_leftover = kasan_early_alloc_pages(0);
- pte = pte_leftover + _PAGE_TABLE_SIZE;
- } else {
- pte = pte_leftover;
- pte_leftover = NULL;
- }
- memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
- return pte;
-}
-
-enum populate_mode {
- POPULATE_ONE2ONE,
- POPULATE_MAP,
- POPULATE_ZERO_SHADOW
-};
-static void __init kasan_early_vmemmap_populate(unsigned long address,
- unsigned long end,
- enum populate_mode mode)
-{
- unsigned long pgt_prot_zero, pgt_prot, sgt_prot;
- pgd_t *pg_dir;
- p4d_t *p4_dir;
- pud_t *pu_dir;
- pmd_t *pm_dir;
- pte_t *pt_dir;
-
- pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO);
- if (!has_nx)
- pgt_prot_zero &= ~_PAGE_NOEXEC;
- pgt_prot = pgprot_val(PAGE_KERNEL_EXEC);
- sgt_prot = pgprot_val(SEGMENT_KERNEL_EXEC);
-
- while (address < end) {
- pg_dir = pgd_offset_k(address);
- if (pgd_none(*pg_dir)) {
- if (mode == POPULATE_ZERO_SHADOW &&
- IS_ALIGNED(address, PGDIR_SIZE) &&
- end - address >= PGDIR_SIZE) {
- pgd_populate(&init_mm, pg_dir,
- kasan_early_shadow_p4d);
- address = (address + PGDIR_SIZE) & PGDIR_MASK;
- continue;
- }
- p4_dir = kasan_early_crst_alloc(_REGION2_ENTRY_EMPTY);
- pgd_populate(&init_mm, pg_dir, p4_dir);
- }
-
- p4_dir = p4d_offset(pg_dir, address);
- if (p4d_none(*p4_dir)) {
- if (mode == POPULATE_ZERO_SHADOW &&
- IS_ALIGNED(address, P4D_SIZE) &&
- end - address >= P4D_SIZE) {
- p4d_populate(&init_mm, p4_dir,
- kasan_early_shadow_pud);
- address = (address + P4D_SIZE) & P4D_MASK;
- continue;
- }
- pu_dir = kasan_early_crst_alloc(_REGION3_ENTRY_EMPTY);
- p4d_populate(&init_mm, p4_dir, pu_dir);
- }
-
- pu_dir = pud_offset(p4_dir, address);
- if (pud_none(*pu_dir)) {
- if (mode == POPULATE_ZERO_SHADOW &&
- IS_ALIGNED(address, PUD_SIZE) &&
- end - address >= PUD_SIZE) {
- pud_populate(&init_mm, pu_dir,
- kasan_early_shadow_pmd);
- address = (address + PUD_SIZE) & PUD_MASK;
- continue;
- }
- pm_dir = kasan_early_crst_alloc(_SEGMENT_ENTRY_EMPTY);
- pud_populate(&init_mm, pu_dir, pm_dir);
- }
-
- pm_dir = pmd_offset(pu_dir, address);
- if (pmd_none(*pm_dir)) {
- if (mode == POPULATE_ZERO_SHADOW &&
- IS_ALIGNED(address, PMD_SIZE) &&
- end - address >= PMD_SIZE) {
- pmd_populate(&init_mm, pm_dir,
- kasan_early_shadow_pte);
- address = (address + PMD_SIZE) & PMD_MASK;
- continue;
- }
- /* the first megabyte of 1:1 is mapped with 4k pages */
- if (has_edat && address && end - address >= PMD_SIZE &&
- mode != POPULATE_ZERO_SHADOW) {
- void *page;
-
- if (mode == POPULATE_ONE2ONE) {
- page = (void *)address;
- } else {
- page = kasan_early_alloc_segment();
- memset(page, 0, _SEGMENT_SIZE);
- }
- pmd_val(*pm_dir) = __pa(page) | sgt_prot;
- address = (address + PMD_SIZE) & PMD_MASK;
- continue;
- }
-
- pt_dir = kasan_early_pte_alloc();
- pmd_populate(&init_mm, pm_dir, pt_dir);
- } else if (pmd_large(*pm_dir)) {
- address = (address + PMD_SIZE) & PMD_MASK;
- continue;
- }
-
- pt_dir = pte_offset_kernel(pm_dir, address);
- if (pte_none(*pt_dir)) {
- void *page;
-
- switch (mode) {
- case POPULATE_ONE2ONE:
- page = (void *)address;
- pte_val(*pt_dir) = __pa(page) | pgt_prot;
- break;
- case POPULATE_MAP:
- page = kasan_early_alloc_pages(0);
- memset(page, 0, PAGE_SIZE);
- pte_val(*pt_dir) = __pa(page) | pgt_prot;
- break;
- case POPULATE_ZERO_SHADOW:
- page = kasan_early_shadow_page;
- pte_val(*pt_dir) = __pa(page) | pgt_prot_zero;
- break;
- }
- }
- address += PAGE_SIZE;
- }
-}
-
-static void __init kasan_set_pgd(pgd_t *pgd, unsigned long asce_type)
-{
- unsigned long asce_bits;
-
- asce_bits = asce_type | _ASCE_TABLE_LENGTH;
- S390_lowcore.kernel_asce = (__pa(pgd) & PAGE_MASK) | asce_bits;
- S390_lowcore.user_asce = S390_lowcore.kernel_asce;
-
- __ctl_load(S390_lowcore.kernel_asce, 1, 1);
- __ctl_load(S390_lowcore.kernel_asce, 7, 7);
- __ctl_load(S390_lowcore.kernel_asce, 13, 13);
-}
-
-static void __init kasan_enable_dat(void)
-{
- psw_t psw;
-
- psw.mask = __extract_psw();
- psw_bits(psw).dat = 1;
- psw_bits(psw).as = PSW_BITS_AS_HOME;
- __load_psw_mask(psw.mask);
-}
-
-static void __init kasan_early_detect_facilities(void)
-{
- if (test_facility(8)) {
- has_edat = true;
- __ctl_set_bit(0, 23);
- }
- if (!noexec_disabled && test_facility(130)) {
- has_nx = true;
- __ctl_set_bit(0, 20);
- }
-}
-
-void __init kasan_early_init(void)
-{
- unsigned long untracked_mem_end;
- unsigned long shadow_alloc_size;
- unsigned long initrd_end;
- unsigned long asce_type;
- unsigned long memsize;
- unsigned long vmax;
- unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO);
- pte_t pte_z;
- pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY);
- pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY);
- p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY);
-
- kasan_early_detect_facilities();
- if (!has_nx)
- pgt_prot &= ~_PAGE_NOEXEC;
- pte_z = __pte(__pa(kasan_early_shadow_page) | pgt_prot);
-
- memsize = get_mem_detect_end();
- if (!memsize)
- kasan_early_panic("cannot detect physical memory size\n");
- /* respect mem= cmdline parameter */
- if (memory_end_set && memsize > memory_end)
- memsize = memory_end;
- if (IS_ENABLED(CONFIG_CRASH_DUMP) && OLDMEM_BASE)
- memsize = min(memsize, OLDMEM_SIZE);
- memsize = min(memsize, KASAN_SHADOW_START);
-
- if (IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING)) {
- /* 4 level paging */
- BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE));
- BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE));
- crst_table_init((unsigned long *)early_pg_dir,
- _REGION2_ENTRY_EMPTY);
- untracked_mem_end = vmax = _REGION1_SIZE;
- asce_type = _ASCE_TYPE_REGION2;
- } else {
- /* 3 level paging */
- BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PUD_SIZE));
- BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PUD_SIZE));
- crst_table_init((unsigned long *)early_pg_dir,
- _REGION3_ENTRY_EMPTY);
- untracked_mem_end = vmax = _REGION2_SIZE;
- asce_type = _ASCE_TYPE_REGION3;
- }
-
- /* init kasan zero shadow */
- crst_table_init((unsigned long *)kasan_early_shadow_p4d,
- p4d_val(p4d_z));
- crst_table_init((unsigned long *)kasan_early_shadow_pud,
- pud_val(pud_z));
- crst_table_init((unsigned long *)kasan_early_shadow_pmd,
- pmd_val(pmd_z));
- memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE);
-
- shadow_alloc_size = memsize >> KASAN_SHADOW_SCALE_SHIFT;
- pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE);
- if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) {
- initrd_end =
- round_up(INITRD_START + INITRD_SIZE, _SEGMENT_SIZE);
- pgalloc_low = max(pgalloc_low, initrd_end);
- }
-
- if (pgalloc_low + shadow_alloc_size > memsize)
- kasan_early_panic("out of memory during initialisation\n");
-
- if (has_edat) {
- segment_pos = round_down(memsize, _SEGMENT_SIZE);
- segment_low = segment_pos - shadow_alloc_size;
- pgalloc_pos = segment_low;
- } else {
- pgalloc_pos = memsize;
- }
- init_mm.pgd = early_pg_dir;
- /*
- * Current memory layout:
- * +- 0 -------------+ +- shadow start -+
- * | 1:1 ram mapping | /| 1/8 ram |
- * +- end of ram ----+ / +----------------+
- * | ... gap ... |/ | kasan |
- * +- shadow start --+ | zero |
- * | 1/8 addr space | | page |
- * +- shadow end -+ | mapping |
- * | ... gap ... |\ | (untracked) |
- * +- modules vaddr -+ \ +----------------+
- * | 2Gb | \| unmapped | allocated per module
- * +-----------------+ +- shadow end ---+
- */
- /* populate kasan shadow (for identity mapping and zero page mapping) */
- kasan_early_vmemmap_populate(__sha(0), __sha(memsize), POPULATE_MAP);
- if (IS_ENABLED(CONFIG_MODULES))
- untracked_mem_end = vmax - MODULES_LEN;
- kasan_early_vmemmap_populate(__sha(max_physmem_end),
- __sha(untracked_mem_end),
- POPULATE_ZERO_SHADOW);
- /* memory allocated for identity mapping structs will be freed later */
- pgalloc_freeable = pgalloc_pos;
- /* populate identity mapping */
- kasan_early_vmemmap_populate(0, memsize, POPULATE_ONE2ONE);
- kasan_set_pgd(early_pg_dir, asce_type);
- kasan_enable_dat();
- /* enable kasan */
- init_task.kasan_depth = 0;
- memblock_reserve(pgalloc_pos, memsize - pgalloc_pos);
- sclp_early_printk("KernelAddressSanitizer initialized\n");
-}
-
-void __init kasan_copy_shadow(pgd_t *pg_dir)
-{
- /*
- * At this point we are still running on early pages setup early_pg_dir,
- * while swapper_pg_dir has just been initialized with identity mapping.
- * Carry over shadow memory region from early_pg_dir to swapper_pg_dir.
- */
-
- pgd_t *pg_dir_src;
- pgd_t *pg_dir_dst;
- p4d_t *p4_dir_src;
- p4d_t *p4_dir_dst;
- pud_t *pu_dir_src;
- pud_t *pu_dir_dst;
-
- pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START);
- pg_dir_dst = pgd_offset_raw(pg_dir, KASAN_SHADOW_START);
- p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START);
- p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START);
- if (!p4d_folded(*p4_dir_src)) {
- /* 4 level paging */
- memcpy(p4_dir_dst, p4_dir_src,
- (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t));
- return;
- }
- /* 3 level paging */
- pu_dir_src = pud_offset(p4_dir_src, KASAN_SHADOW_START);
- pu_dir_dst = pud_offset(p4_dir_dst, KASAN_SHADOW_START);
- memcpy(pu_dir_dst, pu_dir_src,
- (KASAN_SHADOW_SIZE >> PUD_SHIFT) * sizeof(pud_t));
-}
-
-void __init kasan_free_early_identity(void)
-{
- memblock_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
-}
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index de7ca4b6718f..632c3a55feed 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -4,8 +4,6 @@
*
* Copyright IBM Corp. 2009, 2015
*
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
- *
*/
#include <linux/uaccess.h>
@@ -14,9 +12,17 @@
#include <linux/errno.h>
#include <linux/gfp.h>
#include <linux/cpu.h>
-#include <asm/ctl_reg.h>
-#include <asm/io.h>
+#include <linux/uio.h>
+#include <linux/io.h>
+#include <asm/asm-extable.h>
+#include <asm/abs_lowcore.h>
#include <asm/stacktrace.h>
+#include <asm/maccess.h>
+#include <asm/ctlreg.h>
+
+unsigned long __bootdata_preserved(__memcpy_real_area);
+pte_t *__bootdata_preserved(memcpy_real_ptep);
+static DEFINE_MUTEX(memcpy_real_mutex);
static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size)
{
@@ -55,155 +61,84 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz
*/
static DEFINE_SPINLOCK(s390_kernel_write_lock);
-void notrace s390_kernel_write(void *dst, const void *src, size_t size)
+notrace void *s390_kernel_write(void *dst, const void *src, size_t size)
{
+ void *tmp = dst;
unsigned long flags;
long copied;
spin_lock_irqsave(&s390_kernel_write_lock, flags);
while (size) {
- copied = s390_kernel_write_odd(dst, src, size);
- dst += copied;
+ copied = s390_kernel_write_odd(tmp, src, size);
+ tmp += copied;
src += copied;
size -= copied;
}
spin_unlock_irqrestore(&s390_kernel_write_lock, flags);
-}
-
-static int __no_sanitize_address __memcpy_real(void *dest, void *src, size_t count)
-{
- register unsigned long _dest asm("2") = (unsigned long) dest;
- register unsigned long _len1 asm("3") = (unsigned long) count;
- register unsigned long _src asm("4") = (unsigned long) src;
- register unsigned long _len2 asm("5") = (unsigned long) count;
- int rc = -EFAULT;
-
- asm volatile (
- "0: mvcle %1,%2,0x0\n"
- "1: jo 0b\n"
- " lhi %0,0x0\n"
- "2:\n"
- EX_TABLE(1b,2b)
- : "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1),
- "+d" (_len2), "=m" (*((long *) dest))
- : "m" (*((long *) src))
- : "cc", "memory");
- return rc;
-}
-
-static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest,
- unsigned long src,
- unsigned long count)
-{
- int irqs_disabled, rc;
- unsigned long flags;
- if (!count)
- return 0;
- flags = arch_local_irq_save();
- irqs_disabled = arch_irqs_disabled_flags(flags);
- if (!irqs_disabled)
- trace_hardirqs_off();
- __arch_local_irq_stnsm(0xf8); // disable DAT
- rc = __memcpy_real((void *) dest, (void *) src, (size_t) count);
- if (flags & PSW_MASK_DAT)
- __arch_local_irq_stosm(0x04); // enable DAT
- if (!irqs_disabled)
- trace_hardirqs_on();
- __arch_local_irq_ssm(flags);
- return rc;
+ return dst;
}
-/*
- * Copy memory in real mode (kernel to kernel)
- */
-int memcpy_real(void *dest, void *src, size_t count)
+size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count)
{
- int rc;
-
- if (S390_lowcore.nodat_stack != 0) {
- preempt_disable();
- rc = CALL_ON_STACK(_memcpy_real, S390_lowcore.nodat_stack, 3,
- dest, src, count);
- preempt_enable();
- return rc;
- }
- /*
- * This is a really early memcpy_real call, the stacks are
- * not set up yet. Just call _memcpy_real on the early boot
- * stack
- */
- return _memcpy_real((unsigned long) dest,(unsigned long) src,
- (unsigned long) count);
-}
-
-/*
- * Copy memory in absolute mode (kernel to kernel)
- */
-void memcpy_absolute(void *dest, void *src, size_t count)
-{
- unsigned long cr0, flags, prefix;
-
- flags = arch_local_irq_save();
- __ctl_store(cr0, 0, 0);
- __ctl_clear_bit(0, 28); /* disable lowcore protection */
- prefix = store_prefix();
- if (prefix) {
- local_mcck_disable();
- set_prefix(0);
- memcpy(dest, src, count);
- set_prefix(prefix);
- local_mcck_enable();
- } else {
- memcpy(dest, src, count);
+ size_t len, copied, res = 0;
+ unsigned long phys, offset;
+ void *chunk;
+ pte_t pte;
+
+ BUILD_BUG_ON(MEMCPY_REAL_SIZE != PAGE_SIZE);
+ while (count) {
+ phys = src & MEMCPY_REAL_MASK;
+ offset = src & ~MEMCPY_REAL_MASK;
+ chunk = (void *)(__memcpy_real_area + offset);
+ len = min(count, MEMCPY_REAL_SIZE - offset);
+ pte = mk_pte_phys(phys, PAGE_KERNEL_RO);
+
+ mutex_lock(&memcpy_real_mutex);
+ if (pte_val(pte) != pte_val(*memcpy_real_ptep)) {
+ __ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL);
+ set_pte(memcpy_real_ptep, pte);
+ }
+ copied = copy_to_iter(chunk, len, iter);
+ mutex_unlock(&memcpy_real_mutex);
+
+ count -= copied;
+ src += copied;
+ res += copied;
+ if (copied < len)
+ break;
}
- __ctl_load(cr0, 0, 0);
- arch_local_irq_restore(flags);
+ return res;
}
-/*
- * Copy memory from kernel (real) to user (virtual)
- */
-int copy_to_user_real(void __user *dest, void *src, unsigned long count)
+int memcpy_real(void *dest, unsigned long src, size_t count)
{
- int offs = 0, size, rc;
- char *buf;
-
- buf = (char *) __get_free_page(GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- rc = -EFAULT;
- while (offs < count) {
- size = min(PAGE_SIZE, count - offs);
- if (memcpy_real(buf, src + offs, size))
- goto out;
- if (copy_to_user(dest + offs, buf, size))
- goto out;
- offs += size;
- }
- rc = 0;
-out:
- free_page((unsigned long) buf);
- return rc;
+ struct iov_iter iter;
+ struct kvec kvec;
+
+ kvec.iov_base = dest;
+ kvec.iov_len = count;
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+ if (memcpy_real_iter(&iter, src, count) < count)
+ return -EFAULT;
+ return 0;
}
/*
- * Check if physical address is within prefix or zero page
+ * Find CPU that owns swapped prefix page
*/
-static int is_swapped(unsigned long addr)
+static int get_swapped_owner(phys_addr_t addr)
{
- unsigned long lc;
+ phys_addr_t lc;
int cpu;
- if (addr < sizeof(struct lowcore))
- return 1;
for_each_online_cpu(cpu) {
- lc = (unsigned long) lowcore_ptr[cpu];
+ lc = virt_to_phys(lowcore_ptr[cpu]);
if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc)
continue;
- return 1;
+ return cpu;
}
- return 0;
+ return -1;
}
/*
@@ -214,27 +149,45 @@ static int is_swapped(unsigned long addr)
*/
void *xlate_dev_mem_ptr(phys_addr_t addr)
{
- void *bounce = (void *) addr;
+ void *ptr = phys_to_virt(addr);
+ void *bounce = ptr;
+ struct lowcore *abs_lc;
unsigned long size;
+ int this_cpu, cpu;
- get_online_cpus();
- preempt_disable();
- if (is_swapped(addr)) {
- size = PAGE_SIZE - (addr & ~PAGE_MASK);
- bounce = (void *) __get_free_page(GFP_ATOMIC);
- if (bounce)
- memcpy_absolute(bounce, (void *) addr, size);
+ cpus_read_lock();
+ this_cpu = get_cpu();
+ if (addr >= sizeof(struct lowcore)) {
+ cpu = get_swapped_owner(addr);
+ if (cpu < 0)
+ goto out;
}
- preempt_enable();
- put_online_cpus();
+ bounce = (void *)__get_free_page(GFP_ATOMIC);
+ if (!bounce)
+ goto out;
+ size = PAGE_SIZE - (addr & ~PAGE_MASK);
+ if (addr < sizeof(struct lowcore)) {
+ abs_lc = get_abs_lowcore();
+ ptr = (void *)abs_lc + addr;
+ memcpy(bounce, ptr, size);
+ put_abs_lowcore(abs_lc);
+ } else if (cpu == this_cpu) {
+ ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu]));
+ memcpy(bounce, ptr, size);
+ } else {
+ memcpy(bounce, ptr, size);
+ }
+out:
+ put_cpu();
+ cpus_read_unlock();
return bounce;
}
/*
* Free converted buffer for /dev/mem access (if necessary)
*/
-void unxlate_dev_mem_ptr(phys_addr_t addr, void *buf)
+void unxlate_dev_mem_ptr(phys_addr_t addr, void *ptr)
{
- if ((void *) addr != buf)
- free_page((unsigned long) buf);
+ if (addr != virt_to_phys(ptr))
+ free_page((unsigned long)ptr);
}
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index cbc718ba6d78..fc9a7dc26c5e 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -17,7 +17,6 @@
#include <linux/random.h>
#include <linux/compat.h>
#include <linux/security.h>
-#include <asm/pgalloc.h>
#include <asm/elf.h>
static unsigned long stack_maxrandom_size(void)
@@ -38,7 +37,7 @@ static inline int mmap_is_legacy(struct rlimit *rlim_stack)
unsigned long arch_mmap_rnd(void)
{
- return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT;
+ return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT;
}
static unsigned long mmap_base_legacy(unsigned long rnd)
@@ -59,9 +58,9 @@ static inline unsigned long mmap_base(unsigned long rnd,
/*
* Top of mmap area (just below the process stack).
- * Leave at least a ~32 MB hole.
+ * Leave at least a ~128 MB hole.
*/
- gap_min = 32 * 1024 * 1024UL;
+ gap_min = SZ_128M;
gap_max = (STACK_TOP / 6) * 5;
if (gap < gap_min)
@@ -72,14 +71,13 @@ static inline unsigned long mmap_base(unsigned long rnd,
return PAGE_ALIGN(STACK_TOP - gap - rnd);
}
-unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
- int rc;
if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
@@ -105,30 +103,20 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
check_asce_limit:
- if (addr + len > current->mm->context.asce_limit &&
- addr + len <= TASK_SIZE) {
- rc = crst_table_upgrade(mm, addr + len);
- if (rc)
- return (unsigned long) rc;
- }
-
- return addr;
+ return check_asce_limit(mm, addr, len);
}
-unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
+unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
struct vm_unmapped_area_info info;
- int rc;
/* requested length too big for entire address space */
if (len > TASK_SIZE - mmap_min_addr)
@@ -148,7 +136,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
if (filp || (flags & MAP_SHARED))
info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
@@ -163,25 +151,18 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
* can happen with large stack limits and large mmap()
* allocations.
*/
- if (addr & ~PAGE_MASK) {
+ if (offset_in_page(addr)) {
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
addr = vm_unmapped_area(&info);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
}
check_asce_limit:
- if (addr + len > current->mm->context.asce_limit &&
- addr + len <= TASK_SIZE) {
- rc = crst_table_upgrade(mm, addr + len);
- if (rc)
- return (unsigned long) rc;
- }
-
- return addr;
+ return check_asce_limit(mm, addr, len);
}
/*
@@ -207,3 +188,23 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
+
+static const pgprot_t protection_map[16] = {
+ [VM_NONE] = PAGE_NONE,
+ [VM_READ] = PAGE_RO,
+ [VM_WRITE] = PAGE_RO,
+ [VM_WRITE | VM_READ] = PAGE_RO,
+ [VM_EXEC] = PAGE_RX,
+ [VM_EXEC | VM_READ] = PAGE_RX,
+ [VM_EXEC | VM_WRITE] = PAGE_RX,
+ [VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX,
+ [VM_SHARED] = PAGE_NONE,
+ [VM_SHARED | VM_READ] = PAGE_RO,
+ [VM_SHARED | VM_WRITE] = PAGE_RW,
+ [VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW,
+ [VM_SHARED | VM_EXEC] = PAGE_RX,
+ [VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX,
+ [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX,
+ [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX
+};
+DECLARE_VM_GET_PAGE_PROT
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index fc141893d028..01f9b39e65f5 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -7,211 +7,18 @@
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
*/
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/types.h>
#include <linux/mm.h>
-#include <linux/memblock.h>
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <asm/facility.h>
#include <asm/page-states.h>
+#include <asm/sections.h>
+#include <asm/page.h>
-static int cmma_flag = 1;
-
-static int __init cmma(char *str)
-{
- bool enabled;
-
- if (!kstrtobool(str, &enabled))
- cmma_flag = enabled;
- return 1;
-}
-__setup("cmma=", cmma);
-
-static inline int cmma_test_essa(void)
-{
- register unsigned long tmp asm("0") = 0;
- register int rc asm("1");
-
- /* test ESSA_GET_STATE */
- asm volatile(
- " .insn rrf,0xb9ab0000,%1,%1,%2,0\n"
- "0: la %0,0\n"
- "1:\n"
- EX_TABLE(0b,1b)
- : "=&d" (rc), "+&d" (tmp)
- : "i" (ESSA_GET_STATE), "0" (-EOPNOTSUPP));
- return rc;
-}
-
-void __init cmma_init(void)
-{
- if (!cmma_flag)
- return;
- if (cmma_test_essa()) {
- cmma_flag = 0;
- return;
- }
- if (test_facility(147))
- cmma_flag = 2;
-}
-
-static inline unsigned char get_page_state(struct page *page)
-{
- unsigned char state;
-
- asm volatile(" .insn rrf,0xb9ab0000,%0,%1,%2,0"
- : "=&d" (state)
- : "a" (page_to_phys(page)),
- "i" (ESSA_GET_STATE));
- return state & 0x3f;
-}
-
-static inline void set_page_unused(struct page *page, int order)
-{
- int i, rc;
-
- for (i = 0; i < (1 << order); i++)
- asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
- : "=&d" (rc)
- : "a" (page_to_phys(page + i)),
- "i" (ESSA_SET_UNUSED));
-}
-
-static inline void set_page_stable_dat(struct page *page, int order)
-{
- int i, rc;
-
- for (i = 0; i < (1 << order); i++)
- asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
- : "=&d" (rc)
- : "a" (page_to_phys(page + i)),
- "i" (ESSA_SET_STABLE));
-}
-
-static inline void set_page_stable_nodat(struct page *page, int order)
-{
- int i, rc;
-
- for (i = 0; i < (1 << order); i++)
- asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
- : "=&d" (rc)
- : "a" (page_to_phys(page + i)),
- "i" (ESSA_SET_STABLE_NODAT));
-}
-
-static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end)
-{
- unsigned long next;
- struct page *page;
- pmd_t *pmd;
-
- pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- if (pmd_none(*pmd) || pmd_large(*pmd))
- continue;
- page = virt_to_page(pmd_val(*pmd));
- set_bit(PG_arch_1, &page->flags);
- } while (pmd++, addr = next, addr != end);
-}
-
-static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end)
-{
- unsigned long next;
- struct page *page;
- pud_t *pud;
- int i;
-
- pud = pud_offset(p4d, addr);
- do {
- next = pud_addr_end(addr, end);
- if (pud_none(*pud) || pud_large(*pud))
- continue;
- if (!pud_folded(*pud)) {
- page = virt_to_page(pud_val(*pud));
- for (i = 0; i < 3; i++)
- set_bit(PG_arch_1, &page[i].flags);
- }
- mark_kernel_pmd(pud, addr, next);
- } while (pud++, addr = next, addr != end);
-}
-
-static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end)
-{
- unsigned long next;
- struct page *page;
- p4d_t *p4d;
- int i;
-
- p4d = p4d_offset(pgd, addr);
- do {
- next = p4d_addr_end(addr, end);
- if (p4d_none(*p4d))
- continue;
- if (!p4d_folded(*p4d)) {
- page = virt_to_page(p4d_val(*p4d));
- for (i = 0; i < 3; i++)
- set_bit(PG_arch_1, &page[i].flags);
- }
- mark_kernel_pud(p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-}
-
-static void mark_kernel_pgd(void)
-{
- unsigned long addr, next;
- struct page *page;
- pgd_t *pgd;
- int i;
-
- addr = 0;
- pgd = pgd_offset_k(addr);
- do {
- next = pgd_addr_end(addr, MODULES_END);
- if (pgd_none(*pgd))
- continue;
- if (!pgd_folded(*pgd)) {
- page = virt_to_page(pgd_val(*pgd));
- for (i = 0; i < 3; i++)
- set_bit(PG_arch_1, &page[i].flags);
- }
- mark_kernel_p4d(pgd, addr, next);
- } while (pgd++, addr = next, addr != MODULES_END);
-}
-
-void __init cmma_init_nodat(void)
-{
- struct memblock_region *reg;
- struct page *page;
- unsigned long start, end, ix;
-
- if (cmma_flag < 2)
- return;
- /* Mark pages used in kernel page tables */
- mark_kernel_pgd();
-
- /* Set all kernel pages not used for page tables to stable/no-dat */
- for_each_memblock(memory, reg) {
- start = memblock_region_memory_base_pfn(reg);
- end = memblock_region_memory_end_pfn(reg);
- page = pfn_to_page(start);
- for (ix = start; ix < end; ix++, page++) {
- if (__test_and_clear_bit(PG_arch_1, &page->flags))
- continue; /* skip page table pages */
- if (!list_empty(&page->lru))
- continue; /* skip free pages */
- set_page_stable_nodat(page, 0);
- }
- }
-}
+int __bootdata_preserved(cmma_flag);
void arch_free_page(struct page *page, int order)
{
if (!cmma_flag)
return;
- set_page_unused(page, order);
+ __set_page_unused(page_to_virt(page), 1UL << order);
}
void arch_alloc_page(struct page *page, int order)
@@ -219,57 +26,7 @@ void arch_alloc_page(struct page *page, int order)
if (!cmma_flag)
return;
if (cmma_flag < 2)
- set_page_stable_dat(page, order);
+ __set_page_stable_dat(page_to_virt(page), 1UL << order);
else
- set_page_stable_nodat(page, order);
-}
-
-void arch_set_page_dat(struct page *page, int order)
-{
- if (!cmma_flag)
- return;
- set_page_stable_dat(page, order);
-}
-
-void arch_set_page_nodat(struct page *page, int order)
-{
- if (cmma_flag < 2)
- return;
- set_page_stable_nodat(page, order);
-}
-
-int arch_test_page_nodat(struct page *page)
-{
- unsigned char state;
-
- if (cmma_flag < 2)
- return 0;
- state = get_page_state(page);
- return !!(state & 0x20);
-}
-
-void arch_set_page_states(int make_stable)
-{
- unsigned long flags, order, t;
- struct list_head *l;
- struct page *page;
- struct zone *zone;
-
- if (!cmma_flag)
- return;
- if (make_stable)
- drain_local_pages(NULL);
- for_each_populated_zone(zone) {
- spin_lock_irqsave(&zone->lock, flags);
- for_each_migratetype_order(order, t) {
- list_for_each(l, &zone->free_area[order].free_list[t]) {
- page = list_entry(l, struct page, lru);
- if (make_stable)
- set_page_stable_dat(page, order);
- else
- set_page_unused(page, order);
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
- }
+ __set_page_stable_nodat(page_to_virt(page), 1UL << order);
}
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index f8c6faab41f4..631e3a4ee2de 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -4,11 +4,13 @@
* Author(s): Jan Glauber <jang@linux.vnet.ibm.com>
*/
#include <linux/hugetlb.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <asm/cacheflush.h>
#include <asm/facility.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
+#include <asm/kfence.h>
#include <asm/page.h>
#include <asm/set_memory.h>
@@ -41,7 +43,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
}
#ifdef CONFIG_PROC_FS
-atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX];
+atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]);
void arch_report_meminfo(struct seq_file *m)
{
@@ -57,7 +59,7 @@ void arch_report_meminfo(struct seq_file *m)
static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
unsigned long dtt)
{
- unsigned long table, mask;
+ unsigned long *table, mask;
mask = 0;
if (MACHINE_HAS_EDAT2) {
@@ -72,8 +74,8 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
break;
}
- table = (unsigned long)old & mask;
- crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce);
+ table = (unsigned long *)((unsigned long)old & mask);
+ crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce.val);
} else if (MACHINE_HAS_IDTE) {
cspg(old, *old, new);
} else {
@@ -86,7 +88,9 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
{
pte_t *ptep, new;
- ptep = pte_offset(pmdp, addr);
+ if (flags == SET_MEMORY_4K)
+ return 0;
+ ptep = pte_offset_kernel(pmdp, addr);
do {
new = *ptep;
if (pte_none(new))
@@ -94,11 +98,19 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (flags & SET_MEMORY_RO)
new = pte_wrprotect(new);
else if (flags & SET_MEMORY_RW)
- new = pte_mkwrite(pte_mkdirty(new));
+ new = pte_mkwrite_novma(pte_mkdirty(new));
if (flags & SET_MEMORY_NX)
- pte_val(new) |= _PAGE_NOEXEC;
+ new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC));
else if (flags & SET_MEMORY_X)
- pte_val(new) &= ~_PAGE_NOEXEC;
+ new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC));
+ if (flags & SET_MEMORY_INV) {
+ new = set_pte_bit(new, __pgprot(_PAGE_INVALID));
+ } else if (flags & SET_MEMORY_DEF) {
+ new = __pte(pte_val(new) & PAGE_MASK);
+ new = set_pte_bit(new, PAGE_KERNEL);
+ if (!MACHINE_HAS_NX)
+ new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC));
+ }
pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
ptep++;
addr += PAGE_SIZE;
@@ -125,11 +137,11 @@ static int split_pmd_page(pmd_t *pmdp, unsigned long addr)
prot &= ~_PAGE_NOEXEC;
ptep = pt_dir;
for (i = 0; i < PTRS_PER_PTE; i++) {
- pte_val(*ptep) = pte_addr | prot;
+ set_pte(ptep, __pte(pte_addr | prot));
pte_addr += PAGE_SIZE;
ptep++;
}
- pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY;
+ new = __pmd(__pa(pt_dir) | _SEGMENT_ENTRY);
pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE);
update_page_count(PG_DIRECT_MAP_1M, -1);
@@ -144,11 +156,19 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr,
if (flags & SET_MEMORY_RO)
new = pmd_wrprotect(new);
else if (flags & SET_MEMORY_RW)
- new = pmd_mkwrite(pmd_mkdirty(new));
+ new = pmd_mkwrite_novma(pmd_mkdirty(new));
if (flags & SET_MEMORY_NX)
- pmd_val(new) |= _SEGMENT_ENTRY_NOEXEC;
+ new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
else if (flags & SET_MEMORY_X)
- pmd_val(new) &= ~_SEGMENT_ENTRY_NOEXEC;
+ new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
+ if (flags & SET_MEMORY_INV) {
+ new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
+ } else if (flags & SET_MEMORY_DEF) {
+ new = __pmd(pmd_val(new) & PMD_MASK);
+ new = set_pmd_bit(new, SEGMENT_KERNEL);
+ if (!MACHINE_HAS_NX)
+ new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
+ }
pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
}
@@ -156,6 +176,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
unsigned long flags)
{
unsigned long next;
+ int need_split;
pmd_t *pmdp;
int rc = 0;
@@ -165,7 +186,10 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
return -EINVAL;
next = pmd_addr_end(addr, end);
if (pmd_large(*pmdp)) {
- if (addr & ~PMD_MASK || addr + PMD_SIZE > next) {
+ need_split = !!(flags & SET_MEMORY_4K);
+ need_split |= !!(addr & ~PMD_MASK);
+ need_split |= !!(addr + PMD_SIZE > next);
+ if (need_split) {
rc = split_pmd_page(pmdp, addr);
if (rc)
return rc;
@@ -202,11 +226,11 @@ static int split_pud_page(pud_t *pudp, unsigned long addr)
prot &= ~_SEGMENT_ENTRY_NOEXEC;
pmdp = pm_dir;
for (i = 0; i < PTRS_PER_PMD; i++) {
- pmd_val(*pmdp) = pmd_addr | prot;
+ set_pmd(pmdp, __pmd(pmd_addr | prot));
pmd_addr += PMD_SIZE;
pmdp++;
}
- pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY;
+ new = __pud(__pa(pm_dir) | _REGION3_ENTRY);
pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD);
update_page_count(PG_DIRECT_MAP_2G, -1);
@@ -223,9 +247,17 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr,
else if (flags & SET_MEMORY_RW)
new = pud_mkwrite(pud_mkdirty(new));
if (flags & SET_MEMORY_NX)
- pud_val(new) |= _REGION_ENTRY_NOEXEC;
+ new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
else if (flags & SET_MEMORY_X)
- pud_val(new) &= ~_REGION_ENTRY_NOEXEC;
+ new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
+ if (flags & SET_MEMORY_INV) {
+ new = set_pud_bit(new, __pgprot(_REGION_ENTRY_INVALID));
+ } else if (flags & SET_MEMORY_DEF) {
+ new = __pud(pud_val(new) & PUD_MASK);
+ new = set_pud_bit(new, REGION3_KERNEL);
+ if (!MACHINE_HAS_NX)
+ new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
+ }
pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
}
@@ -233,6 +265,7 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end,
unsigned long flags)
{
unsigned long next;
+ int need_split;
pud_t *pudp;
int rc = 0;
@@ -242,7 +275,10 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end,
return -EINVAL;
next = pud_addr_end(addr, end);
if (pud_large(*pudp)) {
- if (addr & ~PUD_MASK || addr + PUD_SIZE > next) {
+ need_split = !!(flags & SET_MEMORY_4K);
+ need_split |= !!(addr & ~PUD_MASK);
+ need_split |= !!(addr + PUD_SIZE > next);
+ if (need_split) {
rc = split_pud_page(pudp, addr);
if (rc)
break;
@@ -279,7 +315,7 @@ static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end,
return rc;
}
-static DEFINE_MUTEX(cpa_mutex);
+DEFINE_MUTEX(cpa_mutex);
static int change_page_attr(unsigned long addr, unsigned long end,
unsigned long flags)
@@ -288,11 +324,6 @@ static int change_page_attr(unsigned long addr, unsigned long end,
int rc = -EINVAL;
pgd_t *pgdp;
- if (addr == end)
- return 0;
- if (end >= MODULES_END)
- return -EINVAL;
- mutex_lock(&cpa_mutex);
pgdp = pgd_offset_k(addr);
do {
if (pgd_none(*pgdp))
@@ -303,21 +334,79 @@ static int change_page_attr(unsigned long addr, unsigned long end,
break;
cond_resched();
} while (pgdp++, addr = next, addr < end && !rc);
- mutex_unlock(&cpa_mutex);
return rc;
}
-int __set_memory(unsigned long addr, int numpages, unsigned long flags)
+static int change_page_attr_alias(unsigned long addr, unsigned long end,
+ unsigned long flags)
{
+ unsigned long alias, offset, va_start, va_end;
+ struct vm_struct *area;
+ int rc = 0;
+
+ /*
+ * Changes to read-only permissions on kernel VA mappings are also
+ * applied to the kernel direct mapping. Execute permissions are
+ * intentionally not transferred to keep all allocated pages within
+ * the direct mapping non-executable.
+ */
+ flags &= SET_MEMORY_RO | SET_MEMORY_RW;
+ if (!flags)
+ return 0;
+ area = NULL;
+ while (addr < end) {
+ if (!area)
+ area = find_vm_area((void *)addr);
+ if (!area || !(area->flags & VM_ALLOC))
+ return 0;
+ va_start = (unsigned long)area->addr;
+ va_end = va_start + area->nr_pages * PAGE_SIZE;
+ offset = (addr - va_start) >> PAGE_SHIFT;
+ alias = (unsigned long)page_address(area->pages[offset]);
+ rc = change_page_attr(alias, alias + PAGE_SIZE, flags);
+ if (rc)
+ break;
+ addr += PAGE_SIZE;
+ if (addr >= va_end)
+ area = NULL;
+ }
+ return rc;
+}
+
+int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags)
+{
+ unsigned long end;
+ int rc;
+
if (!MACHINE_HAS_NX)
flags &= ~(SET_MEMORY_NX | SET_MEMORY_X);
if (!flags)
return 0;
+ if (!numpages)
+ return 0;
addr &= PAGE_MASK;
- return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags);
+ end = addr + numpages * PAGE_SIZE;
+ mutex_lock(&cpa_mutex);
+ rc = change_page_attr(addr, end, flags);
+ if (rc)
+ goto out;
+ rc = change_page_attr_alias(addr, end, flags);
+out:
+ mutex_unlock(&cpa_mutex);
+ return rc;
+}
+
+int set_direct_map_invalid_noflush(struct page *page)
+{
+ return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_INV);
+}
+
+int set_direct_map_default_noflush(struct page *page)
+{
+ return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF);
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
static void ipte_range(pte_t *pte, unsigned long address, int nr)
{
@@ -337,50 +426,27 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr)
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
unsigned long address;
+ pte_t *ptep, pte;
int nr, i, j;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
for (i = 0; i < numpages;) {
- address = page_to_phys(page + i);
- pgd = pgd_offset_k(address);
- p4d = p4d_offset(pgd, address);
- pud = pud_offset(p4d, address);
- pmd = pmd_offset(pud, address);
- pte = pte_offset_kernel(pmd, address);
- nr = (unsigned long)pte >> ilog2(sizeof(long));
+ address = (unsigned long)page_to_virt(page + i);
+ ptep = virt_to_kpte(address);
+ nr = (unsigned long)ptep >> ilog2(sizeof(long));
nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1));
nr = min(numpages - i, nr);
if (enable) {
for (j = 0; j < nr; j++) {
- pte_val(*pte) &= ~_PAGE_INVALID;
+ pte = clear_pte_bit(*ptep, __pgprot(_PAGE_INVALID));
+ set_pte(ptep, pte);
address += PAGE_SIZE;
- pte++;
+ ptep++;
}
} else {
- ipte_range(pte, address, nr);
+ ipte_range(ptep, address, nr);
}
i += nr;
}
}
-#ifdef CONFIG_HIBERNATION
-bool kernel_page_present(struct page *page)
-{
- unsigned long addr;
- int cc;
-
- addr = page_to_phys(page);
- asm volatile(
- " lra %1,0(%1)\n"
- " ipm %0\n"
- " srl %0,28"
- : "=d" (cc), "+a" (addr) : : "cc");
- return cc == 0;
-}
-#endif /* CONFIG_HIBERNATION */
-
#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c
new file mode 100644
index 000000000000..1aac13bb8f53
--- /dev/null
+++ b/arch/s390/mm/pfault.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 1999, 2023
+ */
+
+#include <linux/cpuhotplug.h>
+#include <linux/sched/task.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <asm/asm-extable.h>
+#include <asm/pfault.h>
+#include <asm/diag.h>
+
+#define __SUBCODE_MASK 0x0600
+#define __PF_RES_FIELD 0x8000000000000000UL
+
+/*
+ * 'pfault' pseudo page faults routines.
+ */
+static int pfault_disable;
+
+static int __init nopfault(char *str)
+{
+ pfault_disable = 1;
+ return 1;
+}
+early_param("nopfault", nopfault);
+
+struct pfault_refbk {
+ u16 refdiagc;
+ u16 reffcode;
+ u16 refdwlen;
+ u16 refversn;
+ u64 refgaddr;
+ u64 refselmk;
+ u64 refcmpmk;
+ u64 reserved;
+};
+
+static struct pfault_refbk pfault_init_refbk = {
+ .refdiagc = 0x258,
+ .reffcode = 0,
+ .refdwlen = 5,
+ .refversn = 2,
+ .refgaddr = __LC_LPP,
+ .refselmk = 1UL << 48,
+ .refcmpmk = 1UL << 48,
+ .reserved = __PF_RES_FIELD
+};
+
+int __pfault_init(void)
+{
+ int rc = -EOPNOTSUPP;
+
+ if (pfault_disable)
+ return rc;
+ diag_stat_inc(DIAG_STAT_X258);
+ asm volatile(
+ " diag %[refbk],%[rc],0x258\n"
+ "0: nopr %%r7\n"
+ EX_TABLE(0b, 0b)
+ : [rc] "+d" (rc)
+ : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk)
+ : "cc");
+ return rc;
+}
+
+static struct pfault_refbk pfault_fini_refbk = {
+ .refdiagc = 0x258,
+ .reffcode = 1,
+ .refdwlen = 5,
+ .refversn = 2,
+};
+
+void __pfault_fini(void)
+{
+ if (pfault_disable)
+ return;
+ diag_stat_inc(DIAG_STAT_X258);
+ asm volatile(
+ " diag %[refbk],0,0x258\n"
+ "0: nopr %%r7\n"
+ EX_TABLE(0b, 0b)
+ :
+ : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk)
+ : "cc");
+}
+
+static DEFINE_SPINLOCK(pfault_lock);
+static LIST_HEAD(pfault_list);
+
+#define PF_COMPLETE 0x0080
+
+/*
+ * The mechanism of our pfault code: if Linux is running as guest, runs a user
+ * space process and the user space process accesses a page that the host has
+ * paged out we get a pfault interrupt.
+ *
+ * This allows us, within the guest, to schedule a different process. Without
+ * this mechanism the host would have to suspend the whole virtual cpu until
+ * the page has been paged in.
+ *
+ * So when we get such an interrupt then we set the state of the current task
+ * to uninterruptible and also set the need_resched flag. Both happens within
+ * interrupt context(!). If we later on want to return to user space we
+ * recognize the need_resched flag and then call schedule(). It's not very
+ * obvious how this works...
+ *
+ * Of course we have a lot of additional fun with the completion interrupt (->
+ * host signals that a page of a process has been paged in and the process can
+ * continue to run). This interrupt can arrive on any cpu and, since we have
+ * virtual cpus, actually appear before the interrupt that signals that a page
+ * is missing.
+ */
+static void pfault_interrupt(struct ext_code ext_code,
+ unsigned int param32, unsigned long param64)
+{
+ struct task_struct *tsk;
+ __u16 subcode;
+ pid_t pid;
+
+ /*
+ * Get the external interruption subcode & pfault initial/completion
+ * signal bit. VM stores this in the 'cpu address' field associated
+ * with the external interrupt.
+ */
+ subcode = ext_code.subcode;
+ if ((subcode & 0xff00) != __SUBCODE_MASK)
+ return;
+ inc_irq_stat(IRQEXT_PFL);
+ /* Get the token (= pid of the affected task). */
+ pid = param64 & LPP_PID_MASK;
+ rcu_read_lock();
+ tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (tsk)
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ if (!tsk)
+ return;
+ spin_lock(&pfault_lock);
+ if (subcode & PF_COMPLETE) {
+ /* signal bit is set -> a page has been swapped in by VM */
+ if (tsk->thread.pfault_wait == 1) {
+ /*
+ * Initial interrupt was faster than the completion
+ * interrupt. pfault_wait is valid. Set pfault_wait
+ * back to zero and wake up the process. This can
+ * safely be done because the task is still sleeping
+ * and can't produce new pfaults.
+ */
+ tsk->thread.pfault_wait = 0;
+ list_del(&tsk->thread.list);
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ } else {
+ /*
+ * Completion interrupt was faster than initial
+ * interrupt. Set pfault_wait to -1 so the initial
+ * interrupt doesn't put the task to sleep.
+ * If the task is not running, ignore the completion
+ * interrupt since it must be a leftover of a PFAULT
+ * CANCEL operation which didn't remove all pending
+ * completion interrupts.
+ */
+ if (task_is_running(tsk))
+ tsk->thread.pfault_wait = -1;
+ }
+ } else {
+ /* signal bit not set -> a real page is missing. */
+ if (WARN_ON_ONCE(tsk != current))
+ goto out;
+ if (tsk->thread.pfault_wait == 1) {
+ /* Already on the list with a reference: put to sleep */
+ goto block;
+ } else if (tsk->thread.pfault_wait == -1) {
+ /*
+ * Completion interrupt was faster than the initial
+ * interrupt (pfault_wait == -1). Set pfault_wait
+ * back to zero and exit.
+ */
+ tsk->thread.pfault_wait = 0;
+ } else {
+ /*
+ * Initial interrupt arrived before completion
+ * interrupt. Let the task sleep.
+ * An extra task reference is needed since a different
+ * cpu may set the task state to TASK_RUNNING again
+ * before the scheduler is reached.
+ */
+ get_task_struct(tsk);
+ tsk->thread.pfault_wait = 1;
+ list_add(&tsk->thread.list, &pfault_list);
+block:
+ /*
+ * Since this must be a userspace fault, there
+ * is no kernel task state to trample. Rely on the
+ * return to userspace schedule() to block.
+ */
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ set_tsk_need_resched(tsk);
+ set_preempt_need_resched();
+ }
+ }
+out:
+ spin_unlock(&pfault_lock);
+ put_task_struct(tsk);
+}
+
+static int pfault_cpu_dead(unsigned int cpu)
+{
+ struct thread_struct *thread, *next;
+ struct task_struct *tsk;
+
+ spin_lock_irq(&pfault_lock);
+ list_for_each_entry_safe(thread, next, &pfault_list, list) {
+ thread->pfault_wait = 0;
+ list_del(&thread->list);
+ tsk = container_of(thread, struct task_struct, thread);
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ }
+ spin_unlock_irq(&pfault_lock);
+ return 0;
+}
+
+static int __init pfault_irq_init(void)
+{
+ int rc;
+
+ rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+ if (rc)
+ goto out_extint;
+ rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
+ if (rc)
+ goto out_pfault;
+ irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
+ cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
+ NULL, pfault_cpu_dead);
+ return 0;
+
+out_pfault:
+ unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+out_extint:
+ pfault_disable = 1;
+ return rc;
+}
+early_initcall(pfault_irq_init);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 3dd253f81a77..008e487c94a6 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -10,6 +10,7 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include <asm/mmu_context.h>
+#include <asm/page-states.h>
#include <asm/pgalloc.h>
#include <asm/gmap.h>
#include <asm/tlb.h>
@@ -30,22 +31,11 @@ static struct ctl_table page_table_sysctl[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- { }
-};
-
-static struct ctl_table page_table_sysctl_dir[] = {
- {
- .procname = "vm",
- .maxlen = 0,
- .mode = 0555,
- .child = page_table_sysctl,
- },
- { }
};
static int __init page_table_register_sysctl(void)
{
- return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
+ return register_sysctl("vm", page_table_sysctl) ? 0 : -ENOMEM;
}
__initcall(page_table_register_sysctl);
@@ -53,266 +43,187 @@ __initcall(page_table_register_sysctl);
unsigned long *crst_table_alloc(struct mm_struct *mm)
{
- struct page *page = alloc_pages(GFP_KERNEL, 2);
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
+ unsigned long *table;
- if (!page)
+ if (!ptdesc)
return NULL;
- arch_set_page_dat(page, 2);
- return (unsigned long *) page_to_phys(page);
+ table = ptdesc_to_virt(ptdesc);
+ __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
+ return table;
}
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
- free_pages((unsigned long) table, 2);
+ pagetable_free(virt_to_ptdesc(table));
}
static void __crst_table_upgrade(void *arg)
{
struct mm_struct *mm = arg;
- if (current->active_mm == mm)
- set_user_asce(mm);
+ /* change all active ASCEs to avoid the creation of new TLBs */
+ if (current->active_mm == mm) {
+ S390_lowcore.user_asce.val = mm->context.asce;
+ local_ctl_load(7, &S390_lowcore.user_asce);
+ }
__tlb_flush_local();
}
int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
{
- unsigned long *table, *pgd;
- int rc, notify;
+ unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
+ unsigned long asce_limit = mm->context.asce_limit;
/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
- VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE);
- rc = 0;
- notify = 0;
- while (mm->context.asce_limit < end) {
- table = crst_table_alloc(mm);
- if (!table) {
- rc = -ENOMEM;
- break;
- }
- spin_lock_bh(&mm->page_table_lock);
- pgd = (unsigned long *) mm->pgd;
- if (mm->context.asce_limit == _REGION2_SIZE) {
- crst_table_init(table, _REGION2_ENTRY_EMPTY);
- p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd);
- mm->pgd = (pgd_t *) table;
- mm->context.asce_limit = _REGION1_SIZE;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
- mm_inc_nr_puds(mm);
- } else {
- crst_table_init(table, _REGION1_ENTRY_EMPTY);
- pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd);
- mm->pgd = (pgd_t *) table;
- mm->context.asce_limit = -PAGE_SIZE;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
- }
- notify = 1;
- spin_unlock_bh(&mm->page_table_lock);
- }
- if (notify)
- on_each_cpu(__crst_table_upgrade, mm, 0);
- return rc;
-}
+ VM_BUG_ON(asce_limit < _REGION2_SIZE);
-void crst_table_downgrade(struct mm_struct *mm)
-{
- pgd_t *pgd;
+ if (end <= asce_limit)
+ return 0;
- /* downgrade should only happen from 3 to 2 levels (compat only) */
- VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE);
+ if (asce_limit == _REGION2_SIZE) {
+ p4d = crst_table_alloc(mm);
+ if (unlikely(!p4d))
+ goto err_p4d;
+ crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
+ }
+ if (end > _REGION1_SIZE) {
+ pgd = crst_table_alloc(mm);
+ if (unlikely(!pgd))
+ goto err_pgd;
+ crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
+ }
- if (current->active_mm == mm) {
- clear_user_asce();
- __tlb_flush_mm(mm);
+ spin_lock_bh(&mm->page_table_lock);
+
+ /*
+ * This routine gets called with mmap_lock lock held and there is
+ * no reason to optimize for the case of otherwise. However, if
+ * that would ever change, the below check will let us know.
+ */
+ VM_BUG_ON(asce_limit != mm->context.asce_limit);
+
+ if (p4d) {
+ __pgd = (unsigned long *) mm->pgd;
+ p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
+ mm->pgd = (pgd_t *) p4d;
+ mm->context.asce_limit = _REGION1_SIZE;
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
+ mm_inc_nr_puds(mm);
+ }
+ if (pgd) {
+ __pgd = (unsigned long *) mm->pgd;
+ pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
+ mm->pgd = (pgd_t *) pgd;
+ mm->context.asce_limit = TASK_SIZE_MAX;
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
}
- pgd = mm->pgd;
- mm_dec_nr_pmds(mm);
- mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
- mm->context.asce_limit = _REGION3_SIZE;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
- crst_table_free(mm, (unsigned long *) pgd);
+ spin_unlock_bh(&mm->page_table_lock);
- if (current->active_mm == mm)
- set_user_asce(mm);
-}
+ on_each_cpu(__crst_table_upgrade, mm, 0);
-static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
-{
- unsigned int old, new;
+ return 0;
- do {
- old = atomic_read(v);
- new = old ^ bits;
- } while (atomic_cmpxchg(v, old, new) != old);
- return new;
+err_pgd:
+ crst_table_free(mm, p4d);
+err_p4d:
+ return -ENOMEM;
}
#ifdef CONFIG_PGSTE
struct page *page_table_alloc_pgste(struct mm_struct *mm)
{
- struct page *page;
+ struct ptdesc *ptdesc;
u64 *table;
- page = alloc_page(GFP_KERNEL);
- if (page) {
- table = (u64 *)page_to_phys(page);
+ ptdesc = pagetable_alloc(GFP_KERNEL, 0);
+ if (ptdesc) {
+ table = (u64 *)ptdesc_to_virt(ptdesc);
+ __arch_set_page_dat(table, 1);
memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
}
- return page;
+ return ptdesc_page(ptdesc);
}
void page_table_free_pgste(struct page *page)
{
- __free_page(page);
+ pagetable_free(page_ptdesc(page));
}
#endif /* CONFIG_PGSTE */
-/*
- * page table entry allocation/free routines.
- */
unsigned long *page_table_alloc(struct mm_struct *mm)
{
+ struct ptdesc *ptdesc;
unsigned long *table;
- struct page *page;
- unsigned int mask, bit;
-
- /* Try to get a fragment of a 4K page as a 2K page table */
- if (!mm_alloc_pgste(mm)) {
- table = NULL;
- spin_lock_bh(&mm->context.lock);
- if (!list_empty(&mm->context.pgtable_list)) {
- page = list_first_entry(&mm->context.pgtable_list,
- struct page, lru);
- mask = atomic_read(&page->_refcount) >> 24;
- mask = (mask | (mask >> 4)) & 3;
- if (mask != 3) {
- table = (unsigned long *) page_to_phys(page);
- bit = mask & 1; /* =1 -> second 2K */
- if (bit)
- table += PTRS_PER_PTE;
- atomic_xor_bits(&page->_refcount,
- 1U << (bit + 24));
- list_del(&page->lru);
- }
- }
- spin_unlock_bh(&mm->context.lock);
- if (table)
- return table;
- }
- /* Allocate a fresh page */
- page = alloc_page(GFP_KERNEL);
- if (!page)
+
+ ptdesc = pagetable_alloc(GFP_KERNEL, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pte_page_ctor(page)) {
- __free_page(page);
+ if (!pagetable_pte_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- arch_set_page_dat(page, 0);
- /* Initialize page table */
- table = (unsigned long *) page_to_phys(page);
- if (mm_alloc_pgste(mm)) {
- /* Return 4K page table with PGSTEs */
- atomic_xor_bits(&page->_refcount, 3 << 24);
- memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
- memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
- } else {
- /* Return the first 2K fragment of the page */
- atomic_xor_bits(&page->_refcount, 1 << 24);
- memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
- spin_lock_bh(&mm->context.lock);
- list_add(&page->lru, &mm->context.pgtable_list);
- spin_unlock_bh(&mm->context.lock);
- }
+ table = ptdesc_to_virt(ptdesc);
+ __arch_set_page_dat(table, 1);
+ /* pt_list is used by gmap only */
+ INIT_LIST_HEAD(&ptdesc->pt_list);
+ memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+ memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
return table;
}
+static void pagetable_pte_dtor_free(struct ptdesc *ptdesc)
+{
+ pagetable_pte_dtor(ptdesc);
+ pagetable_free(ptdesc);
+}
+
void page_table_free(struct mm_struct *mm, unsigned long *table)
{
- struct page *page;
- unsigned int bit, mask;
-
- page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
- if (!mm_alloc_pgste(mm)) {
- /* Free 2K page table fragment of a 4K page */
- bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
- mask >>= 24;
- if (mask & 3)
- list_add(&page->lru, &mm->context.pgtable_list);
- else
- list_del(&page->lru);
- spin_unlock_bh(&mm->context.lock);
- if (mask != 0)
- return;
- } else {
- atomic_xor_bits(&page->_refcount, 3U << 24);
- }
+ struct ptdesc *ptdesc = virt_to_ptdesc(table);
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ pagetable_pte_dtor_free(ptdesc);
}
-void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
- unsigned long vmaddr)
+void __tlb_remove_table(void *table)
{
- struct mm_struct *mm;
- struct page *page;
- unsigned int bit, mask;
-
- mm = tlb->mm;
- page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
- if (mm_alloc_pgste(mm)) {
- gmap_unlink(mm, table, vmaddr);
- table = (unsigned long *) (__pa(table) | 3);
- tlb_remove_table(tlb, table);
+ struct ptdesc *ptdesc = virt_to_ptdesc(table);
+ struct page *page = ptdesc_page(ptdesc);
+
+ if (compound_order(page) == CRST_ALLOC_ORDER) {
+ /* pmd, pud, or p4d */
+ pagetable_free(ptdesc);
return;
}
- bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
- mask >>= 24;
- if (mask & 3)
- list_add_tail(&page->lru, &mm->context.pgtable_list);
- else
- list_del(&page->lru);
- spin_unlock_bh(&mm->context.lock);
- table = (unsigned long *) (__pa(table) | (1U << bit));
- tlb_remove_table(tlb, table);
+ pagetable_pte_dtor_free(ptdesc);
}
-void __tlb_remove_table(void *_table)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pte_free_now(struct rcu_head *head)
{
- unsigned int mask = (unsigned long) _table & 3;
- void *table = (void *)((unsigned long) _table ^ mask);
- struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
- switch (mask) {
- case 0: /* pmd, pud, or p4d */
- free_pages((unsigned long) table, 2);
- break;
- case 1: /* lower 2K of a 4K page table */
- case 2: /* higher 2K of a 4K page table */
- mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
- mask >>= 24;
- if (mask != 0)
- break;
- /* fallthrough */
- case 3: /* 4K page table with pgstes */
- if (mask & 3)
- atomic_xor_bits(&page->_refcount, 3 << 24);
- pgtable_pte_page_dtor(page);
- __free_page(page);
- break;
- }
+ pagetable_pte_dtor_free(ptdesc);
+}
+
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+ struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
+
+ call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
+ /*
+ * THPs are not allowed for KVM guests. Warn if pgste ever reaches here.
+ * Turn to the generic pte_free_defer() version once gmap is removed.
+ */
+ WARN_ON_ONCE(mm_has_pgste(mm));
}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* Base infrastructure required to generate basic asces, region, segment,
@@ -321,34 +232,37 @@ void __tlb_remove_table(void *_table)
static struct kmem_cache *base_pgt_cache;
-static unsigned long base_pgt_alloc(void)
+static unsigned long *base_pgt_alloc(void)
{
- u64 *table;
+ unsigned long *table;
table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
if (table)
- memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
- return (unsigned long) table;
+ memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+ return table;
}
-static void base_pgt_free(unsigned long table)
+static void base_pgt_free(unsigned long *table)
{
- kmem_cache_free(base_pgt_cache, (void *) table);
+ kmem_cache_free(base_pgt_cache, table);
}
-static unsigned long base_crst_alloc(unsigned long val)
+static unsigned long *base_crst_alloc(unsigned long val)
{
- unsigned long table;
+ unsigned long *table;
+ struct ptdesc *ptdesc;
- table = __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
- if (table)
- crst_table_init((unsigned long *)table, val);
+ ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
+ if (!ptdesc)
+ return NULL;
+ table = ptdesc_address(ptdesc);
+ crst_table_init(table, val);
return table;
}
-static void base_crst_free(unsigned long table)
+static void base_crst_free(unsigned long *table)
{
- free_pages(table, CRST_ALLOC_ORDER);
+ pagetable_free(virt_to_ptdesc(table));
}
#define BASE_ADDR_END_FUNC(NAME, SIZE) \
@@ -376,14 +290,14 @@ static inline unsigned long base_lra(unsigned long address)
return real;
}
-static int base_page_walk(unsigned long origin, unsigned long addr,
+static int base_page_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
unsigned long *pte, next;
if (!alloc)
return 0;
- pte = (unsigned long *) origin;
+ pte = origin;
pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
do {
next = base_page_addr_end(addr, end);
@@ -392,13 +306,13 @@ static int base_page_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_segment_walk(unsigned long origin, unsigned long addr,
+static int base_segment_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *ste, next, table;
+ unsigned long *ste, next, *table;
int rc;
- ste = (unsigned long *) origin;
+ ste = origin;
ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
do {
next = base_segment_addr_end(addr, end);
@@ -408,9 +322,9 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,
table = base_pgt_alloc();
if (!table)
return -ENOMEM;
- *ste = table | _SEGMENT_ENTRY;
+ *ste = __pa(table) | _SEGMENT_ENTRY;
}
- table = *ste & _SEGMENT_ENTRY_ORIGIN;
+ table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
rc = base_page_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -421,13 +335,13 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region3_walk(unsigned long origin, unsigned long addr,
+static int base_region3_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rtte, next, table;
+ unsigned long *rtte, next, *table;
int rc;
- rtte = (unsigned long *) origin;
+ rtte = origin;
rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
do {
next = base_region3_addr_end(addr, end);
@@ -437,9 +351,9 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rtte = table | _REGION3_ENTRY;
+ *rtte = __pa(table) | _REGION3_ENTRY;
}
- table = *rtte & _REGION_ENTRY_ORIGIN;
+ table = __va(*rtte & _REGION_ENTRY_ORIGIN);
rc = base_segment_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -449,13 +363,13 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region2_walk(unsigned long origin, unsigned long addr,
+static int base_region2_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rste, next, table;
+ unsigned long *rste, next, *table;
int rc;
- rste = (unsigned long *) origin;
+ rste = origin;
rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
do {
next = base_region2_addr_end(addr, end);
@@ -465,9 +379,9 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rste = table | _REGION2_ENTRY;
+ *rste = __pa(table) | _REGION2_ENTRY;
}
- table = *rste & _REGION_ENTRY_ORIGIN;
+ table = __va(*rste & _REGION_ENTRY_ORIGIN);
rc = base_region3_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -477,13 +391,13 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region1_walk(unsigned long origin, unsigned long addr,
+static int base_region1_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rfte, next, table;
+ unsigned long *rfte, next, *table;
int rc;
- rfte = (unsigned long *) origin;
+ rfte = origin;
rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
do {
next = base_region1_addr_end(addr, end);
@@ -493,9 +407,9 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rfte = table | _REGION1_ENTRY;
+ *rfte = __pa(table) | _REGION1_ENTRY;
}
- table = *rfte & _REGION_ENTRY_ORIGIN;
+ table = __va(*rfte & _REGION_ENTRY_ORIGIN);
rc = base_region2_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -514,7 +428,7 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,
*/
void base_asce_free(unsigned long asce)
{
- unsigned long table = asce & _ASCE_ORIGIN;
+ unsigned long *table = __va(asce & _ASCE_ORIGIN);
if (!asce)
return;
@@ -529,7 +443,7 @@ void base_asce_free(unsigned long asce)
base_region2_walk(table, 0, _REGION1_SIZE, 0);
break;
case _ASCE_TYPE_REGION1:
- base_region1_walk(table, 0, -_PAGE_SIZE, 0);
+ base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
break;
}
base_crst_free(table);
@@ -566,7 +480,7 @@ static int base_pgt_cache_init(void)
*/
unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
{
- unsigned long asce, table, end;
+ unsigned long asce, *table, end;
int rc;
if (base_pgt_cache_init())
@@ -577,25 +491,25 @@ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
if (!table)
return 0;
rc = base_segment_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
} else if (end <= _REGION2_SIZE) {
table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region3_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
} else if (end <= _REGION1_SIZE) {
table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region2_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
} else {
table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region1_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
}
if (rc) {
base_asce_free(asce);
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 9ebd01219812..99422926efe1 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -19,13 +19,31 @@
#include <linux/ksm.h>
#include <linux/mman.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/page-states.h>
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+ /*
+ * mio_wb_bit_mask may be set on a different CPU, but it is only set
+ * once at init and only read afterwards.
+ */
+ return __pgprot(pgprot_val(prot) | mio_wb_bit_mask);
+}
+EXPORT_SYMBOL_GPL(pgprot_writecombine);
+
+pgprot_t pgprot_writethrough(pgprot_t prot)
+{
+ /*
+ * mio_wb_bit_mask may be set on a different CPU, but it is only set
+ * once at init and only read afterwards.
+ */
+ return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask);
+}
+EXPORT_SYMBOL_GPL(pgprot_writethrough);
+
static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, int nodat)
{
@@ -97,7 +115,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
atomic_inc(&mm->context.flush_count);
if (cpumask_equal(&mm->context.cpu_attach_mask,
cpumask_of(smp_processor_id()))) {
- pte_val(*ptep) |= _PAGE_INVALID;
+ set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID)));
mm->context.flush_mm = 1;
} else
ptep_ipte_global(mm, addr, ptep, nodat);
@@ -107,32 +125,23 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
static inline pgste_t pgste_get_lock(pte_t *ptep)
{
- unsigned long new = 0;
+ unsigned long value = 0;
#ifdef CONFIG_PGSTE
- unsigned long old;
-
- asm(
- " lg %0,%2\n"
- "0: lgr %1,%0\n"
- " nihh %0,0xff7f\n" /* clear PCL bit in old */
- " oihh %1,0x0080\n" /* set PCL bit in new */
- " csg %0,%1,%2\n"
- " jl 0b\n"
- : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE])
- : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory");
+ unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE);
+
+ do {
+ value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr);
+ } while (value & PGSTE_PCL_BIT);
+ value |= PGSTE_PCL_BIT;
#endif
- return __pgste(new);
+ return __pgste(value);
}
static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
{
#ifdef CONFIG_PGSTE
- asm(
- " nihh %1,0xff7f\n" /* clear PCL bit */
- " stg %1,%0\n"
- : "=Q" (ptep[PTRS_PER_PTE])
- : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
- : "cc", "memory");
+ barrier();
+ WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT);
#endif
}
@@ -206,15 +215,15 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
* Without enhanced suppression-on-protection force
* the dirty bit on for all writable ptes.
*/
- pte_val(entry) |= _PAGE_DIRTY;
- pte_val(entry) &= ~_PAGE_PROTECT;
+ entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY));
+ entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT));
}
if (!(pte_val(entry) & _PAGE_PROTECT))
/* This pte allows write access, set user-dirty */
pgste_val(pgste) |= PGSTE_UC_BIT;
}
#endif
- *ptep = entry;
+ set_pte(ptep, entry);
return pgste;
}
@@ -257,12 +266,12 @@ static inline pte_t ptep_xchg_commit(struct mm_struct *mm,
pgste = pgste_update_all(old, pgste, mm);
if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
_PGSTE_GPS_USAGE_UNUSED)
- pte_val(old) |= _PAGE_UNUSED;
+ old = set_pte_bit(old, __pgprot(_PAGE_UNUSED));
}
pgste = pgste_set_pte(ptep, pgste, new);
pgste_set_unlock(ptep, pgste);
} else {
- *ptep = new;
+ set_pte(ptep, new);
}
return old;
}
@@ -284,6 +293,31 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(ptep_xchg_direct);
+/*
+ * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that
+ * RDP can be used instead of IPTE. See also comments at pte_allow_rdp().
+ */
+void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t new)
+{
+ preempt_disable();
+ atomic_inc(&mm->context.flush_count);
+ if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+ __ptep_rdp(addr, ptep, 0, 0, 1);
+ else
+ __ptep_rdp(addr, ptep, 0, 0, 0);
+ /*
+ * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That
+ * means it is still valid and active, and must not be changed according
+ * to the architecture. But writing a new value that only differs in SW
+ * bits is allowed.
+ */
+ set_pte(ptep, new);
+ atomic_dec(&mm->context.flush_count);
+ preempt_enable();
+}
+EXPORT_SYMBOL(ptep_reset_dat_prot);
+
pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t new)
{
@@ -327,14 +361,14 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
struct mm_struct *mm = vma->vm_mm;
if (!MACHINE_HAS_NX)
- pte_val(pte) &= ~_PAGE_NOEXEC;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC));
if (mm_has_pgste(mm)) {
pgste = pgste_get(ptep);
pgste_set_key(ptep, pgste, pte, mm);
pgste = pgste_set_pte(ptep, pgste, pte);
pgste_set_unlock(ptep, pgste);
} else {
- *ptep = pte;
+ set_pte(ptep, pte);
}
preempt_enable();
}
@@ -399,7 +433,7 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
atomic_inc(&mm->context.flush_count);
if (cpumask_equal(&mm->context.cpu_attach_mask,
cpumask_of(smp_processor_id()))) {
- pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
+ set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID)));
mm->context.flush_mm = 1;
if (mm_has_pgste(mm))
gmap_pmdp_invalidate(mm, addr);
@@ -411,22 +445,36 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
}
#ifdef CONFIG_PGSTE
-static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr)
+static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp)
{
+ struct vm_area_struct *vma;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
- pmd_t *pmd;
+
+ /* We need a valid VMA, otherwise this is clearly a fault. */
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ return -EFAULT;
pgd = pgd_offset(mm, addr);
- p4d = p4d_alloc(mm, pgd, addr);
- if (!p4d)
- return NULL;
- pud = pud_alloc(mm, p4d, addr);
- if (!pud)
- return NULL;
- pmd = pmd_alloc(mm, pud, addr);
- return pmd;
+ if (!pgd_present(*pgd))
+ return -ENOENT;
+
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
+ return -ENOENT;
+
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud))
+ return -ENOENT;
+
+ /* Large PUDs are not supported yet. */
+ if (pud_large(*pud))
+ return -EFAULT;
+
+ *pmdp = pmd_offset(pud, addr);
+ return 0;
}
#endif
@@ -437,7 +485,7 @@ pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
preempt_disable();
old = pmdp_flush_direct(mm, addr, pmdp);
- *pmdp = new;
+ set_pmd(pmdp, new);
preempt_enable();
return old;
}
@@ -450,7 +498,7 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
preempt_disable();
old = pmdp_flush_lazy(mm, addr, pmdp);
- *pmdp = new;
+ set_pmd(pmdp, new);
preempt_enable();
return old;
}
@@ -507,7 +555,7 @@ pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr,
preempt_disable();
old = pudp_flush_direct(mm, addr, pudp);
- *pudp = new;
+ set_pud(pudp, new);
preempt_enable();
return old;
}
@@ -547,9 +595,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
list_del(lh);
}
ptep = (pte_t *) pgtable;
- pte_val(*ptep) = _PAGE_INVALID;
+ set_pte(ptep, __pte(_PAGE_INVALID));
ptep++;
- pte_val(*ptep) = _PAGE_INVALID;
+ set_pte(ptep, __pte(_PAGE_INVALID));
return pgtable;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -614,12 +662,12 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
if (prot == PROT_NONE && !pte_i) {
ptep_flush_direct(mm, addr, ptep, nodat);
pgste = pgste_update_all(entry, pgste, mm);
- pte_val(entry) |= _PAGE_INVALID;
+ entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID));
}
if (prot == PROT_READ && !pte_p) {
ptep_flush_direct(mm, addr, ptep, nodat);
- pte_val(entry) &= ~_PAGE_INVALID;
- pte_val(entry) |= _PAGE_PROTECT;
+ entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID));
+ entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT));
}
pgste_val(pgste) |= bit;
pgste = pgste_set_pte(ptep, pgste, entry);
@@ -643,8 +691,8 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
!(pte_val(pte) & _PAGE_PROTECT))) {
pgste_val(spgste) |= PGSTE_VSIE_BIT;
tpgste = pgste_get_lock(tptep);
- pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
- (pte_val(pte) & _PAGE_PROTECT);
+ tpte = __pte((pte_val(spte) & PAGE_MASK) |
+ (pte_val(pte) & _PAGE_PROTECT));
/* don't touch the storage key - it belongs to parent pgste */
tpgste = pgste_set_pte(tptep, tpgste, tpte);
pgste_set_unlock(tptep, tpgste);
@@ -673,7 +721,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
if (!non_swap_entry(entry))
dec_mm_counter(mm, MM_SWAPENTS);
else if (is_migration_entry(entry)) {
- struct page *page = migration_entry_to_page(entry);
+ struct page *page = pfn_swap_entry_to_page(entry);
dec_mm_counter(mm, mm_counter(page));
}
@@ -699,7 +747,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
pte_clear(mm, addr, ptep);
}
if (reset)
- pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
+ pgste_val(pgste) &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
pgste_set_unlock(ptep, pgste);
preempt_enable();
}
@@ -716,7 +764,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT;
ptev = pte_val(*ptep);
if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
- page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
+ page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0);
pgste_set_unlock(ptep, pgste);
preempt_enable();
}
@@ -741,10 +789,10 @@ bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
ptep_ipte_global(mm, addr, ptep, nodat);
if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
- pte_val(pte) |= _PAGE_PROTECT;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
else
- pte_val(pte) |= _PAGE_INVALID;
- *ptep = pte;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID));
+ set_pte(ptep, pte);
}
pgste_set_unlock(ptep, pgste);
return dirty;
@@ -760,14 +808,23 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp;
pte_t *ptep;
- pmdp = pmd_alloc_map(mm, addr);
- if (unlikely(!pmdp))
+ /*
+ * If we don't have a PTE table and if there is no huge page mapped,
+ * we can ignore attempts to set the key to 0, because it already is 0.
+ */
+ switch (pmd_lookup(mm, addr, &pmdp)) {
+ case -ENOENT:
+ return key ? -EFAULT : 0;
+ case 0:
+ break;
+ default:
return -EFAULT;
-
+ }
+again:
ptl = pmd_lock(mm, pmdp);
if (!pmd_present(*pmdp)) {
spin_unlock(ptl);
- return -EFAULT;
+ return key ? -EFAULT : 0;
}
if (pmd_large(*pmdp)) {
@@ -783,10 +840,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
spin_unlock(ptl);
- ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
- if (unlikely(!ptep))
- return -EFAULT;
-
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!ptep)
+ goto again;
new = old = pgste_get_lock(ptep);
pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
PGSTE_ACC_BITS | PGSTE_FP_BIT);
@@ -816,7 +872,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(set_guest_storage_key);
-/**
+/*
* Conditionally set a guest storage key (handling csske).
* oldkey will be updated when either mr or mc is set and a pointer is given.
*
@@ -849,7 +905,7 @@ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(cond_set_guest_storage_key);
-/**
+/*
* Reset a guest reference bit (rrbe), returning the reference and changed bit.
*
* Returns < 0 in case of error, otherwise the cc to be reported to the guest.
@@ -863,14 +919,23 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
pte_t *ptep;
int cc = 0;
- pmdp = pmd_alloc_map(mm, addr);
- if (unlikely(!pmdp))
+ /*
+ * If we don't have a PTE table and if there is no huge page mapped,
+ * the storage key is 0 and there is nothing for us to do.
+ */
+ switch (pmd_lookup(mm, addr, &pmdp)) {
+ case -ENOENT:
+ return 0;
+ case 0:
+ break;
+ default:
return -EFAULT;
-
+ }
+again:
ptl = pmd_lock(mm, pmdp);
if (!pmd_present(*pmdp)) {
spin_unlock(ptl);
- return -EFAULT;
+ return 0;
}
if (pmd_large(*pmdp)) {
@@ -882,10 +947,9 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
}
spin_unlock(ptl);
- ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
- if (unlikely(!ptep))
- return -EFAULT;
-
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!ptep)
+ goto again;
new = old = pgste_get_lock(ptep);
/* Reset guest reference bit only */
pgste_val(new) &= ~PGSTE_GR_BIT;
@@ -917,15 +981,24 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp;
pte_t *ptep;
- pmdp = pmd_alloc_map(mm, addr);
- if (unlikely(!pmdp))
- return -EFAULT;
+ /*
+ * If we don't have a PTE table and if there is no huge page mapped,
+ * the storage key is 0.
+ */
+ *key = 0;
+ switch (pmd_lookup(mm, addr, &pmdp)) {
+ case -ENOENT:
+ return 0;
+ case 0:
+ break;
+ default:
+ return -EFAULT;
+ }
+again:
ptl = pmd_lock(mm, pmdp);
if (!pmd_present(*pmdp)) {
- /* Not yet mapped memory has a zero key */
spin_unlock(ptl);
- *key = 0;
return 0;
}
@@ -938,10 +1011,9 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
spin_unlock(ptl);
- ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
- if (unlikely(!ptep))
- return -EFAULT;
-
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!ptep)
+ goto again;
pgste = pgste_get_lock(ptep);
*key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
paddr = pte_val(*ptep) & PAGE_MASK;
@@ -970,6 +1042,7 @@ EXPORT_SYMBOL(get_guest_storage_key);
int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
unsigned long *oldpte, unsigned long *oldpgste)
{
+ struct vm_area_struct *vma;
unsigned long pgstev;
spinlock_t *ptl;
pgste_t pgste;
@@ -979,6 +1052,10 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
WARN_ON_ONCE(orc > ESSA_MAX);
if (unlikely(orc > ESSA_MAX))
return -EINVAL;
+
+ vma = vma_lookup(mm, hva);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return -EFAULT;
ptep = get_locked_pte(mm, hva, &ptl);
if (unlikely(!ptep))
return -EFAULT;
@@ -1071,10 +1148,14 @@ EXPORT_SYMBOL(pgste_perform_essa);
int set_pgste_bits(struct mm_struct *mm, unsigned long hva,
unsigned long bits, unsigned long value)
{
+ struct vm_area_struct *vma;
spinlock_t *ptl;
pgste_t new;
pte_t *ptep;
+ vma = vma_lookup(mm, hva);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return -EFAULT;
ptep = get_locked_pte(mm, hva, &ptl);
if (unlikely(!ptep))
return -EFAULT;
@@ -1099,9 +1180,13 @@ EXPORT_SYMBOL(set_pgste_bits);
*/
int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep)
{
+ struct vm_area_struct *vma;
spinlock_t *ptl;
pte_t *ptep;
+ vma = vma_lookup(mm, hva);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return -EFAULT;
ptep = get_locked_pte(mm, hva, &ptl);
if (unlikely(!ptep))
return -EFAULT;
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index b403fa14847d..186a020857cf 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -1,9 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright IBM Corp. 2006
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
+#include <linux/memory_hotplug.h>
#include <linux/memblock.h>
#include <linux/pfn.h>
#include <linux/mm.h>
@@ -11,9 +11,12 @@
#include <linux/list.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
+#include <linux/sort.h>
+#include <asm/page-states.h>
#include <asm/cacheflush.h>
+#include <asm/nospec-branch.h>
+#include <asm/ctlreg.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/setup.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
@@ -21,21 +24,22 @@
static DEFINE_MUTEX(vmem_mutex);
-struct memory_segment {
- struct list_head list;
- unsigned long start;
- unsigned long size;
-};
-
-static LIST_HEAD(mem_segs);
-
static void __ref *vmem_alloc_pages(unsigned int order)
{
unsigned long size = PAGE_SIZE << order;
if (slab_is_available())
return (void *)__get_free_pages(GFP_KERNEL, order);
- return (void *) memblock_phys_alloc(size, size);
+ return memblock_alloc(size, size);
+}
+
+static void vmem_free_pages(unsigned long addr, int order)
+{
+ /* We don't expect boot memory to be removed ever. */
+ if (!slab_is_available() ||
+ WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr))))
+ return;
+ free_pages(addr, order);
}
void *vmem_crst_alloc(unsigned long val)
@@ -43,8 +47,10 @@ void *vmem_crst_alloc(unsigned long val)
unsigned long *table;
table = vmem_alloc_pages(CRST_ALLOC_ORDER);
- if (table)
- crst_table_init(table, val);
+ if (!table)
+ return NULL;
+ crst_table_init(table, val);
+ __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
return table;
}
@@ -56,389 +62,613 @@ pte_t __ref *vmem_pte_alloc(void)
if (slab_is_available())
pte = (pte_t *) page_table_alloc(&init_mm);
else
- pte = (pte_t *) memblock_phys_alloc(size, size);
+ pte = (pte_t *) memblock_alloc(size, size);
if (!pte)
return NULL;
memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
+ __arch_set_page_dat(pte, 1);
return pte;
}
+static void vmem_pte_free(unsigned long *table)
+{
+ /* We don't expect boot memory to be removed ever. */
+ if (!slab_is_available() ||
+ WARN_ON_ONCE(PageReserved(virt_to_page(table))))
+ return;
+ page_table_free(&init_mm, table);
+}
+
+#define PAGE_UNUSED 0xFD
+
/*
- * Add a physical memory range to the 1:1 mapping.
+ * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
+ * from unused_sub_pmd_start to next PMD_SIZE boundary.
*/
-static int vmem_add_mem(unsigned long start, unsigned long size)
-{
- unsigned long pgt_prot, sgt_prot, r3_prot;
- unsigned long pages4k, pages1m, pages2g;
- unsigned long end = start + size;
- unsigned long address = start;
- pgd_t *pg_dir;
- p4d_t *p4_dir;
- pud_t *pu_dir;
- pmd_t *pm_dir;
- pte_t *pt_dir;
- int ret = -ENOMEM;
+static unsigned long unused_sub_pmd_start;
+
+static void vmemmap_flush_unused_sub_pmd(void)
+{
+ if (!unused_sub_pmd_start)
+ return;
+ memset((void *)unused_sub_pmd_start, PAGE_UNUSED,
+ ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start);
+ unused_sub_pmd_start = 0;
+}
- pgt_prot = pgprot_val(PAGE_KERNEL);
- sgt_prot = pgprot_val(SEGMENT_KERNEL);
- r3_prot = pgprot_val(REGION3_KERNEL);
- if (!MACHINE_HAS_NX) {
- pgt_prot &= ~_PAGE_NOEXEC;
- sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
- r3_prot &= ~_REGION_ENTRY_NOEXEC;
+static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end)
+{
+ /*
+ * As we expect to add in the same granularity as we remove, it's
+ * sufficient to mark only some piece used to block the memmap page from
+ * getting removed (just in case the memmap never gets initialized,
+ * e.g., because the memory block never gets onlined).
+ */
+ memset((void *)start, 0, sizeof(struct page));
+}
+
+static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+{
+ /*
+ * We only optimize if the new used range directly follows the
+ * previously unused range (esp., when populating consecutive sections).
+ */
+ if (unused_sub_pmd_start == start) {
+ unused_sub_pmd_start = end;
+ if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE)))
+ unused_sub_pmd_start = 0;
+ return;
}
- pages4k = pages1m = pages2g = 0;
- while (address < end) {
- pg_dir = pgd_offset_k(address);
- if (pgd_none(*pg_dir)) {
- p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
- if (!p4_dir)
- goto out;
- pgd_populate(&init_mm, pg_dir, p4_dir);
- }
- p4_dir = p4d_offset(pg_dir, address);
- if (p4d_none(*p4_dir)) {
- pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
- if (!pu_dir)
- goto out;
- p4d_populate(&init_mm, p4_dir, pu_dir);
- }
- pu_dir = pud_offset(p4_dir, address);
- if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
- !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) &&
- !debug_pagealloc_enabled()) {
- pud_val(*pu_dir) = address | r3_prot;
- address += PUD_SIZE;
- pages2g++;
- continue;
- }
- if (pud_none(*pu_dir)) {
- pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
- if (!pm_dir)
- goto out;
- pud_populate(&init_mm, pu_dir, pm_dir);
- }
- pm_dir = pmd_offset(pu_dir, address);
- if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
- !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) &&
- !debug_pagealloc_enabled()) {
- pmd_val(*pm_dir) = address | sgt_prot;
- address += PMD_SIZE;
- pages1m++;
+ vmemmap_flush_unused_sub_pmd();
+ vmemmap_mark_sub_pmd_used(start, end);
+}
+
+static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
+{
+ unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
+
+ vmemmap_flush_unused_sub_pmd();
+
+ /* Could be our memmap page is filled with PAGE_UNUSED already ... */
+ vmemmap_mark_sub_pmd_used(start, end);
+
+ /* Mark the unused parts of the new memmap page PAGE_UNUSED. */
+ if (!IS_ALIGNED(start, PMD_SIZE))
+ memset((void *)page, PAGE_UNUSED, start - page);
+ /*
+ * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
+ * consecutive sections. Remember for the last added PMD the last
+ * unused range in the populated PMD.
+ */
+ if (!IS_ALIGNED(end, PMD_SIZE))
+ unused_sub_pmd_start = end;
+}
+
+/* Returns true if the PMD is completely unused and can be freed. */
+static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
+{
+ unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
+
+ vmemmap_flush_unused_sub_pmd();
+ memset((void *)start, PAGE_UNUSED, end - start);
+ return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE);
+}
+
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
+ unsigned long end, bool add, bool direct)
+{
+ unsigned long prot, pages = 0;
+ int ret = -ENOMEM;
+ pte_t *pte;
+
+ prot = pgprot_val(PAGE_KERNEL);
+ if (!MACHINE_HAS_NX)
+ prot &= ~_PAGE_NOEXEC;
+
+ pte = pte_offset_kernel(pmd, addr);
+ for (; addr < end; addr += PAGE_SIZE, pte++) {
+ if (!add) {
+ if (pte_none(*pte))
+ continue;
+ if (!direct)
+ vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0);
+ pte_clear(&init_mm, addr, pte);
+ } else if (pte_none(*pte)) {
+ if (!direct) {
+ void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+
+ if (!new_page)
+ goto out;
+ set_pte(pte, __pte(__pa(new_page) | prot));
+ } else {
+ set_pte(pte, __pte(__pa(addr) | prot));
+ }
+ } else {
continue;
}
- if (pmd_none(*pm_dir)) {
- pt_dir = vmem_pte_alloc();
- if (!pt_dir)
- goto out;
- pmd_populate(&init_mm, pm_dir, pt_dir);
- }
-
- pt_dir = pte_offset_kernel(pm_dir, address);
- pte_val(*pt_dir) = address | pgt_prot;
- address += PAGE_SIZE;
- pages4k++;
+ pages++;
}
ret = 0;
out:
- update_page_count(PG_DIRECT_MAP_4K, pages4k);
- update_page_count(PG_DIRECT_MAP_1M, pages1m);
- update_page_count(PG_DIRECT_MAP_2G, pages2g);
+ if (direct)
+ update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages);
return ret;
}
-/*
- * Remove a physical memory range from the 1:1 mapping.
- * Currently only invalidates page table entries.
- */
-static void vmem_remove_range(unsigned long start, unsigned long size)
+static void try_free_pte_table(pmd_t *pmd, unsigned long start)
{
- unsigned long pages4k, pages1m, pages2g;
- unsigned long end = start + size;
- unsigned long address = start;
- pgd_t *pg_dir;
- p4d_t *p4_dir;
- pud_t *pu_dir;
- pmd_t *pm_dir;
- pte_t *pt_dir;
-
- pages4k = pages1m = pages2g = 0;
- while (address < end) {
- pg_dir = pgd_offset_k(address);
- if (pgd_none(*pg_dir)) {
- address += PGDIR_SIZE;
- continue;
- }
- p4_dir = p4d_offset(pg_dir, address);
- if (p4d_none(*p4_dir)) {
- address += P4D_SIZE;
- continue;
- }
- pu_dir = pud_offset(p4_dir, address);
- if (pud_none(*pu_dir)) {
- address += PUD_SIZE;
- continue;
- }
- if (pud_large(*pu_dir)) {
- pud_clear(pu_dir);
- address += PUD_SIZE;
- pages2g++;
- continue;
- }
- pm_dir = pmd_offset(pu_dir, address);
- if (pmd_none(*pm_dir)) {
- address += PMD_SIZE;
- continue;
- }
- if (pmd_large(*pm_dir)) {
- pmd_clear(pm_dir);
- address += PMD_SIZE;
- pages1m++;
- continue;
- }
- pt_dir = pte_offset_kernel(pm_dir, address);
- pte_clear(&init_mm, address, pt_dir);
- address += PAGE_SIZE;
- pages4k++;
+ pte_t *pte;
+ int i;
+
+ /* We can safely assume this is fully in 1:1 mapping & vmemmap area */
+ pte = pte_offset_kernel(pmd, start);
+ for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
+ if (!pte_none(*pte))
+ return;
}
- flush_tlb_kernel_range(start, end);
- update_page_count(PG_DIRECT_MAP_4K, -pages4k);
- update_page_count(PG_DIRECT_MAP_1M, -pages1m);
- update_page_count(PG_DIRECT_MAP_2G, -pages2g);
+ vmem_pte_free((unsigned long *) pmd_deref(*pmd));
+ pmd_clear(pmd);
}
-/*
- * Add a backed mem_map array to the virtual mem_map array.
- */
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
- struct vmem_altmap *altmap)
-{
- unsigned long pgt_prot, sgt_prot;
- unsigned long address = start;
- pgd_t *pg_dir;
- p4d_t *p4_dir;
- pud_t *pu_dir;
- pmd_t *pm_dir;
- pte_t *pt_dir;
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
+ unsigned long end, bool add, bool direct)
+{
+ unsigned long next, prot, pages = 0;
int ret = -ENOMEM;
+ pmd_t *pmd;
+ pte_t *pte;
- pgt_prot = pgprot_val(PAGE_KERNEL);
- sgt_prot = pgprot_val(SEGMENT_KERNEL);
- if (!MACHINE_HAS_NX) {
- pgt_prot &= ~_PAGE_NOEXEC;
- sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
- }
- for (address = start; address < end;) {
- pg_dir = pgd_offset_k(address);
- if (pgd_none(*pg_dir)) {
- p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
- if (!p4_dir)
- goto out;
- pgd_populate(&init_mm, pg_dir, p4_dir);
- }
+ prot = pgprot_val(SEGMENT_KERNEL);
+ if (!MACHINE_HAS_NX)
+ prot &= ~_SEGMENT_ENTRY_NOEXEC;
- p4_dir = p4d_offset(pg_dir, address);
- if (p4d_none(*p4_dir)) {
- pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
- if (!pu_dir)
- goto out;
- p4d_populate(&init_mm, p4_dir, pu_dir);
- }
+ pmd = pmd_offset(pud, addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+ if (!add) {
+ if (pmd_none(*pmd))
+ continue;
+ if (pmd_large(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+ pmd_clear(pmd);
+ pages++;
+ } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
+ vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+ pmd_clear(pmd);
+ }
+ continue;
+ }
+ } else if (pmd_none(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE) &&
+ MACHINE_HAS_EDAT1 && direct &&
+ !debug_pagealloc_enabled()) {
+ set_pmd(pmd, __pmd(__pa(addr) | prot));
+ pages++;
+ continue;
+ } else if (!direct && MACHINE_HAS_EDAT1) {
+ void *new_page;
- pu_dir = pud_offset(p4_dir, address);
- if (pud_none(*pu_dir)) {
- pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
- if (!pm_dir)
+ /*
+ * Use 1MB frames for vmemmap if available. We
+ * always use large frames even if they are only
+ * partially used. Otherwise we would have also
+ * page tables since vmemmap_populate gets
+ * called for each section separately.
+ */
+ new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
+ if (new_page) {
+ set_pmd(pmd, __pmd(__pa(new_page) | prot));
+ if (!IS_ALIGNED(addr, PMD_SIZE) ||
+ !IS_ALIGNED(next, PMD_SIZE)) {
+ vmemmap_use_new_sub_pmd(addr, next);
+ }
+ continue;
+ }
+ }
+ pte = vmem_pte_alloc();
+ if (!pte)
goto out;
- pud_populate(&init_mm, pu_dir, pm_dir);
+ pmd_populate(&init_mm, pmd, pte);
+ } else if (pmd_large(*pmd)) {
+ if (!direct)
+ vmemmap_use_sub_pmd(addr, next);
+ continue;
}
+ ret = modify_pte_table(pmd, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_pte_table(pmd, addr & PMD_MASK);
+ }
+ ret = 0;
+out:
+ if (direct)
+ update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages);
+ return ret;
+}
- pm_dir = pmd_offset(pu_dir, address);
- if (pmd_none(*pm_dir)) {
- /* Use 1MB frames for vmemmap if available. We always
- * use large frames even if they are only partially
- * used.
- * Otherwise we would have also page tables since
- * vmemmap_populate gets called for each section
- * separately. */
- if (MACHINE_HAS_EDAT1) {
- void *new_page;
+static void try_free_pmd_table(pud_t *pud, unsigned long start)
+{
+ pmd_t *pmd;
+ int i;
+
+ pmd = pmd_offset(pud, start);
+ for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
+ if (!pmd_none(*pmd))
+ return;
+ vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
+ pud_clear(pud);
+}
- new_page = vmemmap_alloc_block(PMD_SIZE, node);
- if (!new_page)
- goto out;
- pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
- address = (address + PMD_SIZE) & PMD_MASK;
+static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
+ bool add, bool direct)
+{
+ unsigned long next, prot, pages = 0;
+ int ret = -ENOMEM;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ prot = pgprot_val(REGION3_KERNEL);
+ if (!MACHINE_HAS_NX)
+ prot &= ~_REGION_ENTRY_NOEXEC;
+ pud = pud_offset(p4d, addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+ if (!add) {
+ if (pud_none(*pud))
+ continue;
+ if (pud_large(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ pud_clear(pud);
+ pages++;
+ }
+ continue;
+ }
+ } else if (pud_none(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE) &&
+ MACHINE_HAS_EDAT2 && direct &&
+ !debug_pagealloc_enabled()) {
+ set_pud(pud, __pud(__pa(addr) | prot));
+ pages++;
continue;
}
- pt_dir = vmem_pte_alloc();
- if (!pt_dir)
+ pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ if (!pmd)
goto out;
- pmd_populate(&init_mm, pm_dir, pt_dir);
- } else if (pmd_large(*pm_dir)) {
- address = (address + PMD_SIZE) & PMD_MASK;
+ pud_populate(&init_mm, pud, pmd);
+ } else if (pud_large(*pud)) {
continue;
}
+ ret = modify_pmd_table(pud, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_pmd_table(pud, addr & PUD_MASK);
+ }
+ ret = 0;
+out:
+ if (direct)
+ update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages);
+ return ret;
+}
- pt_dir = pte_offset_kernel(pm_dir, address);
- if (pte_none(*pt_dir)) {
- void *new_page;
+static void try_free_pud_table(p4d_t *p4d, unsigned long start)
+{
+ pud_t *pud;
+ int i;
+
+ pud = pud_offset(p4d, start);
+ for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
+ if (!pud_none(*pud))
+ return;
+ }
+ vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
+ p4d_clear(p4d);
+}
- new_page = vmemmap_alloc_block(PAGE_SIZE, node);
- if (!new_page)
+static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
+ bool add, bool direct)
+{
+ unsigned long next;
+ int ret = -ENOMEM;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ p4d = p4d_offset(pgd, addr);
+ for (; addr < end; addr = next, p4d++) {
+ next = p4d_addr_end(addr, end);
+ if (!add) {
+ if (p4d_none(*p4d))
+ continue;
+ } else if (p4d_none(*p4d)) {
+ pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+ if (!pud)
goto out;
- pte_val(*pt_dir) = __pa(new_page) | pgt_prot;
+ p4d_populate(&init_mm, p4d, pud);
}
- address += PAGE_SIZE;
+ ret = modify_pud_table(p4d, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_pud_table(p4d, addr & P4D_MASK);
}
ret = 0;
out:
return ret;
}
-void vmemmap_free(unsigned long start, unsigned long end,
- struct vmem_altmap *altmap)
+static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
{
+ p4d_t *p4d;
+ int i;
+
+ p4d = p4d_offset(pgd, start);
+ for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
+ if (!p4d_none(*p4d))
+ return;
+ }
+ vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
+ pgd_clear(pgd);
}
-/*
- * Add memory segment to the segment list if it doesn't overlap with
- * an already present segment.
- */
-static int insert_memory_segment(struct memory_segment *seg)
+static int modify_pagetable(unsigned long start, unsigned long end, bool add,
+ bool direct)
{
- struct memory_segment *tmp;
+ unsigned long addr, next;
+ int ret = -ENOMEM;
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
+ return -EINVAL;
+ /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+ if (WARN_ON_ONCE(end > VMALLOC_START))
+ return -EINVAL;
+ for (addr = start; addr < end; addr = next) {
+ next = pgd_addr_end(addr, end);
+ pgd = pgd_offset_k(addr);
+
+ if (!add) {
+ if (pgd_none(*pgd))
+ continue;
+ } else if (pgd_none(*pgd)) {
+ p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+ if (!p4d)
+ goto out;
+ pgd_populate(&init_mm, pgd, p4d);
+ }
+ ret = modify_p4d_table(pgd, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_p4d_table(pgd, addr & PGDIR_MASK);
+ }
+ ret = 0;
+out:
+ if (!add)
+ flush_tlb_kernel_range(start, end);
+ return ret;
+}
- if (seg->start + seg->size > VMEM_MAX_PHYS ||
- seg->start + seg->size < seg->start)
- return -ERANGE;
+static int add_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+ return modify_pagetable(start, end, true, direct);
+}
- list_for_each_entry(tmp, &mem_segs, list) {
- if (seg->start >= tmp->start + tmp->size)
- continue;
- if (seg->start + seg->size <= tmp->start)
- continue;
- return -ENOSPC;
- }
- list_add(&seg->list, &mem_segs);
- return 0;
+static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+ return modify_pagetable(start, end, false, direct);
}
/*
- * Remove memory segment from the segment list.
+ * Add a physical memory range to the 1:1 mapping.
*/
-static void remove_memory_segment(struct memory_segment *seg)
+static int vmem_add_range(unsigned long start, unsigned long size)
{
- list_del(&seg->list);
+ start = (unsigned long)__va(start);
+ return add_pagetable(start, start + size, true);
}
-static void __remove_shared_memory(struct memory_segment *seg)
+/*
+ * Remove a physical memory range from the 1:1 mapping.
+ */
+static void vmem_remove_range(unsigned long start, unsigned long size)
{
- remove_memory_segment(seg);
- vmem_remove_range(seg->start, seg->size);
+ start = (unsigned long)__va(start);
+ remove_pagetable(start, start + size, true);
}
-int vmem_remove_mapping(unsigned long start, unsigned long size)
+/*
+ * Add a backed mem_map array to the virtual mem_map array.
+ */
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
{
- struct memory_segment *seg;
int ret;
mutex_lock(&vmem_mutex);
+ /* We don't care about the node, just use NUMA_NO_NODE on allocations */
+ ret = add_pagetable(start, end, false);
+ if (ret)
+ remove_pagetable(start, end, false);
+ mutex_unlock(&vmem_mutex);
+ return ret;
+}
- ret = -ENOENT;
- list_for_each_entry(seg, &mem_segs, list) {
- if (seg->start == start && seg->size == size)
- break;
- }
+#ifdef CONFIG_MEMORY_HOTPLUG
- if (seg->start != start || seg->size != size)
- goto out;
+void vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+ mutex_lock(&vmem_mutex);
+ remove_pagetable(start, end, false);
+ mutex_unlock(&vmem_mutex);
+}
- ret = 0;
- __remove_shared_memory(seg);
- kfree(seg);
-out:
+#endif
+
+void vmem_remove_mapping(unsigned long start, unsigned long size)
+{
+ mutex_lock(&vmem_mutex);
+ vmem_remove_range(start, size);
mutex_unlock(&vmem_mutex);
- return ret;
+}
+
+struct range arch_get_mappable_range(void)
+{
+ struct range mhp_range;
+
+ mhp_range.start = 0;
+ mhp_range.end = max_mappable - 1;
+ return mhp_range;
}
int vmem_add_mapping(unsigned long start, unsigned long size)
{
- struct memory_segment *seg;
+ struct range range = arch_get_mappable_range();
int ret;
- mutex_lock(&vmem_mutex);
- ret = -ENOMEM;
- seg = kzalloc(sizeof(*seg), GFP_KERNEL);
- if (!seg)
- goto out;
- seg->start = start;
- seg->size = size;
-
- ret = insert_memory_segment(seg);
- if (ret)
- goto out_free;
+ if (start < range.start ||
+ start + size > range.end + 1 ||
+ start + size < start)
+ return -ERANGE;
- ret = vmem_add_mem(start, size);
+ mutex_lock(&vmem_mutex);
+ ret = vmem_add_range(start, size);
if (ret)
- goto out_remove;
- goto out;
-
-out_remove:
- __remove_shared_memory(seg);
-out_free:
- kfree(seg);
-out:
+ vmem_remove_range(start, size);
mutex_unlock(&vmem_mutex);
return ret;
}
/*
- * map whole physical memory to virtual memory (identity mapping)
- * we reserve enough space in the vmalloc area for vmemmap to hotplug
- * additional memory segments.
+ * Allocate new or return existing page-table entry, but do not map it
+ * to any physical address. If missing, allocate segment- and region-
+ * table entries along. Meeting a large segment- or region-table entry
+ * while traversing is an error, since the function is expected to be
+ * called against virtual regions reserved for 4KB mappings only.
*/
-void __init vmem_map_init(void)
+pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc)
{
- struct memblock_region *reg;
-
- for_each_memblock(memory, reg)
- vmem_add_mem(reg->base, reg->size);
- __set_memory((unsigned long)_stext,
- (unsigned long)(_etext - _stext) >> PAGE_SHIFT,
- SET_MEMORY_RO | SET_MEMORY_X);
- __set_memory((unsigned long)_etext,
- (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT,
- SET_MEMORY_RO);
- __set_memory((unsigned long)_sinittext,
- (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
- SET_MEMORY_RO | SET_MEMORY_X);
- __set_memory(__stext_dma, (__etext_dma - __stext_dma) >> PAGE_SHIFT,
- SET_MEMORY_RO | SET_MEMORY_X);
- pr_info("Write protected kernel read-only data: %luk\n",
- (unsigned long)(__end_rodata - _stext) >> 10);
+ pte_t *ptep = NULL;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd)) {
+ if (!alloc)
+ goto out;
+ p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+ if (!p4d)
+ goto out;
+ pgd_populate(&init_mm, pgd, p4d);
+ }
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d)) {
+ if (!alloc)
+ goto out;
+ pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+ if (!pud)
+ goto out;
+ p4d_populate(&init_mm, p4d, pud);
+ }
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud)) {
+ if (!alloc)
+ goto out;
+ pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ if (!pmd)
+ goto out;
+ pud_populate(&init_mm, pud, pmd);
+ } else if (WARN_ON_ONCE(pud_large(*pud))) {
+ goto out;
+ }
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ if (!alloc)
+ goto out;
+ pte = vmem_pte_alloc();
+ if (!pte)
+ goto out;
+ pmd_populate(&init_mm, pmd, pte);
+ } else if (WARN_ON_ONCE(pmd_large(*pmd))) {
+ goto out;
+ }
+ ptep = pte_offset_kernel(pmd, addr);
+out:
+ return ptep;
}
-/*
- * Convert memblock.memory to a memory segment list so there is a single
- * list that contains all memory segments.
- */
-static int __init vmem_convert_memory_chunk(void)
+int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc)
{
- struct memblock_region *reg;
- struct memory_segment *seg;
+ pte_t *ptep, pte;
+
+ if (!IS_ALIGNED(addr, PAGE_SIZE))
+ return -EINVAL;
+ ptep = vmem_get_alloc_pte(addr, alloc);
+ if (!ptep)
+ return -ENOMEM;
+ __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+ pte = mk_pte_phys(phys, prot);
+ set_pte(ptep, pte);
+ return 0;
+}
+
+int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot)
+{
+ int rc;
mutex_lock(&vmem_mutex);
- for_each_memblock(memory, reg) {
- seg = kzalloc(sizeof(*seg), GFP_KERNEL);
- if (!seg)
- panic("Out of memory...\n");
- seg->start = reg->base;
- seg->size = reg->size;
- insert_memory_segment(seg);
- }
+ rc = __vmem_map_4k_page(addr, phys, prot, true);
mutex_unlock(&vmem_mutex);
- return 0;
+ return rc;
}
-core_initcall(vmem_convert_memory_chunk);
+void vmem_unmap_4k_page(unsigned long addr)
+{
+ pte_t *ptep;
+
+ mutex_lock(&vmem_mutex);
+ ptep = virt_to_kpte(addr);
+ __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+ pte_clear(&init_mm, addr, ptep);
+ mutex_unlock(&vmem_mutex);
+}
+
+void __init vmem_map_init(void)
+{
+ __set_memory_rox(_stext, _etext);
+ __set_memory_ro(_etext, __end_rodata);
+ __set_memory_rox(_sinittext, _einittext);
+ __set_memory_rox(__stext_amode31, __etext_amode31);
+ /*
+ * If the BEAR-enhancement facility is not installed the first
+ * prefix page is used to return to the previous context with
+ * an LPSWE instruction and therefore must be executable.
+ */
+ if (!static_key_enabled(&cpu_has_bear))
+ set_memory_x(0, 1);
+ if (debug_pagealloc_enabled()) {
+ /*
+ * Use RELOC_HIDE() as long as __va(0) translates to NULL,
+ * since performing pointer arithmetic on a NULL pointer
+ * has undefined behavior and generates compiler warnings.
+ */
+ __set_memory_4k(__va(0), RELOC_HIDE(__va(0), ident_map_size));
+ }
+ if (MACHINE_HAS_NX)
+ system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT);
+ pr_info("Write protected kernel read-only data: %luk\n",
+ (unsigned long)(__end_rodata - _stext) >> 10);
+}
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 8d2134136290..b418333bb086 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -7,7 +7,6 @@
* - HAVE_MARCH_Z196_FEATURES: laal, laalg
* - HAVE_MARCH_Z10_FEATURES: msfi, cgrj, clgrj
* - HAVE_MARCH_Z9_109_FEATURES: alfi, llilf, clfi, oilf, nilf
- * - PACK_STACK
* - 64BIT
*
* Copyright IBM Corp. 2012,2015
@@ -26,10 +25,12 @@
#include <linux/mm.h>
#include <linux/kernel.h>
#include <asm/cacheflush.h>
+#include <asm/extable.h>
#include <asm/dis.h>
#include <asm/facility.h>
#include <asm/nospec-branch.h>
#include <asm/set_memory.h>
+#include <asm/text-patching.h>
#include "bpf_jit.h"
struct bpf_jit {
@@ -49,13 +50,14 @@ struct bpf_jit {
int r1_thunk_ip; /* Address of expoline thunk for 'br %r1' */
int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */
int tail_call_start; /* Tail call start offset */
- int labels[1]; /* Labels for local jumps */
+ int excnt; /* Number of exception table entries */
+ int prologue_plt_ret; /* Return address for prologue hotpatch PLT */
+ int prologue_plt; /* Start of prologue hotpatch PLT */
};
#define SEEN_MEM BIT(0) /* use mem[] for temporary storage */
#define SEEN_LITERAL BIT(1) /* code uses literals */
#define SEEN_FUNC BIT(2) /* calls C functions */
-#define SEEN_TAIL_CALL BIT(3) /* code uses tail calls */
#define SEEN_STACK (SEEN_FUNC | SEEN_MEM)
/*
@@ -68,6 +70,10 @@ struct bpf_jit {
#define REG_0 REG_W0 /* Register 0 */
#define REG_1 REG_W1 /* Register 1 */
#define REG_2 BPF_REG_1 /* Register 2 */
+#define REG_3 BPF_REG_2 /* Register 3 */
+#define REG_4 BPF_REG_3 /* Register 4 */
+#define REG_7 BPF_REG_6 /* Register 7 */
+#define REG_8 BPF_REG_7 /* Register 8 */
#define REG_14 BPF_REG_0 /* Register 14 */
/*
@@ -112,7 +118,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
{
u32 r1 = reg2hex[b1];
- if (!jit->seen_reg[r1] && r1 >= 6 && r1 <= 15)
+ if (r1 >= 6 && r1 <= 15 && !jit->seen_reg[r1])
jit->seen_reg[r1] = 1;
}
@@ -228,18 +234,18 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
REG_SET_SEEN(b3); \
})
-#define EMIT6_PCREL_LABEL(op1, op2, b1, b2, label, mask) \
+#define EMIT6_PCREL_RIEB(op1, op2, b1, b2, mask, target) \
({ \
- int rel = (jit->labels[label] - jit->prg) >> 1; \
+ unsigned int rel = (int)((target) - jit->prg) / 2; \
_EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), \
(op2) | (mask) << 12); \
REG_SET_SEEN(b1); \
REG_SET_SEEN(b2); \
})
-#define EMIT6_PCREL_IMM_LABEL(op1, op2, b1, imm, label, mask) \
+#define EMIT6_PCREL_RIEC(op1, op2, b1, imm, mask, target) \
({ \
- int rel = (jit->labels[label] - jit->prg) >> 1; \
+ unsigned int rel = (int)((target) - jit->prg) / 2; \
_EMIT6((op1) | (reg_high(b1) | (mask)) << 16 | \
(rel & 0xffff), (op2) | ((imm) & 0xff) << 8); \
REG_SET_SEEN(b1); \
@@ -248,8 +254,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
#define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask) \
({ \
- /* Branch instruction needs 6 bytes */ \
- int rel = (addrs[(i) + (off) + 1] - (addrs[(i) + 1] - 6)) / 2;\
+ int rel = (addrs[(i) + (off) + 1] - jit->prg) / 2; \
_EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), (op2) | (mask));\
REG_SET_SEEN(b1); \
REG_SET_SEEN(b2); \
@@ -489,21 +494,79 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth)
} while (re <= last);
}
+static void bpf_skip(struct bpf_jit *jit, int size)
+{
+ if (size >= 6 && !is_valid_rel(size)) {
+ /* brcl 0xf,size */
+ EMIT6_PCREL_RIL(0xc0f4000000, size);
+ size -= 6;
+ } else if (size >= 4 && is_valid_rel(size)) {
+ /* brc 0xf,size */
+ EMIT4_PCREL(0xa7f40000, size);
+ size -= 4;
+ }
+ while (size >= 2) {
+ /* bcr 0,%0 */
+ _EMIT2(0x0700);
+ size -= 2;
+ }
+}
+
+/*
+ * PLT for hotpatchable calls. The calling convention is the same as for the
+ * ftrace hotpatch trampolines: %r0 is return address, %r1 is clobbered.
+ */
+extern const char bpf_plt[];
+extern const char bpf_plt_ret[];
+extern const char bpf_plt_target[];
+extern const char bpf_plt_end[];
+#define BPF_PLT_SIZE 32
+asm(
+ ".pushsection .rodata\n"
+ " .balign 8\n"
+ "bpf_plt:\n"
+ " lgrl %r0,bpf_plt_ret\n"
+ " lgrl %r1,bpf_plt_target\n"
+ " br %r1\n"
+ " .balign 8\n"
+ "bpf_plt_ret: .quad 0\n"
+ "bpf_plt_target: .quad 0\n"
+ "bpf_plt_end:\n"
+ " .popsection\n"
+);
+
+static void bpf_jit_plt(void *plt, void *ret, void *target)
+{
+ memcpy(plt, bpf_plt, BPF_PLT_SIZE);
+ *(void **)((char *)plt + (bpf_plt_ret - bpf_plt)) = ret;
+ *(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target ?: ret;
+}
+
/*
* Emit function prologue
*
* Save registers and create stack frame if necessary.
- * See stack frame layout desription in "bpf_jit.h"!
+ * See stack frame layout description in "bpf_jit.h"!
*/
-static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
+static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp,
+ u32 stack_depth)
{
- if (jit->seen & SEEN_TAIL_CALL) {
+ /* No-op for hotpatching */
+ /* brcl 0,prologue_plt */
+ EMIT6_PCREL_RILC(0xc0040000, 0, jit->prologue_plt);
+ jit->prologue_plt_ret = jit->prg;
+
+ if (!bpf_is_subprog(fp)) {
+ /* Initialize the tail call counter in the main program. */
/* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */
_EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT);
} else {
- /* j tail_call_start: NOP if no tail calls are used */
- EMIT4_PCREL(0xa7f40000, 6);
- _EMIT2(0);
+ /*
+ * Skip the tail call counter initialization in subprograms.
+ * Insert nops in order to have tail_call_start at a
+ * predictable offset.
+ */
+ bpf_skip(jit, 6);
}
/* Tail calls have to skip above initialization */
jit->tail_call_start = jit->prg;
@@ -539,6 +602,43 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
}
/*
+ * Emit an expoline for a jump that follows
+ */
+static void emit_expoline(struct bpf_jit *jit)
+{
+ /* exrl %r0,.+10 */
+ EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
+ /* j . */
+ EMIT4_PCREL(0xa7f40000, 0);
+}
+
+/*
+ * Emit __s390_indirect_jump_r1 thunk if necessary
+ */
+static void emit_r1_thunk(struct bpf_jit *jit)
+{
+ if (nospec_uses_trampoline()) {
+ jit->r1_thunk_ip = jit->prg;
+ emit_expoline(jit);
+ /* br %r1 */
+ _EMIT2(0x07f1);
+ }
+}
+
+/*
+ * Call r1 either directly or via __s390_indirect_jump_r1 thunk
+ */
+static void call_r1(struct bpf_jit *jit)
+{
+ if (nospec_uses_trampoline())
+ /* brasl %r14,__s390_indirect_jump_r1 */
+ EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip);
+ else
+ /* basr %r14,%r1 */
+ EMIT2(0x0d00, REG_14, REG_1);
+}
+
+/*
* Function epilogue
*/
static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
@@ -548,42 +648,124 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
EMIT4(0xb9040000, REG_2, BPF_REG_0);
/* Restore registers */
save_restore_regs(jit, REGS_RESTORE, stack_depth);
- if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) {
+ if (nospec_uses_trampoline()) {
jit->r14_thunk_ip = jit->prg;
/* Generate __s390_indirect_jump_r14 thunk */
- if (test_facility(35)) {
- /* exrl %r0,.+10 */
- EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
- } else {
- /* larl %r1,.+14 */
- EMIT6_PCREL_RILB(0xc0000000, REG_1, jit->prg + 14);
- /* ex 0,0(%r1) */
- EMIT4_DISP(0x44000000, REG_0, REG_1, 0);
- }
- /* j . */
- EMIT4_PCREL(0xa7f40000, 0);
+ emit_expoline(jit);
}
/* br %r14 */
_EMIT2(0x07fe);
- if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable &&
- (is_first_pass(jit) || (jit->seen & SEEN_FUNC))) {
- jit->r1_thunk_ip = jit->prg;
- /* Generate __s390_indirect_jump_r1 thunk */
- if (test_facility(35)) {
- /* exrl %r0,.+10 */
- EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10);
- /* j . */
- EMIT4_PCREL(0xa7f40000, 0);
- /* br %r1 */
- _EMIT2(0x07f1);
- } else {
- /* ex 0,S390_lowcore.br_r1_tampoline */
- EMIT4_DISP(0x44000000, REG_0, REG_0,
- offsetof(struct lowcore, br_r1_trampoline));
- /* j . */
- EMIT4_PCREL(0xa7f40000, 0);
- }
+ if (is_first_pass(jit) || (jit->seen & SEEN_FUNC))
+ emit_r1_thunk(jit);
+
+ jit->prg = ALIGN(jit->prg, 8);
+ jit->prologue_plt = jit->prg;
+ if (jit->prg_buf)
+ bpf_jit_plt(jit->prg_buf + jit->prg,
+ jit->prg_buf + jit->prologue_plt_ret, NULL);
+ jit->prg += BPF_PLT_SIZE;
+}
+
+static int get_probe_mem_regno(const u8 *insn)
+{
+ /*
+ * insn must point to llgc, llgh, llgf, lg, lgb, lgh or lgf, which have
+ * destination register at the same position.
+ */
+ if (insn[0] != 0xe3) /* common prefix */
+ return -1;
+ if (insn[5] != 0x90 && /* llgc */
+ insn[5] != 0x91 && /* llgh */
+ insn[5] != 0x16 && /* llgf */
+ insn[5] != 0x04 && /* lg */
+ insn[5] != 0x77 && /* lgb */
+ insn[5] != 0x15 && /* lgh */
+ insn[5] != 0x14) /* lgf */
+ return -1;
+ return insn[1] >> 4;
+}
+
+bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
+{
+ regs->psw.addr = extable_fixup(x);
+ regs->gprs[x->data] = 0;
+ return true;
+}
+
+static int bpf_jit_probe_mem(struct bpf_jit *jit, struct bpf_prog *fp,
+ int probe_prg, int nop_prg)
+{
+ struct exception_table_entry *ex;
+ int reg, prg;
+ s64 delta;
+ u8 *insn;
+ int i;
+
+ if (!fp->aux->extable)
+ /* Do nothing during early JIT passes. */
+ return 0;
+ insn = jit->prg_buf + probe_prg;
+ reg = get_probe_mem_regno(insn);
+ if (WARN_ON_ONCE(reg < 0))
+ /* JIT bug - unexpected probe instruction. */
+ return -1;
+ if (WARN_ON_ONCE(probe_prg + insn_length(*insn) != nop_prg))
+ /* JIT bug - gap between probe and nop instructions. */
+ return -1;
+ for (i = 0; i < 2; i++) {
+ if (WARN_ON_ONCE(jit->excnt >= fp->aux->num_exentries))
+ /* Verifier bug - not enough entries. */
+ return -1;
+ ex = &fp->aux->extable[jit->excnt];
+ /* Add extable entries for probe and nop instructions. */
+ prg = i == 0 ? probe_prg : nop_prg;
+ delta = jit->prg_buf + prg - (u8 *)&ex->insn;
+ if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX))
+ /* JIT bug - code and extable must be close. */
+ return -1;
+ ex->insn = delta;
+ /*
+ * Always land on the nop. Note that extable infrastructure
+ * ignores fixup field, it is handled by ex_handler_bpf().
+ */
+ delta = jit->prg_buf + nop_prg - (u8 *)&ex->fixup;
+ if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX))
+ /* JIT bug - landing pad and extable must be close. */
+ return -1;
+ ex->fixup = delta;
+ ex->type = EX_TYPE_BPF;
+ ex->data = reg;
+ jit->excnt++;
+ }
+ return 0;
+}
+
+/*
+ * Sign-extend the register if necessary
+ */
+static int sign_extend(struct bpf_jit *jit, int r, u8 size, u8 flags)
+{
+ if (!(flags & BTF_FMODEL_SIGNED_ARG))
+ return 0;
+
+ switch (size) {
+ case 1:
+ /* lgbr %r,%r */
+ EMIT4(0xb9060000, r, r);
+ return 0;
+ case 2:
+ /* lghr %r,%r */
+ EMIT4(0xb9070000, r, r);
+ return 0;
+ case 4:
+ /* lgfr %r,%r */
+ EMIT4(0xb9140000, r, r);
+ return 0;
+ case 8:
+ return 0;
+ default:
+ return -1;
}
}
@@ -594,30 +776,71 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
* stack space for the large switch statement.
*/
static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
- int i, bool extra_pass)
+ int i, bool extra_pass, u32 stack_depth)
{
struct bpf_insn *insn = &fp->insnsi[i];
+ s32 branch_oc_off = insn->off;
u32 dst_reg = insn->dst_reg;
u32 src_reg = insn->src_reg;
int last, insn_count = 1;
u32 *addrs = jit->addrs;
s32 imm = insn->imm;
s16 off = insn->off;
+ int probe_prg = -1;
unsigned int mask;
+ int nop_prg;
+ int err;
+
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
+ probe_prg = jit->prg;
switch (insn->code) {
/*
* BPF_MOV
*/
- case BPF_ALU | BPF_MOV | BPF_X: /* dst = (u32) src */
- /* llgfr %dst,%src */
- EMIT4(0xb9160000, dst_reg, src_reg);
- if (insn_is_zext(&insn[1]))
- insn_count = 2;
+ case BPF_ALU | BPF_MOV | BPF_X:
+ switch (insn->off) {
+ case 0: /* DST = (u32) SRC */
+ /* llgfr %dst,%src */
+ EMIT4(0xb9160000, dst_reg, src_reg);
+ if (insn_is_zext(&insn[1]))
+ insn_count = 2;
+ break;
+ case 8: /* DST = (u32)(s8) SRC */
+ /* lbr %dst,%src */
+ EMIT4(0xb9260000, dst_reg, src_reg);
+ /* llgfr %dst,%dst */
+ EMIT4(0xb9160000, dst_reg, dst_reg);
+ break;
+ case 16: /* DST = (u32)(s16) SRC */
+ /* lhr %dst,%src */
+ EMIT4(0xb9270000, dst_reg, src_reg);
+ /* llgfr %dst,%dst */
+ EMIT4(0xb9160000, dst_reg, dst_reg);
+ break;
+ }
break;
- case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */
- /* lgr %dst,%src */
- EMIT4(0xb9040000, dst_reg, src_reg);
+ case BPF_ALU64 | BPF_MOV | BPF_X:
+ switch (insn->off) {
+ case 0: /* DST = SRC */
+ /* lgr %dst,%src */
+ EMIT4(0xb9040000, dst_reg, src_reg);
+ break;
+ case 8: /* DST = (s8) SRC */
+ /* lgbr %dst,%src */
+ EMIT4(0xb9060000, dst_reg, src_reg);
+ break;
+ case 16: /* DST = (s16) SRC */
+ /* lghr %dst,%src */
+ EMIT4(0xb9070000, dst_reg, src_reg);
+ break;
+ case 32: /* DST = (s32) SRC */
+ /* lgfr %dst,%src */
+ EMIT4(0xb9140000, dst_reg, src_reg);
+ break;
+ }
break;
case BPF_ALU | BPF_MOV | BPF_K: /* dst = (u32) imm */
/* llilf %dst,imm */
@@ -656,10 +879,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4(0xb9080000, dst_reg, src_reg);
break;
case BPF_ALU | BPF_ADD | BPF_K: /* dst = (u32) dst + (u32) imm */
- if (!imm)
- break;
- /* alfi %dst,imm */
- EMIT6_IMM(0xc20b0000, dst_reg, imm);
+ if (imm != 0) {
+ /* alfi %dst,imm */
+ EMIT6_IMM(0xc20b0000, dst_reg, imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_ADD | BPF_K: /* dst = dst + imm */
@@ -681,17 +904,22 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4(0xb9090000, dst_reg, src_reg);
break;
case BPF_ALU | BPF_SUB | BPF_K: /* dst = (u32) dst - (u32) imm */
- if (!imm)
- break;
- /* alfi %dst,-imm */
- EMIT6_IMM(0xc20b0000, dst_reg, -imm);
+ if (imm != 0) {
+ /* alfi %dst,-imm */
+ EMIT6_IMM(0xc20b0000, dst_reg, -imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_SUB | BPF_K: /* dst = dst - imm */
if (!imm)
break;
- /* agfi %dst,-imm */
- EMIT6_IMM(0xc2080000, dst_reg, -imm);
+ if (imm == -0x80000000) {
+ /* algfi %dst,0x80000000 */
+ EMIT6_IMM(0xc20a0000, dst_reg, 0x80000000);
+ } else {
+ /* agfi %dst,-imm */
+ EMIT6_IMM(0xc2080000, dst_reg, -imm);
+ }
break;
/*
* BPF_MUL
@@ -706,10 +934,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4(0xb90c0000, dst_reg, src_reg);
break;
case BPF_ALU | BPF_MUL | BPF_K: /* dst = (u32) dst * (u32) imm */
- if (imm == 1)
- break;
- /* msfi %r5,imm */
- EMIT6_IMM(0xc2010000, dst_reg, imm);
+ if (imm != 1) {
+ /* msfi %r5,imm */
+ EMIT6_IMM(0xc2010000, dst_reg, imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_MUL | BPF_K: /* dst = dst * imm */
@@ -721,64 +949,115 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
/*
* BPF_DIV / BPF_MOD
*/
- case BPF_ALU | BPF_DIV | BPF_X: /* dst = (u32) dst / (u32) src */
- case BPF_ALU | BPF_MOD | BPF_X: /* dst = (u32) dst % (u32) src */
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_X:
{
int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
- /* lhi %w0,0 */
- EMIT4_IMM(0xa7080000, REG_W0, 0);
- /* lr %w1,%dst */
- EMIT2(0x1800, REG_W1, dst_reg);
- /* dlr %w0,%src */
- EMIT4(0xb9970000, REG_W0, src_reg);
+ switch (off) {
+ case 0: /* dst = (u32) dst {/,%} (u32) src */
+ /* xr %w0,%w0 */
+ EMIT2(0x1700, REG_W0, REG_W0);
+ /* lr %w1,%dst */
+ EMIT2(0x1800, REG_W1, dst_reg);
+ /* dlr %w0,%src */
+ EMIT4(0xb9970000, REG_W0, src_reg);
+ break;
+ case 1: /* dst = (u32) ((s32) dst {/,%} (s32) src) */
+ /* lgfr %r1,%dst */
+ EMIT4(0xb9140000, REG_W1, dst_reg);
+ /* dsgfr %r0,%src */
+ EMIT4(0xb91d0000, REG_W0, src_reg);
+ break;
+ }
/* llgfr %dst,%rc */
EMIT4(0xb9160000, dst_reg, rc_reg);
if (insn_is_zext(&insn[1]))
insn_count = 2;
break;
}
- case BPF_ALU64 | BPF_DIV | BPF_X: /* dst = dst / src */
- case BPF_ALU64 | BPF_MOD | BPF_X: /* dst = dst % src */
+ case BPF_ALU64 | BPF_DIV | BPF_X:
+ case BPF_ALU64 | BPF_MOD | BPF_X:
{
int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
- /* lghi %w0,0 */
- EMIT4_IMM(0xa7090000, REG_W0, 0);
- /* lgr %w1,%dst */
- EMIT4(0xb9040000, REG_W1, dst_reg);
- /* dlgr %w0,%dst */
- EMIT4(0xb9870000, REG_W0, src_reg);
+ switch (off) {
+ case 0: /* dst = dst {/,%} src */
+ /* lghi %w0,0 */
+ EMIT4_IMM(0xa7090000, REG_W0, 0);
+ /* lgr %w1,%dst */
+ EMIT4(0xb9040000, REG_W1, dst_reg);
+ /* dlgr %w0,%src */
+ EMIT4(0xb9870000, REG_W0, src_reg);
+ break;
+ case 1: /* dst = (s64) dst {/,%} (s64) src */
+ /* lgr %w1,%dst */
+ EMIT4(0xb9040000, REG_W1, dst_reg);
+ /* dsgr %w0,%src */
+ EMIT4(0xb90d0000, REG_W0, src_reg);
+ break;
+ }
/* lgr %dst,%rc */
EMIT4(0xb9040000, dst_reg, rc_reg);
break;
}
- case BPF_ALU | BPF_DIV | BPF_K: /* dst = (u32) dst / (u32) imm */
- case BPF_ALU | BPF_MOD | BPF_K: /* dst = (u32) dst % (u32) imm */
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_K:
{
int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
if (imm == 1) {
if (BPF_OP(insn->code) == BPF_MOD)
- /* lhgi %dst,0 */
+ /* lghi %dst,0 */
EMIT4_IMM(0xa7090000, dst_reg, 0);
+ else
+ EMIT_ZERO(dst_reg);
break;
}
- /* lhi %w0,0 */
- EMIT4_IMM(0xa7080000, REG_W0, 0);
- /* lr %w1,%dst */
- EMIT2(0x1800, REG_W1, dst_reg);
if (!is_first_pass(jit) && can_use_ldisp_for_lit32(jit)) {
- /* dl %w0,<d(imm)>(%l) */
- EMIT6_DISP_LH(0xe3000000, 0x0097, REG_W0, REG_0, REG_L,
- EMIT_CONST_U32(imm));
+ switch (off) {
+ case 0: /* dst = (u32) dst {/,%} (u32) imm */
+ /* xr %w0,%w0 */
+ EMIT2(0x1700, REG_W0, REG_W0);
+ /* lr %w1,%dst */
+ EMIT2(0x1800, REG_W1, dst_reg);
+ /* dl %w0,<d(imm)>(%l) */
+ EMIT6_DISP_LH(0xe3000000, 0x0097, REG_W0, REG_0,
+ REG_L, EMIT_CONST_U32(imm));
+ break;
+ case 1: /* dst = (s32) dst {/,%} (s32) imm */
+ /* lgfr %r1,%dst */
+ EMIT4(0xb9140000, REG_W1, dst_reg);
+ /* dsgf %r0,<d(imm)>(%l) */
+ EMIT6_DISP_LH(0xe3000000, 0x001d, REG_W0, REG_0,
+ REG_L, EMIT_CONST_U32(imm));
+ break;
+ }
} else {
- /* lgfrl %dst,imm */
- EMIT6_PCREL_RILB(0xc40c0000, dst_reg,
- _EMIT_CONST_U32(imm));
- jit->seen |= SEEN_LITERAL;
- /* dlr %w0,%dst */
- EMIT4(0xb9970000, REG_W0, dst_reg);
+ switch (off) {
+ case 0: /* dst = (u32) dst {/,%} (u32) imm */
+ /* xr %w0,%w0 */
+ EMIT2(0x1700, REG_W0, REG_W0);
+ /* lr %w1,%dst */
+ EMIT2(0x1800, REG_W1, dst_reg);
+ /* lrl %dst,imm */
+ EMIT6_PCREL_RILB(0xc40d0000, dst_reg,
+ _EMIT_CONST_U32(imm));
+ jit->seen |= SEEN_LITERAL;
+ /* dlr %w0,%dst */
+ EMIT4(0xb9970000, REG_W0, dst_reg);
+ break;
+ case 1: /* dst = (s32) dst {/,%} (s32) imm */
+ /* lgfr %w1,%dst */
+ EMIT4(0xb9140000, REG_W1, dst_reg);
+ /* lgfrl %dst,imm */
+ EMIT6_PCREL_RILB(0xc40c0000, dst_reg,
+ _EMIT_CONST_U32(imm));
+ jit->seen |= SEEN_LITERAL;
+ /* dsgr %w0,%dst */
+ EMIT4(0xb90d0000, REG_W0, dst_reg);
+ break;
+ }
}
/* llgfr %dst,%rc */
EMIT4(0xb9160000, dst_reg, rc_reg);
@@ -786,8 +1065,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
insn_count = 2;
break;
}
- case BPF_ALU64 | BPF_DIV | BPF_K: /* dst = dst / imm */
- case BPF_ALU64 | BPF_MOD | BPF_K: /* dst = dst % imm */
+ case BPF_ALU64 | BPF_DIV | BPF_K:
+ case BPF_ALU64 | BPF_MOD | BPF_K:
{
int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
@@ -797,21 +1076,50 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4_IMM(0xa7090000, dst_reg, 0);
break;
}
- /* lghi %w0,0 */
- EMIT4_IMM(0xa7090000, REG_W0, 0);
- /* lgr %w1,%dst */
- EMIT4(0xb9040000, REG_W1, dst_reg);
if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) {
- /* dlg %w0,<d(imm)>(%l) */
- EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0, REG_L,
- EMIT_CONST_U64(imm));
+ switch (off) {
+ case 0: /* dst = dst {/,%} imm */
+ /* lghi %w0,0 */
+ EMIT4_IMM(0xa7090000, REG_W0, 0);
+ /* lgr %w1,%dst */
+ EMIT4(0xb9040000, REG_W1, dst_reg);
+ /* dlg %w0,<d(imm)>(%l) */
+ EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0,
+ REG_L, EMIT_CONST_U64(imm));
+ break;
+ case 1: /* dst = (s64) dst {/,%} (s64) imm */
+ /* lgr %w1,%dst */
+ EMIT4(0xb9040000, REG_W1, dst_reg);
+ /* dsg %w0,<d(imm)>(%l) */
+ EMIT6_DISP_LH(0xe3000000, 0x000d, REG_W0, REG_0,
+ REG_L, EMIT_CONST_U64(imm));
+ break;
+ }
} else {
- /* lgrl %dst,imm */
- EMIT6_PCREL_RILB(0xc4080000, dst_reg,
- _EMIT_CONST_U64(imm));
- jit->seen |= SEEN_LITERAL;
- /* dlgr %w0,%dst */
- EMIT4(0xb9870000, REG_W0, dst_reg);
+ switch (off) {
+ case 0: /* dst = dst {/,%} imm */
+ /* lghi %w0,0 */
+ EMIT4_IMM(0xa7090000, REG_W0, 0);
+ /* lgr %w1,%dst */
+ EMIT4(0xb9040000, REG_W1, dst_reg);
+ /* lgrl %dst,imm */
+ EMIT6_PCREL_RILB(0xc4080000, dst_reg,
+ _EMIT_CONST_U64(imm));
+ jit->seen |= SEEN_LITERAL;
+ /* dlgr %w0,%dst */
+ EMIT4(0xb9870000, REG_W0, dst_reg);
+ break;
+ case 1: /* dst = (s64) dst {/,%} (s64) imm */
+ /* lgr %w1,%dst */
+ EMIT4(0xb9040000, REG_W1, dst_reg);
+ /* lgrl %dst,imm */
+ EMIT6_PCREL_RILB(0xc4080000, dst_reg,
+ _EMIT_CONST_U64(imm));
+ jit->seen |= SEEN_LITERAL;
+ /* dsgr %w0,%dst */
+ EMIT4(0xb90d0000, REG_W0, dst_reg);
+ break;
+ }
}
/* lgr %dst,%rc */
EMIT4(0xb9040000, dst_reg, rc_reg);
@@ -894,10 +1202,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4(0xb9820000, dst_reg, src_reg);
break;
case BPF_ALU | BPF_XOR | BPF_K: /* dst = (u32) dst ^ (u32) imm */
- if (!imm)
- break;
- /* xilf %dst,imm */
- EMIT6_IMM(0xc0070000, dst_reg, imm);
+ if (imm != 0) {
+ /* xilf %dst,imm */
+ EMIT6_IMM(0xc0070000, dst_reg, imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_XOR | BPF_K: /* dst = dst ^ imm */
@@ -928,10 +1236,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT6_DISP_LH(0xeb000000, 0x000d, dst_reg, dst_reg, src_reg, 0);
break;
case BPF_ALU | BPF_LSH | BPF_K: /* dst = (u32) dst << (u32) imm */
- if (imm == 0)
- break;
- /* sll %dst,imm(%r0) */
- EMIT4_DISP(0x89000000, dst_reg, REG_0, imm);
+ if (imm != 0) {
+ /* sll %dst,imm(%r0) */
+ EMIT4_DISP(0x89000000, dst_reg, REG_0, imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_LSH | BPF_K: /* dst = dst << imm */
@@ -953,10 +1261,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT6_DISP_LH(0xeb000000, 0x000c, dst_reg, dst_reg, src_reg, 0);
break;
case BPF_ALU | BPF_RSH | BPF_K: /* dst = (u32) dst >> (u32) imm */
- if (imm == 0)
- break;
- /* srl %dst,imm(%r0) */
- EMIT4_DISP(0x88000000, dst_reg, REG_0, imm);
+ if (imm != 0) {
+ /* srl %dst,imm(%r0) */
+ EMIT4_DISP(0x88000000, dst_reg, REG_0, imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_RSH | BPF_K: /* dst = dst >> imm */
@@ -978,10 +1286,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT6_DISP_LH(0xeb000000, 0x000a, dst_reg, dst_reg, src_reg, 0);
break;
case BPF_ALU | BPF_ARSH | BPF_K: /* ((s32) dst >> imm */
- if (imm == 0)
- break;
- /* sra %dst,imm(%r0) */
- EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm);
+ if (imm != 0) {
+ /* sra %dst,imm(%r0) */
+ EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm);
+ }
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_ARSH | BPF_K: /* ((s64) dst) >>= imm */
@@ -1024,6 +1332,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
}
break;
case BPF_ALU | BPF_END | BPF_FROM_LE:
+ case BPF_ALU64 | BPF_END | BPF_FROM_LE:
switch (imm) {
case 16: /* dst = (u16) cpu_to_le16(dst) */
/* lrvr %dst,%dst */
@@ -1049,6 +1358,11 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
}
break;
/*
+ * BPF_NOSPEC (speculation barrier)
+ */
+ case BPF_ST | BPF_NOSPEC:
+ break;
+ /*
* BPF_ST(X)
*/
case BPF_STX | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = src_reg */
@@ -1100,45 +1414,118 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
jit->seen |= SEEN_MEM;
break;
/*
- * BPF_STX XADD (atomic_add)
+ * BPF_ATOMIC
*/
- case BPF_STX | BPF_XADD | BPF_W: /* *(u32 *)(dst + off) += src */
- /* laal %w0,%src,off(%dst) */
- EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W0, src_reg,
- dst_reg, off);
- jit->seen |= SEEN_MEM;
- break;
- case BPF_STX | BPF_XADD | BPF_DW: /* *(u64 *)(dst + off) += src */
- /* laalg %w0,%src,off(%dst) */
- EMIT6_DISP_LH(0xeb000000, 0x00ea, REG_W0, src_reg,
- dst_reg, off);
+ case BPF_STX | BPF_ATOMIC | BPF_DW:
+ case BPF_STX | BPF_ATOMIC | BPF_W:
+ {
+ bool is32 = BPF_SIZE(insn->code) == BPF_W;
+
+ switch (insn->imm) {
+/* {op32|op64} {%w0|%src},%src,off(%dst) */
+#define EMIT_ATOMIC(op32, op64) do { \
+ EMIT6_DISP_LH(0xeb000000, is32 ? (op32) : (op64), \
+ (insn->imm & BPF_FETCH) ? src_reg : REG_W0, \
+ src_reg, dst_reg, off); \
+ if (is32 && (insn->imm & BPF_FETCH)) \
+ EMIT_ZERO(src_reg); \
+} while (0)
+ case BPF_ADD:
+ case BPF_ADD | BPF_FETCH:
+ /* {laal|laalg} */
+ EMIT_ATOMIC(0x00fa, 0x00ea);
+ break;
+ case BPF_AND:
+ case BPF_AND | BPF_FETCH:
+ /* {lan|lang} */
+ EMIT_ATOMIC(0x00f4, 0x00e4);
+ break;
+ case BPF_OR:
+ case BPF_OR | BPF_FETCH:
+ /* {lao|laog} */
+ EMIT_ATOMIC(0x00f6, 0x00e6);
+ break;
+ case BPF_XOR:
+ case BPF_XOR | BPF_FETCH:
+ /* {lax|laxg} */
+ EMIT_ATOMIC(0x00f7, 0x00e7);
+ break;
+#undef EMIT_ATOMIC
+ case BPF_XCHG:
+ /* {ly|lg} %w0,off(%dst) */
+ EMIT6_DISP_LH(0xe3000000,
+ is32 ? 0x0058 : 0x0004, REG_W0, REG_0,
+ dst_reg, off);
+ /* 0: {csy|csg} %w0,%src,off(%dst) */
+ EMIT6_DISP_LH(0xeb000000, is32 ? 0x0014 : 0x0030,
+ REG_W0, src_reg, dst_reg, off);
+ /* brc 4,0b */
+ EMIT4_PCREL_RIC(0xa7040000, 4, jit->prg - 6);
+ /* {llgfr|lgr} %src,%w0 */
+ EMIT4(is32 ? 0xb9160000 : 0xb9040000, src_reg, REG_W0);
+ if (is32 && insn_is_zext(&insn[1]))
+ insn_count = 2;
+ break;
+ case BPF_CMPXCHG:
+ /* 0: {csy|csg} %b0,%src,off(%dst) */
+ EMIT6_DISP_LH(0xeb000000, is32 ? 0x0014 : 0x0030,
+ BPF_REG_0, src_reg, dst_reg, off);
+ break;
+ default:
+ pr_err("Unknown atomic operation %02x\n", insn->imm);
+ return -1;
+ }
+
jit->seen |= SEEN_MEM;
break;
+ }
/*
* BPF_LDX
*/
case BPF_LDX | BPF_MEM | BPF_B: /* dst = *(u8 *)(ul) (src + off) */
+ case BPF_LDX | BPF_PROBE_MEM | BPF_B:
/* llgc %dst,0(off,%src) */
EMIT6_DISP_LH(0xe3000000, 0x0090, dst_reg, src_reg, REG_0, off);
jit->seen |= SEEN_MEM;
if (insn_is_zext(&insn[1]))
insn_count = 2;
break;
+ case BPF_LDX | BPF_MEMSX | BPF_B: /* dst = *(s8 *)(ul) (src + off) */
+ case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
+ /* lgb %dst,0(off,%src) */
+ EMIT6_DISP_LH(0xe3000000, 0x0077, dst_reg, src_reg, REG_0, off);
+ jit->seen |= SEEN_MEM;
+ break;
case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */
+ case BPF_LDX | BPF_PROBE_MEM | BPF_H:
/* llgh %dst,0(off,%src) */
EMIT6_DISP_LH(0xe3000000, 0x0091, dst_reg, src_reg, REG_0, off);
jit->seen |= SEEN_MEM;
if (insn_is_zext(&insn[1]))
insn_count = 2;
break;
+ case BPF_LDX | BPF_MEMSX | BPF_H: /* dst = *(s16 *)(ul) (src + off) */
+ case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
+ /* lgh %dst,0(off,%src) */
+ EMIT6_DISP_LH(0xe3000000, 0x0015, dst_reg, src_reg, REG_0, off);
+ jit->seen |= SEEN_MEM;
+ break;
case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */
+ case BPF_LDX | BPF_PROBE_MEM | BPF_W:
/* llgf %dst,off(%src) */
jit->seen |= SEEN_MEM;
EMIT6_DISP_LH(0xe3000000, 0x0016, dst_reg, src_reg, REG_0, off);
if (insn_is_zext(&insn[1]))
insn_count = 2;
break;
+ case BPF_LDX | BPF_MEMSX | BPF_W: /* dst = *(s32 *)(ul) (src + off) */
+ case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
+ /* lgf %dst,off(%src) */
+ jit->seen |= SEEN_MEM;
+ EMIT6_DISP_LH(0xe3000000, 0x0014, dst_reg, src_reg, REG_0, off);
+ break;
case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */
+ case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
/* lg %dst,0(off,%src) */
jit->seen |= SEEN_MEM;
EMIT6_DISP_LH(0xe3000000, 0x0004, dst_reg, src_reg, REG_0, off);
@@ -1148,9 +1535,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
*/
case BPF_JMP | BPF_CALL:
{
- u64 func;
+ const struct btf_func_model *m;
bool func_addr_fixed;
- int ret;
+ int j, ret;
+ u64 func;
ret = bpf_jit_get_func_addr(fp, insn, extra_pass,
&func, &func_addr_fixed);
@@ -1159,29 +1547,51 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
REG_SET_SEEN(BPF_REG_5);
jit->seen |= SEEN_FUNC;
+ /*
+ * Copy the tail call counter to where the callee expects it.
+ *
+ * Note 1: The callee can increment the tail call counter, but
+ * we do not load it back, since the x86 JIT does not do this
+ * either.
+ *
+ * Note 2: We assume that the verifier does not let us call the
+ * main program, which clears the tail call counter on entry.
+ */
+ /* mvc STK_OFF_TCCNT(4,%r15),N(%r15) */
+ _EMIT6(0xd203f000 | STK_OFF_TCCNT,
+ 0xf000 | (STK_OFF_TCCNT + STK_OFF + stack_depth));
+
+ /* Sign-extend the kfunc arguments. */
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ m = bpf_jit_find_kfunc_model(fp, insn);
+ if (!m)
+ return -1;
+
+ for (j = 0; j < m->nr_args; j++) {
+ if (sign_extend(jit, BPF_REG_1 + j,
+ m->arg_size[j],
+ m->arg_flags[j]))
+ return -1;
+ }
+ }
+
/* lgrl %w1,func */
EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func));
- if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) {
- /* brasl %r14,__s390_indirect_jump_r1 */
- EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip);
- } else {
- /* basr %r14,%w1 */
- EMIT2(0x0d00, REG_14, REG_W1);
- }
+ /* %r1() */
+ call_r1(jit);
/* lgr %b0,%r2: load return value into %b0 */
EMIT4(0xb9040000, BPF_REG_0, REG_2);
break;
}
- case BPF_JMP | BPF_TAIL_CALL:
+ case BPF_JMP | BPF_TAIL_CALL: {
+ int patch_1_clrj, patch_2_clij, patch_3_brc;
+
/*
* Implicit input:
* B1: pointer to ctx
* B2: pointer to bpf_array
* B3: index in bpf_array
- */
- jit->seen |= SEEN_TAIL_CALL;
-
- /*
+ *
* if (index >= array->map.max_entries)
* goto out;
*/
@@ -1190,40 +1600,28 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2,
offsetof(struct bpf_array, map.max_entries));
/* if ((u32)%b3 >= (u32)%w1) goto out; */
- if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) {
- /* clrj %b3,%w1,0xa,label0 */
- EMIT6_PCREL_LABEL(0xec000000, 0x0077, BPF_REG_3,
- REG_W1, 0, 0xa);
- } else {
- /* clr %b3,%w1 */
- EMIT2(0x1500, BPF_REG_3, REG_W1);
- /* brcl 0xa,label0 */
- EMIT6_PCREL_RILC(0xc0040000, 0xa, jit->labels[0]);
- }
+ /* clrj %b3,%w1,0xa,out */
+ patch_1_clrj = jit->prg;
+ EMIT6_PCREL_RIEB(0xec000000, 0x0077, BPF_REG_3, REG_W1, 0xa,
+ jit->prg);
/*
- * if (tail_call_cnt++ > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
if (jit->seen & SEEN_STACK)
- off = STK_OFF_TCCNT + STK_OFF + fp->aux->stack_depth;
+ off = STK_OFF_TCCNT + STK_OFF + stack_depth;
else
off = STK_OFF_TCCNT;
/* lhi %w0,1 */
EMIT4_IMM(0xa7080000, REG_W0, 1);
/* laal %w1,%w0,off(%r15) */
EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off);
- if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) {
- /* clij %w1,MAX_TAIL_CALL_CNT,0x2,label0 */
- EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007f, REG_W1,
- MAX_TAIL_CALL_CNT, 0, 0x2);
- } else {
- /* clfi %w1,MAX_TAIL_CALL_CNT */
- EMIT6_IMM(0xc20f0000, REG_W1, MAX_TAIL_CALL_CNT);
- /* brcl 0x2,label0 */
- EMIT6_PCREL_RILC(0xc0040000, 0x2, jit->labels[0]);
- }
+ /* clij %w1,MAX_TAIL_CALL_CNT-1,0x2,out */
+ patch_2_clij = jit->prg;
+ EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT - 1,
+ 2, jit->prg);
/*
* prog = array->ptrs[index];
@@ -1238,18 +1636,14 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
/* ltg %r1,prog(%b2,%r1) */
EMIT6_DISP_LH(0xe3000000, 0x0002, REG_1, BPF_REG_2,
REG_1, offsetof(struct bpf_array, ptrs));
- if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) {
- /* brc 0x8,label0 */
- EMIT4_PCREL_RIC(0xa7040000, 0x8, jit->labels[0]);
- } else {
- /* brcl 0x8,label0 */
- EMIT6_PCREL_RILC(0xc0040000, 0x8, jit->labels[0]);
- }
+ /* brc 0x8,out */
+ patch_3_brc = jit->prg;
+ EMIT4_PCREL_RIC(0xa7040000, 8, jit->prg);
/*
* Restore registers before calling function
*/
- save_restore_regs(jit, REGS_RESTORE, fp->aux->stack_depth);
+ save_restore_regs(jit, REGS_RESTORE, stack_depth);
/*
* goto *(prog->bpf_func + tail_call_start);
@@ -1258,17 +1652,37 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
/* lg %r1,bpf_func(%r1) */
EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_1, REG_0,
offsetof(struct bpf_prog, bpf_func));
- /* bc 0xf,tail_call_start(%r1) */
- _EMIT4(0x47f01000 + jit->tail_call_start);
+ if (nospec_uses_trampoline()) {
+ jit->seen |= SEEN_FUNC;
+ /* aghi %r1,tail_call_start */
+ EMIT4_IMM(0xa70b0000, REG_1, jit->tail_call_start);
+ /* brcl 0xf,__s390_indirect_jump_r1 */
+ EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->r1_thunk_ip);
+ } else {
+ /* bc 0xf,tail_call_start(%r1) */
+ _EMIT4(0x47f01000 + jit->tail_call_start);
+ }
/* out: */
- jit->labels[0] = jit->prg;
+ if (jit->prg_buf) {
+ *(u16 *)(jit->prg_buf + patch_1_clrj + 2) =
+ (jit->prg - patch_1_clrj) >> 1;
+ *(u16 *)(jit->prg_buf + patch_2_clij + 2) =
+ (jit->prg - patch_2_clij) >> 1;
+ *(u16 *)(jit->prg_buf + patch_3_brc + 2) =
+ (jit->prg - patch_3_brc) >> 1;
+ }
break;
+ }
case BPF_JMP | BPF_EXIT: /* return b0 */
last = (i == fp->len - 1) ? 1 : 0;
if (last)
break;
- /* j <exit> */
- EMIT4_PCREL(0xa7f40000, jit->exit_ip - jit->prg);
+ if (!is_first_pass(jit) && can_use_rel(jit, jit->exit_ip))
+ /* brc 0xf, <exit> */
+ EMIT4_PCREL_RIC(0xa7040000, 0xf, jit->exit_ip);
+ else
+ /* brcl 0xf, <exit> */
+ EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->exit_ip);
break;
/*
* Branch relative (number of skipped instructions) to offset on
@@ -1290,6 +1704,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
* instruction itself (loop) and for BPF with offset 0 we
* branch to the instruction behind the branch.
*/
+ case BPF_JMP32 | BPF_JA: /* if (true) */
+ branch_oc_off = imm;
+ fallthrough;
case BPF_JMP | BPF_JA: /* if (true) */
mask = 0xf000; /* j */
goto branch_oc;
@@ -1416,21 +1833,10 @@ branch_ks:
}
break;
branch_ku:
- is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
- /* clfi or clgfi %dst,imm */
- EMIT6_IMM(is_jmp32 ? 0xc20f0000 : 0xc20e0000,
- dst_reg, imm);
- if (!is_first_pass(jit) &&
- can_use_rel(jit, addrs[i + off + 1])) {
- /* brc mask,off */
- EMIT4_PCREL_RIC(0xa7040000,
- mask >> 12, addrs[i + off + 1]);
- } else {
- /* brcl mask,off */
- EMIT6_PCREL_RILC(0xc0040000,
- mask >> 12, addrs[i + off + 1]);
- }
- break;
+ /* lgfi %w1,imm (load sign extend imm) */
+ src_reg = REG_1;
+ EMIT6_IMM(0xc0010000, src_reg, imm);
+ goto branch_xu;
branch_xs:
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
if (!is_first_pass(jit) &&
@@ -1469,14 +1875,16 @@ branch_xu:
break;
branch_oc:
if (!is_first_pass(jit) &&
- can_use_rel(jit, addrs[i + off + 1])) {
+ can_use_rel(jit, addrs[i + branch_oc_off + 1])) {
/* brc mask,off */
EMIT4_PCREL_RIC(0xa7040000,
- mask >> 12, addrs[i + off + 1]);
+ mask >> 12,
+ addrs[i + branch_oc_off + 1]);
} else {
/* brcl mask,off */
EMIT6_PCREL_RILC(0xc0040000,
- mask >> 12, addrs[i + off + 1]);
+ mask >> 12,
+ addrs[i + branch_oc_off + 1]);
}
break;
}
@@ -1484,6 +1892,23 @@ branch_oc:
pr_err("Unknown opcode %02x\n", insn->code);
return -1;
}
+
+ if (probe_prg != -1) {
+ /*
+ * Handlers of certain exceptions leave psw.addr pointing to
+ * the instruction directly after the failing one. Therefore,
+ * create two exception table entries and also add a nop in
+ * case two probing instructions come directly after each
+ * other.
+ */
+ nop_prg = jit->prg;
+ /* bcr 0,%0 */
+ _EMIT2(0x0700);
+ err = bpf_jit_probe_mem(jit, fp, probe_prg, nop_prg);
+ if (err < 0)
+ return err;
+ }
+
return insn_count;
}
@@ -1509,7 +1934,14 @@ static bool bpf_is_new_addr_sane(struct bpf_jit *jit, int i)
*/
static int bpf_set_addr(struct bpf_jit *jit, int i)
{
- if (!bpf_is_new_addr_sane(jit, i))
+ int delta;
+
+ if (is_codegen_pass(jit)) {
+ delta = jit->prg - jit->addrs[i];
+ if (delta < 0)
+ bpf_skip(jit, -delta);
+ }
+ if (WARN_ON_ONCE(!bpf_is_new_addr_sane(jit, i)))
return -1;
jit->addrs[i] = jit->prg;
return 0;
@@ -1519,26 +1951,27 @@ static int bpf_set_addr(struct bpf_jit *jit, int i)
* Compile eBPF program into s390x code
*/
static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
- bool extra_pass)
+ bool extra_pass, u32 stack_depth)
{
int i, insn_count, lit32_size, lit64_size;
jit->lit32 = jit->lit32_start;
jit->lit64 = jit->lit64_start;
jit->prg = 0;
+ jit->excnt = 0;
- bpf_jit_prologue(jit, fp->aux->stack_depth);
+ bpf_jit_prologue(jit, fp, stack_depth);
if (bpf_set_addr(jit, 0) < 0)
return -1;
for (i = 0; i < fp->len; i += insn_count) {
- insn_count = bpf_jit_insn(jit, fp, i, extra_pass);
+ insn_count = bpf_jit_insn(jit, fp, i, extra_pass, stack_depth);
if (insn_count < 0)
return -1;
/* Next instruction address */
if (bpf_set_addr(jit, i + insn_count) < 0)
return -1;
}
- bpf_jit_epilogue(jit, fp->aux->stack_depth);
+ bpf_jit_epilogue(jit, stack_depth);
lit32_size = jit->lit32 - jit->lit32_start;
lit64_size = jit->lit64 - jit->lit64_start;
@@ -1550,6 +1983,12 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
jit->lit64_start = ALIGN(jit->lit64_start, 8);
jit->size = jit->lit64_start + lit64_size;
jit->size_prg = jit->prg;
+
+ if (WARN_ON_ONCE(fp->aux->extable &&
+ jit->excnt != fp->aux->num_exentries))
+ /* Verifier bug - too many entries. */
+ return -1;
+
return 0;
}
@@ -1564,11 +2003,35 @@ struct s390_jit_data {
int pass;
};
+static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit,
+ struct bpf_prog *fp)
+{
+ struct bpf_binary_header *header;
+ u32 extable_size;
+ u32 code_size;
+
+ /* We need two entries per insn. */
+ fp->aux->num_exentries *= 2;
+
+ code_size = roundup(jit->size,
+ __alignof__(struct exception_table_entry));
+ extable_size = fp->aux->num_exentries *
+ sizeof(struct exception_table_entry);
+ header = bpf_jit_binary_alloc(code_size + extable_size, &jit->prg_buf,
+ 8, jit_fill_hole);
+ if (!header)
+ return NULL;
+ fp->aux->extable = (struct exception_table_entry *)
+ (jit->prg_buf + code_size);
+ return header;
+}
+
/*
* Compile eBPF program "fp"
*/
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
{
+ u32 stack_depth = round_up(fp->aux->stack_depth, 8);
struct bpf_prog *tmp, *orig_fp = fp;
struct bpf_binary_header *header;
struct s390_jit_data *jit_data;
@@ -1577,6 +2040,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
struct bpf_jit jit;
int pass;
+ if (WARN_ON_ONCE(bpf_plt_end - bpf_plt != BPF_PLT_SIZE))
+ return orig_fp;
+
if (!fp->jit_requested)
return orig_fp;
@@ -1613,15 +2079,15 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
jit.addrs = kvcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL);
if (jit.addrs == NULL) {
fp = orig_fp;
- goto out;
+ goto free_addrs;
}
/*
* Three initial passes:
* - 1/2: Determine clobbered registers
- * - 3: Calculate program size and addrs arrray
+ * - 3: Calculate program size and addrs array
*/
for (pass = 1; pass <= 3; pass++) {
- if (bpf_jit_prog(&jit, fp, extra_pass)) {
+ if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) {
fp = orig_fp;
goto free_addrs;
}
@@ -1629,13 +2095,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
/*
* Final pass: Allocate and generate program
*/
- header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 8, jit_fill_hole);
+ header = bpf_jit_alloc(&jit, fp);
if (!header) {
fp = orig_fp;
goto free_addrs;
}
skip_init_ctx:
- if (bpf_jit_prog(&jit, fp, extra_pass)) {
+ if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) {
bpf_jit_binary_free(header);
fp = orig_fp;
goto free_addrs;
@@ -1668,3 +2134,556 @@ out:
tmp : orig_fp);
return fp;
}
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+ return true;
+}
+
+bool bpf_jit_supports_far_kfunc_call(void)
+{
+ return true;
+}
+
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+ void *old_addr, void *new_addr)
+{
+ struct {
+ u16 opc;
+ s32 disp;
+ } __packed insn;
+ char expected_plt[BPF_PLT_SIZE];
+ char current_plt[BPF_PLT_SIZE];
+ char new_plt[BPF_PLT_SIZE];
+ char *plt;
+ char *ret;
+ int err;
+
+ /* Verify the branch to be patched. */
+ err = copy_from_kernel_nofault(&insn, ip, sizeof(insn));
+ if (err < 0)
+ return err;
+ if (insn.opc != (0xc004 | (old_addr ? 0xf0 : 0)))
+ return -EINVAL;
+
+ if (t == BPF_MOD_JUMP &&
+ insn.disp == ((char *)new_addr - (char *)ip) >> 1) {
+ /*
+ * The branch already points to the destination,
+ * there is no PLT.
+ */
+ } else {
+ /* Verify the PLT. */
+ plt = (char *)ip + (insn.disp << 1);
+ err = copy_from_kernel_nofault(current_plt, plt, BPF_PLT_SIZE);
+ if (err < 0)
+ return err;
+ ret = (char *)ip + 6;
+ bpf_jit_plt(expected_plt, ret, old_addr);
+ if (memcmp(current_plt, expected_plt, BPF_PLT_SIZE))
+ return -EINVAL;
+ /* Adjust the call address. */
+ bpf_jit_plt(new_plt, ret, new_addr);
+ s390_kernel_write(plt + (bpf_plt_target - bpf_plt),
+ new_plt + (bpf_plt_target - bpf_plt),
+ sizeof(void *));
+ }
+
+ /* Adjust the mask of the branch. */
+ insn.opc = 0xc004 | (new_addr ? 0xf0 : 0);
+ s390_kernel_write((char *)ip + 1, (char *)&insn.opc + 1, 1);
+
+ /* Make the new code visible to the other CPUs. */
+ text_poke_sync_lock();
+
+ return 0;
+}
+
+struct bpf_tramp_jit {
+ struct bpf_jit common;
+ int orig_stack_args_off;/* Offset of arguments placed on stack by the
+ * func_addr's original caller
+ */
+ int stack_size; /* Trampoline stack size */
+ int backchain_off; /* Offset of backchain */
+ int stack_args_off; /* Offset of stack arguments for calling
+ * func_addr, has to be at the top
+ */
+ int reg_args_off; /* Offset of register arguments for calling
+ * func_addr
+ */
+ int ip_off; /* For bpf_get_func_ip(), has to be at
+ * (ctx - 16)
+ */
+ int arg_cnt_off; /* For bpf_get_func_arg_cnt(), has to be at
+ * (ctx - 8)
+ */
+ int bpf_args_off; /* Offset of BPF_PROG context, which consists
+ * of BPF arguments followed by return value
+ */
+ int retval_off; /* Offset of return value (see above) */
+ int r7_r8_off; /* Offset of saved %r7 and %r8, which are used
+ * for __bpf_prog_enter() return value and
+ * func_addr respectively
+ */
+ int run_ctx_off; /* Offset of struct bpf_tramp_run_ctx */
+ int tccnt_off; /* Offset of saved tailcall counter */
+ int r14_off; /* Offset of saved %r14, has to be at the
+ * bottom */
+ int do_fexit; /* do_fexit: label */
+};
+
+static void load_imm64(struct bpf_jit *jit, int dst_reg, u64 val)
+{
+ /* llihf %dst_reg,val_hi */
+ EMIT6_IMM(0xc00e0000, dst_reg, (val >> 32));
+ /* oilf %rdst_reg,val_lo */
+ EMIT6_IMM(0xc00d0000, dst_reg, val);
+}
+
+static int invoke_bpf_prog(struct bpf_tramp_jit *tjit,
+ const struct btf_func_model *m,
+ struct bpf_tramp_link *tlink, bool save_ret)
+{
+ struct bpf_jit *jit = &tjit->common;
+ int cookie_off = tjit->run_ctx_off +
+ offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
+ struct bpf_prog *p = tlink->link.prog;
+ int patch;
+
+ /*
+ * run_ctx.cookie = tlink->cookie;
+ */
+
+ /* %r0 = tlink->cookie */
+ load_imm64(jit, REG_W0, tlink->cookie);
+ /* stg %r0,cookie_off(%r15) */
+ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W0, REG_0, REG_15, cookie_off);
+
+ /*
+ * if ((start = __bpf_prog_enter(p, &run_ctx)) == 0)
+ * goto skip;
+ */
+
+ /* %r1 = __bpf_prog_enter */
+ load_imm64(jit, REG_1, (u64)bpf_trampoline_enter(p));
+ /* %r2 = p */
+ load_imm64(jit, REG_2, (u64)p);
+ /* la %r3,run_ctx_off(%r15) */
+ EMIT4_DISP(0x41000000, REG_3, REG_15, tjit->run_ctx_off);
+ /* %r1() */
+ call_r1(jit);
+ /* ltgr %r7,%r2 */
+ EMIT4(0xb9020000, REG_7, REG_2);
+ /* brcl 8,skip */
+ patch = jit->prg;
+ EMIT6_PCREL_RILC(0xc0040000, 8, 0);
+
+ /*
+ * retval = bpf_func(args, p->insnsi);
+ */
+
+ /* %r1 = p->bpf_func */
+ load_imm64(jit, REG_1, (u64)p->bpf_func);
+ /* la %r2,bpf_args_off(%r15) */
+ EMIT4_DISP(0x41000000, REG_2, REG_15, tjit->bpf_args_off);
+ /* %r3 = p->insnsi */
+ if (!p->jited)
+ load_imm64(jit, REG_3, (u64)p->insnsi);
+ /* %r1() */
+ call_r1(jit);
+ /* stg %r2,retval_off(%r15) */
+ if (save_ret) {
+ if (sign_extend(jit, REG_2, m->ret_size, m->ret_flags))
+ return -1;
+ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15,
+ tjit->retval_off);
+ }
+
+ /* skip: */
+ if (jit->prg_buf)
+ *(u32 *)&jit->prg_buf[patch + 2] = (jit->prg - patch) >> 1;
+
+ /*
+ * __bpf_prog_exit(p, start, &run_ctx);
+ */
+
+ /* %r1 = __bpf_prog_exit */
+ load_imm64(jit, REG_1, (u64)bpf_trampoline_exit(p));
+ /* %r2 = p */
+ load_imm64(jit, REG_2, (u64)p);
+ /* lgr %r3,%r7 */
+ EMIT4(0xb9040000, REG_3, REG_7);
+ /* la %r4,run_ctx_off(%r15) */
+ EMIT4_DISP(0x41000000, REG_4, REG_15, tjit->run_ctx_off);
+ /* %r1() */
+ call_r1(jit);
+
+ return 0;
+}
+
+static int alloc_stack(struct bpf_tramp_jit *tjit, size_t size)
+{
+ int stack_offset = tjit->stack_size;
+
+ tjit->stack_size += size;
+ return stack_offset;
+}
+
+/* ABI uses %r2 - %r6 for parameter passing. */
+#define MAX_NR_REG_ARGS 5
+
+/* The "L" field of the "mvc" instruction is 8 bits. */
+#define MAX_MVC_SIZE 256
+#define MAX_NR_STACK_ARGS (MAX_MVC_SIZE / sizeof(u64))
+
+/* -mfentry generates a 6-byte nop on s390x. */
+#define S390X_PATCH_SIZE 6
+
+static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
+ struct bpf_tramp_jit *tjit,
+ const struct btf_func_model *m,
+ u32 flags,
+ struct bpf_tramp_links *tlinks,
+ void *func_addr)
+{
+ struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
+ struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
+ struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
+ int nr_bpf_args, nr_reg_args, nr_stack_args;
+ struct bpf_jit *jit = &tjit->common;
+ int arg, bpf_arg_off;
+ int i, j;
+
+ /* Support as many stack arguments as "mvc" instruction can handle. */
+ nr_reg_args = min_t(int, m->nr_args, MAX_NR_REG_ARGS);
+ nr_stack_args = m->nr_args - nr_reg_args;
+ if (nr_stack_args > MAX_NR_STACK_ARGS)
+ return -ENOTSUPP;
+
+ /* Return to %r14, since func_addr and %r0 are not available. */
+ if ((!func_addr && !(flags & BPF_TRAMP_F_ORIG_STACK)) ||
+ (flags & BPF_TRAMP_F_INDIRECT))
+ flags |= BPF_TRAMP_F_SKIP_FRAME;
+
+ /*
+ * Compute how many arguments we need to pass to BPF programs.
+ * BPF ABI mirrors that of x86_64: arguments that are 16 bytes or
+ * smaller are packed into 1 or 2 registers; larger arguments are
+ * passed via pointers.
+ * In s390x ABI, arguments that are 8 bytes or smaller are packed into
+ * a register; larger arguments are passed via pointers.
+ * We need to deal with this difference.
+ */
+ nr_bpf_args = 0;
+ for (i = 0; i < m->nr_args; i++) {
+ if (m->arg_size[i] <= 8)
+ nr_bpf_args += 1;
+ else if (m->arg_size[i] <= 16)
+ nr_bpf_args += 2;
+ else
+ return -ENOTSUPP;
+ }
+
+ /*
+ * Calculate the stack layout.
+ */
+
+ /*
+ * Allocate STACK_FRAME_OVERHEAD bytes for the callees. As the s390x
+ * ABI requires, put our backchain at the end of the allocated memory.
+ */
+ tjit->stack_size = STACK_FRAME_OVERHEAD;
+ tjit->backchain_off = tjit->stack_size - sizeof(u64);
+ tjit->stack_args_off = alloc_stack(tjit, nr_stack_args * sizeof(u64));
+ tjit->reg_args_off = alloc_stack(tjit, nr_reg_args * sizeof(u64));
+ tjit->ip_off = alloc_stack(tjit, sizeof(u64));
+ tjit->arg_cnt_off = alloc_stack(tjit, sizeof(u64));
+ tjit->bpf_args_off = alloc_stack(tjit, nr_bpf_args * sizeof(u64));
+ tjit->retval_off = alloc_stack(tjit, sizeof(u64));
+ tjit->r7_r8_off = alloc_stack(tjit, 2 * sizeof(u64));
+ tjit->run_ctx_off = alloc_stack(tjit,
+ sizeof(struct bpf_tramp_run_ctx));
+ tjit->tccnt_off = alloc_stack(tjit, sizeof(u64));
+ tjit->r14_off = alloc_stack(tjit, sizeof(u64) * 2);
+ /*
+ * In accordance with the s390x ABI, the caller has allocated
+ * STACK_FRAME_OVERHEAD bytes for us. 8 of them contain the caller's
+ * backchain, and the rest we can use.
+ */
+ tjit->stack_size -= STACK_FRAME_OVERHEAD - sizeof(u64);
+ tjit->orig_stack_args_off = tjit->stack_size + STACK_FRAME_OVERHEAD;
+
+ /* lgr %r1,%r15 */
+ EMIT4(0xb9040000, REG_1, REG_15);
+ /* aghi %r15,-stack_size */
+ EMIT4_IMM(0xa70b0000, REG_15, -tjit->stack_size);
+ /* stg %r1,backchain_off(%r15) */
+ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_1, REG_0, REG_15,
+ tjit->backchain_off);
+ /* mvc tccnt_off(4,%r15),stack_size+STK_OFF_TCCNT(%r15) */
+ _EMIT6(0xd203f000 | tjit->tccnt_off,
+ 0xf000 | (tjit->stack_size + STK_OFF_TCCNT));
+ /* stmg %r2,%rN,fwd_reg_args_off(%r15) */
+ if (nr_reg_args)
+ EMIT6_DISP_LH(0xeb000000, 0x0024, REG_2,
+ REG_2 + (nr_reg_args - 1), REG_15,
+ tjit->reg_args_off);
+ for (i = 0, j = 0; i < m->nr_args; i++) {
+ if (i < MAX_NR_REG_ARGS)
+ arg = REG_2 + i;
+ else
+ arg = tjit->orig_stack_args_off +
+ (i - MAX_NR_REG_ARGS) * sizeof(u64);
+ bpf_arg_off = tjit->bpf_args_off + j * sizeof(u64);
+ if (m->arg_size[i] <= 8) {
+ if (i < MAX_NR_REG_ARGS)
+ /* stg %arg,bpf_arg_off(%r15) */
+ EMIT6_DISP_LH(0xe3000000, 0x0024, arg,
+ REG_0, REG_15, bpf_arg_off);
+ else
+ /* mvc bpf_arg_off(8,%r15),arg(%r15) */
+ _EMIT6(0xd207f000 | bpf_arg_off,
+ 0xf000 | arg);
+ j += 1;
+ } else {
+ if (i < MAX_NR_REG_ARGS) {
+ /* mvc bpf_arg_off(16,%r15),0(%arg) */
+ _EMIT6(0xd20ff000 | bpf_arg_off,
+ reg2hex[arg] << 12);
+ } else {
+ /* lg %r1,arg(%r15) */
+ EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_0,
+ REG_15, arg);
+ /* mvc bpf_arg_off(16,%r15),0(%r1) */
+ _EMIT6(0xd20ff000 | bpf_arg_off, 0x1000);
+ }
+ j += 2;
+ }
+ }
+ /* stmg %r7,%r8,r7_r8_off(%r15) */
+ EMIT6_DISP_LH(0xeb000000, 0x0024, REG_7, REG_8, REG_15,
+ tjit->r7_r8_off);
+ /* stg %r14,r14_off(%r15) */
+ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_14, REG_0, REG_15, tjit->r14_off);
+
+ if (flags & BPF_TRAMP_F_ORIG_STACK) {
+ /*
+ * The ftrace trampoline puts the return address (which is the
+ * address of the original function + S390X_PATCH_SIZE) into
+ * %r0; see ftrace_shared_hotpatch_trampoline_br and
+ * ftrace_init_nop() for details.
+ */
+
+ /* lgr %r8,%r0 */
+ EMIT4(0xb9040000, REG_8, REG_0);
+ } else {
+ /* %r8 = func_addr + S390X_PATCH_SIZE */
+ load_imm64(jit, REG_8, (u64)func_addr + S390X_PATCH_SIZE);
+ }
+
+ /*
+ * ip = func_addr;
+ * arg_cnt = m->nr_args;
+ */
+
+ if (flags & BPF_TRAMP_F_IP_ARG) {
+ /* %r0 = func_addr */
+ load_imm64(jit, REG_0, (u64)func_addr);
+ /* stg %r0,ip_off(%r15) */
+ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15,
+ tjit->ip_off);
+ }
+ /* lghi %r0,nr_bpf_args */
+ EMIT4_IMM(0xa7090000, REG_0, nr_bpf_args);
+ /* stg %r0,arg_cnt_off(%r15) */
+ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15,
+ tjit->arg_cnt_off);
+
+ if (flags & BPF_TRAMP_F_CALL_ORIG) {
+ /*
+ * __bpf_tramp_enter(im);
+ */
+
+ /* %r1 = __bpf_tramp_enter */
+ load_imm64(jit, REG_1, (u64)__bpf_tramp_enter);
+ /* %r2 = im */
+ load_imm64(jit, REG_2, (u64)im);
+ /* %r1() */
+ call_r1(jit);
+ }
+
+ for (i = 0; i < fentry->nr_links; i++)
+ if (invoke_bpf_prog(tjit, m, fentry->links[i],
+ flags & BPF_TRAMP_F_RET_FENTRY_RET))
+ return -EINVAL;
+
+ if (fmod_ret->nr_links) {
+ /*
+ * retval = 0;
+ */
+
+ /* xc retval_off(8,%r15),retval_off(%r15) */
+ _EMIT6(0xd707f000 | tjit->retval_off,
+ 0xf000 | tjit->retval_off);
+
+ for (i = 0; i < fmod_ret->nr_links; i++) {
+ if (invoke_bpf_prog(tjit, m, fmod_ret->links[i], true))
+ return -EINVAL;
+
+ /*
+ * if (retval)
+ * goto do_fexit;
+ */
+
+ /* ltg %r0,retval_off(%r15) */
+ EMIT6_DISP_LH(0xe3000000, 0x0002, REG_0, REG_0, REG_15,
+ tjit->retval_off);
+ /* brcl 7,do_fexit */
+ EMIT6_PCREL_RILC(0xc0040000, 7, tjit->do_fexit);
+ }
+ }
+
+ if (flags & BPF_TRAMP_F_CALL_ORIG) {
+ /*
+ * retval = func_addr(args);
+ */
+
+ /* lmg %r2,%rN,reg_args_off(%r15) */
+ if (nr_reg_args)
+ EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2,
+ REG_2 + (nr_reg_args - 1), REG_15,
+ tjit->reg_args_off);
+ /* mvc stack_args_off(N,%r15),orig_stack_args_off(%r15) */
+ if (nr_stack_args)
+ _EMIT6(0xd200f000 |
+ (nr_stack_args * sizeof(u64) - 1) << 16 |
+ tjit->stack_args_off,
+ 0xf000 | tjit->orig_stack_args_off);
+ /* mvc STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */
+ _EMIT6(0xd203f000 | STK_OFF_TCCNT, 0xf000 | tjit->tccnt_off);
+ /* lgr %r1,%r8 */
+ EMIT4(0xb9040000, REG_1, REG_8);
+ /* %r1() */
+ call_r1(jit);
+ /* stg %r2,retval_off(%r15) */
+ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15,
+ tjit->retval_off);
+
+ im->ip_after_call = jit->prg_buf + jit->prg;
+
+ /*
+ * The following nop will be patched by bpf_tramp_image_put().
+ */
+
+ /* brcl 0,im->ip_epilogue */
+ EMIT6_PCREL_RILC(0xc0040000, 0, (u64)im->ip_epilogue);
+ }
+
+ /* do_fexit: */
+ tjit->do_fexit = jit->prg;
+ for (i = 0; i < fexit->nr_links; i++)
+ if (invoke_bpf_prog(tjit, m, fexit->links[i], false))
+ return -EINVAL;
+
+ if (flags & BPF_TRAMP_F_CALL_ORIG) {
+ im->ip_epilogue = jit->prg_buf + jit->prg;
+
+ /*
+ * __bpf_tramp_exit(im);
+ */
+
+ /* %r1 = __bpf_tramp_exit */
+ load_imm64(jit, REG_1, (u64)__bpf_tramp_exit);
+ /* %r2 = im */
+ load_imm64(jit, REG_2, (u64)im);
+ /* %r1() */
+ call_r1(jit);
+ }
+
+ /* lmg %r2,%rN,reg_args_off(%r15) */
+ if ((flags & BPF_TRAMP_F_RESTORE_REGS) && nr_reg_args)
+ EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2,
+ REG_2 + (nr_reg_args - 1), REG_15,
+ tjit->reg_args_off);
+ /* lgr %r1,%r8 */
+ if (!(flags & BPF_TRAMP_F_SKIP_FRAME))
+ EMIT4(0xb9040000, REG_1, REG_8);
+ /* lmg %r7,%r8,r7_r8_off(%r15) */
+ EMIT6_DISP_LH(0xeb000000, 0x0004, REG_7, REG_8, REG_15,
+ tjit->r7_r8_off);
+ /* lg %r14,r14_off(%r15) */
+ EMIT6_DISP_LH(0xe3000000, 0x0004, REG_14, REG_0, REG_15, tjit->r14_off);
+ /* lg %r2,retval_off(%r15) */
+ if (flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET))
+ EMIT6_DISP_LH(0xe3000000, 0x0004, REG_2, REG_0, REG_15,
+ tjit->retval_off);
+ /* mvc stack_size+STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */
+ _EMIT6(0xd203f000 | (tjit->stack_size + STK_OFF_TCCNT),
+ 0xf000 | tjit->tccnt_off);
+ /* aghi %r15,stack_size */
+ EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size);
+ /* Emit an expoline for the following indirect jump. */
+ if (nospec_uses_trampoline())
+ emit_expoline(jit);
+ if (flags & BPF_TRAMP_F_SKIP_FRAME)
+ /* br %r14 */
+ _EMIT2(0x07fe);
+ else
+ /* br %r1 */
+ _EMIT2(0x07f1);
+
+ emit_r1_thunk(jit);
+
+ return 0;
+}
+
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks, void *orig_call)
+{
+ struct bpf_tramp_image im;
+ struct bpf_tramp_jit tjit;
+ int ret;
+
+ memset(&tjit, 0, sizeof(tjit));
+
+ ret = __arch_prepare_bpf_trampoline(&im, &tjit, m, flags,
+ tlinks, orig_call);
+
+ return ret < 0 ? ret : tjit.common.prg;
+}
+
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
+ void *image_end, const struct btf_func_model *m,
+ u32 flags, struct bpf_tramp_links *tlinks,
+ void *func_addr)
+{
+ struct bpf_tramp_jit tjit;
+ int ret;
+
+ /* Compute offsets, check whether the code fits. */
+ memset(&tjit, 0, sizeof(tjit));
+ ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags,
+ tlinks, func_addr);
+
+ if (ret < 0)
+ return ret;
+ if (tjit.common.prg > (char *)image_end - (char *)image)
+ /*
+ * Use the same error code as for exceeding
+ * BPF_MAX_TRAMP_LINKS.
+ */
+ return -E2BIG;
+
+ tjit.common.prg = 0;
+ tjit.common.prg_buf = image;
+ ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags,
+ tlinks, func_addr);
+
+ return ret < 0 ? ret : tjit.common.prg;
+}
+
+bool bpf_jit_supports_subprog_tailcalls(void)
+{
+ return true;
+}
diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile
deleted file mode 100644
index 66c2dff74895..000000000000
--- a/arch/s390/numa/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-obj-y += numa.o
-obj-y += toptree.o
-obj-$(CONFIG_NUMA_EMU) += mode_emu.o
diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c
deleted file mode 100644
index 72d742bb2d17..000000000000
--- a/arch/s390/numa/mode_emu.c
+++ /dev/null
@@ -1,577 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NUMA support for s390
- *
- * NUMA emulation (aka fake NUMA) distributes the available memory to nodes
- * without using real topology information about the physical memory of the
- * machine.
- *
- * It distributes the available CPUs to nodes while respecting the original
- * machine topology information. This is done by trying to avoid to separate
- * CPUs which reside on the same book or even on the same MC.
- *
- * Because the current Linux scheduler code requires a stable cpu to node
- * mapping, cores are pinned to nodes when the first CPU thread is set online.
- *
- * Copyright IBM Corp. 2015
- */
-
-#define KMSG_COMPONENT "numa_emu"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/cpumask.h>
-#include <linux/memblock.h>
-#include <linux/node.h>
-#include <linux/memory.h>
-#include <linux/slab.h>
-#include <asm/smp.h>
-#include <asm/topology.h>
-#include "numa_mode.h"
-#include "toptree.h"
-
-/* Distances between the different system components */
-#define DIST_EMPTY 0
-#define DIST_CORE 1
-#define DIST_MC 2
-#define DIST_BOOK 3
-#define DIST_DRAWER 4
-#define DIST_MAX 5
-
-/* Node distance reported to common code */
-#define EMU_NODE_DIST 10
-
-/* Node ID for free (not yet pinned) cores */
-#define NODE_ID_FREE -1
-
-/* Different levels of toptree */
-enum toptree_level {CORE, MC, BOOK, DRAWER, NODE, TOPOLOGY};
-
-/* The two toptree IDs */
-enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA};
-
-/* Number of NUMA nodes */
-static int emu_nodes = 1;
-/* NUMA stripe size */
-static unsigned long emu_size;
-
-/*
- * Node to core pinning information updates are protected by
- * "sched_domains_mutex".
- */
-static struct {
- s32 to_node_id[CONFIG_NR_CPUS]; /* Pinned core to node mapping */
- int total; /* Total number of pinned cores */
- int per_node_target; /* Cores per node without extra cores */
- int per_node[MAX_NUMNODES]; /* Number of cores pinned to node */
-} *emu_cores;
-
-/*
- * Pin a core to a node
- */
-static void pin_core_to_node(int core_id, int node_id)
-{
- if (emu_cores->to_node_id[core_id] == NODE_ID_FREE) {
- emu_cores->per_node[node_id]++;
- emu_cores->to_node_id[core_id] = node_id;
- emu_cores->total++;
- } else {
- WARN_ON(emu_cores->to_node_id[core_id] != node_id);
- }
-}
-
-/*
- * Number of pinned cores of a node
- */
-static int cores_pinned(struct toptree *node)
-{
- return emu_cores->per_node[node->id];
-}
-
-/*
- * ID of the node where the core is pinned (or NODE_ID_FREE)
- */
-static int core_pinned_to_node_id(struct toptree *core)
-{
- return emu_cores->to_node_id[core->id];
-}
-
-/*
- * Number of cores in the tree that are not yet pinned
- */
-static int cores_free(struct toptree *tree)
-{
- struct toptree *core;
- int count = 0;
-
- toptree_for_each(core, tree, CORE) {
- if (core_pinned_to_node_id(core) == NODE_ID_FREE)
- count++;
- }
- return count;
-}
-
-/*
- * Return node of core
- */
-static struct toptree *core_node(struct toptree *core)
-{
- return core->parent->parent->parent->parent;
-}
-
-/*
- * Return drawer of core
- */
-static struct toptree *core_drawer(struct toptree *core)
-{
- return core->parent->parent->parent;
-}
-
-/*
- * Return book of core
- */
-static struct toptree *core_book(struct toptree *core)
-{
- return core->parent->parent;
-}
-
-/*
- * Return mc of core
- */
-static struct toptree *core_mc(struct toptree *core)
-{
- return core->parent;
-}
-
-/*
- * Distance between two cores
- */
-static int dist_core_to_core(struct toptree *core1, struct toptree *core2)
-{
- if (core_drawer(core1)->id != core_drawer(core2)->id)
- return DIST_DRAWER;
- if (core_book(core1)->id != core_book(core2)->id)
- return DIST_BOOK;
- if (core_mc(core1)->id != core_mc(core2)->id)
- return DIST_MC;
- /* Same core or sibling on same MC */
- return DIST_CORE;
-}
-
-/*
- * Distance of a node to a core
- */
-static int dist_node_to_core(struct toptree *node, struct toptree *core)
-{
- struct toptree *core_node;
- int dist_min = DIST_MAX;
-
- toptree_for_each(core_node, node, CORE)
- dist_min = min(dist_min, dist_core_to_core(core_node, core));
- return dist_min == DIST_MAX ? DIST_EMPTY : dist_min;
-}
-
-/*
- * Unify will delete empty nodes, therefore recreate nodes.
- */
-static void toptree_unify_tree(struct toptree *tree)
-{
- int nid;
-
- toptree_unify(tree);
- for (nid = 0; nid < emu_nodes; nid++)
- toptree_get_child(tree, nid);
-}
-
-/*
- * Find the best/nearest node for a given core and ensure that no node
- * gets more than "emu_cores->per_node_target + extra" cores.
- */
-static struct toptree *node_for_core(struct toptree *numa, struct toptree *core,
- int extra)
-{
- struct toptree *node, *node_best = NULL;
- int dist_cur, dist_best, cores_target;
-
- cores_target = emu_cores->per_node_target + extra;
- dist_best = DIST_MAX;
- node_best = NULL;
- toptree_for_each(node, numa, NODE) {
- /* Already pinned cores must use their nodes */
- if (core_pinned_to_node_id(core) == node->id) {
- node_best = node;
- break;
- }
- /* Skip nodes that already have enough cores */
- if (cores_pinned(node) >= cores_target)
- continue;
- dist_cur = dist_node_to_core(node, core);
- if (dist_cur < dist_best) {
- dist_best = dist_cur;
- node_best = node;
- }
- }
- return node_best;
-}
-
-/*
- * Find the best node for each core with respect to "extra" core count
- */
-static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys,
- int extra)
-{
- struct toptree *node, *core, *tmp;
-
- toptree_for_each_safe(core, tmp, phys, CORE) {
- node = node_for_core(numa, core, extra);
- if (!node)
- return;
- toptree_move(core, node);
- pin_core_to_node(core->id, node->id);
- }
-}
-
-/*
- * Move structures of given level to specified NUMA node
- */
-static void move_level_to_numa_node(struct toptree *node, struct toptree *phys,
- enum toptree_level level, bool perfect)
-{
- int cores_free, cores_target = emu_cores->per_node_target;
- struct toptree *cur, *tmp;
-
- toptree_for_each_safe(cur, tmp, phys, level) {
- cores_free = cores_target - toptree_count(node, CORE);
- if (perfect) {
- if (cores_free == toptree_count(cur, CORE))
- toptree_move(cur, node);
- } else {
- if (cores_free >= toptree_count(cur, CORE))
- toptree_move(cur, node);
- }
- }
-}
-
-/*
- * Move structures of a given level to NUMA nodes. If "perfect" is specified
- * move only perfectly fitting structures. Otherwise move also smaller
- * than needed structures.
- */
-static void move_level_to_numa(struct toptree *numa, struct toptree *phys,
- enum toptree_level level, bool perfect)
-{
- struct toptree *node;
-
- toptree_for_each(node, numa, NODE)
- move_level_to_numa_node(node, phys, level, perfect);
-}
-
-/*
- * For the first run try to move the big structures
- */
-static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys)
-{
- struct toptree *core;
-
- /* Always try to move perfectly fitting structures first */
- move_level_to_numa(numa, phys, DRAWER, true);
- move_level_to_numa(numa, phys, DRAWER, false);
- move_level_to_numa(numa, phys, BOOK, true);
- move_level_to_numa(numa, phys, BOOK, false);
- move_level_to_numa(numa, phys, MC, true);
- move_level_to_numa(numa, phys, MC, false);
- /* Now pin all the moved cores */
- toptree_for_each(core, numa, CORE)
- pin_core_to_node(core->id, core_node(core)->id);
-}
-
-/*
- * Allocate new topology and create required nodes
- */
-static struct toptree *toptree_new(int id, int nodes)
-{
- struct toptree *tree;
- int nid;
-
- tree = toptree_alloc(TOPOLOGY, id);
- if (!tree)
- goto fail;
- for (nid = 0; nid < nodes; nid++) {
- if (!toptree_get_child(tree, nid))
- goto fail;
- }
- return tree;
-fail:
- panic("NUMA emulation could not allocate topology");
-}
-
-/*
- * Allocate and initialize core to node mapping
- */
-static void __ref create_core_to_node_map(void)
-{
- int i;
-
- emu_cores = memblock_alloc(sizeof(*emu_cores), 8);
- if (!emu_cores)
- panic("%s: Failed to allocate %zu bytes align=0x%x\n",
- __func__, sizeof(*emu_cores), 8);
- for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++)
- emu_cores->to_node_id[i] = NODE_ID_FREE;
-}
-
-/*
- * Move cores from physical topology into NUMA target topology
- * and try to keep as much of the physical topology as possible.
- */
-static struct toptree *toptree_to_numa(struct toptree *phys)
-{
- static int first = 1;
- struct toptree *numa;
- int cores_total;
-
- cores_total = emu_cores->total + cores_free(phys);
- emu_cores->per_node_target = cores_total / emu_nodes;
- numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes);
- if (first) {
- toptree_to_numa_first(numa, phys);
- first = 0;
- }
- toptree_to_numa_single(numa, phys, 0);
- toptree_to_numa_single(numa, phys, 1);
- toptree_unify_tree(numa);
-
- WARN_ON(cpumask_weight(&phys->mask));
- return numa;
-}
-
-/*
- * Create a toptree out of the physical topology that we got from the hypervisor
- */
-static struct toptree *toptree_from_topology(void)
-{
- struct toptree *phys, *node, *drawer, *book, *mc, *core;
- struct cpu_topology_s390 *top;
- int cpu;
-
- phys = toptree_new(TOPTREE_ID_PHYS, 1);
-
- for_each_cpu(cpu, &cpus_with_topology) {
- top = &cpu_topology[cpu];
- node = toptree_get_child(phys, 0);
- drawer = toptree_get_child(node, top->drawer_id);
- book = toptree_get_child(drawer, top->book_id);
- mc = toptree_get_child(book, top->socket_id);
- core = toptree_get_child(mc, smp_get_base_cpu(cpu));
- if (!drawer || !book || !mc || !core)
- panic("NUMA emulation could not allocate memory");
- cpumask_set_cpu(cpu, &core->mask);
- toptree_update_mask(mc);
- }
- return phys;
-}
-
-/*
- * Add toptree core to topology and create correct CPU masks
- */
-static void topology_add_core(struct toptree *core)
-{
- struct cpu_topology_s390 *top;
- int cpu;
-
- for_each_cpu(cpu, &core->mask) {
- top = &cpu_topology[cpu];
- cpumask_copy(&top->thread_mask, &core->mask);
- cpumask_copy(&top->core_mask, &core_mc(core)->mask);
- cpumask_copy(&top->book_mask, &core_book(core)->mask);
- cpumask_copy(&top->drawer_mask, &core_drawer(core)->mask);
- cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]);
- top->node_id = core_node(core)->id;
- }
-}
-
-/*
- * Apply toptree to topology and create CPU masks
- */
-static void toptree_to_topology(struct toptree *numa)
-{
- struct toptree *core;
- int i;
-
- /* Clear all node masks */
- for (i = 0; i < MAX_NUMNODES; i++)
- cpumask_clear(&node_to_cpumask_map[i]);
-
- /* Rebuild all masks */
- toptree_for_each(core, numa, CORE)
- topology_add_core(core);
-}
-
-/*
- * Show the node to core mapping
- */
-static void print_node_to_core_map(void)
-{
- int nid, cid;
-
- if (!numa_debug_enabled)
- return;
- printk(KERN_DEBUG "NUMA node to core mapping\n");
- for (nid = 0; nid < emu_nodes; nid++) {
- printk(KERN_DEBUG " node %3d: ", nid);
- for (cid = 0; cid < ARRAY_SIZE(emu_cores->to_node_id); cid++) {
- if (emu_cores->to_node_id[cid] == nid)
- printk(KERN_CONT "%d ", cid);
- }
- printk(KERN_CONT "\n");
- }
-}
-
-static void pin_all_possible_cpus(void)
-{
- int core_id, node_id, cpu;
- static int initialized;
-
- if (initialized)
- return;
- print_node_to_core_map();
- node_id = 0;
- for_each_possible_cpu(cpu) {
- core_id = smp_get_base_cpu(cpu);
- if (emu_cores->to_node_id[core_id] != NODE_ID_FREE)
- continue;
- pin_core_to_node(core_id, node_id);
- cpu_topology[cpu].node_id = node_id;
- node_id = (node_id + 1) % emu_nodes;
- }
- print_node_to_core_map();
- initialized = 1;
-}
-
-/*
- * Transfer physical topology into a NUMA topology and modify CPU masks
- * according to the NUMA topology.
- *
- * Must be called with "sched_domains_mutex" lock held.
- */
-static void emu_update_cpu_topology(void)
-{
- struct toptree *phys, *numa;
-
- if (emu_cores == NULL)
- create_core_to_node_map();
- phys = toptree_from_topology();
- numa = toptree_to_numa(phys);
- toptree_free(phys);
- toptree_to_topology(numa);
- toptree_free(numa);
- pin_all_possible_cpus();
-}
-
-/*
- * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum
- * alignment (needed for memory hotplug).
- */
-static unsigned long emu_setup_size_adjust(unsigned long size)
-{
- unsigned long size_new;
-
- size = size ? : CONFIG_EMU_SIZE;
- size_new = roundup(size, memory_block_size_bytes());
- if (size_new == size)
- return size;
- pr_warn("Increasing memory stripe size from %ld MB to %ld MB\n",
- size >> 20, size_new >> 20);
- return size_new;
-}
-
-/*
- * If we have not enough memory for the specified nodes, reduce the node count.
- */
-static int emu_setup_nodes_adjust(int nodes)
-{
- int nodes_max;
-
- nodes_max = memblock.memory.total_size / emu_size;
- nodes_max = max(nodes_max, 1);
- if (nodes_max >= nodes)
- return nodes;
- pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes);
- return nodes_max;
-}
-
-/*
- * Early emu setup
- */
-static void emu_setup(void)
-{
- int nid;
-
- emu_size = emu_setup_size_adjust(emu_size);
- emu_nodes = emu_setup_nodes_adjust(emu_nodes);
- for (nid = 0; nid < emu_nodes; nid++)
- node_set(nid, node_possible_map);
- pr_info("Creating %d nodes with memory stripe size %ld MB\n",
- emu_nodes, emu_size >> 20);
-}
-
-/*
- * Return node id for given page number
- */
-static int emu_pfn_to_nid(unsigned long pfn)
-{
- return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes;
-}
-
-/*
- * Return stripe size
- */
-static unsigned long emu_align(void)
-{
- return emu_size;
-}
-
-/*
- * Return distance between two nodes
- */
-static int emu_distance(int node1, int node2)
-{
- return (node1 != node2) * EMU_NODE_DIST;
-}
-
-/*
- * Define callbacks for generic s390 NUMA infrastructure
- */
-const struct numa_mode numa_mode_emu = {
- .name = "emu",
- .setup = emu_setup,
- .update_cpu_topology = emu_update_cpu_topology,
- .__pfn_to_nid = emu_pfn_to_nid,
- .align = emu_align,
- .distance = emu_distance,
-};
-
-/*
- * Kernel parameter: emu_nodes=<n>
- */
-static int __init early_parse_emu_nodes(char *p)
-{
- int count;
-
- if (!p || kstrtoint(p, 0, &count) != 0 || count <= 0)
- return 0;
- emu_nodes = min(count, MAX_NUMNODES);
- return 0;
-}
-early_param("emu_nodes", early_parse_emu_nodes);
-
-/*
- * Kernel parameter: emu_size=[<n>[k|M|G|T]]
- */
-static int __init early_parse_emu_size(char *p)
-{
- if (p)
- emu_size = memparse(p, NULL);
- return 0;
-}
-early_param("emu_size", early_parse_emu_size);
diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c
deleted file mode 100644
index d2910fa834c8..000000000000
--- a/arch/s390/numa/numa.c
+++ /dev/null
@@ -1,171 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NUMA support for s390
- *
- * Implement NUMA core code.
- *
- * Copyright IBM Corp. 2015
- */
-
-#define KMSG_COMPONENT "numa"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/mmzone.h>
-#include <linux/cpumask.h>
-#include <linux/memblock.h>
-#include <linux/slab.h>
-#include <linux/node.h>
-
-#include <asm/numa.h>
-#include "numa_mode.h"
-
-pg_data_t *node_data[MAX_NUMNODES];
-EXPORT_SYMBOL(node_data);
-
-cpumask_t node_to_cpumask_map[MAX_NUMNODES];
-EXPORT_SYMBOL(node_to_cpumask_map);
-
-static void plain_setup(void)
-{
- node_set(0, node_possible_map);
-}
-
-const struct numa_mode numa_mode_plain = {
- .name = "plain",
- .setup = plain_setup,
-};
-
-static const struct numa_mode *mode = &numa_mode_plain;
-
-int numa_pfn_to_nid(unsigned long pfn)
-{
- return mode->__pfn_to_nid ? mode->__pfn_to_nid(pfn) : 0;
-}
-
-void numa_update_cpu_topology(void)
-{
- if (mode->update_cpu_topology)
- mode->update_cpu_topology();
-}
-
-int __node_distance(int a, int b)
-{
- return mode->distance ? mode->distance(a, b) : 0;
-}
-EXPORT_SYMBOL(__node_distance);
-
-int numa_debug_enabled;
-
-/*
- * numa_setup_memory() - Assign bootmem to nodes
- *
- * The memory is first added to memblock without any respect to nodes.
- * This is fixed before remaining memblock memory is handed over to the
- * buddy allocator.
- * An important side effect is that large bootmem allocations might easily
- * cross node boundaries, which can be needed for large allocations with
- * smaller memory stripes in each node (i.e. when using NUMA emulation).
- *
- * Memory defines nodes:
- * Therefore this routine also sets the nodes online with memory.
- */
-static void __init numa_setup_memory(void)
-{
- unsigned long cur_base, align, end_of_dram;
- int nid = 0;
-
- end_of_dram = memblock_end_of_DRAM();
- align = mode->align ? mode->align() : ULONG_MAX;
-
- /*
- * Step through all available memory and assign it to the nodes
- * indicated by the mode implementation.
- * All nodes which are seen here will be set online.
- */
- cur_base = 0;
- do {
- nid = numa_pfn_to_nid(PFN_DOWN(cur_base));
- node_set_online(nid);
- memblock_set_node(cur_base, align, &memblock.memory, nid);
- cur_base += align;
- } while (cur_base < end_of_dram);
-
- /* Allocate and fill out node_data */
- for (nid = 0; nid < MAX_NUMNODES; nid++) {
- NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8);
- if (!NODE_DATA(nid))
- panic("%s: Failed to allocate %zu bytes align=0x%x\n",
- __func__, sizeof(pg_data_t), 8);
- }
-
- for_each_online_node(nid) {
- unsigned long start_pfn, end_pfn;
- unsigned long t_start, t_end;
- int i;
-
- start_pfn = ULONG_MAX;
- end_pfn = 0;
- for_each_mem_pfn_range(i, nid, &t_start, &t_end, NULL) {
- if (t_start < start_pfn)
- start_pfn = t_start;
- if (t_end > end_pfn)
- end_pfn = t_end;
- }
- NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
- NODE_DATA(nid)->node_id = nid;
- }
-}
-
-/*
- * numa_setup() - Earliest initialization
- *
- * Assign the mode and call the mode's setup routine.
- */
-void __init numa_setup(void)
-{
- pr_info("NUMA mode: %s\n", mode->name);
- nodes_clear(node_possible_map);
- /* Initially attach all possible CPUs to node 0. */
- cpumask_copy(&node_to_cpumask_map[0], cpu_possible_mask);
- if (mode->setup)
- mode->setup();
- numa_setup_memory();
- memblock_dump_all();
-}
-
-/*
- * numa_init_late() - Initialization initcall
- *
- * Register NUMA nodes.
- */
-static int __init numa_init_late(void)
-{
- int nid;
-
- for_each_online_node(nid)
- register_one_node(nid);
- return 0;
-}
-arch_initcall(numa_init_late);
-
-static int __init parse_debug(char *parm)
-{
- numa_debug_enabled = 1;
- return 0;
-}
-early_param("numa_debug", parse_debug);
-
-static int __init parse_numa(char *parm)
-{
- if (!parm)
- return 1;
- if (strcmp(parm, numa_mode_plain.name) == 0)
- mode = &numa_mode_plain;
-#ifdef CONFIG_NUMA_EMU
- if (strcmp(parm, numa_mode_emu.name) == 0)
- mode = &numa_mode_emu;
-#endif
- return 0;
-}
-early_param("numa", parse_numa);
diff --git a/arch/s390/numa/numa_mode.h b/arch/s390/numa/numa_mode.h
deleted file mode 100644
index dfd3e2784081..000000000000
--- a/arch/s390/numa/numa_mode.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * NUMA support for s390
- *
- * Define declarations used for communication between NUMA mode
- * implementations and NUMA core functionality.
- *
- * Copyright IBM Corp. 2015
- */
-#ifndef __S390_NUMA_MODE_H
-#define __S390_NUMA_MODE_H
-
-struct numa_mode {
- char *name; /* Name of mode */
- void (*setup)(void); /* Initizalize mode */
- void (*update_cpu_topology)(void); /* Called by topology code */
- int (*__pfn_to_nid)(unsigned long pfn); /* PFN to node ID */
- unsigned long (*align)(void); /* Minimum node alignment */
- int (*distance)(int a, int b); /* Distance between two nodes */
-};
-
-extern const struct numa_mode numa_mode_plain;
-extern const struct numa_mode numa_mode_emu;
-
-#endif /* __S390_NUMA_MODE_H */
diff --git a/arch/s390/numa/toptree.c b/arch/s390/numa/toptree.c
deleted file mode 100644
index 71a608cd4f61..000000000000
--- a/arch/s390/numa/toptree.c
+++ /dev/null
@@ -1,351 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NUMA support for s390
- *
- * A tree structure used for machine topology mangling
- *
- * Copyright IBM Corp. 2015
- */
-
-#include <linux/kernel.h>
-#include <linux/memblock.h>
-#include <linux/cpumask.h>
-#include <linux/list.h>
-#include <linux/list_sort.h>
-#include <linux/slab.h>
-#include <asm/numa.h>
-
-#include "toptree.h"
-
-/**
- * toptree_alloc - Allocate and initialize a new tree node.
- * @level: The node's vertical level; level 0 contains the leaves.
- * @id: ID number, explicitly not unique beyond scope of node's siblings
- *
- * Allocate a new tree node and initialize it.
- *
- * RETURNS:
- * Pointer to the new tree node or NULL on error
- */
-struct toptree __ref *toptree_alloc(int level, int id)
-{
- struct toptree *res;
-
- if (slab_is_available())
- res = kzalloc(sizeof(*res), GFP_KERNEL);
- else
- res = memblock_alloc(sizeof(*res), 8);
- if (!res)
- return res;
-
- INIT_LIST_HEAD(&res->children);
- INIT_LIST_HEAD(&res->sibling);
- cpumask_clear(&res->mask);
- res->level = level;
- res->id = id;
- return res;
-}
-
-/**
- * toptree_remove - Remove a tree node from a tree
- * @cand: Pointer to the node to remove
- *
- * The node is detached from its parent node. The parent node's
- * masks will be updated to reflect the loss of the child.
- */
-static void toptree_remove(struct toptree *cand)
-{
- struct toptree *oldparent;
-
- list_del_init(&cand->sibling);
- oldparent = cand->parent;
- cand->parent = NULL;
- toptree_update_mask(oldparent);
-}
-
-/**
- * toptree_free - discard a tree node
- * @cand: Pointer to the tree node to discard
- *
- * Checks if @cand is attached to a parent node. Detaches it
- * cleanly using toptree_remove. Possible children are freed
- * recursively. In the end @cand itself is freed.
- */
-void __ref toptree_free(struct toptree *cand)
-{
- struct toptree *child, *tmp;
-
- if (cand->parent)
- toptree_remove(cand);
- toptree_for_each_child_safe(child, tmp, cand)
- toptree_free(child);
- if (slab_is_available())
- kfree(cand);
- else
- memblock_free_early((unsigned long)cand, sizeof(*cand));
-}
-
-/**
- * toptree_update_mask - Update node bitmasks
- * @cand: Pointer to a tree node
- *
- * The node's cpumask will be updated by combining all children's
- * masks. Then toptree_update_mask is called recursively for the
- * parent if applicable.
- *
- * NOTE:
- * This must not be called on leaves. If called on a leaf, its
- * CPU mask is cleared and lost.
- */
-void toptree_update_mask(struct toptree *cand)
-{
- struct toptree *child;
-
- cpumask_clear(&cand->mask);
- list_for_each_entry(child, &cand->children, sibling)
- cpumask_or(&cand->mask, &cand->mask, &child->mask);
- if (cand->parent)
- toptree_update_mask(cand->parent);
-}
-
-/**
- * toptree_insert - Insert a tree node into tree
- * @cand: Pointer to the node to insert
- * @target: Pointer to the node to which @cand will added as a child
- *
- * Insert a tree node into a tree. Masks will be updated automatically.
- *
- * RETURNS:
- * 0 on success, -1 if NULL is passed as argument or the node levels
- * don't fit.
- */
-static int toptree_insert(struct toptree *cand, struct toptree *target)
-{
- if (!cand || !target)
- return -1;
- if (target->level != (cand->level + 1))
- return -1;
- list_add_tail(&cand->sibling, &target->children);
- cand->parent = target;
- toptree_update_mask(target);
- return 0;
-}
-
-/**
- * toptree_move_children - Move all child nodes of a node to a new place
- * @cand: Pointer to the node whose children are to be moved
- * @target: Pointer to the node to which @cand's children will be attached
- *
- * Take all child nodes of @cand and move them using toptree_move.
- */
-static void toptree_move_children(struct toptree *cand, struct toptree *target)
-{
- struct toptree *child, *tmp;
-
- toptree_for_each_child_safe(child, tmp, cand)
- toptree_move(child, target);
-}
-
-/**
- * toptree_unify - Merge children with same ID
- * @cand: Pointer to node whose direct children should be made unique
- *
- * When mangling the tree it is possible that a node has two or more children
- * which have the same ID. This routine merges these children into one and
- * moves all children of the merged nodes into the unified node.
- */
-void toptree_unify(struct toptree *cand)
-{
- struct toptree *child, *tmp, *cand_copy;
-
- /* Threads cannot be split, cores are not split */
- if (cand->level < 2)
- return;
-
- cand_copy = toptree_alloc(cand->level, 0);
- toptree_for_each_child_safe(child, tmp, cand) {
- struct toptree *tmpchild;
-
- if (!cpumask_empty(&child->mask)) {
- tmpchild = toptree_get_child(cand_copy, child->id);
- toptree_move_children(child, tmpchild);
- }
- toptree_free(child);
- }
- toptree_move_children(cand_copy, cand);
- toptree_free(cand_copy);
-
- toptree_for_each_child(child, cand)
- toptree_unify(child);
-}
-
-/**
- * toptree_move - Move a node to another context
- * @cand: Pointer to the node to move
- * @target: Pointer to the node where @cand should go
- *
- * In the easiest case @cand is exactly on the level below @target
- * and will be immediately moved to the target.
- *
- * If @target's level is not the direct parent level of @cand,
- * nodes for the missing levels are created and put between
- * @cand and @target. The "stacking" nodes' IDs are taken from
- * @cand's parents.
- *
- * After this it is likely to have redundant nodes in the tree
- * which are addressed by means of toptree_unify.
- */
-void toptree_move(struct toptree *cand, struct toptree *target)
-{
- struct toptree *stack_target, *real_insert_point, *ptr, *tmp;
-
- if (cand->level + 1 == target->level) {
- toptree_remove(cand);
- toptree_insert(cand, target);
- return;
- }
-
- real_insert_point = NULL;
- ptr = cand;
- stack_target = NULL;
-
- do {
- tmp = stack_target;
- stack_target = toptree_alloc(ptr->level + 1,
- ptr->parent->id);
- toptree_insert(tmp, stack_target);
- if (!real_insert_point)
- real_insert_point = stack_target;
- ptr = ptr->parent;
- } while (stack_target->level < (target->level - 1));
-
- toptree_remove(cand);
- toptree_insert(cand, real_insert_point);
- toptree_insert(stack_target, target);
-}
-
-/**
- * toptree_get_child - Access a tree node's child by its ID
- * @cand: Pointer to tree node whose child is to access
- * @id: The desired child's ID
- *
- * @cand's children are searched for a child with matching ID.
- * If no match can be found, a new child with the desired ID
- * is created and returned.
- */
-struct toptree *toptree_get_child(struct toptree *cand, int id)
-{
- struct toptree *child;
-
- toptree_for_each_child(child, cand)
- if (child->id == id)
- return child;
- child = toptree_alloc(cand->level-1, id);
- toptree_insert(child, cand);
- return child;
-}
-
-/**
- * toptree_first - Find the first descendant on specified level
- * @context: Pointer to tree node whose descendants are to be used
- * @level: The level of interest
- *
- * RETURNS:
- * @context's first descendant on the specified level, or NULL
- * if there is no matching descendant
- */
-struct toptree *toptree_first(struct toptree *context, int level)
-{
- struct toptree *child, *tmp;
-
- if (context->level == level)
- return context;
-
- if (!list_empty(&context->children)) {
- list_for_each_entry(child, &context->children, sibling) {
- tmp = toptree_first(child, level);
- if (tmp)
- return tmp;
- }
- }
- return NULL;
-}
-
-/**
- * toptree_next_sibling - Return next sibling
- * @cur: Pointer to a tree node
- *
- * RETURNS:
- * If @cur has a parent and is not the last in the parent's children list,
- * the next sibling is returned. Or NULL when there are no siblings left.
- */
-static struct toptree *toptree_next_sibling(struct toptree *cur)
-{
- if (cur->parent == NULL)
- return NULL;
-
- if (cur == list_last_entry(&cur->parent->children,
- struct toptree, sibling))
- return NULL;
- return (struct toptree *) list_next_entry(cur, sibling);
-}
-
-/**
- * toptree_next - Tree traversal function
- * @cur: Pointer to current element
- * @context: Pointer to the root node of the tree or subtree to
- * be traversed.
- * @level: The level of interest.
- *
- * RETURNS:
- * Pointer to the next node on level @level
- * or NULL when there is no next node.
- */
-struct toptree *toptree_next(struct toptree *cur, struct toptree *context,
- int level)
-{
- struct toptree *cur_context, *tmp;
-
- if (!cur)
- return NULL;
-
- if (context->level == level)
- return NULL;
-
- tmp = toptree_next_sibling(cur);
- if (tmp != NULL)
- return tmp;
-
- cur_context = cur;
- while (cur_context->level < context->level - 1) {
- /* Step up */
- cur_context = cur_context->parent;
- /* Step aside */
- tmp = toptree_next_sibling(cur_context);
- if (tmp != NULL) {
- /* Step down */
- tmp = toptree_first(tmp, level);
- if (tmp != NULL)
- return tmp;
- }
- }
- return NULL;
-}
-
-/**
- * toptree_count - Count descendants on specified level
- * @context: Pointer to node whose descendants are to be considered
- * @level: Only descendants on the specified level will be counted
- *
- * RETURNS:
- * Number of descendants on the specified level
- */
-int toptree_count(struct toptree *context, int level)
-{
- struct toptree *cur;
- int cnt = 0;
-
- toptree_for_each(cur, context, level)
- cnt++;
- return cnt;
-}
diff --git a/arch/s390/numa/toptree.h b/arch/s390/numa/toptree.h
deleted file mode 100644
index 5246371ec713..000000000000
--- a/arch/s390/numa/toptree.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * NUMA support for s390
- *
- * A tree structure used for machine topology mangling
- *
- * Copyright IBM Corp. 2015
- */
-#ifndef S390_TOPTREE_H
-#define S390_TOPTREE_H
-
-#include <linux/cpumask.h>
-#include <linux/list.h>
-
-struct toptree {
- int level;
- int id;
- cpumask_t mask;
- struct toptree *parent;
- struct list_head sibling;
- struct list_head children;
-};
-
-struct toptree *toptree_alloc(int level, int id);
-void toptree_free(struct toptree *cand);
-void toptree_update_mask(struct toptree *cand);
-void toptree_unify(struct toptree *cand);
-struct toptree *toptree_get_child(struct toptree *cand, int id);
-void toptree_move(struct toptree *cand, struct toptree *target);
-int toptree_count(struct toptree *context, int level);
-
-struct toptree *toptree_first(struct toptree *context, int level);
-struct toptree *toptree_next(struct toptree *cur, struct toptree *context,
- int level);
-
-#define toptree_for_each_child(child, ptree) \
- list_for_each_entry(child, &ptree->children, sibling)
-
-#define toptree_for_each_child_safe(child, ptmp, ptree) \
- list_for_each_entry_safe(child, ptmp, &ptree->children, sibling)
-
-#define toptree_is_last(ptree) \
- ((ptree->parent == NULL) || \
- (ptree->parent->children.prev == &ptree->sibling))
-
-#define toptree_for_each(ptree, cont, ttype) \
- for (ptree = toptree_first(cont, ttype); \
- ptree != NULL; \
- ptree = toptree_next(ptree, cont, ttype))
-
-#define toptree_for_each_safe(ptree, tmp, cont, ttype) \
- for (ptree = toptree_first(cont, ttype), \
- tmp = toptree_next(ptree, cont, ttype); \
- ptree != NULL; \
- ptree = tmp, \
- tmp = toptree_next(ptree, cont, ttype))
-
-#define toptree_for_each_sibling(ptree, start) \
- toptree_for_each(ptree, start->parent, start->level)
-
-#endif /* S390_TOPTREE_H */
diff --git a/arch/s390/oprofile/Makefile b/arch/s390/oprofile/Makefile
deleted file mode 100644
index 36261f9d360b..000000000000
--- a/arch/s390/oprofile/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_OPROFILE) += oprofile.o
-
-DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
- oprof.o cpu_buffer.o buffer_sync.o \
- event_buffer.o oprofile_files.o \
- oprofilefs.o oprofile_stats.o \
- timer_int.o )
-
-oprofile-y := $(DRIVER_OBJS) init.o
diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c
deleted file mode 100644
index 7441857df51b..000000000000
--- a/arch/s390/oprofile/init.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * S390 Version
- * Copyright IBM Corp. 2002, 2011
- * Author(s): Thomas Spatzier (tspat@de.ibm.com)
- * Author(s): Mahesh Salgaonkar (mahesh@linux.vnet.ibm.com)
- * Author(s): Heinz Graalfs (graalfs@linux.vnet.ibm.com)
- * Author(s): Andreas Krebbel (krebbel@linux.vnet.ibm.com)
- *
- * @remark Copyright 2002-2011 OProfile authors
- */
-
-#include <linux/oprofile.h>
-#include <linux/init.h>
-#include <asm/processor.h>
-#include <asm/unwind.h>
-
-static void s390_backtrace(struct pt_regs *regs, unsigned int depth)
-{
- struct unwind_state state;
-
- unwind_for_each_frame(&state, current, regs, 0) {
- if (depth-- == 0)
- break;
- oprofile_add_trace(state.ip);
- }
-}
-
-int __init oprofile_arch_init(struct oprofile_operations *ops)
-{
- ops->backtrace = s390_backtrace;
- return 0;
-}
-
-void oprofile_arch_exit(void)
-{
-}
diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile
index 748626a33028..0547a10406e7 100644
--- a/arch/s390/pci/Makefile
+++ b/arch/s390/pci/Makefile
@@ -3,5 +3,7 @@
# Makefile for the s390 PCI subsystem.
#
-obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \
- pci_event.o pci_debug.o pci_insn.o pci_mmio.o
+obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_clp.o pci_sysfs.o \
+ pci_event.o pci_debug.o pci_insn.o pci_mmio.o \
+ pci_bus.o pci_kvm_hook.o
+obj-$(CONFIG_PCI_IOV) += pci_iov.o
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 8e872951c07b..676ac74026a8 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -36,17 +36,22 @@
#include <asm/pci_clp.h>
#include <asm/pci_dma.h>
+#include "pci_bus.h"
+#include "pci_iov.h"
+
/* list of all detected zpci devices */
static LIST_HEAD(zpci_list);
static DEFINE_SPINLOCK(zpci_list_lock);
-static DECLARE_BITMAP(zpci_domain, ZPCI_NR_DEVICES);
+static DECLARE_BITMAP(zpci_domain, ZPCI_DOMAIN_BITMAP_SIZE);
static DEFINE_SPINLOCK(zpci_domain_lock);
#define ZPCI_IOMAP_ENTRIES \
min(((unsigned long) ZPCI_NR_DEVICES * PCI_STD_NUM_BARS / 2), \
ZPCI_IOMAP_MAX_ENTRIES)
+unsigned int s390_pci_no_rid;
+
static DEFINE_SPINLOCK(zpci_iomap_lock);
static unsigned long *zpci_iomap_bitmap;
struct zpci_iomap_entry *zpci_iomap_start;
@@ -56,6 +61,12 @@ DEFINE_STATIC_KEY_FALSE(have_mio);
static struct kmem_cache *zdev_fmb_cache;
+/* AEN structures that must be preserved over KVM module re-insertion */
+union zpci_sic_iib *zpci_aipb;
+EXPORT_SYMBOL_GPL(zpci_aipb);
+struct airq_iv *zpci_aif_sbv;
+EXPORT_SYMBOL_GPL(zpci_aif_sbv);
+
struct zpci_dev *get_zdev_by_fid(u32 fid)
{
struct zpci_dev *tmp, *zdev = NULL;
@@ -64,6 +75,7 @@ struct zpci_dev *get_zdev_by_fid(u32 fid)
list_for_each_entry(tmp, &zpci_list, entry) {
if (tmp->fid == fid) {
zdev = tmp;
+ zpci_zdev_get(zdev);
break;
}
}
@@ -87,17 +99,12 @@ void zpci_remove_reserved_devices(void)
spin_unlock(&zpci_list_lock);
list_for_each_entry_safe(zdev, tmp, &remove, entry)
- zpci_remove_device(zdev);
-}
-
-static struct zpci_dev *get_zdev_by_bus(struct pci_bus *bus)
-{
- return (bus && bus->sysdata) ? (struct zpci_dev *) bus->sysdata : NULL;
+ zpci_device_reserved(zdev);
}
int pci_domain_nr(struct pci_bus *bus)
{
- return ((struct zpci_dev *) bus->sysdata)->domain;
+ return ((struct zpci_bus *) bus->sysdata)->domain_nr;
}
EXPORT_SYMBOL_GPL(pci_domain_nr);
@@ -109,18 +116,27 @@ EXPORT_SYMBOL_GPL(pci_proc_domain);
/* Modify PCI: Register I/O address translation parameters */
int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas,
- u64 base, u64 limit, u64 iota)
+ u64 base, u64 limit, u64 iota, u8 *status)
{
u64 req = ZPCI_CREATE_REQ(zdev->fh, dmaas, ZPCI_MOD_FC_REG_IOAT);
struct zpci_fib fib = {0};
- u8 status;
+ u8 cc;
WARN_ON_ONCE(iota & 0x3fff);
fib.pba = base;
- fib.pal = limit;
+ /* Work around off by one in ISM virt device */
+ if (zdev->pft == PCI_FUNC_TYPE_ISM && limit > base)
+ fib.pal = limit + (1 << 12);
+ else
+ fib.pal = limit;
fib.iota = iota | ZPCI_IOTA_RTTO_FLAG;
- return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
+ fib.gd = zdev->gisa;
+ cc = zpci_mod_fc(req, &fib, status);
+ if (cc)
+ zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, *status);
+ return cc;
}
+EXPORT_SYMBOL_GPL(zpci_register_ioat);
/* Modify PCI: Unregister I/O address translation parameters */
int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
@@ -129,16 +145,19 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
struct zpci_fib fib = {0};
u8 cc, status;
+ fib.gd = zdev->gisa;
+
cc = zpci_mod_fc(req, &fib, &status);
- if (cc == 3) /* Function already gone. */
- cc = 0;
- return cc ? -EIO : 0;
+ if (cc)
+ zpci_dbg(3, "unreg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status);
+ return cc;
}
/* Modify PCI: Set PCI function measurement parameters */
int zpci_fmb_enable_device(struct zpci_dev *zdev)
{
u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_SET_MEASURE);
+ struct zpci_iommu_ctrs *ctrs;
struct zpci_fib fib = {0};
u8 cc, status;
@@ -151,11 +170,18 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev)
WARN_ON((u64) zdev->fmb & 0xf);
/* reset software counters */
- atomic64_set(&zdev->allocated_pages, 0);
- atomic64_set(&zdev->mapped_pages, 0);
- atomic64_set(&zdev->unmapped_pages, 0);
+ ctrs = zpci_get_iommu_ctrs(zdev);
+ if (ctrs) {
+ atomic64_set(&ctrs->mapped_pages, 0);
+ atomic64_set(&ctrs->unmapped_pages, 0);
+ atomic64_set(&ctrs->global_rpcits, 0);
+ atomic64_set(&ctrs->sync_map_rpcits, 0);
+ atomic64_set(&ctrs->sync_rpcits, 0);
+ }
+
fib.fmb_addr = virt_to_phys(zdev->fmb);
+ fib.gd = zdev->gisa;
cc = zpci_mod_fc(req, &fib, &status);
if (cc) {
kmem_cache_free(zdev_fmb_cache, zdev->fmb);
@@ -174,6 +200,8 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev)
if (!zdev->fmb)
return -EINVAL;
+ fib.gd = zdev->gisa;
+
/* Function measurement is disabled if fmb address is zero */
cc = zpci_mod_fc(req, &fib, &status);
if (cc == 3) /* Function already gone. */
@@ -227,38 +255,25 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
zpci_memcpy_toio(to, from, count);
}
-void __iomem *ioremap(unsigned long ioaddr, unsigned long size)
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long prot)
{
- struct vm_struct *area;
- unsigned long offset;
-
- if (!size)
- return NULL;
-
+ /*
+ * When PCI MIO instructions are unavailable the "physical" address
+ * encodes a hint for accessing the PCI memory space it represents.
+ * Just pass it unchanged such that ioread/iowrite can decode it.
+ */
if (!static_branch_unlikely(&have_mio))
- return (void __iomem *) ioaddr;
+ return (void __iomem *)phys_addr;
- offset = ioaddr & ~PAGE_MASK;
- ioaddr &= PAGE_MASK;
- size = PAGE_ALIGN(size + offset);
- area = get_vm_area(size, VM_IOREMAP);
- if (!area)
- return NULL;
-
- if (ioremap_page_range((unsigned long) area->addr,
- (unsigned long) area->addr + size,
- ioaddr, PAGE_KERNEL)) {
- vunmap(area->addr);
- return NULL;
- }
- return (void __iomem *) ((unsigned long) area->addr + offset);
+ return generic_ioremap_prot(phys_addr, size, __pgprot(prot));
}
-EXPORT_SYMBOL(ioremap);
+EXPORT_SYMBOL(ioremap_prot);
void iounmap(volatile void __iomem *addr)
{
if (static_branch_likely(&have_mio))
- vunmap((__force void *) ((unsigned long) addr & PAGE_MASK));
+ generic_iounmap(addr);
}
EXPORT_SYMBOL(iounmap);
@@ -372,29 +387,17 @@ EXPORT_SYMBOL(pci_iounmap);
static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
int size, u32 *val)
{
- struct zpci_dev *zdev = get_zdev_by_bus(bus);
- int ret;
-
- if (!zdev || devfn != ZPCI_DEVFN)
- ret = -ENODEV;
- else
- ret = zpci_cfg_load(zdev, where, val, size);
+ struct zpci_dev *zdev = zdev_from_bus(bus, devfn);
- return ret;
+ return (zdev) ? zpci_cfg_load(zdev, where, val, size) : -ENODEV;
}
static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
int size, u32 val)
{
- struct zpci_dev *zdev = get_zdev_by_bus(bus);
- int ret;
+ struct zpci_dev *zdev = zdev_from_bus(bus, devfn);
- if (!zdev || devfn != ZPCI_DEVFN)
- ret = -ENODEV;
- else
- ret = zpci_cfg_store(zdev, where, val, size);
-
- return ret;
+ return (zdev) ? zpci_cfg_store(zdev, where, val, size) : -ENODEV;
}
static struct pci_ops pci_root_ops = {
@@ -402,15 +405,6 @@ static struct pci_ops pci_root_ops = {
.write = pci_write,
};
-#ifdef CONFIG_PCI_IOV
-static struct resource iov_res = {
- .name = "PCI IOV res",
- .start = 0,
- .end = -1,
- .flags = IORESOURCE_MEM,
-};
-#endif
-
static void zpci_map_resources(struct pci_dev *pdev)
{
struct zpci_dev *zdev = to_zpci(pdev);
@@ -424,23 +418,14 @@ static void zpci_map_resources(struct pci_dev *pdev)
if (zpci_use_mio(zdev))
pdev->resource[i].start =
- (resource_size_t __force) zdev->bars[i].mio_wb;
+ (resource_size_t __force) zdev->bars[i].mio_wt;
else
pdev->resource[i].start = (resource_size_t __force)
pci_iomap_range_fh(pdev, i, 0, 0);
pdev->resource[i].end = pdev->resource[i].start + len - 1;
}
-#ifdef CONFIG_PCI_IOV
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- int bar = i + PCI_IOV_RESOURCES;
-
- len = pci_resource_len(pdev, bar);
- if (!len)
- continue;
- pdev->resource[bar].parent = &iov_res;
- }
-#endif
+ zpci_iov_map_resources(pdev);
}
static void zpci_unmap_resources(struct pci_dev *pdev)
@@ -484,6 +469,34 @@ static void zpci_free_iomap(struct zpci_dev *zdev, int entry)
spin_unlock(&zpci_iomap_lock);
}
+static void zpci_do_update_iomap_fh(struct zpci_dev *zdev, u32 fh)
+{
+ int bar, idx;
+
+ spin_lock(&zpci_iomap_lock);
+ for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
+ if (!zdev->bars[bar].size)
+ continue;
+ idx = zdev->bars[bar].map_idx;
+ if (!zpci_iomap_start[idx].count)
+ continue;
+ WRITE_ONCE(zpci_iomap_start[idx].fh, zdev->fh);
+ }
+ spin_unlock(&zpci_iomap_lock);
+}
+
+void zpci_update_fh(struct zpci_dev *zdev, u32 fh)
+{
+ if (!fh || zdev->fh == fh)
+ return;
+
+ zdev->fh = fh;
+ if (zpci_use_mio(zdev))
+ return;
+ if (zdev->has_resources && zdev_enabled(zdev))
+ zpci_do_update_iomap_fh(zdev, fh);
+}
+
static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start,
unsigned long size, unsigned long flags)
{
@@ -505,15 +518,14 @@ static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start,
return r;
}
-static int zpci_setup_bus_resources(struct zpci_dev *zdev,
- struct list_head *resources)
+int zpci_setup_bus_resources(struct zpci_dev *zdev)
{
unsigned long addr, size, flags;
struct resource *res;
int i, entry;
snprintf(zdev->res_name, sizeof(zdev->res_name),
- "PCI Bus %04x:%02x", zdev->domain, ZPCI_BUS_NR);
+ "PCI Bus %04x:%02x", zdev->uid, ZPCI_BUS_NR);
for (i = 0; i < PCI_STD_NUM_BARS; i++) {
if (!zdev->bars[i].size)
@@ -531,7 +543,7 @@ static int zpci_setup_bus_resources(struct zpci_dev *zdev,
flags |= IORESOURCE_MEM_64;
if (zpci_use_mio(zdev))
- addr = (unsigned long) zdev->bars[i].mio_wb;
+ addr = (unsigned long) zdev->bars[i].mio_wt;
else
addr = ZPCI_ADDR(entry);
size = 1UL << zdev->bars[i].size;
@@ -542,36 +554,45 @@ static int zpci_setup_bus_resources(struct zpci_dev *zdev,
return -ENOMEM;
}
zdev->bars[i].res = res;
- pci_add_resource(resources, res);
}
+ zdev->has_resources = 1;
return 0;
}
static void zpci_cleanup_bus_resources(struct zpci_dev *zdev)
{
+ struct resource *res;
int i;
+ pci_lock_rescan_remove();
for (i = 0; i < PCI_STD_NUM_BARS; i++) {
- if (!zdev->bars[i].size || !zdev->bars[i].res)
+ res = zdev->bars[i].res;
+ if (!res)
continue;
+ release_resource(res);
+ pci_bus_remove_resource(zdev->zbus->bus, res);
zpci_free_iomap(zdev, zdev->bars[i].map_idx);
- release_resource(zdev->bars[i].res);
- kfree(zdev->bars[i].res);
+ zdev->bars[i].res = NULL;
+ kfree(res);
}
+ zdev->has_resources = 0;
+ pci_unlock_rescan_remove();
}
-int pcibios_add_device(struct pci_dev *pdev)
+int pcibios_device_add(struct pci_dev *pdev)
{
+ struct zpci_dev *zdev = to_zpci(pdev);
struct resource *res;
int i;
+ /* The pdev has a reference to the zdev via its bus */
+ zpci_zdev_get(zdev);
if (pdev->is_physfn)
pdev->no_vf_scan = 1;
pdev->dev.groups = zpci_attr_groups;
- pdev->dev.dma_ops = &s390_pci_dma_ops;
zpci_map_resources(pdev);
for (i = 0; i < PCI_STD_NUM_BARS; i++) {
@@ -586,7 +607,10 @@ int pcibios_add_device(struct pci_dev *pdev)
void pcibios_release_device(struct pci_dev *pdev)
{
+ struct zpci_dev *zdev = to_zpci(pdev);
+
zpci_unmap_resources(pdev);
+ zpci_zdev_put(zdev);
}
int pcibios_enable_device(struct pci_dev *pdev, int mask)
@@ -607,210 +631,323 @@ void pcibios_disable_device(struct pci_dev *pdev)
zpci_debug_exit_device(zdev);
}
-#ifdef CONFIG_HIBERNATE_CALLBACKS
-static int zpci_restore(struct device *dev)
+static int __zpci_register_domain(int domain)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct zpci_dev *zdev = to_zpci(pdev);
- int ret = 0;
-
- if (zdev->state != ZPCI_FN_STATE_ONLINE)
- goto out;
-
- ret = clp_enable_fh(zdev, ZPCI_NR_DMA_SPACES);
- if (ret)
- goto out;
-
- zpci_map_resources(pdev);
- zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
- (u64) zdev->dma_table);
-
-out:
- return ret;
+ spin_lock(&zpci_domain_lock);
+ if (test_bit(domain, zpci_domain)) {
+ spin_unlock(&zpci_domain_lock);
+ pr_err("Domain %04x is already assigned\n", domain);
+ return -EEXIST;
+ }
+ set_bit(domain, zpci_domain);
+ spin_unlock(&zpci_domain_lock);
+ return domain;
}
-static int zpci_freeze(struct device *dev)
+static int __zpci_alloc_domain(void)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct zpci_dev *zdev = to_zpci(pdev);
+ int domain;
- if (zdev->state != ZPCI_FN_STATE_ONLINE)
- return 0;
-
- zpci_unregister_ioat(zdev, 0);
- zpci_unmap_resources(pdev);
- return clp_disable_fh(zdev);
+ spin_lock(&zpci_domain_lock);
+ /*
+ * We can always auto allocate domains below ZPCI_NR_DEVICES.
+ * There is either a free domain or we have reached the maximum in
+ * which case we would have bailed earlier.
+ */
+ domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES);
+ set_bit(domain, zpci_domain);
+ spin_unlock(&zpci_domain_lock);
+ return domain;
}
-struct dev_pm_ops pcibios_pm_ops = {
- .thaw_noirq = zpci_restore,
- .freeze_noirq = zpci_freeze,
- .restore_noirq = zpci_restore,
- .poweroff_noirq = zpci_freeze,
-};
-#endif /* CONFIG_HIBERNATE_CALLBACKS */
-
-static int zpci_alloc_domain(struct zpci_dev *zdev)
+int zpci_alloc_domain(int domain)
{
if (zpci_unique_uid) {
- zdev->domain = (u16) zdev->uid;
- if (zdev->domain >= ZPCI_NR_DEVICES)
- return 0;
-
- spin_lock(&zpci_domain_lock);
- if (test_bit(zdev->domain, zpci_domain)) {
- spin_unlock(&zpci_domain_lock);
- pr_err("Adding PCI function %08x failed because domain %04x is already assigned\n",
- zdev->fid, zdev->domain);
- return -EEXIST;
- }
- set_bit(zdev->domain, zpci_domain);
- spin_unlock(&zpci_domain_lock);
- return 0;
- }
-
- spin_lock(&zpci_domain_lock);
- zdev->domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES);
- if (zdev->domain == ZPCI_NR_DEVICES) {
- spin_unlock(&zpci_domain_lock);
- pr_err("Adding PCI function %08x failed because the configured limit of %d is reached\n",
- zdev->fid, ZPCI_NR_DEVICES);
- return -ENOSPC;
+ if (domain)
+ return __zpci_register_domain(domain);
+ pr_warn("UID checking was active but no UID is provided: switching to automatic domain allocation\n");
+ update_uid_checking(false);
}
- set_bit(zdev->domain, zpci_domain);
- spin_unlock(&zpci_domain_lock);
- return 0;
+ return __zpci_alloc_domain();
}
-static void zpci_free_domain(struct zpci_dev *zdev)
+void zpci_free_domain(int domain)
{
- if (zdev->domain >= ZPCI_NR_DEVICES)
- return;
-
spin_lock(&zpci_domain_lock);
- clear_bit(zdev->domain, zpci_domain);
+ clear_bit(domain, zpci_domain);
spin_unlock(&zpci_domain_lock);
}
-void pcibios_remove_bus(struct pci_bus *bus)
+
+int zpci_enable_device(struct zpci_dev *zdev)
{
- struct zpci_dev *zdev = get_zdev_by_bus(bus);
+ u32 fh = zdev->fh;
+ int rc = 0;
- zpci_exit_slot(zdev);
- zpci_cleanup_bus_resources(zdev);
- zpci_destroy_iommu(zdev);
- zpci_free_domain(zdev);
+ if (clp_enable_fh(zdev, &fh, ZPCI_NR_DMA_SPACES))
+ rc = -EIO;
+ else
+ zpci_update_fh(zdev, fh);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(zpci_enable_device);
- spin_lock(&zpci_list_lock);
- list_del(&zdev->entry);
- spin_unlock(&zpci_list_lock);
+int zpci_disable_device(struct zpci_dev *zdev)
+{
+ u32 fh = zdev->fh;
+ int cc, rc = 0;
- zpci_dbg(3, "rem fid:%x\n", zdev->fid);
- kfree(zdev);
+ cc = clp_disable_fh(zdev, &fh);
+ if (!cc) {
+ zpci_update_fh(zdev, fh);
+ } else if (cc == CLP_RC_SETPCIFN_ALRDY) {
+ pr_info("Disabling PCI function %08x had no effect as it was already disabled\n",
+ zdev->fid);
+ /* Function is already disabled - update handle */
+ rc = clp_refresh_fh(zdev->fid, &fh);
+ if (!rc) {
+ zpci_update_fh(zdev, fh);
+ rc = -EINVAL;
+ }
+ } else {
+ rc = -EIO;
+ }
+ return rc;
}
+EXPORT_SYMBOL_GPL(zpci_disable_device);
-static int zpci_scan_bus(struct zpci_dev *zdev)
+/**
+ * zpci_hot_reset_device - perform a reset of the given zPCI function
+ * @zdev: the slot which should be reset
+ *
+ * Performs a low level reset of the zPCI function. The reset is low level in
+ * the sense that the zPCI function can be reset without detaching it from the
+ * common PCI subsystem. The reset may be performed while under control of
+ * either DMA or IOMMU APIs in which case the existing DMA/IOMMU translation
+ * table is reinstated at the end of the reset.
+ *
+ * After the reset the functions internal state is reset to an initial state
+ * equivalent to its state during boot when first probing a driver.
+ * Consequently after reset the PCI function requires re-initialization via the
+ * common PCI code including re-enabling IRQs via pci_alloc_irq_vectors()
+ * and enabling the function via e.g.pci_enablde_device_flags().The caller
+ * must guard against concurrent reset attempts.
+ *
+ * In most cases this function should not be called directly but through
+ * pci_reset_function() or pci_reset_bus() which handle the save/restore and
+ * locking.
+ *
+ * Return: 0 on success and an error value otherwise
+ */
+int zpci_hot_reset_device(struct zpci_dev *zdev)
{
- LIST_HEAD(resources);
- int ret;
+ u8 status;
+ int rc;
- ret = zpci_setup_bus_resources(zdev, &resources);
- if (ret)
- goto error;
+ zpci_dbg(3, "rst fid:%x, fh:%x\n", zdev->fid, zdev->fh);
+ if (zdev_enabled(zdev)) {
+ /* Disables device access, DMAs and IRQs (reset state) */
+ rc = zpci_disable_device(zdev);
+ /*
+ * Due to a z/VM vs LPAR inconsistency in the error state the
+ * FH may indicate an enabled device but disable says the
+ * device is already disabled don't treat it as an error here.
+ */
+ if (rc == -EINVAL)
+ rc = 0;
+ if (rc)
+ return rc;
+ }
- zdev->bus = pci_scan_root_bus(NULL, ZPCI_BUS_NR, &pci_root_ops,
- zdev, &resources);
- if (!zdev->bus) {
- ret = -EIO;
- goto error;
+ rc = zpci_enable_device(zdev);
+ if (rc)
+ return rc;
+
+ if (zdev->dma_table)
+ rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+ virt_to_phys(zdev->dma_table), &status);
+ if (rc) {
+ zpci_disable_device(zdev);
+ return rc;
}
- zdev->bus->max_bus_speed = zdev->max_bus_speed;
- pci_bus_add_devices(zdev->bus);
- return 0;
-error:
- zpci_cleanup_bus_resources(zdev);
- pci_free_resource_list(&resources);
- return ret;
+ return 0;
}
-int zpci_enable_device(struct zpci_dev *zdev)
+/**
+ * zpci_create_device() - Create a new zpci_dev and add it to the zbus
+ * @fid: Function ID of the device to be created
+ * @fh: Current Function Handle of the device to be created
+ * @state: Initial state after creation either Standby or Configured
+ *
+ * Creates a new zpci device and adds it to its, possibly newly created, zbus
+ * as well as zpci_list.
+ *
+ * Returns: the zdev on success or an error pointer otherwise
+ */
+struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
{
+ struct zpci_dev *zdev;
int rc;
- rc = clp_enable_fh(zdev, ZPCI_NR_DMA_SPACES);
+ zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", fid, fh, state);
+ zdev = kzalloc(sizeof(*zdev), GFP_KERNEL);
+ if (!zdev)
+ return ERR_PTR(-ENOMEM);
+
+ /* FID and Function Handle are the static/dynamic identifiers */
+ zdev->fid = fid;
+ zdev->fh = fh;
+
+ /* Query function properties and update zdev */
+ rc = clp_query_pci_fn(zdev);
if (rc)
- goto out;
+ goto error;
+ zdev->state = state;
- rc = zpci_dma_init_device(zdev);
+ kref_init(&zdev->kref);
+ mutex_init(&zdev->lock);
+ mutex_init(&zdev->kzdev_lock);
+
+ rc = zpci_init_iommu(zdev);
if (rc)
- goto out_dma;
+ goto error;
- zdev->state = ZPCI_FN_STATE_ONLINE;
- return 0;
+ rc = zpci_bus_device_register(zdev, &pci_root_ops);
+ if (rc)
+ goto error_destroy_iommu;
-out_dma:
- clp_disable_fh(zdev);
-out:
- return rc;
+ spin_lock(&zpci_list_lock);
+ list_add_tail(&zdev->entry, &zpci_list);
+ spin_unlock(&zpci_list_lock);
+
+ return zdev;
+
+error_destroy_iommu:
+ zpci_destroy_iommu(zdev);
+error:
+ zpci_dbg(0, "add fid:%x, rc:%d\n", fid, rc);
+ kfree(zdev);
+ return ERR_PTR(rc);
}
-EXPORT_SYMBOL_GPL(zpci_enable_device);
-int zpci_disable_device(struct zpci_dev *zdev)
+bool zpci_is_device_configured(struct zpci_dev *zdev)
{
- zpci_dma_exit_device(zdev);
- return clp_disable_fh(zdev);
+ enum zpci_state state = zdev->state;
+
+ return state != ZPCI_FN_STATE_RESERVED &&
+ state != ZPCI_FN_STATE_STANDBY;
}
-EXPORT_SYMBOL_GPL(zpci_disable_device);
-int zpci_create_device(struct zpci_dev *zdev)
+/**
+ * zpci_scan_configured_device() - Scan a freshly configured zpci_dev
+ * @zdev: The zpci_dev to be configured
+ * @fh: The general function handle supplied by the platform
+ *
+ * Given a device in the configuration state Configured, enables, scans and
+ * adds it to the common code PCI subsystem if possible. If any failure occurs,
+ * the zpci_dev is left disabled.
+ *
+ * Return: 0 on success, or an error code otherwise
+ */
+int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh)
{
- int rc;
+ zpci_update_fh(zdev, fh);
+ return zpci_bus_scan_device(zdev);
+}
- rc = zpci_alloc_domain(zdev);
- if (rc)
- goto out;
+/**
+ * zpci_deconfigure_device() - Deconfigure a zpci_dev
+ * @zdev: The zpci_dev to configure
+ *
+ * Deconfigure a zPCI function that is currently configured and possibly known
+ * to the common code PCI subsystem.
+ * If any failure occurs the device is left as is.
+ *
+ * Return: 0 on success, or an error code otherwise
+ */
+int zpci_deconfigure_device(struct zpci_dev *zdev)
+{
+ int rc;
- rc = zpci_init_iommu(zdev);
- if (rc)
- goto out_free;
+ if (zdev->zbus->bus)
+ zpci_bus_remove_device(zdev, false);
- mutex_init(&zdev->lock);
- if (zdev->state == ZPCI_FN_STATE_CONFIGURED) {
- rc = zpci_enable_device(zdev);
+ if (zdev_enabled(zdev)) {
+ rc = zpci_disable_device(zdev);
if (rc)
- goto out_destroy_iommu;
+ return rc;
}
- rc = zpci_scan_bus(zdev);
+
+ rc = sclp_pci_deconfigure(zdev->fid);
+ zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, rc);
if (rc)
- goto out_disable;
+ return rc;
+ zdev->state = ZPCI_FN_STATE_STANDBY;
+
+ return 0;
+}
+/**
+ * zpci_device_reserved() - Mark device as resverved
+ * @zdev: the zpci_dev that was reserved
+ *
+ * Handle the case that a given zPCI function was reserved by another system.
+ * After a call to this function the zpci_dev can not be found via
+ * get_zdev_by_fid() anymore but may still be accessible via existing
+ * references though it will not be functional anymore.
+ */
+void zpci_device_reserved(struct zpci_dev *zdev)
+{
+ if (zdev->has_hp_slot)
+ zpci_exit_slot(zdev);
+ /*
+ * Remove device from zpci_list as it is going away. This also
+ * makes sure we ignore subsequent zPCI events for this device.
+ */
spin_lock(&zpci_list_lock);
- list_add_tail(&zdev->entry, &zpci_list);
+ list_del(&zdev->entry);
spin_unlock(&zpci_list_lock);
+ zdev->state = ZPCI_FN_STATE_RESERVED;
+ zpci_dbg(3, "rsv fid:%x\n", zdev->fid);
+ zpci_zdev_put(zdev);
+}
- zpci_init_slot(zdev);
+void zpci_release_device(struct kref *kref)
+{
+ struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref);
+ int ret;
- return 0;
+ if (zdev->zbus->bus)
+ zpci_bus_remove_device(zdev, false);
-out_disable:
- if (zdev->state == ZPCI_FN_STATE_ONLINE)
+ if (zdev_enabled(zdev))
zpci_disable_device(zdev);
-out_destroy_iommu:
- zpci_destroy_iommu(zdev);
-out_free:
- zpci_free_domain(zdev);
-out:
- return rc;
-}
-void zpci_remove_device(struct zpci_dev *zdev)
-{
- if (!zdev->bus)
- return;
-
- pci_stop_root_bus(zdev->bus);
- pci_remove_root_bus(zdev->bus);
+ switch (zdev->state) {
+ case ZPCI_FN_STATE_CONFIGURED:
+ ret = sclp_pci_deconfigure(zdev->fid);
+ zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, ret);
+ fallthrough;
+ case ZPCI_FN_STATE_STANDBY:
+ if (zdev->has_hp_slot)
+ zpci_exit_slot(zdev);
+ spin_lock(&zpci_list_lock);
+ list_del(&zdev->entry);
+ spin_unlock(&zpci_list_lock);
+ zpci_dbg(3, "rsv fid:%x\n", zdev->fid);
+ fallthrough;
+ case ZPCI_FN_STATE_RESERVED:
+ if (zdev->has_resources)
+ zpci_cleanup_bus_resources(zdev);
+ zpci_bus_device_unregister(zdev);
+ zpci_destroy_iommu(zdev);
+ fallthrough;
+ default:
+ break;
+ }
+ zpci_dbg(3, "rem fid:%x\n", zdev->fid);
+ kfree_rcu(zdev, rcu);
}
int zpci_report_error(struct pci_dev *pdev,
@@ -822,6 +959,59 @@ int zpci_report_error(struct pci_dev *pdev,
}
EXPORT_SYMBOL(zpci_report_error);
+/**
+ * zpci_clear_error_state() - Clears the zPCI error state of the device
+ * @zdev: The zdev for which the zPCI error state should be reset
+ *
+ * Clear the zPCI error state of the device. If clearing the zPCI error state
+ * fails the device is left in the error state. In this case it may make sense
+ * to call zpci_io_perm_failure() on the associated pdev if it exists.
+ *
+ * Returns: 0 on success, -EIO otherwise
+ */
+int zpci_clear_error_state(struct zpci_dev *zdev)
+{
+ u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_RESET_ERROR);
+ struct zpci_fib fib = {0};
+ u8 status;
+ int cc;
+
+ cc = zpci_mod_fc(req, &fib, &status);
+ if (cc) {
+ zpci_dbg(3, "ces fid:%x, cc:%d, status:%x\n", zdev->fid, cc, status);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/**
+ * zpci_reset_load_store_blocked() - Re-enables L/S from error state
+ * @zdev: The zdev for which to unblock load/store access
+ *
+ * Re-enables load/store access for a PCI function in the error state while
+ * keeping DMA blocked. In this state drivers can poke MMIO space to determine
+ * if error recovery is possible while catching any rogue DMA access from the
+ * device.
+ *
+ * Returns: 0 on success, -EIO otherwise
+ */
+int zpci_reset_load_store_blocked(struct zpci_dev *zdev)
+{
+ u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_RESET_BLOCK);
+ struct zpci_fib fib = {0};
+ u8 status;
+ int cc;
+
+ cc = zpci_mod_fc(req, &fib, &status);
+ if (cc) {
+ zpci_dbg(3, "rls fid:%x, cc:%d, status:%x\n", zdev->fid, cc, status);
+ return -EIO;
+ }
+
+ return 0;
+}
+
static int zpci_mem_init(void)
{
BUILD_BUG_ON(!is_power_of_2(__alignof__(struct zpci_fmb)) ||
@@ -842,6 +1032,9 @@ static int zpci_mem_init(void)
if (!zpci_iomap_bitmap)
goto error_iomap_bitmap;
+ if (static_branch_likely(&have_mio))
+ clp_setup_writeback_mio();
+
return 0;
error_iomap_bitmap:
kfree(zpci_iomap_start);
@@ -859,7 +1052,6 @@ static void zpci_mem_exit(void)
}
static unsigned int s390_pci_probe __initdata = 1;
-static unsigned int s390_pci_no_mio __initdata;
unsigned int s390_pci_force_floating __initdata;
static unsigned int s390_pci_initialized;
@@ -870,13 +1062,17 @@ char * __init pcibios_setup(char *str)
return NULL;
}
if (!strcmp(str, "nomio")) {
- s390_pci_no_mio = 1;
+ S390_lowcore.machine_flags &= ~MACHINE_FLAG_PCI_MIO;
return NULL;
}
if (!strcmp(str, "force_floating")) {
s390_pci_force_floating = 1;
return NULL;
}
+ if (!strcmp(str, "norid")) {
+ s390_pci_no_rid = 1;
+ return NULL;
+ }
return str;
}
@@ -892,12 +1088,14 @@ static int __init pci_base_init(void)
if (!s390_pci_probe)
return 0;
- if (!test_facility(69) || !test_facility(71))
+ if (!test_facility(69) || !test_facility(71)) {
+ pr_info("PCI is not supported because CPU facilities 69 or 71 are not available\n");
return 0;
+ }
- if (test_facility(153) && !s390_pci_no_mio) {
+ if (MACHINE_HAS_PCI_MIO) {
static_branch_enable(&have_mio);
- ctl_set_bit(2, 5);
+ system_ctl_set_bit(2, CR2_MIO_ADDRESSING_BIT);
}
rc = zpci_debug_init();
@@ -912,20 +1110,15 @@ static int __init pci_base_init(void)
if (rc)
goto out_irq;
- rc = zpci_dma_init();
- if (rc)
- goto out_dma;
-
rc = clp_scan_pci_devices();
if (rc)
goto out_find;
+ zpci_bus_scan_busses();
s390_pci_initialized = 1;
return 0;
out_find:
- zpci_dma_exit();
-out_dma:
zpci_irq_exit();
out_irq:
zpci_mem_exit();
@@ -935,9 +1128,3 @@ out:
return rc;
}
subsys_initcall_sync(pci_base_init);
-
-void zpci_rescan(void)
-{
- if (zpci_is_enabled())
- clp_rescan_pci_devices_simple();
-}
diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
new file mode 100644
index 000000000000..daa5d7450c7d
--- /dev/null
+++ b/arch/s390/pci/pci_bus.c
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s):
+ * Pierre Morel <pmorel@linux.ibm.com>
+ *
+ */
+
+#define KMSG_COMPONENT "zpci"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/seq_file.h>
+#include <linux/jump_label.h>
+#include <linux/pci.h>
+#include <linux/printk.h>
+
+#include <asm/pci_clp.h>
+#include <asm/pci_dma.h>
+
+#include "pci_bus.h"
+#include "pci_iov.h"
+
+static LIST_HEAD(zbus_list);
+static DEFINE_MUTEX(zbus_list_lock);
+static int zpci_nb_devices;
+
+/* zpci_bus_prepare_device - Prepare a zPCI function for scanning
+ * @zdev: the zPCI function to be prepared
+ *
+ * The PCI resources for the function are set up and added to its zbus and the
+ * function is enabled. The function must be added to a zbus which must have
+ * a PCI bus created. If an error occurs the zPCI function is not enabled.
+ *
+ * Return: 0 on success, an error code otherwise
+ */
+static int zpci_bus_prepare_device(struct zpci_dev *zdev)
+{
+ int rc, i;
+
+ if (!zdev_enabled(zdev)) {
+ rc = zpci_enable_device(zdev);
+ if (rc)
+ return rc;
+ }
+
+ if (!zdev->has_resources) {
+ zpci_setup_bus_resources(zdev);
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+ if (zdev->bars[i].res)
+ pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res, 0);
+ }
+ }
+
+ return 0;
+}
+
+/* zpci_bus_scan_device - Scan a single device adding it to the PCI core
+ * @zdev: the zdev to be scanned
+ *
+ * Scans the PCI function making it available to the common PCI code.
+ *
+ * Return: 0 on success, an error value otherwise
+ */
+int zpci_bus_scan_device(struct zpci_dev *zdev)
+{
+ struct pci_dev *pdev;
+ int rc;
+
+ rc = zpci_bus_prepare_device(zdev);
+ if (rc)
+ return rc;
+
+ pdev = pci_scan_single_device(zdev->zbus->bus, zdev->devfn);
+ if (!pdev)
+ return -ENODEV;
+
+ pci_lock_rescan_remove();
+ pci_bus_add_device(pdev);
+ pci_unlock_rescan_remove();
+
+ return 0;
+}
+
+/* zpci_bus_remove_device - Removes the given zdev from the PCI core
+ * @zdev: the zdev to be removed from the PCI core
+ * @set_error: if true the device's error state is set to permanent failure
+ *
+ * Sets a zPCI device to a configured but offline state; the zPCI
+ * device is still accessible through its hotplug slot and the zPCI
+ * API but is removed from the common code PCI bus, making it
+ * no longer available to drivers.
+ */
+void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error)
+{
+ struct zpci_bus *zbus = zdev->zbus;
+ struct pci_dev *pdev;
+
+ if (!zdev->zbus->bus)
+ return;
+
+ pdev = pci_get_slot(zbus->bus, zdev->devfn);
+ if (pdev) {
+ if (set_error)
+ pdev->error_state = pci_channel_io_perm_failure;
+ if (pdev->is_virtfn) {
+ zpci_iov_remove_virtfn(pdev, zdev->vfn);
+ /* balance pci_get_slot */
+ pci_dev_put(pdev);
+ return;
+ }
+ pci_stop_and_remove_bus_device_locked(pdev);
+ /* balance pci_get_slot */
+ pci_dev_put(pdev);
+ }
+}
+
+/* zpci_bus_scan_bus - Scan all configured zPCI functions on the bus
+ * @zbus: the zbus to be scanned
+ *
+ * Enables and scans all PCI functions on the bus making them available to the
+ * common PCI code. If a PCI function fails to be initialized an error will be
+ * returned but attempts will still be made for all other functions on the bus.
+ *
+ * Return: 0 on success, an error value otherwise
+ */
+int zpci_bus_scan_bus(struct zpci_bus *zbus)
+{
+ struct zpci_dev *zdev;
+ int devfn, rc, ret = 0;
+
+ for (devfn = 0; devfn < ZPCI_FUNCTIONS_PER_BUS; devfn++) {
+ zdev = zbus->function[devfn];
+ if (zdev && zdev->state == ZPCI_FN_STATE_CONFIGURED) {
+ rc = zpci_bus_prepare_device(zdev);
+ if (rc)
+ ret = -EIO;
+ }
+ }
+
+ pci_lock_rescan_remove();
+ pci_scan_child_bus(zbus->bus);
+ pci_bus_add_devices(zbus->bus);
+ pci_unlock_rescan_remove();
+
+ return ret;
+}
+
+/* zpci_bus_scan_busses - Scan all registered busses
+ *
+ * Scan all available zbusses
+ *
+ */
+void zpci_bus_scan_busses(void)
+{
+ struct zpci_bus *zbus = NULL;
+
+ mutex_lock(&zbus_list_lock);
+ list_for_each_entry(zbus, &zbus_list, bus_next) {
+ zpci_bus_scan_bus(zbus);
+ cond_resched();
+ }
+ mutex_unlock(&zbus_list_lock);
+}
+
+/* zpci_bus_create_pci_bus - Create the PCI bus associated with this zbus
+ * @zbus: the zbus holding the zdevices
+ * @fr: PCI root function that will determine the bus's domain, and bus speeed
+ * @ops: the pci operations
+ *
+ * The PCI function @fr determines the domain (its UID), multifunction property
+ * and maximum bus speed of the entire bus.
+ *
+ * Return: 0 on success, an error code otherwise
+ */
+static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, struct pci_ops *ops)
+{
+ struct pci_bus *bus;
+ int domain;
+
+ domain = zpci_alloc_domain((u16)fr->uid);
+ if (domain < 0)
+ return domain;
+
+ zbus->domain_nr = domain;
+ zbus->multifunction = fr->rid_available;
+ zbus->max_bus_speed = fr->max_bus_speed;
+
+ /*
+ * Note that the zbus->resources are taken over and zbus->resources
+ * is empty after a successful call
+ */
+ bus = pci_create_root_bus(NULL, ZPCI_BUS_NR, ops, zbus, &zbus->resources);
+ if (!bus) {
+ zpci_free_domain(zbus->domain_nr);
+ return -EFAULT;
+ }
+
+ zbus->bus = bus;
+
+ return 0;
+}
+
+static void zpci_bus_release(struct kref *kref)
+{
+ struct zpci_bus *zbus = container_of(kref, struct zpci_bus, kref);
+
+ if (zbus->bus) {
+ pci_lock_rescan_remove();
+ pci_stop_root_bus(zbus->bus);
+
+ zpci_free_domain(zbus->domain_nr);
+ pci_free_resource_list(&zbus->resources);
+
+ pci_remove_root_bus(zbus->bus);
+ pci_unlock_rescan_remove();
+ }
+
+ mutex_lock(&zbus_list_lock);
+ list_del(&zbus->bus_next);
+ mutex_unlock(&zbus_list_lock);
+ kfree(zbus);
+}
+
+static void zpci_bus_put(struct zpci_bus *zbus)
+{
+ kref_put(&zbus->kref, zpci_bus_release);
+}
+
+static struct zpci_bus *zpci_bus_get(int pchid)
+{
+ struct zpci_bus *zbus;
+
+ mutex_lock(&zbus_list_lock);
+ list_for_each_entry(zbus, &zbus_list, bus_next) {
+ if (pchid == zbus->pchid) {
+ kref_get(&zbus->kref);
+ goto out_unlock;
+ }
+ }
+ zbus = NULL;
+out_unlock:
+ mutex_unlock(&zbus_list_lock);
+ return zbus;
+}
+
+static struct zpci_bus *zpci_bus_alloc(int pchid)
+{
+ struct zpci_bus *zbus;
+
+ zbus = kzalloc(sizeof(*zbus), GFP_KERNEL);
+ if (!zbus)
+ return NULL;
+
+ zbus->pchid = pchid;
+ INIT_LIST_HEAD(&zbus->bus_next);
+ mutex_lock(&zbus_list_lock);
+ list_add_tail(&zbus->bus_next, &zbus_list);
+ mutex_unlock(&zbus_list_lock);
+
+ kref_init(&zbus->kref);
+ INIT_LIST_HEAD(&zbus->resources);
+
+ zbus->bus_resource.start = 0;
+ zbus->bus_resource.end = ZPCI_BUS_NR;
+ zbus->bus_resource.flags = IORESOURCE_BUS;
+ pci_add_resource(&zbus->resources, &zbus->bus_resource);
+
+ return zbus;
+}
+
+void pcibios_bus_add_device(struct pci_dev *pdev)
+{
+ struct zpci_dev *zdev = to_zpci(pdev);
+
+ /*
+ * With pdev->no_vf_scan the common PCI probing code does not
+ * perform PF/VF linking.
+ */
+ if (zdev->vfn) {
+ zpci_iov_setup_virtfn(zdev->zbus, pdev, zdev->vfn);
+ pdev->no_command_memory = 1;
+ }
+}
+
+static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev)
+{
+ int rc = -EINVAL;
+
+ if (zbus->function[zdev->devfn]) {
+ pr_err("devfn %04x is already assigned\n", zdev->devfn);
+ return rc;
+ }
+
+ zdev->zbus = zbus;
+ zbus->function[zdev->devfn] = zdev;
+ zpci_nb_devices++;
+
+ if (zbus->multifunction && !zdev->rid_available) {
+ WARN_ONCE(1, "rid_available not set for multifunction\n");
+ goto error;
+ }
+ rc = zpci_init_slot(zdev);
+ if (rc)
+ goto error;
+ zdev->has_hp_slot = 1;
+
+ return 0;
+
+error:
+ zbus->function[zdev->devfn] = NULL;
+ zdev->zbus = NULL;
+ zpci_nb_devices--;
+ return rc;
+}
+
+int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops)
+{
+ struct zpci_bus *zbus = NULL;
+ int rc = -EBADF;
+
+ if (zpci_nb_devices == ZPCI_NR_DEVICES) {
+ pr_warn("Adding PCI function %08x failed because the configured limit of %d is reached\n",
+ zdev->fid, ZPCI_NR_DEVICES);
+ return -ENOSPC;
+ }
+
+ if (zdev->devfn >= ZPCI_FUNCTIONS_PER_BUS)
+ return -EINVAL;
+
+ if (!s390_pci_no_rid && zdev->rid_available)
+ zbus = zpci_bus_get(zdev->pchid);
+
+ if (!zbus) {
+ zbus = zpci_bus_alloc(zdev->pchid);
+ if (!zbus)
+ return -ENOMEM;
+ }
+
+ if (!zbus->bus) {
+ /* The UID of the first PCI function registered with a zpci_bus
+ * is used as the domain number for that bus. Currently there
+ * is exactly one zpci_bus per domain.
+ */
+ rc = zpci_bus_create_pci_bus(zbus, zdev, ops);
+ if (rc)
+ goto error;
+ }
+
+ rc = zpci_bus_add_device(zbus, zdev);
+ if (rc)
+ goto error;
+
+ return 0;
+
+error:
+ pr_err("Adding PCI function %08x failed\n", zdev->fid);
+ zpci_bus_put(zbus);
+ return rc;
+}
+
+void zpci_bus_device_unregister(struct zpci_dev *zdev)
+{
+ struct zpci_bus *zbus = zdev->zbus;
+
+ zpci_nb_devices--;
+ zbus->function[zdev->devfn] = NULL;
+ zpci_bus_put(zbus);
+}
diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h
new file mode 100644
index 000000000000..af9f0ac79a1b
--- /dev/null
+++ b/arch/s390/pci/pci_bus.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s):
+ * Pierre Morel <pmorel@linux.ibm.com>
+ *
+ */
+
+int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops);
+void zpci_bus_device_unregister(struct zpci_dev *zdev);
+
+int zpci_bus_scan_bus(struct zpci_bus *zbus);
+void zpci_bus_scan_busses(void);
+
+int zpci_bus_scan_device(struct zpci_dev *zdev);
+void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error);
+
+void zpci_release_device(struct kref *kref);
+static inline void zpci_zdev_put(struct zpci_dev *zdev)
+{
+ if (zdev)
+ kref_put(&zdev->kref, zpci_release_device);
+}
+
+static inline void zpci_zdev_get(struct zpci_dev *zdev)
+{
+ kref_get(&zdev->kref);
+}
+
+int zpci_alloc_domain(int domain);
+void zpci_free_domain(int domain);
+int zpci_setup_bus_resources(struct zpci_dev *zdev);
+
+static inline struct zpci_dev *zdev_from_bus(struct pci_bus *bus,
+ unsigned int devfn)
+{
+ struct zpci_bus *zbus = bus->sysdata;
+
+ return (devfn >= ZPCI_FUNCTIONS_PER_BUS) ? NULL : zbus->function[devfn];
+}
+
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 4c613e569fe0..ee90a91ed888 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -17,17 +17,20 @@
#include <linux/delay.h>
#include <linux/pci.h>
#include <linux/uaccess.h>
+#include <asm/asm-extable.h>
#include <asm/pci_debug.h>
#include <asm/pci_clp.h>
#include <asm/clp.h>
#include <uapi/asm/clp.h>
+#include "pci_bus.h"
+
bool zpci_unique_uid;
-static void update_uid_checking(bool new)
+void update_uid_checking(bool new)
{
if (zpci_unique_uid != new)
- zpci_dbg(1, "uid checking:%d\n", new);
+ zpci_dbg(3, "uid checking:%d\n", new);
zpci_unique_uid = new;
}
@@ -102,6 +105,9 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev,
zdev->msi_addr = response->msia;
zdev->max_msi = response->noi;
zdev->fmb_update = response->mui;
+ zdev->version = response->version;
+ zdev->maxstbl = response->maxstbl;
+ zdev->dtsm = response->dtsm;
switch (response->version) {
case 1:
@@ -155,13 +161,19 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
zdev->pfgid = response->pfgid;
zdev->pft = response->pft;
zdev->vfn = response->vfn;
+ zdev->port = response->port;
zdev->uid = response->uid;
zdev->fmb_length = sizeof(u32) * response->fmb_len;
+ zdev->rid_available = response->rid_avail;
+ zdev->is_physfn = response->is_physfn;
+ if (!s390_pci_no_rid && zdev->rid_available)
+ zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN;
memcpy(zdev->pfip, response->pfip, sizeof(zdev->pfip));
if (response->util_str_avail) {
memcpy(zdev->util_str, response->util_str,
sizeof(zdev->util_str));
+ zdev->util_str_avail = 1;
}
zdev->mio_capable = response->mio_addr_avail;
for (i = 0; i < PCI_STD_NUM_BARS; i++) {
@@ -174,7 +186,7 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
return 0;
}
-static int clp_query_pci_fn(struct zpci_dev *zdev, u32 fh)
+int clp_query_pci_fn(struct zpci_dev *zdev)
{
struct clp_req_rsp_query_pci *rrb;
int rc;
@@ -187,7 +199,7 @@ static int clp_query_pci_fn(struct zpci_dev *zdev, u32 fh)
rrb->request.hdr.len = sizeof(rrb->request);
rrb->request.hdr.cmd = CLP_QUERY_PCI_FN;
rrb->response.hdr.len = sizeof(rrb->response);
- rrb->request.fh = fh;
+ rrb->request.fh = zdev->fh;
rc = clp_req(rrb, CLP_LPS_PCI);
if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) {
@@ -205,60 +217,39 @@ out:
return rc;
}
-int clp_add_pci_device(u32 fid, u32 fh, int configured)
-{
- struct zpci_dev *zdev;
- int rc = -ENOMEM;
-
- zpci_dbg(3, "add fid:%x, fh:%x, c:%d\n", fid, fh, configured);
- zdev = kzalloc(sizeof(*zdev), GFP_KERNEL);
- if (!zdev)
- goto error;
-
- zdev->fh = fh;
- zdev->fid = fid;
-
- /* Query function properties and update zdev */
- rc = clp_query_pci_fn(zdev, fh);
- if (rc)
- goto error;
-
- if (configured)
- zdev->state = ZPCI_FN_STATE_CONFIGURED;
- else
- zdev->state = ZPCI_FN_STATE_STANDBY;
-
- rc = zpci_create_device(zdev);
- if (rc)
- goto error;
- return 0;
-
-error:
- zpci_dbg(0, "add fid:%x, rc:%d\n", fid, rc);
- kfree(zdev);
- return rc;
-}
-
-/*
- * Enable/Disable a given PCI function defined by its function handle.
+/**
+ * clp_set_pci_fn() - Execute a command on a PCI function
+ * @zdev: Function that will be affected
+ * @fh: Out parameter for updated function handle
+ * @nr_dma_as: DMA address space number
+ * @command: The command code to execute
+ *
+ * Returns: 0 on success, < 0 for Linux errors (e.g. -ENOMEM), and
+ * > 0 for non-success platform responses
*/
-static int clp_set_pci_fn(u32 *fh, u8 nr_dma_as, u8 command)
+static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 command)
{
struct clp_req_rsp_set_pci *rrb;
int rc, retries = 100;
+ u32 gisa = 0;
+ *fh = 0;
rrb = clp_alloc_block(GFP_KERNEL);
if (!rrb)
return -ENOMEM;
+ if (command != CLP_SET_DISABLE_PCI_FN)
+ gisa = zdev->gisa;
+
do {
memset(rrb, 0, sizeof(*rrb));
rrb->request.hdr.len = sizeof(rrb->request);
rrb->request.hdr.cmd = CLP_SET_PCI_FN;
rrb->response.hdr.len = sizeof(rrb->response);
- rrb->request.fh = *fh;
+ rrb->request.fh = zdev->fh;
rrb->request.oc = command;
rrb->request.ndas = nr_dma_as;
+ rrb->request.gisa = gisa;
rc = clp_req(rrb, CLP_LPS_PCI);
if (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY) {
@@ -269,50 +260,107 @@ static int clp_set_pci_fn(u32 *fh, u8 nr_dma_as, u8 command)
}
} while (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY);
- if (!rc && rrb->response.hdr.rsp == CLP_RC_OK)
+ if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) {
*fh = rrb->response.fh;
- else {
+ } else {
zpci_err("Set PCI FN:\n");
zpci_err_clp(rrb->response.hdr.rsp, rc);
- rc = -EIO;
+ if (!rc)
+ rc = rrb->response.hdr.rsp;
}
clp_free_block(rrb);
return rc;
}
-int clp_enable_fh(struct zpci_dev *zdev, u8 nr_dma_as)
+int clp_setup_writeback_mio(void)
{
- u32 fh = zdev->fh;
+ struct clp_req_rsp_slpc_pci *rrb;
+ u8 wb_bit_pos;
int rc;
- rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_PCI_FN);
- zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc);
- if (rc)
- goto out;
+ rrb = clp_alloc_block(GFP_KERNEL);
+ if (!rrb)
+ return -ENOMEM;
+
+ memset(rrb, 0, sizeof(*rrb));
+ rrb->request.hdr.len = sizeof(rrb->request);
+ rrb->request.hdr.cmd = CLP_SLPC;
+ rrb->response.hdr.len = sizeof(rrb->response);
+
+ rc = clp_req(rrb, CLP_LPS_PCI);
+ if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) {
+ if (rrb->response.vwb) {
+ wb_bit_pos = rrb->response.mio_wb;
+ set_bit_inv(wb_bit_pos, &mio_wb_bit_mask);
+ zpci_dbg(3, "wb bit: %d\n", wb_bit_pos);
+ } else {
+ zpci_dbg(3, "wb bit: n.a.\n");
+ }
+
+ } else {
+ zpci_err("SLPC PCI:\n");
+ zpci_err_clp(rrb->response.hdr.rsp, rc);
+ rc = -EIO;
+ }
+ clp_free_block(rrb);
+ return rc;
+}
+
+int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as)
+{
+ int rc;
- zdev->fh = fh;
- if (zpci_use_mio(zdev)) {
- rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_MIO);
- zpci_dbg(3, "ena mio fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc);
+ rc = clp_set_pci_fn(zdev, fh, nr_dma_as, CLP_SET_ENABLE_PCI_FN);
+ zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, *fh, rc);
+ if (!rc && zpci_use_mio(zdev)) {
+ rc = clp_set_pci_fn(zdev, fh, nr_dma_as, CLP_SET_ENABLE_MIO);
+ zpci_dbg(3, "ena mio fid:%x, fh:%x, rc:%d\n",
+ zdev->fid, *fh, rc);
if (rc)
- clp_disable_fh(zdev);
+ clp_disable_fh(zdev, fh);
}
-out:
return rc;
}
-int clp_disable_fh(struct zpci_dev *zdev)
+int clp_disable_fh(struct zpci_dev *zdev, u32 *fh)
{
- u32 fh = zdev->fh;
int rc;
if (!zdev_enabled(zdev))
return 0;
- rc = clp_set_pci_fn(&fh, 0, CLP_SET_DISABLE_PCI_FN);
- zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc);
- if (!rc)
- zdev->fh = fh;
+ rc = clp_set_pci_fn(zdev, fh, 0, CLP_SET_DISABLE_PCI_FN);
+ zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, *fh, rc);
+ return rc;
+}
+
+static int clp_list_pci_req(struct clp_req_rsp_list_pci *rrb,
+ u64 *resume_token, int *nentries)
+{
+ int rc;
+
+ memset(rrb, 0, sizeof(*rrb));
+ rrb->request.hdr.len = sizeof(rrb->request);
+ rrb->request.hdr.cmd = CLP_LIST_PCI;
+ /* store as many entries as possible */
+ rrb->response.hdr.len = CLP_BLK_SIZE - LIST_PCI_HDR_LEN;
+ rrb->request.resume_token = *resume_token;
+
+ /* Get PCI function handle list */
+ rc = clp_req(rrb, CLP_LPS_PCI);
+ if (rc || rrb->response.hdr.rsp != CLP_RC_OK) {
+ zpci_err("List PCI FN:\n");
+ zpci_err_clp(rrb->response.hdr.rsp, rc);
+ return -EIO;
+ }
+
+ update_uid_checking(rrb->response.uid_checking);
+ WARN_ON_ONCE(rrb->response.entry_size !=
+ sizeof(struct clp_fh_list_entry));
+
+ *nentries = (rrb->response.hdr.len - LIST_PCI_HDR_LEN) /
+ rrb->response.entry_size;
+ *resume_token = rrb->response.resume_token;
return rc;
}
@@ -321,53 +369,43 @@ static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, void *data,
void (*cb)(struct clp_fh_list_entry *, void *))
{
u64 resume_token = 0;
- int entries, i, rc;
+ int nentries, i, rc;
do {
- memset(rrb, 0, sizeof(*rrb));
- rrb->request.hdr.len = sizeof(rrb->request);
- rrb->request.hdr.cmd = CLP_LIST_PCI;
- /* store as many entries as possible */
- rrb->response.hdr.len = CLP_BLK_SIZE - LIST_PCI_HDR_LEN;
- rrb->request.resume_token = resume_token;
-
- /* Get PCI function handle list */
- rc = clp_req(rrb, CLP_LPS_PCI);
- if (rc || rrb->response.hdr.rsp != CLP_RC_OK) {
- zpci_err("List PCI FN:\n");
- zpci_err_clp(rrb->response.hdr.rsp, rc);
- rc = -EIO;
- goto out;
- }
-
- update_uid_checking(rrb->response.uid_checking);
- WARN_ON_ONCE(rrb->response.entry_size !=
- sizeof(struct clp_fh_list_entry));
-
- entries = (rrb->response.hdr.len - LIST_PCI_HDR_LEN) /
- rrb->response.entry_size;
-
- resume_token = rrb->response.resume_token;
- for (i = 0; i < entries; i++)
+ rc = clp_list_pci_req(rrb, &resume_token, &nentries);
+ if (rc)
+ return rc;
+ for (i = 0; i < nentries; i++)
cb(&rrb->response.fh_list[i], data);
} while (resume_token);
-out:
+
return rc;
}
-static void __clp_add(struct clp_fh_list_entry *entry, void *data)
+static int clp_find_pci(struct clp_req_rsp_list_pci *rrb, u32 fid,
+ struct clp_fh_list_entry *entry)
{
- struct zpci_dev *zdev;
+ struct clp_fh_list_entry *fh_list;
+ u64 resume_token = 0;
+ int nentries, i, rc;
- if (!entry->vendor_id)
- return;
+ do {
+ rc = clp_list_pci_req(rrb, &resume_token, &nentries);
+ if (rc)
+ return rc;
+ fh_list = rrb->response.fh_list;
+ for (i = 0; i < nentries; i++) {
+ if (fh_list[i].fid == fid) {
+ *entry = fh_list[i];
+ return 0;
+ }
+ }
+ } while (resume_token);
- zdev = get_zdev_by_fid(entry->fid);
- if (!zdev)
- clp_add_pci_device(entry->fid, entry->fh, entry->config_state);
+ return -ENODEV;
}
-static void __clp_update(struct clp_fh_list_entry *entry, void *data)
+static void __clp_add(struct clp_fh_list_entry *entry, void *data)
{
struct zpci_dev *zdev;
@@ -375,10 +413,11 @@ static void __clp_update(struct clp_fh_list_entry *entry, void *data)
return;
zdev = get_zdev_by_fid(entry->fid);
- if (!zdev)
+ if (zdev) {
+ zpci_zdev_put(zdev);
return;
-
- zdev->fh = entry->fh;
+ }
+ zpci_create_device(entry->fid, entry->fh, entry->config_state);
}
int clp_scan_pci_devices(void)
@@ -396,66 +435,44 @@ int clp_scan_pci_devices(void)
return rc;
}
-int clp_rescan_pci_devices(void)
-{
- struct clp_req_rsp_list_pci *rrb;
- int rc;
-
- zpci_remove_reserved_devices();
-
- rrb = clp_alloc_block(GFP_KERNEL);
- if (!rrb)
- return -ENOMEM;
-
- rc = clp_list_pci(rrb, NULL, __clp_add);
-
- clp_free_block(rrb);
- return rc;
-}
-
-int clp_rescan_pci_devices_simple(void)
+/*
+ * Get the current function handle of the function matching @fid
+ */
+int clp_refresh_fh(u32 fid, u32 *fh)
{
struct clp_req_rsp_list_pci *rrb;
+ struct clp_fh_list_entry entry;
int rc;
rrb = clp_alloc_block(GFP_NOWAIT);
if (!rrb)
return -ENOMEM;
- rc = clp_list_pci(rrb, NULL, __clp_update);
+ rc = clp_find_pci(rrb, fid, &entry);
+ if (!rc)
+ *fh = entry.fh;
clp_free_block(rrb);
return rc;
}
-struct clp_state_data {
- u32 fid;
- enum zpci_state state;
-};
-
-static void __clp_get_state(struct clp_fh_list_entry *entry, void *data)
-{
- struct clp_state_data *sd = data;
-
- if (entry->fid != sd->fid)
- return;
-
- sd->state = entry->config_state;
-}
-
int clp_get_state(u32 fid, enum zpci_state *state)
{
struct clp_req_rsp_list_pci *rrb;
- struct clp_state_data sd = {fid, ZPCI_FN_STATE_RESERVED};
+ struct clp_fh_list_entry entry;
int rc;
rrb = clp_alloc_block(GFP_ATOMIC);
if (!rrb)
return -ENOMEM;
- rc = clp_list_pci(rrb, &sd, __clp_get_state);
- if (!rc)
- *state = sd.state;
+ rc = clp_find_pci(rrb, fid, &entry);
+ if (!rc) {
+ *state = entry.config_state;
+ } else if (rc == -ENODEV) {
+ *state = ZPCI_FN_STATE_RESERVED;
+ rc = 0;
+ }
clp_free_block(rrb);
return rc;
@@ -481,7 +498,7 @@ static int clp_base_command(struct clp_req *req, struct clp_req_hdr *lpcb)
}
}
-static int clp_pci_slpc(struct clp_req *req, struct clp_req_rsp_slpc *lpcb)
+static int clp_pci_slpc(struct clp_req *req, struct clp_req_rsp_slpc_pci *lpcb)
{
unsigned long limit = PAGE_SIZE - sizeof(lpcb->request);
@@ -649,9 +666,4 @@ static struct miscdevice clp_misc_device = {
.fops = &clp_misc_fops,
};
-static int __init clp_misc_init(void)
-{
- return misc_register(&clp_misc_device);
-}
-
-device_initcall(clp_misc_init);
+builtin_misc_device(clp_misc_device);
diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c
index 3408c0df3ebf..6dde2263c79d 100644
--- a/arch/s390/pci/pci_debug.c
+++ b/arch/s390/pci/pci_debug.c
@@ -53,9 +53,11 @@ static char *pci_fmt3_names[] = {
};
static char *pci_sw_names[] = {
- "Allocated pages",
"Mapped pages",
"Unmapped pages",
+ "Global RPCITs",
+ "Sync Map RPCITs",
+ "Sync RPCITs",
};
static void pci_fmb_show(struct seq_file *m, char *name[], int length,
@@ -69,10 +71,14 @@ static void pci_fmb_show(struct seq_file *m, char *name[], int length,
static void pci_sw_counter_show(struct seq_file *m)
{
- struct zpci_dev *zdev = m->private;
- atomic64_t *counter = &zdev->allocated_pages;
+ struct zpci_iommu_ctrs *ctrs = zpci_get_iommu_ctrs(m->private);
+ atomic64_t *counter;
int i;
+ if (!ctrs)
+ return;
+
+ counter = &ctrs->mapped_pages;
for (i = 0; i < ARRAY_SIZE(pci_sw_names); i++, counter++)
seq_printf(m, "%26s:\t%llu\n", pci_sw_names[i],
atomic64_read(counter));
@@ -196,7 +202,7 @@ int __init zpci_debug_init(void)
if (!pci_debug_err_id)
return -EINVAL;
debug_register_view(pci_debug_err_id, &debug_hex_ascii_view);
- debug_set_level(pci_debug_err_id, 6);
+ debug_set_level(pci_debug_err_id, 3);
debugfs_root = debugfs_create_dir("pci", NULL);
return 0;
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
deleted file mode 100644
index 64b1399a73f0..000000000000
--- a/arch/s390/pci/pci_dma.c
+++ /dev/null
@@ -1,684 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright IBM Corp. 2012
- *
- * Author(s):
- * Jan Glauber <jang@linux.vnet.ibm.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/iommu-helper.h>
-#include <linux/dma-mapping.h>
-#include <linux/vmalloc.h>
-#include <linux/pci.h>
-#include <asm/pci_dma.h>
-
-static struct kmem_cache *dma_region_table_cache;
-static struct kmem_cache *dma_page_table_cache;
-static int s390_iommu_strict;
-
-static int zpci_refresh_global(struct zpci_dev *zdev)
-{
- return zpci_refresh_trans((u64) zdev->fh << 32, zdev->start_dma,
- zdev->iommu_pages * PAGE_SIZE);
-}
-
-unsigned long *dma_alloc_cpu_table(void)
-{
- unsigned long *table, *entry;
-
- table = kmem_cache_alloc(dma_region_table_cache, GFP_ATOMIC);
- if (!table)
- return NULL;
-
- for (entry = table; entry < table + ZPCI_TABLE_ENTRIES; entry++)
- *entry = ZPCI_TABLE_INVALID;
- return table;
-}
-
-static void dma_free_cpu_table(void *table)
-{
- kmem_cache_free(dma_region_table_cache, table);
-}
-
-static unsigned long *dma_alloc_page_table(void)
-{
- unsigned long *table, *entry;
-
- table = kmem_cache_alloc(dma_page_table_cache, GFP_ATOMIC);
- if (!table)
- return NULL;
-
- for (entry = table; entry < table + ZPCI_PT_ENTRIES; entry++)
- *entry = ZPCI_PTE_INVALID;
- return table;
-}
-
-static void dma_free_page_table(void *table)
-{
- kmem_cache_free(dma_page_table_cache, table);
-}
-
-static unsigned long *dma_get_seg_table_origin(unsigned long *entry)
-{
- unsigned long *sto;
-
- if (reg_entry_isvalid(*entry))
- sto = get_rt_sto(*entry);
- else {
- sto = dma_alloc_cpu_table();
- if (!sto)
- return NULL;
-
- set_rt_sto(entry, sto);
- validate_rt_entry(entry);
- entry_clr_protected(entry);
- }
- return sto;
-}
-
-static unsigned long *dma_get_page_table_origin(unsigned long *entry)
-{
- unsigned long *pto;
-
- if (reg_entry_isvalid(*entry))
- pto = get_st_pto(*entry);
- else {
- pto = dma_alloc_page_table();
- if (!pto)
- return NULL;
- set_st_pto(entry, pto);
- validate_st_entry(entry);
- entry_clr_protected(entry);
- }
- return pto;
-}
-
-unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr)
-{
- unsigned long *sto, *pto;
- unsigned int rtx, sx, px;
-
- rtx = calc_rtx(dma_addr);
- sto = dma_get_seg_table_origin(&rto[rtx]);
- if (!sto)
- return NULL;
-
- sx = calc_sx(dma_addr);
- pto = dma_get_page_table_origin(&sto[sx]);
- if (!pto)
- return NULL;
-
- px = calc_px(dma_addr);
- return &pto[px];
-}
-
-void dma_update_cpu_trans(unsigned long *entry, void *page_addr, int flags)
-{
- if (flags & ZPCI_PTE_INVALID) {
- invalidate_pt_entry(entry);
- } else {
- set_pt_pfaa(entry, page_addr);
- validate_pt_entry(entry);
- }
-
- if (flags & ZPCI_TABLE_PROTECTED)
- entry_set_protected(entry);
- else
- entry_clr_protected(entry);
-}
-
-static int __dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
- dma_addr_t dma_addr, size_t size, int flags)
-{
- unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
- u8 *page_addr = (u8 *) (pa & PAGE_MASK);
- unsigned long irq_flags;
- unsigned long *entry;
- int i, rc = 0;
-
- if (!nr_pages)
- return -EINVAL;
-
- spin_lock_irqsave(&zdev->dma_table_lock, irq_flags);
- if (!zdev->dma_table) {
- rc = -EINVAL;
- goto out_unlock;
- }
-
- for (i = 0; i < nr_pages; i++) {
- entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr);
- if (!entry) {
- rc = -ENOMEM;
- goto undo_cpu_trans;
- }
- dma_update_cpu_trans(entry, page_addr, flags);
- page_addr += PAGE_SIZE;
- dma_addr += PAGE_SIZE;
- }
-
-undo_cpu_trans:
- if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) {
- flags = ZPCI_PTE_INVALID;
- while (i-- > 0) {
- page_addr -= PAGE_SIZE;
- dma_addr -= PAGE_SIZE;
- entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr);
- if (!entry)
- break;
- dma_update_cpu_trans(entry, page_addr, flags);
- }
- }
-out_unlock:
- spin_unlock_irqrestore(&zdev->dma_table_lock, irq_flags);
- return rc;
-}
-
-static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr,
- size_t size, int flags)
-{
- unsigned long irqflags;
- int ret;
-
- /*
- * With zdev->tlb_refresh == 0, rpcit is not required to establish new
- * translations when previously invalid translation-table entries are
- * validated. With lazy unmap, rpcit is skipped for previously valid
- * entries, but a global rpcit is then required before any address can
- * be re-used, i.e. after each iommu bitmap wrap-around.
- */
- if ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID) {
- if (!zdev->tlb_refresh)
- return 0;
- } else {
- if (!s390_iommu_strict)
- return 0;
- }
-
- ret = zpci_refresh_trans((u64) zdev->fh << 32, dma_addr,
- PAGE_ALIGN(size));
- if (ret == -ENOMEM && !s390_iommu_strict) {
- /* enable the hypervisor to free some resources */
- if (zpci_refresh_global(zdev))
- goto out;
-
- spin_lock_irqsave(&zdev->iommu_bitmap_lock, irqflags);
- bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap,
- zdev->lazy_bitmap, zdev->iommu_pages);
- bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages);
- spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, irqflags);
- ret = 0;
- }
-out:
- return ret;
-}
-
-static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
- dma_addr_t dma_addr, size_t size, int flags)
-{
- int rc;
-
- rc = __dma_update_trans(zdev, pa, dma_addr, size, flags);
- if (rc)
- return rc;
-
- rc = __dma_purge_tlb(zdev, dma_addr, size, flags);
- if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID))
- __dma_update_trans(zdev, pa, dma_addr, size, ZPCI_PTE_INVALID);
-
- return rc;
-}
-
-void dma_free_seg_table(unsigned long entry)
-{
- unsigned long *sto = get_rt_sto(entry);
- int sx;
-
- for (sx = 0; sx < ZPCI_TABLE_ENTRIES; sx++)
- if (reg_entry_isvalid(sto[sx]))
- dma_free_page_table(get_st_pto(sto[sx]));
-
- dma_free_cpu_table(sto);
-}
-
-void dma_cleanup_tables(unsigned long *table)
-{
- int rtx;
-
- if (!table)
- return;
-
- for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++)
- if (reg_entry_isvalid(table[rtx]))
- dma_free_seg_table(table[rtx]);
-
- dma_free_cpu_table(table);
-}
-
-static unsigned long __dma_alloc_iommu(struct device *dev,
- unsigned long start, int size)
-{
- struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
- unsigned long boundary_size;
-
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
- return iommu_area_alloc(zdev->iommu_bitmap, zdev->iommu_pages,
- start, size, zdev->start_dma >> PAGE_SHIFT,
- boundary_size, 0);
-}
-
-static dma_addr_t dma_alloc_address(struct device *dev, int size)
-{
- struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
- unsigned long offset, flags;
-
- spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags);
- offset = __dma_alloc_iommu(dev, zdev->next_bit, size);
- if (offset == -1) {
- if (!s390_iommu_strict) {
- /* global flush before DMA addresses are reused */
- if (zpci_refresh_global(zdev))
- goto out_error;
-
- bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap,
- zdev->lazy_bitmap, zdev->iommu_pages);
- bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages);
- }
- /* wrap-around */
- offset = __dma_alloc_iommu(dev, 0, size);
- if (offset == -1)
- goto out_error;
- }
- zdev->next_bit = offset + size;
- spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
-
- return zdev->start_dma + offset * PAGE_SIZE;
-
-out_error:
- spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
- return DMA_MAPPING_ERROR;
-}
-
-static void dma_free_address(struct device *dev, dma_addr_t dma_addr, int size)
-{
- struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
- unsigned long flags, offset;
-
- offset = (dma_addr - zdev->start_dma) >> PAGE_SHIFT;
-
- spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags);
- if (!zdev->iommu_bitmap)
- goto out;
-
- if (s390_iommu_strict)
- bitmap_clear(zdev->iommu_bitmap, offset, size);
- else
- bitmap_set(zdev->lazy_bitmap, offset, size);
-
-out:
- spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
-}
-
-static inline void zpci_err_dma(unsigned long rc, unsigned long addr)
-{
- struct {
- unsigned long rc;
- unsigned long addr;
- } __packed data = {rc, addr};
-
- zpci_err_hex(&data, sizeof(data));
-}
-
-static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction direction,
- unsigned long attrs)
-{
- struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
- unsigned long pa = page_to_phys(page) + offset;
- int flags = ZPCI_PTE_VALID;
- unsigned long nr_pages;
- dma_addr_t dma_addr;
- int ret;
-
- /* This rounds up number of pages based on size and offset */
- nr_pages = iommu_num_pages(pa, size, PAGE_SIZE);
- dma_addr = dma_alloc_address(dev, nr_pages);
- if (dma_addr == DMA_MAPPING_ERROR) {
- ret = -ENOSPC;
- goto out_err;
- }
-
- /* Use rounded up size */
- size = nr_pages * PAGE_SIZE;
-
- if (direction == DMA_NONE || direction == DMA_TO_DEVICE)
- flags |= ZPCI_TABLE_PROTECTED;
-
- ret = dma_update_trans(zdev, pa, dma_addr, size, flags);
- if (ret)
- goto out_free;
-
- atomic64_add(nr_pages, &zdev->mapped_pages);
- return dma_addr + (offset & ~PAGE_MASK);
-
-out_free:
- dma_free_address(dev, dma_addr, nr_pages);
-out_err:
- zpci_err("map error:\n");
- zpci_err_dma(ret, pa);
- return DMA_MAPPING_ERROR;
-}
-
-static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr,
- size_t size, enum dma_data_direction direction,
- unsigned long attrs)
-{
- struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
- int npages, ret;
-
- npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
- dma_addr = dma_addr & PAGE_MASK;
- ret = dma_update_trans(zdev, 0, dma_addr, npages * PAGE_SIZE,
- ZPCI_PTE_INVALID);
- if (ret) {
- zpci_err("unmap error:\n");
- zpci_err_dma(ret, dma_addr);
- return;
- }
-
- atomic64_add(npages, &zdev->unmapped_pages);
- dma_free_address(dev, dma_addr, npages);
-}
-
-static void *s390_dma_alloc(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag,
- unsigned long attrs)
-{
- struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
- struct page *page;
- unsigned long pa;
- dma_addr_t map;
-
- size = PAGE_ALIGN(size);
- page = alloc_pages(flag | __GFP_ZERO, get_order(size));
- if (!page)
- return NULL;
-
- pa = page_to_phys(page);
- map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, 0);
- if (dma_mapping_error(dev, map)) {
- free_pages(pa, get_order(size));
- return NULL;
- }
-
- atomic64_add(size / PAGE_SIZE, &zdev->allocated_pages);
- if (dma_handle)
- *dma_handle = map;
- return (void *) pa;
-}
-
-static void s390_dma_free(struct device *dev, size_t size,
- void *pa, dma_addr_t dma_handle,
- unsigned long attrs)
-{
- struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
-
- size = PAGE_ALIGN(size);
- atomic64_sub(size / PAGE_SIZE, &zdev->allocated_pages);
- s390_dma_unmap_pages(dev, dma_handle, size, DMA_BIDIRECTIONAL, 0);
- free_pages((unsigned long) pa, get_order(size));
-}
-
-/* Map a segment into a contiguous dma address area */
-static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
- size_t size, dma_addr_t *handle,
- enum dma_data_direction dir)
-{
- unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
- struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
- dma_addr_t dma_addr_base, dma_addr;
- int flags = ZPCI_PTE_VALID;
- struct scatterlist *s;
- unsigned long pa = 0;
- int ret;
-
- dma_addr_base = dma_alloc_address(dev, nr_pages);
- if (dma_addr_base == DMA_MAPPING_ERROR)
- return -ENOMEM;
-
- dma_addr = dma_addr_base;
- if (dir == DMA_NONE || dir == DMA_TO_DEVICE)
- flags |= ZPCI_TABLE_PROTECTED;
-
- for (s = sg; dma_addr < dma_addr_base + size; s = sg_next(s)) {
- pa = page_to_phys(sg_page(s));
- ret = __dma_update_trans(zdev, pa, dma_addr,
- s->offset + s->length, flags);
- if (ret)
- goto unmap;
-
- dma_addr += s->offset + s->length;
- }
- ret = __dma_purge_tlb(zdev, dma_addr_base, size, flags);
- if (ret)
- goto unmap;
-
- *handle = dma_addr_base;
- atomic64_add(nr_pages, &zdev->mapped_pages);
-
- return ret;
-
-unmap:
- dma_update_trans(zdev, 0, dma_addr_base, dma_addr - dma_addr_base,
- ZPCI_PTE_INVALID);
- dma_free_address(dev, dma_addr_base, nr_pages);
- zpci_err("map error:\n");
- zpci_err_dma(ret, pa);
- return ret;
-}
-
-static int s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nr_elements, enum dma_data_direction dir,
- unsigned long attrs)
-{
- struct scatterlist *s = sg, *start = sg, *dma = sg;
- unsigned int max = dma_get_max_seg_size(dev);
- unsigned int size = s->offset + s->length;
- unsigned int offset = s->offset;
- int count = 0, i;
-
- for (i = 1; i < nr_elements; i++) {
- s = sg_next(s);
-
- s->dma_address = DMA_MAPPING_ERROR;
- s->dma_length = 0;
-
- if (s->offset || (size & ~PAGE_MASK) ||
- size + s->length > max) {
- if (__s390_dma_map_sg(dev, start, size,
- &dma->dma_address, dir))
- goto unmap;
-
- dma->dma_address += offset;
- dma->dma_length = size - offset;
-
- size = offset = s->offset;
- start = s;
- dma = sg_next(dma);
- count++;
- }
- size += s->length;
- }
- if (__s390_dma_map_sg(dev, start, size, &dma->dma_address, dir))
- goto unmap;
-
- dma->dma_address += offset;
- dma->dma_length = size - offset;
-
- return count + 1;
-unmap:
- for_each_sg(sg, s, count, i)
- s390_dma_unmap_pages(dev, sg_dma_address(s), sg_dma_len(s),
- dir, attrs);
-
- return 0;
-}
-
-static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
- int nr_elements, enum dma_data_direction dir,
- unsigned long attrs)
-{
- struct scatterlist *s;
- int i;
-
- for_each_sg(sg, s, nr_elements, i) {
- if (s->dma_length)
- s390_dma_unmap_pages(dev, s->dma_address, s->dma_length,
- dir, attrs);
- s->dma_address = 0;
- s->dma_length = 0;
- }
-}
-
-int zpci_dma_init_device(struct zpci_dev *zdev)
-{
- int rc;
-
- /*
- * At this point, if the device is part of an IOMMU domain, this would
- * be a strong hint towards a bug in the IOMMU API (common) code and/or
- * simultaneous access via IOMMU and DMA API. So let's issue a warning.
- */
- WARN_ON(zdev->s390_domain);
-
- spin_lock_init(&zdev->iommu_bitmap_lock);
- spin_lock_init(&zdev->dma_table_lock);
-
- zdev->dma_table = dma_alloc_cpu_table();
- if (!zdev->dma_table) {
- rc = -ENOMEM;
- goto out;
- }
-
- /*
- * Restrict the iommu bitmap size to the minimum of the following:
- * - main memory size
- * - 3-level pagetable address limit minus start_dma offset
- * - DMA address range allowed by the hardware (clp query pci fn)
- *
- * Also set zdev->end_dma to the actual end address of the usable
- * range, instead of the theoretical maximum as reported by hardware.
- */
- zdev->start_dma = PAGE_ALIGN(zdev->start_dma);
- zdev->iommu_size = min3((u64) high_memory,
- ZPCI_TABLE_SIZE_RT - zdev->start_dma,
- zdev->end_dma - zdev->start_dma + 1);
- zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1;
- zdev->iommu_pages = zdev->iommu_size >> PAGE_SHIFT;
- zdev->iommu_bitmap = vzalloc(zdev->iommu_pages / 8);
- if (!zdev->iommu_bitmap) {
- rc = -ENOMEM;
- goto free_dma_table;
- }
- if (!s390_iommu_strict) {
- zdev->lazy_bitmap = vzalloc(zdev->iommu_pages / 8);
- if (!zdev->lazy_bitmap) {
- rc = -ENOMEM;
- goto free_bitmap;
- }
-
- }
- rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
- (u64) zdev->dma_table);
- if (rc)
- goto free_bitmap;
-
- return 0;
-free_bitmap:
- vfree(zdev->iommu_bitmap);
- zdev->iommu_bitmap = NULL;
- vfree(zdev->lazy_bitmap);
- zdev->lazy_bitmap = NULL;
-free_dma_table:
- dma_free_cpu_table(zdev->dma_table);
- zdev->dma_table = NULL;
-out:
- return rc;
-}
-
-void zpci_dma_exit_device(struct zpci_dev *zdev)
-{
- /*
- * At this point, if the device is part of an IOMMU domain, this would
- * be a strong hint towards a bug in the IOMMU API (common) code and/or
- * simultaneous access via IOMMU and DMA API. So let's issue a warning.
- */
- WARN_ON(zdev->s390_domain);
-
- if (zpci_unregister_ioat(zdev, 0))
- return;
-
- dma_cleanup_tables(zdev->dma_table);
- zdev->dma_table = NULL;
- vfree(zdev->iommu_bitmap);
- zdev->iommu_bitmap = NULL;
- vfree(zdev->lazy_bitmap);
- zdev->lazy_bitmap = NULL;
-
- zdev->next_bit = 0;
-}
-
-static int __init dma_alloc_cpu_table_caches(void)
-{
- dma_region_table_cache = kmem_cache_create("PCI_DMA_region_tables",
- ZPCI_TABLE_SIZE, ZPCI_TABLE_ALIGN,
- 0, NULL);
- if (!dma_region_table_cache)
- return -ENOMEM;
-
- dma_page_table_cache = kmem_cache_create("PCI_DMA_page_tables",
- ZPCI_PT_SIZE, ZPCI_PT_ALIGN,
- 0, NULL);
- if (!dma_page_table_cache) {
- kmem_cache_destroy(dma_region_table_cache);
- return -ENOMEM;
- }
- return 0;
-}
-
-int __init zpci_dma_init(void)
-{
- return dma_alloc_cpu_table_caches();
-}
-
-void zpci_dma_exit(void)
-{
- kmem_cache_destroy(dma_page_table_cache);
- kmem_cache_destroy(dma_region_table_cache);
-}
-
-const struct dma_map_ops s390_pci_dma_ops = {
- .alloc = s390_dma_alloc,
- .free = s390_dma_free,
- .map_sg = s390_dma_map_sg,
- .unmap_sg = s390_dma_unmap_sg,
- .map_page = s390_dma_map_pages,
- .unmap_page = s390_dma_unmap_pages,
- .mmap = dma_common_mmap,
- .get_sgtable = dma_common_get_sgtable,
- /* dma_supported is unconditionally true without a callback */
-};
-EXPORT_SYMBOL_GPL(s390_pci_dma_ops);
-
-static int __init s390_iommu_setup(char *str)
-{
- if (!strcmp(str, "strict"))
- s390_iommu_strict = 1;
- return 1;
-}
-
-__setup("s390_iommu=", s390_iommu_setup);
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index 8d6ee4af4230..4d9773ef9e0a 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -12,8 +12,11 @@
#include <linux/kernel.h>
#include <linux/pci.h>
#include <asm/pci_debug.h>
+#include <asm/pci_dma.h>
#include <asm/sclp.h>
+#include "pci_bus.h"
+
/* Content Code Description for PCI Function Error */
struct zpci_ccdf_err {
u32 reserved1;
@@ -44,25 +47,254 @@ struct zpci_ccdf_avail {
u16 pec; /* PCI event code */
} __packed;
+static inline bool ers_result_indicates_abort(pci_ers_result_t ers_res)
+{
+ switch (ers_res) {
+ case PCI_ERS_RESULT_CAN_RECOVER:
+ case PCI_ERS_RESULT_RECOVERED:
+ case PCI_ERS_RESULT_NEED_RESET:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static bool is_passed_through(struct pci_dev *pdev)
+{
+ struct zpci_dev *zdev = to_zpci(pdev);
+ bool ret;
+
+ mutex_lock(&zdev->kzdev_lock);
+ ret = !!zdev->kzdev;
+ mutex_unlock(&zdev->kzdev_lock);
+
+ return ret;
+}
+
+static bool is_driver_supported(struct pci_driver *driver)
+{
+ if (!driver || !driver->err_handler)
+ return false;
+ if (!driver->err_handler->error_detected)
+ return false;
+ if (!driver->err_handler->slot_reset)
+ return false;
+ if (!driver->err_handler->resume)
+ return false;
+ return true;
+}
+
+static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev,
+ struct pci_driver *driver)
+{
+ pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
+
+ ers_res = driver->err_handler->error_detected(pdev, pdev->error_state);
+ if (ers_result_indicates_abort(ers_res))
+ pr_info("%s: Automatic recovery failed after initial reporting\n", pci_name(pdev));
+ else if (ers_res == PCI_ERS_RESULT_NEED_RESET)
+ pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev));
+
+ return ers_res;
+}
+
+static pci_ers_result_t zpci_event_do_error_state_clear(struct pci_dev *pdev,
+ struct pci_driver *driver)
+{
+ pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
+ struct zpci_dev *zdev = to_zpci(pdev);
+ int rc;
+
+ pr_info("%s: Unblocking device access for examination\n", pci_name(pdev));
+ rc = zpci_reset_load_store_blocked(zdev);
+ if (rc) {
+ pr_err("%s: Unblocking device access failed\n", pci_name(pdev));
+ /* Let's try a full reset instead */
+ return PCI_ERS_RESULT_NEED_RESET;
+ }
+
+ if (driver->err_handler->mmio_enabled) {
+ ers_res = driver->err_handler->mmio_enabled(pdev);
+ if (ers_result_indicates_abort(ers_res)) {
+ pr_info("%s: Automatic recovery failed after MMIO re-enable\n",
+ pci_name(pdev));
+ return ers_res;
+ } else if (ers_res == PCI_ERS_RESULT_NEED_RESET) {
+ pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev));
+ return ers_res;
+ }
+ }
+
+ pr_debug("%s: Unblocking DMA\n", pci_name(pdev));
+ rc = zpci_clear_error_state(zdev);
+ if (!rc) {
+ pdev->error_state = pci_channel_io_normal;
+ } else {
+ pr_err("%s: Unblocking DMA failed\n", pci_name(pdev));
+ /* Let's try a full reset instead */
+ return PCI_ERS_RESULT_NEED_RESET;
+ }
+
+ return ers_res;
+}
+
+static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
+ struct pci_driver *driver)
+{
+ pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
+
+ pr_info("%s: Initiating reset\n", pci_name(pdev));
+ if (zpci_hot_reset_device(to_zpci(pdev))) {
+ pr_err("%s: The reset request failed\n", pci_name(pdev));
+ return ers_res;
+ }
+ pdev->error_state = pci_channel_io_normal;
+ ers_res = driver->err_handler->slot_reset(pdev);
+ if (ers_result_indicates_abort(ers_res)) {
+ pr_info("%s: Automatic recovery failed after slot reset\n", pci_name(pdev));
+ return ers_res;
+ }
+
+ return ers_res;
+}
+
+/* zpci_event_attempt_error_recovery - Try to recover the given PCI function
+ * @pdev: PCI function to recover currently in the error state
+ *
+ * We follow the scheme outlined in Documentation/PCI/pci-error-recovery.rst.
+ * With the simplification that recovery always happens per function
+ * and the platform determines which functions are affected for
+ * multi-function devices.
+ */
+static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
+{
+ pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
+ struct pci_driver *driver;
+
+ /*
+ * Ensure that the PCI function is not removed concurrently, no driver
+ * is unbound or probed and that userspace can't access its
+ * configuration space while we perform recovery.
+ */
+ pci_dev_lock(pdev);
+ if (pdev->error_state == pci_channel_io_perm_failure) {
+ ers_res = PCI_ERS_RESULT_DISCONNECT;
+ goto out_unlock;
+ }
+ pdev->error_state = pci_channel_io_frozen;
+
+ if (is_passed_through(pdev)) {
+ pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
+ pci_name(pdev));
+ goto out_unlock;
+ }
+
+ driver = to_pci_driver(pdev->dev.driver);
+ if (!is_driver_supported(driver)) {
+ if (!driver)
+ pr_info("%s: Cannot be recovered because no driver is bound to the device\n",
+ pci_name(pdev));
+ else
+ pr_info("%s: The %s driver bound to the device does not support error recovery\n",
+ pci_name(pdev),
+ driver->name);
+ goto out_unlock;
+ }
+
+ ers_res = zpci_event_notify_error_detected(pdev, driver);
+ if (ers_result_indicates_abort(ers_res))
+ goto out_unlock;
+
+ if (ers_res == PCI_ERS_RESULT_CAN_RECOVER) {
+ ers_res = zpci_event_do_error_state_clear(pdev, driver);
+ if (ers_result_indicates_abort(ers_res))
+ goto out_unlock;
+ }
+
+ if (ers_res == PCI_ERS_RESULT_NEED_RESET)
+ ers_res = zpci_event_do_reset(pdev, driver);
+
+ if (ers_res != PCI_ERS_RESULT_RECOVERED) {
+ pr_err("%s: Automatic recovery failed; operator intervention is required\n",
+ pci_name(pdev));
+ goto out_unlock;
+ }
+
+ pr_info("%s: The device is ready to resume operations\n", pci_name(pdev));
+ if (driver->err_handler->resume)
+ driver->err_handler->resume(pdev);
+out_unlock:
+ pci_dev_unlock(pdev);
+
+ return ers_res;
+}
+
+/* zpci_event_io_failure - Report PCI channel failure state to driver
+ * @pdev: PCI function for which to report
+ * @es: PCI channel failure state to report
+ */
+static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
+{
+ struct pci_driver *driver;
+
+ pci_dev_lock(pdev);
+ pdev->error_state = es;
+ /**
+ * While vfio-pci's error_detected callback notifies user-space QEMU
+ * reacts to this by freezing the guest. In an s390 environment PCI
+ * errors are rarely fatal so this is overkill. Instead in the future
+ * we will inject the error event and let the guest recover the device
+ * itself.
+ */
+ if (is_passed_through(pdev))
+ goto out;
+ driver = to_pci_driver(pdev->dev.driver);
+ if (driver && driver->err_handler && driver->err_handler->error_detected)
+ driver->err_handler->error_detected(pdev, pdev->error_state);
+out:
+ pci_dev_unlock(pdev);
+}
+
static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
{
struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
struct pci_dev *pdev = NULL;
+ pci_ers_result_t ers_res;
+ zpci_dbg(3, "err fid:%x, fh:%x, pec:%x\n",
+ ccdf->fid, ccdf->fh, ccdf->pec);
zpci_err("error CCDF:\n");
zpci_err_hex(ccdf, sizeof(*ccdf));
- if (zdev)
- pdev = pci_get_slot(zdev->bus, ZPCI_DEVFN);
+ if (zdev) {
+ zpci_update_fh(zdev, ccdf->fh);
+ if (zdev->zbus->bus)
+ pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
+ }
pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n",
pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid);
if (!pdev)
- return;
+ goto no_pdev;
- pdev->error_state = pci_channel_io_perm_failure;
+ switch (ccdf->pec) {
+ case 0x003a: /* Service Action or Error Recovery Successful */
+ ers_res = zpci_event_attempt_error_recovery(pdev);
+ if (ers_res != PCI_ERS_RESULT_RECOVERED)
+ zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
+ break;
+ default:
+ /*
+ * Mark as frozen not permanently failed because the device
+ * could be subsequently recovered by the platform.
+ */
+ zpci_event_io_failure(pdev, pci_channel_io_frozen);
+ break;
+ }
pci_dev_put(pdev);
+no_pdev:
+ zpci_zdev_put(zdev);
}
void zpci_event_error(void *data)
@@ -71,90 +303,88 @@ void zpci_event_error(void *data)
__zpci_event_error(data);
}
+static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh)
+{
+ zpci_update_fh(zdev, fh);
+ /* Give the driver a hint that the function is
+ * already unusable.
+ */
+ zpci_bus_remove_device(zdev, true);
+ /* Even though the device is already gone we still
+ * need to free zPCI resources as part of the disable.
+ */
+ if (zdev_enabled(zdev))
+ zpci_disable_device(zdev);
+ zdev->state = ZPCI_FN_STATE_STANDBY;
+}
+
static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
{
struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
- struct pci_dev *pdev = NULL;
+ bool existing_zdev = !!zdev;
enum zpci_state state;
- int ret;
-
- if (zdev)
- pdev = pci_get_slot(zdev->bus, ZPCI_DEVFN);
-
- pr_info("%s: Event 0x%x reconfigured PCI function 0x%x\n",
- pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid);
- zpci_err("avail CCDF:\n");
- zpci_err_hex(ccdf, sizeof(*ccdf));
+ zpci_dbg(3, "avl fid:%x, fh:%x, pec:%x\n",
+ ccdf->fid, ccdf->fh, ccdf->pec);
switch (ccdf->pec) {
case 0x0301: /* Reserved|Standby -> Configured */
if (!zdev) {
- ret = clp_add_pci_device(ccdf->fid, ccdf->fh, 0);
- if (ret)
+ zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_CONFIGURED);
+ if (IS_ERR(zdev))
break;
- zdev = get_zdev_by_fid(ccdf->fid);
+ } else {
+ /* the configuration request may be stale */
+ if (zdev->state != ZPCI_FN_STATE_STANDBY)
+ break;
+ zdev->state = ZPCI_FN_STATE_CONFIGURED;
}
- if (!zdev || zdev->state != ZPCI_FN_STATE_STANDBY)
- break;
- zdev->state = ZPCI_FN_STATE_CONFIGURED;
- zdev->fh = ccdf->fh;
- ret = zpci_enable_device(zdev);
- if (ret)
- break;
- pci_lock_rescan_remove();
- pci_rescan_bus(zdev->bus);
- pci_unlock_rescan_remove();
+ zpci_scan_configured_device(zdev, ccdf->fh);
break;
case 0x0302: /* Reserved -> Standby */
if (!zdev)
- clp_add_pci_device(ccdf->fid, ccdf->fh, 0);
+ zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY);
+ else
+ zpci_update_fh(zdev, ccdf->fh);
break;
case 0x0303: /* Deconfiguration requested */
- if (!zdev)
- break;
- if (pdev)
- pci_stop_and_remove_bus_device_locked(pdev);
-
- ret = zpci_disable_device(zdev);
- if (ret)
- break;
-
- ret = sclp_pci_deconfigure(zdev->fid);
- zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, ret);
- if (!ret)
- zdev->state = ZPCI_FN_STATE_STANDBY;
-
+ if (zdev) {
+ /* The event may have been queued before we confirgured
+ * the device.
+ */
+ if (zdev->state != ZPCI_FN_STATE_CONFIGURED)
+ break;
+ zpci_update_fh(zdev, ccdf->fh);
+ zpci_deconfigure_device(zdev);
+ }
break;
case 0x0304: /* Configured -> Standby|Reserved */
- if (!zdev)
- break;
- if (pdev) {
- /* Give the driver a hint that the function is
- * already unusable. */
- pdev->error_state = pci_channel_io_perm_failure;
- pci_stop_and_remove_bus_device_locked(pdev);
- }
-
- zdev->fh = ccdf->fh;
- zpci_disable_device(zdev);
- zdev->state = ZPCI_FN_STATE_STANDBY;
- if (!clp_get_state(ccdf->fid, &state) &&
- state == ZPCI_FN_STATE_RESERVED) {
- zpci_remove_device(zdev);
+ if (zdev) {
+ /* The event may have been queued before we confirgured
+ * the device.:
+ */
+ if (zdev->state == ZPCI_FN_STATE_CONFIGURED)
+ zpci_event_hard_deconfigured(zdev, ccdf->fh);
+ /* The 0x0304 event may immediately reserve the device */
+ if (!clp_get_state(zdev->fid, &state) &&
+ state == ZPCI_FN_STATE_RESERVED) {
+ zpci_device_reserved(zdev);
+ }
}
break;
case 0x0306: /* 0x308 or 0x302 for multiple devices */
- clp_rescan_pci_devices();
+ zpci_remove_reserved_devices();
+ clp_scan_pci_devices();
break;
case 0x0308: /* Standby -> Reserved */
if (!zdev)
break;
- zpci_remove_device(zdev);
+ zpci_device_reserved(zdev);
break;
default:
break;
}
- pci_dev_put(pdev);
+ if (existing_zdev)
+ zpci_zdev_put(zdev);
}
void zpci_event_availability(void *data)
diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c
index 02f9505c99a8..56480be48244 100644
--- a/arch/s390/pci/pci_insn.c
+++ b/arch/s390/pci/pci_insn.c
@@ -9,6 +9,7 @@
#include <linux/errno.h>
#include <linux/delay.h>
#include <linux/jump_label.h>
+#include <asm/asm-extable.h>
#include <asm/facility.h>
#include <asm/pci_insn.h>
#include <asm/pci_debug.h>
@@ -17,16 +18,40 @@
#define ZPCI_INSN_BUSY_DELAY 1 /* 1 microsecond */
-static inline void zpci_err_insn(u8 cc, u8 status, u64 req, u64 offset)
+struct zpci_err_insn_data {
+ u8 insn;
+ u8 cc;
+ u8 status;
+ union {
+ struct {
+ u64 req;
+ u64 offset;
+ };
+ struct {
+ u64 addr;
+ u64 len;
+ };
+ };
+} __packed;
+
+static inline void zpci_err_insn_req(int lvl, u8 insn, u8 cc, u8 status,
+ u64 req, u64 offset)
{
- struct {
- u64 req;
- u64 offset;
- u8 cc;
- u8 status;
- } __packed data = {req, offset, cc, status};
-
- zpci_err_hex(&data, sizeof(data));
+ struct zpci_err_insn_data data = {
+ .insn = insn, .cc = cc, .status = status,
+ .req = req, .offset = offset};
+
+ zpci_err_hex_level(lvl, &data, sizeof(data));
+}
+
+static inline void zpci_err_insn_addr(int lvl, u8 insn, u8 cc, u8 status,
+ u64 addr, u64 len)
+{
+ struct zpci_err_insn_data data = {
+ .insn = insn, .cc = cc, .status = status,
+ .addr = addr, .len = len};
+
+ zpci_err_hex_level(lvl, &data, sizeof(data));
}
/* Modify PCI Function Controls */
@@ -46,33 +71,41 @@ static inline u8 __mpcifc(u64 req, struct zpci_fib *fib, u8 *status)
u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status)
{
+ bool retried = false;
u8 cc;
do {
cc = __mpcifc(req, fib, status);
- if (cc == 2)
+ if (cc == 2) {
msleep(ZPCI_INSN_BUSY_DELAY);
+ if (!retried) {
+ zpci_err_insn_req(1, 'M', cc, *status, req, 0);
+ retried = true;
+ }
+ }
} while (cc == 2);
if (cc)
- zpci_err_insn(cc, *status, req, 0);
+ zpci_err_insn_req(0, 'M', cc, *status, req, 0);
+ else if (retried)
+ zpci_err_insn_req(1, 'M', cc, *status, req, 0);
return cc;
}
+EXPORT_SYMBOL_GPL(zpci_mod_fc);
/* Refresh PCI Translations */
static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status)
{
- register u64 __addr asm("2") = addr;
- register u64 __range asm("3") = range;
+ union register_pair addr_range = {.even = addr, .odd = range};
u8 cc;
asm volatile (
- " .insn rre,0xb9d30000,%[fn],%[addr]\n"
+ " .insn rre,0xb9d30000,%[fn],%[addr_range]\n"
" ipm %[cc]\n"
" srl %[cc],28\n"
: [cc] "=d" (cc), [fn] "+d" (fn)
- : [addr] "d" (__addr), "d" (__range)
+ : [addr_range] "d" (addr_range.pair)
: "cc");
*status = fn >> 24 & 0xff;
return cc;
@@ -80,16 +113,24 @@ static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status)
int zpci_refresh_trans(u64 fn, u64 addr, u64 range)
{
+ bool retried = false;
u8 cc, status;
do {
cc = __rpcit(fn, addr, range, &status);
- if (cc == 2)
+ if (cc == 2) {
udelay(ZPCI_INSN_BUSY_DELAY);
+ if (!retried) {
+ zpci_err_insn_addr(1, 'R', cc, status, addr, range);
+ retried = true;
+ }
+ }
} while (cc == 2);
if (cc)
- zpci_err_insn(cc, status, addr, range);
+ zpci_err_insn_addr(0, 'R', cc, status, addr, range);
+ else if (retried)
+ zpci_err_insn_addr(1, 'R', cc, status, addr, range);
if (cc == 1 && (status == 4 || status == 16))
return -ENOMEM;
@@ -98,7 +139,7 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range)
}
/* Set Interruption Controls */
-int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
+int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
{
if (!test_facility(72))
return -EIO;
@@ -109,25 +150,24 @@ int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
return 0;
}
+EXPORT_SYMBOL_GPL(zpci_set_irq_ctrl);
/* PCI Load */
static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status)
{
- register u64 __req asm("2") = req;
- register u64 __offset asm("3") = offset;
+ union register_pair req_off = {.even = req, .odd = offset};
int cc = -ENXIO;
u64 __data;
asm volatile (
- " .insn rre,0xb9d20000,%[data],%[req]\n"
+ " .insn rre,0xb9d20000,%[data],%[req_off]\n"
"0: ipm %[cc]\n"
" srl %[cc],28\n"
"1:\n"
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [data] "=d" (__data), [req] "+d" (__req)
- : "d" (__offset)
- : "cc");
- *status = __req >> 24 & 0xff;
+ : [cc] "+d" (cc), [data] "=d" (__data),
+ [req_off] "+&d" (req_off.pair) :: "cc");
+ *status = req_off.even >> 24 & 0xff;
*data = __data;
return cc;
}
@@ -146,17 +186,25 @@ static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status)
int __zpci_load(u64 *data, u64 req, u64 offset)
{
+ bool retried = false;
u8 status;
int cc;
do {
cc = __pcilg(data, req, offset, &status);
- if (cc == 2)
+ if (cc == 2) {
udelay(ZPCI_INSN_BUSY_DELAY);
+ if (!retried) {
+ zpci_err_insn_req(1, 'l', cc, status, req, offset);
+ retried = true;
+ }
+ }
} while (cc == 2);
if (cc)
- zpci_err_insn(cc, status, req, offset);
+ zpci_err_insn_req(0, 'l', cc, status, req, offset);
+ else if (retried)
+ zpci_err_insn_req(1, 'l', cc, status, req, offset);
return (cc > 0) ? -EIO : cc;
}
@@ -166,28 +214,26 @@ static inline int zpci_load_fh(u64 *data, const volatile void __iomem *addr,
unsigned long len)
{
struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)];
- u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len);
+ u64 req = ZPCI_CREATE_REQ(READ_ONCE(entry->fh), entry->bar, len);
return __zpci_load(data, req, ZPCI_OFFSET(addr));
}
static inline int __pcilg_mio(u64 *data, u64 ioaddr, u64 len, u8 *status)
{
- register u64 addr asm("2") = ioaddr;
- register u64 r3 asm("3") = len;
+ union register_pair ioaddr_len = {.even = ioaddr, .odd = len};
int cc = -ENXIO;
u64 __data;
asm volatile (
- " .insn rre,0xb9d60000,%[data],%[ioaddr]\n"
+ " .insn rre,0xb9d60000,%[data],%[ioaddr_len]\n"
"0: ipm %[cc]\n"
" srl %[cc],28\n"
"1:\n"
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [data] "=d" (__data), "+d" (r3)
- : [ioaddr] "d" (addr)
- : "cc");
- *status = r3 >> 24 & 0xff;
+ : [cc] "+d" (cc), [data] "=d" (__data),
+ [ioaddr_len] "+&d" (ioaddr_len.pair) :: "cc");
+ *status = ioaddr_len.odd >> 24 & 0xff;
*data = __data;
return cc;
}
@@ -202,7 +248,7 @@ int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len)
cc = __pcilg_mio(data, (__force u64) addr, len, &status);
if (cc)
- zpci_err_insn(cc, status, 0, (__force u64) addr);
+ zpci_err_insn_addr(0, 'L', cc, status, (__force u64) addr, len);
return (cc > 0) ? -EIO : cc;
}
@@ -211,36 +257,43 @@ EXPORT_SYMBOL_GPL(zpci_load);
/* PCI Store */
static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status)
{
- register u64 __req asm("2") = req;
- register u64 __offset asm("3") = offset;
+ union register_pair req_off = {.even = req, .odd = offset};
int cc = -ENXIO;
asm volatile (
- " .insn rre,0xb9d00000,%[data],%[req]\n"
+ " .insn rre,0xb9d00000,%[data],%[req_off]\n"
"0: ipm %[cc]\n"
" srl %[cc],28\n"
"1:\n"
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [req] "+d" (__req)
- : "d" (__offset), [data] "d" (data)
+ : [cc] "+d" (cc), [req_off] "+&d" (req_off.pair)
+ : [data] "d" (data)
: "cc");
- *status = __req >> 24 & 0xff;
+ *status = req_off.even >> 24 & 0xff;
return cc;
}
int __zpci_store(u64 data, u64 req, u64 offset)
{
+ bool retried = false;
u8 status;
int cc;
do {
cc = __pcistg(data, req, offset, &status);
- if (cc == 2)
+ if (cc == 2) {
udelay(ZPCI_INSN_BUSY_DELAY);
+ if (!retried) {
+ zpci_err_insn_req(1, 's', cc, status, req, offset);
+ retried = true;
+ }
+ }
} while (cc == 2);
if (cc)
- zpci_err_insn(cc, status, req, offset);
+ zpci_err_insn_req(0, 's', cc, status, req, offset);
+ else if (retried)
+ zpci_err_insn_req(1, 's', cc, status, req, offset);
return (cc > 0) ? -EIO : cc;
}
@@ -250,27 +303,26 @@ static inline int zpci_store_fh(const volatile void __iomem *addr, u64 data,
unsigned long len)
{
struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)];
- u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len);
+ u64 req = ZPCI_CREATE_REQ(READ_ONCE(entry->fh), entry->bar, len);
return __zpci_store(data, req, ZPCI_OFFSET(addr));
}
static inline int __pcistg_mio(u64 data, u64 ioaddr, u64 len, u8 *status)
{
- register u64 addr asm("2") = ioaddr;
- register u64 r3 asm("3") = len;
+ union register_pair ioaddr_len = {.even = ioaddr, .odd = len};
int cc = -ENXIO;
asm volatile (
- " .insn rre,0xb9d40000,%[data],%[ioaddr]\n"
+ " .insn rre,0xb9d40000,%[data],%[ioaddr_len]\n"
"0: ipm %[cc]\n"
" srl %[cc],28\n"
"1:\n"
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), "+d" (r3)
- : [data] "d" (data), [ioaddr] "d" (addr)
- : "cc");
- *status = r3 >> 24 & 0xff;
+ : [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair)
+ : [data] "d" (data)
+ : "cc", "memory");
+ *status = ioaddr_len.odd >> 24 & 0xff;
return cc;
}
@@ -284,7 +336,7 @@ int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len)
cc = __pcistg_mio(data, (__force u64) addr, len, &status);
if (cc)
- zpci_err_insn(cc, status, 0, (__force u64) addr);
+ zpci_err_insn_addr(0, 'S', cc, status, (__force u64) addr, len);
return (cc > 0) ? -EIO : cc;
}
@@ -310,17 +362,25 @@ static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status)
int __zpci_store_block(const u64 *data, u64 req, u64 offset)
{
+ bool retried = false;
u8 status;
int cc;
do {
cc = __pcistb(data, req, offset, &status);
- if (cc == 2)
+ if (cc == 2) {
udelay(ZPCI_INSN_BUSY_DELAY);
+ if (!retried) {
+ zpci_err_insn_req(0, 'b', cc, status, req, offset);
+ retried = true;
+ }
+ }
} while (cc == 2);
if (cc)
- zpci_err_insn(cc, status, req, offset);
+ zpci_err_insn_req(0, 'b', cc, status, req, offset);
+ else if (retried)
+ zpci_err_insn_req(1, 'b', cc, status, req, offset);
return (cc > 0) ? -EIO : cc;
}
@@ -364,7 +424,7 @@ int zpci_write_block(volatile void __iomem *dst,
cc = __pcistb_mio(src, (__force u64) dst, len, &status);
if (cc)
- zpci_err_insn(cc, status, 0, (__force u64) dst);
+ zpci_err_insn_addr(0, 'B', cc, status, (__force u64) dst, len);
return (cc > 0) ? -EIO : cc;
}
@@ -372,10 +432,7 @@ EXPORT_SYMBOL_GPL(zpci_write_block);
static inline void __pciwb_mio(void)
{
- unsigned long unused = 0;
-
- asm volatile (".insn rre,0xb9d50000,%[op],%[op]\n"
- : [op] "+d" (unused));
+ asm volatile (".insn rre,0xb9d50000,0,0\n");
}
void zpci_barrier(void)
diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c
new file mode 100644
index 000000000000..ead062bf2b41
--- /dev/null
+++ b/arch/s390/pci/pci_iov.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s):
+ * Niklas Schnelle <schnelle@linux.ibm.com>
+ *
+ */
+
+#define KMSG_COMPONENT "zpci"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include "pci_iov.h"
+
+static struct resource iov_res = {
+ .name = "PCI IOV res",
+ .start = 0,
+ .end = -1,
+ .flags = IORESOURCE_MEM,
+};
+
+void zpci_iov_map_resources(struct pci_dev *pdev)
+{
+ resource_size_t len;
+ int i;
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ int bar = i + PCI_IOV_RESOURCES;
+
+ len = pci_resource_len(pdev, bar);
+ if (!len)
+ continue;
+ pdev->resource[bar].parent = &iov_res;
+ }
+}
+
+void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn)
+{
+ pci_lock_rescan_remove();
+ /* Linux' vfid's start at 0 vfn at 1 */
+ pci_iov_remove_virtfn(pdev->physfn, vfn - 1);
+ pci_unlock_rescan_remove();
+}
+
+static int zpci_iov_link_virtfn(struct pci_dev *pdev, struct pci_dev *virtfn, int vfid)
+{
+ int rc;
+
+ rc = pci_iov_sysfs_link(pdev, virtfn, vfid);
+ if (rc)
+ return rc;
+
+ virtfn->is_virtfn = 1;
+ virtfn->multifunction = 0;
+ virtfn->physfn = pci_dev_get(pdev);
+
+ return 0;
+}
+
+int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn)
+{
+ int i, cand_devfn;
+ struct zpci_dev *zdev;
+ struct pci_dev *pdev;
+ int vfid = vfn - 1; /* Linux' vfid's start at 0 vfn at 1*/
+ int rc = 0;
+
+ if (!zbus->multifunction)
+ return 0;
+
+ /* If the parent PF for the given VF is also configured in the
+ * instance, it must be on the same zbus.
+ * We can then identify the parent PF by checking what
+ * devfn the VF would have if it belonged to that PF using the PF's
+ * stride and offset. Only if this candidate devfn matches the
+ * actual devfn will we link both functions.
+ */
+ for (i = 0; i < ZPCI_FUNCTIONS_PER_BUS; i++) {
+ zdev = zbus->function[i];
+ if (zdev && zdev->is_physfn) {
+ pdev = pci_get_slot(zbus->bus, zdev->devfn);
+ if (!pdev)
+ continue;
+ cand_devfn = pci_iov_virtfn_devfn(pdev, vfid);
+ if (cand_devfn == virtfn->devfn) {
+ rc = zpci_iov_link_virtfn(pdev, virtfn, vfid);
+ /* balance pci_get_slot() */
+ pci_dev_put(pdev);
+ break;
+ }
+ /* balance pci_get_slot() */
+ pci_dev_put(pdev);
+ }
+ }
+ return rc;
+}
diff --git a/arch/s390/pci/pci_iov.h b/arch/s390/pci/pci_iov.h
new file mode 100644
index 000000000000..b2c828003bad
--- /dev/null
+++ b/arch/s390/pci/pci_iov.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s):
+ * Niklas Schnelle <schnelle@linux.ibm.com>
+ *
+ */
+
+#ifndef __S390_PCI_IOV_H
+#define __S390_PCI_IOV_H
+
+#ifdef CONFIG_PCI_IOV
+void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn);
+
+void zpci_iov_map_resources(struct pci_dev *pdev);
+
+int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn);
+
+#else /* CONFIG_PCI_IOV */
+static inline void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) {}
+
+static inline void zpci_iov_map_resources(struct pci_dev *pdev) {}
+
+static inline int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn)
+{
+ return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+#endif /* __S390_PCI_IOV_h */
diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
index fbe97ab2e228..ff8f24854c64 100644
--- a/arch/s390/pci/pci_irq.c
+++ b/arch/s390/pci/pci_irq.c
@@ -11,16 +11,10 @@
#include <asm/isc.h>
#include <asm/airq.h>
+#include <asm/tpi.h>
static enum {FLOATING, DIRECTED} irq_delivery;
-#define SIC_IRQ_MODE_ALL 0
-#define SIC_IRQ_MODE_SINGLE 1
-#define SIC_IRQ_MODE_DIRECT 4
-#define SIC_IRQ_MODE_D_ALL 16
-#define SIC_IRQ_MODE_D_SINGLE 17
-#define SIC_IRQ_MODE_SET_CPU 18
-
/*
* summary bit vector
* FLOATING - summary bit per function
@@ -35,7 +29,7 @@ static struct airq_iv *zpci_sbv;
*/
static struct airq_iv **zpci_ibv;
-/* Modify PCI: Register adapter interruptions */
+/* Modify PCI: Register floating adapter interruptions */
static int zpci_set_airq(struct zpci_dev *zdev)
{
u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT);
@@ -45,21 +39,24 @@ static int zpci_set_airq(struct zpci_dev *zdev)
fib.fmt0.isc = PCI_ISC;
fib.fmt0.sum = 1; /* enable summary notifications */
fib.fmt0.noi = airq_iv_end(zdev->aibv);
- fib.fmt0.aibv = (unsigned long) zdev->aibv->vector;
+ fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector);
fib.fmt0.aibvo = 0; /* each zdev has its own interrupt vector */
- fib.fmt0.aisb = (unsigned long) zpci_sbv->vector + (zdev->aisb/64)*8;
+ fib.fmt0.aisb = virt_to_phys(zpci_sbv->vector) + (zdev->aisb / 64) * 8;
fib.fmt0.aisbo = zdev->aisb & 63;
+ fib.gd = zdev->gisa;
return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
}
-/* Modify PCI: Unregister adapter interruptions */
+/* Modify PCI: Unregister floating adapter interruptions */
static int zpci_clear_airq(struct zpci_dev *zdev)
{
u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT);
struct zpci_fib fib = {0};
u8 cc, status;
+ fib.gd = zdev->gisa;
+
cc = zpci_mod_fc(req, &fib, &status);
if (cc == 3 || (cc == 1 && status == 24))
/* Function already gone or IRQs already deregistered. */
@@ -78,6 +75,7 @@ static int zpci_set_directed_irq(struct zpci_dev *zdev)
fib.fmt = 1;
fib.fmt1.noi = zdev->msi_nr_irqs;
fib.fmt1.dibvo = zdev->msi_first_bit;
+ fib.gd = zdev->gisa;
return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
}
@@ -90,6 +88,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev)
u8 cc, status;
fib.fmt = 1;
+ fib.gd = zdev->gisa;
cc = zpci_mod_fc(req, &fib, &status);
if (cc == 3 || (cc == 1 && status == 24))
/* Function already gone or IRQs already deregistered. */
@@ -98,14 +97,47 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev)
return cc ? -EIO : 0;
}
+/* Register adapter interruptions */
+static int zpci_set_irq(struct zpci_dev *zdev)
+{
+ int rc;
+
+ if (irq_delivery == DIRECTED)
+ rc = zpci_set_directed_irq(zdev);
+ else
+ rc = zpci_set_airq(zdev);
+
+ if (!rc)
+ zdev->irqs_registered = 1;
+
+ return rc;
+}
+
+/* Clear adapter interruptions */
+static int zpci_clear_irq(struct zpci_dev *zdev)
+{
+ int rc;
+
+ if (irq_delivery == DIRECTED)
+ rc = zpci_clear_directed_irq(zdev);
+ else
+ rc = zpci_clear_airq(zdev);
+
+ if (!rc)
+ zdev->irqs_registered = 0;
+
+ return rc;
+}
+
static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest,
bool force)
{
- struct msi_desc *entry = irq_get_msi_desc(data->irq);
+ struct msi_desc *entry = irq_data_get_msi_desc(data);
struct msi_msg msg = entry->msg;
+ int cpu_addr = smp_cpu_get_cpu_address(cpumask_first(dest));
msg.address_lo &= 0xff0000ff;
- msg.address_lo |= (cpumask_first(dest) << 8);
+ msg.address_lo |= (cpu_addr << 8);
pci_write_msi_msg(data->irq, &msg);
return IRQ_SET_MASK_OK;
@@ -115,12 +147,12 @@ static struct irq_chip zpci_irq_chip = {
.name = "PCI-MSI",
.irq_unmask = pci_msi_unmask_irq,
.irq_mask = pci_msi_mask_irq,
- .irq_set_affinity = zpci_set_irq_affinity,
};
static void zpci_handle_cpu_local_irq(bool rescan)
{
struct airq_iv *dibv = zpci_ibv[smp_processor_id()];
+ union zpci_sic_iib iib = {{0}};
unsigned long bit;
int irqs_on = 0;
@@ -131,8 +163,8 @@ static void zpci_handle_cpu_local_irq(bool rescan)
if (!rescan || irqs_on++)
/* End of second scan with interrupts on. */
break;
- /* First scan complete, reenable interrupts. */
- if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC))
+ /* First scan complete, re-enable interrupts. */
+ if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &iib))
break;
bit = 0;
continue;
@@ -160,6 +192,7 @@ static void zpci_handle_remote_irq(void *data)
static void zpci_handle_fallback_irq(void)
{
struct cpu_irq_data *cpu_data;
+ union zpci_sic_iib iib = {{0}};
unsigned long cpu;
int irqs_on = 0;
@@ -169,8 +202,8 @@ static void zpci_handle_fallback_irq(void)
if (irqs_on++)
/* End of second scan with interrupts on. */
break;
- /* First scan complete, reenable interrupts. */
- if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC))
+ /* First scan complete, re-enable interrupts. */
+ if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib))
break;
cpu = 0;
continue;
@@ -179,15 +212,16 @@ static void zpci_handle_fallback_irq(void)
if (atomic_inc_return(&cpu_data->scheduled) > 1)
continue;
- cpu_data->csd.func = zpci_handle_remote_irq;
- cpu_data->csd.info = &cpu_data->scheduled;
- cpu_data->csd.flags = 0;
+ INIT_CSD(&cpu_data->csd, zpci_handle_remote_irq, &cpu_data->scheduled);
smp_call_function_single_async(cpu, &cpu_data->csd);
}
}
-static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating)
+static void zpci_directed_irq_handler(struct airq_struct *airq,
+ struct tpi_info *tpi_info)
{
+ bool floating = !tpi_info->directed_irq;
+
if (floating) {
inc_irq_stat(IRQIO_PCF);
zpci_handle_fallback_irq();
@@ -197,8 +231,10 @@ static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating)
}
}
-static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating)
+static void zpci_floating_irq_handler(struct airq_struct *airq,
+ struct tpi_info *tpi_info)
{
+ union zpci_sic_iib iib = {{0}};
unsigned long si, ai;
struct airq_iv *aibv;
int irqs_on = 0;
@@ -211,8 +247,8 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating)
if (irqs_on++)
/* End of second scan with interrupts on. */
break;
- /* First scan complete, reenable interrupts. */
- if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC))
+ /* First scan complete, re-enable interrupts. */
+ if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib))
break;
si = 0;
continue;
@@ -239,6 +275,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
unsigned long bit;
struct msi_desc *msi;
struct msi_msg msg;
+ int cpu_addr;
int rc, irq;
zdev->aisb = -1UL;
@@ -260,7 +297,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
zdev->aisb = bit;
/* Create adapter interrupt vector */
- zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK);
+ zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL);
if (!zdev->aibv)
return -ENOMEM;
@@ -272,11 +309,13 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
/* Request MSI interrupts */
hwirq = bit;
- for_each_pci_msi_entry(msi, pdev) {
+ msi_for_each_desc(msi, &pdev->dev, MSI_DESC_NOTASSOCIATED) {
rc = -EIO;
if (hwirq - bit >= msi_vecs)
break;
- irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE, msi->affinity);
+ irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE,
+ (irq_delivery == DIRECTED) ?
+ msi->affinity : NULL);
if (irq < 0)
return -ENOMEM;
rc = irq_set_msi_desc(irq, msi);
@@ -286,9 +325,15 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
handle_percpu_irq);
msg.data = hwirq - bit;
if (irq_delivery == DIRECTED) {
+ if (msi->affinity)
+ cpu = cpumask_first(&msi->affinity->mask);
+ else
+ cpu = 0;
+ cpu_addr = smp_cpu_get_cpu_address(cpu);
+
msg.address_lo = zdev->msi_addr & 0xff0000ff;
- msg.address_lo |= msi->affinity ?
- (cpumask_first(&msi->affinity->mask) << 8) : 0;
+ msg.address_lo |= (cpu_addr << 8);
+
for_each_possible_cpu(cpu) {
airq_iv_set_data(zpci_ibv[cpu], hwirq, irq);
}
@@ -304,10 +349,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
zdev->msi_first_bit = bit;
zdev->msi_nr_irqs = msi_vecs;
- if (irq_delivery == DIRECTED)
- rc = zpci_set_directed_irq(zdev);
- else
- rc = zpci_set_airq(zdev);
+ rc = zpci_set_irq(zdev);
if (rc)
return rc;
@@ -321,21 +363,12 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev)
int rc;
/* Disable interrupts */
- if (irq_delivery == DIRECTED)
- rc = zpci_clear_directed_irq(zdev);
- else
- rc = zpci_clear_airq(zdev);
+ rc = zpci_clear_irq(zdev);
if (rc)
return;
/* Release MSI interrupts */
- for_each_pci_msi_entry(msi, pdev) {
- if (!msi->irq)
- continue;
- if (msi->msi_attrib.is_msix)
- __pci_msix_desc_mask_irq(msi, 1);
- else
- __pci_msi_desc_mask_irq(msi, 1, 1);
+ msi_for_each_desc(msi, &pdev->dev, MSI_DESC_ASSOCIATED) {
irq_set_msi_desc(msi->irq, NULL);
irq_free_desc(msi->irq);
msi->msg.address_lo = 0;
@@ -358,6 +391,15 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev)
airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs);
}
+bool arch_restore_msi_irqs(struct pci_dev *pdev)
+{
+ struct zpci_dev *zdev = to_zpci(pdev);
+
+ if (!zdev->irqs_registered)
+ zpci_set_irq(zdev);
+ return true;
+}
+
static struct airq_struct zpci_airq = {
.handler = zpci_floating_irq_handler,
.isc = PCI_ISC,
@@ -366,11 +408,12 @@ static struct airq_struct zpci_airq = {
static void __init cpu_enable_directed_irq(void *unused)
{
union zpci_sic_iib iib = {{0}};
+ union zpci_sic_iib ziib = {{0}};
iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector;
- __zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib);
- zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC);
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib);
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &ziib);
}
static int __init zpci_directed_irq_init(void)
@@ -378,14 +421,14 @@ static int __init zpci_directed_irq_init(void)
union zpci_sic_iib iib = {{0}};
unsigned int cpu;
- zpci_sbv = airq_iv_create(num_possible_cpus(), 0);
+ zpci_sbv = airq_iv_create(num_possible_cpus(), 0, NULL);
if (!zpci_sbv)
return -ENOMEM;
iib.diib.isc = PCI_ISC;
iib.diib.nr_cpus = num_possible_cpus();
- iib.diib.disb_addr = (u64) zpci_sbv->vector;
- __zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib);
+ iib.diib.disb_addr = virt_to_phys(zpci_sbv->vector);
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib);
zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv),
GFP_KERNEL);
@@ -400,7 +443,7 @@ static int __init zpci_directed_irq_init(void)
zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE,
AIRQ_IV_DATA |
AIRQ_IV_CACHELINE |
- (!cpu ? AIRQ_IV_ALLOC : 0));
+ (!cpu ? AIRQ_IV_ALLOC : 0), NULL);
if (!zpci_ibv[cpu])
return -ENOMEM;
}
@@ -417,7 +460,7 @@ static int __init zpci_floating_irq_init(void)
if (!zpci_ibv)
return -ENOMEM;
- zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC);
+ zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL);
if (!zpci_sbv)
goto out_free;
@@ -430,6 +473,7 @@ out_free:
int __init zpci_irq_init(void)
{
+ union zpci_sic_iib iib = {{0}};
int rc;
irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING;
@@ -461,7 +505,7 @@ int __init zpci_irq_init(void)
* Enable floating IRQs (with suppression after one IRQ). When using
* directed IRQs this enables the fallback path.
*/
- zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC);
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib);
return 0;
out_airq:
diff --git a/arch/s390/pci/pci_kvm_hook.c b/arch/s390/pci/pci_kvm_hook.c
new file mode 100644
index 000000000000..ff34baf50a3e
--- /dev/null
+++ b/arch/s390/pci/pci_kvm_hook.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO ZPCI devices support
+ *
+ * Copyright (C) IBM Corp. 2022. All rights reserved.
+ * Author(s): Pierre Morel <pmorel@linux.ibm.com>
+ */
+#include <linux/kvm_host.h>
+
+struct zpci_kvm_hook zpci_kvm_hook;
+EXPORT_SYMBOL_GPL(zpci_kvm_hook);
diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
index 7d42a8794f10..a90499c087f0 100644
--- a/arch/s390/pci/pci_mmio.c
+++ b/arch/s390/pci/pci_mmio.c
@@ -11,25 +11,108 @@
#include <linux/mm.h>
#include <linux/errno.h>
#include <linux/pci.h>
+#include <asm/asm-extable.h>
+#include <asm/pci_io.h>
+#include <asm/pci_debug.h>
-static long get_pfn(unsigned long user_addr, unsigned long access,
- unsigned long *pfn)
+static inline void zpci_err_mmio(u8 cc, u8 status, u64 offset)
{
- struct vm_area_struct *vma;
- long ret;
+ struct {
+ u64 offset;
+ u8 cc;
+ u8 status;
+ } data = {offset, cc, status};
- down_read(&current->mm->mmap_sem);
- ret = -EINVAL;
- vma = find_vma(current->mm, user_addr);
- if (!vma)
- goto out;
- ret = -EACCES;
- if (!(vma->vm_flags & access))
- goto out;
- ret = follow_pfn(vma, user_addr, pfn);
-out:
- up_read(&current->mm->mmap_sem);
- return ret;
+ zpci_err_hex(&data, sizeof(data));
+}
+
+static inline int __pcistb_mio_inuser(
+ void __iomem *ioaddr, const void __user *src,
+ u64 len, u8 *status)
+{
+ int cc = -ENXIO;
+
+ asm volatile (
+ " sacf 256\n"
+ "0: .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[src]\n"
+ "1: ipm %[cc]\n"
+ " srl %[cc],28\n"
+ "2: sacf 768\n"
+ EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
+ : [cc] "+d" (cc), [len] "+d" (len)
+ : [ioaddr] "a" (ioaddr), [src] "Q" (*((u8 __force *)src))
+ : "cc", "memory");
+ *status = len >> 24 & 0xff;
+ return cc;
+}
+
+static inline int __pcistg_mio_inuser(
+ void __iomem *ioaddr, const void __user *src,
+ u64 ulen, u8 *status)
+{
+ union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen};
+ int cc = -ENXIO;
+ u64 val = 0;
+ u64 cnt = ulen;
+ u8 tmp;
+
+ /*
+ * copy 0 < @len <= 8 bytes from @src into the right most bytes of
+ * a register, then store it to PCI at @ioaddr while in secondary
+ * address space. pcistg then uses the user mappings.
+ */
+ asm volatile (
+ " sacf 256\n"
+ "0: llgc %[tmp],0(%[src])\n"
+ "4: sllg %[val],%[val],8\n"
+ " aghi %[src],1\n"
+ " ogr %[val],%[tmp]\n"
+ " brctg %[cnt],0b\n"
+ "1: .insn rre,0xb9d40000,%[val],%[ioaddr_len]\n"
+ "2: ipm %[cc]\n"
+ " srl %[cc],28\n"
+ "3: sacf 768\n"
+ EX_TABLE(0b, 3b) EX_TABLE(4b, 3b) EX_TABLE(1b, 3b) EX_TABLE(2b, 3b)
+ :
+ [src] "+a" (src), [cnt] "+d" (cnt),
+ [val] "+d" (val), [tmp] "=d" (tmp),
+ [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair)
+ :: "cc", "memory");
+ *status = ioaddr_len.odd >> 24 & 0xff;
+
+ /* did we read everything from user memory? */
+ if (!cc && cnt != 0)
+ cc = -EFAULT;
+
+ return cc;
+}
+
+static inline int __memcpy_toio_inuser(void __iomem *dst,
+ const void __user *src, size_t n)
+{
+ int size, rc = 0;
+ u8 status = 0;
+
+ if (!src)
+ return -EINVAL;
+
+ while (n > 0) {
+ size = zpci_get_max_io_size((u64 __force) dst,
+ (u64 __force) src, n,
+ ZPCI_MAX_WRITE_SIZE);
+ if (size > 8) /* main path */
+ rc = __pcistb_mio_inuser(dst, src, size, &status);
+ else
+ rc = __pcistg_mio_inuser(dst, src, size, &status);
+ if (rc)
+ break;
+ src += size;
+ dst += size;
+ n -= size;
+ }
+ if (rc)
+ zpci_err_mmio(rc, status, (__force u64) dst);
+ return rc;
}
SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
@@ -38,7 +121,9 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
u8 local_buf[64];
void __iomem *io_addr;
void *buf;
- unsigned long pfn;
+ struct vm_area_struct *vma;
+ pte_t *ptep;
+ spinlock_t *ptl;
long ret;
if (!zpci_is_enabled())
@@ -46,6 +131,22 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
if (length <= 0 || PAGE_SIZE - (mmio_addr & ~PAGE_MASK) < length)
return -EINVAL;
+
+ /*
+ * We only support write access to MIO capable devices if we are on
+ * a MIO enabled system. Otherwise we would have to check for every
+ * address if it is a special ZPCI_ADDR and would have to do
+ * a pfn lookup which we don't need for MIO capable devices. Currently
+ * ISM devices are the only devices without MIO support and there is no
+ * known need for accessing these from userspace.
+ */
+ if (static_branch_likely(&have_mio)) {
+ ret = __memcpy_toio_inuser((void __iomem *) mmio_addr,
+ user_buffer,
+ length);
+ return ret;
+ }
+
if (length > 64) {
buf = kmalloc(length, GFP_KERNEL);
if (!buf)
@@ -53,32 +154,118 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
} else
buf = local_buf;
- ret = get_pfn(mmio_addr, VM_WRITE, &pfn);
+ ret = -EFAULT;
+ if (copy_from_user(buf, user_buffer, length))
+ goto out_free;
+
+ mmap_read_lock(current->mm);
+ ret = -EINVAL;
+ vma = vma_lookup(current->mm, mmio_addr);
+ if (!vma)
+ goto out_unlock_mmap;
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ goto out_unlock_mmap;
+ ret = -EACCES;
+ if (!(vma->vm_flags & VM_WRITE))
+ goto out_unlock_mmap;
+
+ ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
if (ret)
- goto out;
- io_addr = (void __iomem *)((pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK));
+ goto out_unlock_mmap;
- ret = -EFAULT;
- if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE)
- goto out;
+ io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
+ (mmio_addr & ~PAGE_MASK));
- if (copy_from_user(buf, user_buffer, length))
- goto out;
+ if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE)
+ goto out_unlock_pt;
ret = zpci_memcpy_toio(io_addr, buf, length);
-out:
+out_unlock_pt:
+ pte_unmap_unlock(ptep, ptl);
+out_unlock_mmap:
+ mmap_read_unlock(current->mm);
+out_free:
if (buf != local_buf)
kfree(buf);
return ret;
}
+static inline int __pcilg_mio_inuser(
+ void __user *dst, const void __iomem *ioaddr,
+ u64 ulen, u8 *status)
+{
+ union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen};
+ u64 cnt = ulen;
+ int shift = ulen * 8;
+ int cc = -ENXIO;
+ u64 val, tmp;
+
+ /*
+ * read 0 < @len <= 8 bytes from the PCI memory mapped at @ioaddr (in
+ * user space) into a register using pcilg then store these bytes at
+ * user address @dst
+ */
+ asm volatile (
+ " sacf 256\n"
+ "0: .insn rre,0xb9d60000,%[val],%[ioaddr_len]\n"
+ "1: ipm %[cc]\n"
+ " srl %[cc],28\n"
+ " ltr %[cc],%[cc]\n"
+ " jne 4f\n"
+ "2: ahi %[shift],-8\n"
+ " srlg %[tmp],%[val],0(%[shift])\n"
+ "3: stc %[tmp],0(%[dst])\n"
+ "5: aghi %[dst],1\n"
+ " brctg %[cnt],2b\n"
+ "4: sacf 768\n"
+ EX_TABLE(0b, 4b) EX_TABLE(1b, 4b) EX_TABLE(3b, 4b) EX_TABLE(5b, 4b)
+ :
+ [ioaddr_len] "+&d" (ioaddr_len.pair),
+ [cc] "+d" (cc), [val] "=d" (val),
+ [dst] "+a" (dst), [cnt] "+d" (cnt), [tmp] "=d" (tmp),
+ [shift] "+d" (shift)
+ :: "cc", "memory");
+
+ /* did we write everything to the user space buffer? */
+ if (!cc && cnt != 0)
+ cc = -EFAULT;
+
+ *status = ioaddr_len.odd >> 24 & 0xff;
+ return cc;
+}
+
+static inline int __memcpy_fromio_inuser(void __user *dst,
+ const void __iomem *src,
+ unsigned long n)
+{
+ int size, rc = 0;
+ u8 status;
+
+ while (n > 0) {
+ size = zpci_get_max_io_size((u64 __force) src,
+ (u64 __force) dst, n,
+ ZPCI_MAX_READ_SIZE);
+ rc = __pcilg_mio_inuser(dst, src, size, &status);
+ if (rc)
+ break;
+ src += size;
+ dst += size;
+ n -= size;
+ }
+ if (rc)
+ zpci_err_mmio(rc, status, (__force u64) dst);
+ return rc;
+}
+
SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
void __user *, user_buffer, size_t, length)
{
u8 local_buf[64];
void __iomem *io_addr;
void *buf;
- unsigned long pfn;
+ struct vm_area_struct *vma;
+ pte_t *ptep;
+ spinlock_t *ptl;
long ret;
if (!zpci_is_enabled())
@@ -86,29 +273,62 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
if (length <= 0 || PAGE_SIZE - (mmio_addr & ~PAGE_MASK) < length)
return -EINVAL;
+
+ /*
+ * We only support read access to MIO capable devices if we are on
+ * a MIO enabled system. Otherwise we would have to check for every
+ * address if it is a special ZPCI_ADDR and would have to do
+ * a pfn lookup which we don't need for MIO capable devices. Currently
+ * ISM devices are the only devices without MIO support and there is no
+ * known need for accessing these from userspace.
+ */
+ if (static_branch_likely(&have_mio)) {
+ ret = __memcpy_fromio_inuser(
+ user_buffer, (const void __iomem *)mmio_addr,
+ length);
+ return ret;
+ }
+
if (length > 64) {
buf = kmalloc(length, GFP_KERNEL);
if (!buf)
return -ENOMEM;
- } else
+ } else {
buf = local_buf;
+ }
- ret = get_pfn(mmio_addr, VM_READ, &pfn);
+ mmap_read_lock(current->mm);
+ ret = -EINVAL;
+ vma = vma_lookup(current->mm, mmio_addr);
+ if (!vma)
+ goto out_unlock_mmap;
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ goto out_unlock_mmap;
+ ret = -EACCES;
+ if (!(vma->vm_flags & VM_WRITE))
+ goto out_unlock_mmap;
+
+ ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
if (ret)
- goto out;
- io_addr = (void __iomem *)((pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK));
+ goto out_unlock_mmap;
+
+ io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
+ (mmio_addr & ~PAGE_MASK));
if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) {
ret = -EFAULT;
- goto out;
+ goto out_unlock_pt;
}
ret = zpci_memcpy_fromio(buf, io_addr, length);
- if (ret)
- goto out;
- if (copy_to_user(user_buffer, buf, length))
+
+out_unlock_pt:
+ pte_unmap_unlock(ptep, ptl);
+out_unlock_mmap:
+ mmap_read_unlock(current->mm);
+
+ if (!ret && copy_to_user(user_buffer, buf, length))
ret = -EFAULT;
-out:
if (buf != local_buf)
kfree(buf);
return ret;
diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c
index a433ba01a317..8a7abac51816 100644
--- a/arch/s390/pci/pci_sysfs.c
+++ b/arch/s390/pci/pci_sysfs.c
@@ -13,6 +13,8 @@
#include <linux/stat.h>
#include <linux/pci.h>
+#include "../../../drivers/pci/pci.h"
+
#include <asm/sclp.h>
#define zpci_attr(name, fmt, member) \
@@ -31,6 +33,7 @@ zpci_attr(pchid, "0x%04x\n", pchid);
zpci_attr(pfgid, "0x%02x\n", pfgid);
zpci_attr(vfn, "0x%04x\n", vfn);
zpci_attr(pft, "0x%02x\n", pft);
+zpci_attr(port, "%d\n", port);
zpci_attr(uid, "0x%x\n", uid);
zpci_attr(segment0, "0x%02x\n", pfip[0]);
zpci_attr(segment1, "0x%02x\n", pfip[1]);
@@ -49,31 +52,68 @@ static DEVICE_ATTR_RO(mio_enabled);
static ssize_t recover_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
+ struct kernfs_node *kn;
struct pci_dev *pdev = to_pci_dev(dev);
struct zpci_dev *zdev = to_zpci(pdev);
- int ret;
-
- if (!device_remove_file_self(dev, attr))
- return count;
-
+ int ret = 0;
+ u8 status;
+
+ /* Can't use device_remove_self() here as that would lead us to lock
+ * the pci_rescan_remove_lock while holding the device' kernfs lock.
+ * This would create a possible deadlock with disable_slot() which is
+ * not directly protected by the device' kernfs lock but takes it
+ * during the device removal which happens under
+ * pci_rescan_remove_lock.
+ *
+ * This is analogous to sdev_store_delete() in
+ * drivers/scsi/scsi_sysfs.c
+ */
+ kn = sysfs_break_active_protection(&dev->kobj, &attr->attr);
+ WARN_ON_ONCE(!kn);
+ /* device_remove_file() serializes concurrent calls ignoring all but
+ * the first
+ */
+ device_remove_file(dev, attr);
+
+ /* A concurrent call to recover_store() may slip between
+ * sysfs_break_active_protection() and the sysfs file removal.
+ * Once it unblocks from pci_lock_rescan_remove() the original pdev
+ * will already be removed.
+ */
pci_lock_rescan_remove();
- pci_stop_and_remove_bus_device(pdev);
- ret = zpci_disable_device(zdev);
- if (ret)
- goto error;
-
- ret = zpci_enable_device(zdev);
- if (ret)
- goto error;
-
- pci_rescan_bus(zdev->bus);
- pci_unlock_rescan_remove();
-
- return count;
-
-error:
+ if (pci_dev_is_added(pdev)) {
+ pci_stop_and_remove_bus_device(pdev);
+ if (zdev_enabled(zdev)) {
+ ret = zpci_disable_device(zdev);
+ /*
+ * Due to a z/VM vs LPAR inconsistency in the error
+ * state the FH may indicate an enabled device but
+ * disable says the device is already disabled don't
+ * treat it as an error here.
+ */
+ if (ret == -EINVAL)
+ ret = 0;
+ if (ret)
+ goto out;
+ }
+
+ ret = zpci_enable_device(zdev);
+ if (ret)
+ goto out;
+
+ if (zdev->dma_table) {
+ ret = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+ virt_to_phys(zdev->dma_table), &status);
+ if (ret)
+ zpci_disable_device(zdev);
+ }
+ }
+out:
+ pci_rescan_bus(zdev->zbus->bus);
pci_unlock_rescan_remove();
- return ret;
+ if (kn)
+ sysfs_unbreak_active_protection(kn);
+ return ret ? ret : count;
}
static DEVICE_ATTR_WO(recover);
@@ -109,6 +149,45 @@ static ssize_t report_error_write(struct file *filp, struct kobject *kobj,
}
static BIN_ATTR(report_error, S_IWUSR, NULL, report_error_write, PAGE_SIZE);
+static ssize_t uid_is_unique_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", zpci_unique_uid ? 1 : 0);
+}
+static DEVICE_ATTR_RO(uid_is_unique);
+
+#ifndef CONFIG_DMI
+/* analogous to smbios index */
+static ssize_t index_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
+ u32 index = ~0;
+
+ if (zpci_unique_uid)
+ index = zdev->uid;
+
+ return sysfs_emit(buf, "%u\n", index);
+}
+static DEVICE_ATTR_RO(index);
+
+static umode_t zpci_index_is_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
+{
+ return zpci_unique_uid ? attr->mode : 0;
+}
+
+static struct attribute *zpci_ident_attrs[] = {
+ &dev_attr_index.attr,
+ NULL,
+};
+
+static struct attribute_group zpci_ident_attr_group = {
+ .attrs = zpci_ident_attrs,
+ .is_visible = zpci_index_is_visible,
+};
+#endif
+
static struct bin_attribute *zpci_bin_attrs[] = {
&bin_attr_util_string,
&bin_attr_report_error,
@@ -121,12 +200,15 @@ static struct attribute *zpci_dev_attrs[] = {
&dev_attr_pchid.attr,
&dev_attr_pfgid.attr,
&dev_attr_pft.attr,
+ &dev_attr_port.attr,
&dev_attr_vfn.attr,
&dev_attr_uid.attr,
&dev_attr_recover.attr,
&dev_attr_mio_enabled.attr,
+ &dev_attr_uid_is_unique.attr,
NULL,
};
+
static struct attribute_group zpci_attr_group = {
.attrs = zpci_dev_attrs,
.bin_attrs = zpci_bin_attrs,
@@ -147,5 +229,8 @@ static struct attribute_group pfip_attr_group = {
const struct attribute_group *zpci_attr_groups[] = {
&zpci_attr_group,
&pfip_attr_group,
+#ifndef CONFIG_DMI
+ &zpci_ident_attr_group,
+#endif
NULL,
};
diff --git a/arch/s390/purgatory/.gitignore b/arch/s390/purgatory/.gitignore
index 04a03433c720..97ca52779457 100644
--- a/arch/s390/purgatory/.gitignore
+++ b/arch/s390/purgatory/.gitignore
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
purgatory
+purgatory.chk
purgatory.lds
purgatory.ro
diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile
index bc0d7a0d0394..4e930f566878 100644
--- a/arch/s390/purgatory/Makefile
+++ b/arch/s390/purgatory/Makefile
@@ -4,40 +4,51 @@ OBJECT_FILES_NON_STANDARD := y
purgatory-y := head.o purgatory.o string.o sha256.o mem.o
-targets += $(purgatory-y) purgatory.lds purgatory purgatory.ro
+targets += $(purgatory-y) purgatory.lds purgatory purgatory.chk purgatory.ro
PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
$(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE
$(call if_changed_rule,cc_o_c)
-CFLAGS_sha256.o := -D__DISABLE_EXPORTS
+CFLAGS_sha256.o := -D__DISABLE_EXPORTS -D__NO_FORTIFY
$(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE
$(call if_changed_rule,as_o_S)
-$(obj)/string.o: $(srctree)/arch/s390/lib/string.c FORCE
- $(call if_changed_rule,cc_o_c)
+KCOV_INSTRUMENT := n
+GCOV_PROFILE := n
+UBSAN_SANITIZE := n
+KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes
KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare
KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding
-KBUILD_CFLAGS += -c -MD -Os -m64 -msoft-float -fno-common
+KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common
+KBUILD_CFLAGS += -fno-stack-protector
+KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
KBUILD_CFLAGS += $(CLANG_FLAGS)
KBUILD_CFLAGS += $(call cc-option,-fno-PIE)
KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS))
-LDFLAGS_purgatory := -r --no-undefined -nostdlib -z nodefaultlib -T
+# Since we link purgatory with -r unresolved symbols are not checked, so we
+# also link a purgatory.chk binary without -r to check for unresolved symbols.
+PURGATORY_LDFLAGS := -nostdlib -z nodefaultlib
+LDFLAGS_purgatory := -r $(PURGATORY_LDFLAGS) -T
+LDFLAGS_purgatory.chk := -e purgatory_start $(PURGATORY_LDFLAGS)
$(obj)/purgatory: $(obj)/purgatory.lds $(PURGATORY_OBJS) FORCE
$(call if_changed,ld)
+$(obj)/purgatory.chk: $(obj)/purgatory FORCE
+ $(call if_changed,ld)
+
OBJCOPYFLAGS_purgatory.ro := -O elf64-s390
OBJCOPYFLAGS_purgatory.ro += --remove-section='*debug*'
OBJCOPYFLAGS_purgatory.ro += --remove-section='.comment'
OBJCOPYFLAGS_purgatory.ro += --remove-section='.note.*'
-$(obj)/purgatory.ro: $(obj)/purgatory FORCE
+$(obj)/purgatory.ro: $(obj)/purgatory $(obj)/purgatory.chk FORCE
$(call if_changed,objcopy)
-$(obj)/kexec-purgatory.o: $(obj)/kexec-purgatory.S $(obj)/purgatory.ro FORCE
- $(call if_changed_rule,as_o_S)
+$(obj)/kexec-purgatory.o: $(obj)/purgatory.ro
-obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += kexec-purgatory.o
+obj-y += kexec-purgatory.o
diff --git a/arch/s390/purgatory/head.S b/arch/s390/purgatory/head.S
index 5a10ce34b95d..0f93f2e72eba 100644
--- a/arch/s390/purgatory/head.S
+++ b/arch/s390/purgatory/head.S
@@ -44,11 +44,14 @@
.endm
.macro MEMSWAP dst,src,buf,len
-10: cghi \len,bufsz
+10: larl %r0,purgatory_end
+ larl %r1,stack
+ slgr %r0,%r1
+ cgr \len,%r0
jh 11f
lgr %r4,\len
j 12f
-11: lghi %r4,bufsz
+11: lgr %r4,%r0
12: MEMCPY \buf,\dst,%r4
MEMCPY \dst,\src,%r4
@@ -62,19 +65,20 @@
jh 10b
.endm
-.macro START_NEXT_KERNEL base
+.macro START_NEXT_KERNEL base subcode
lg %r4,kernel_entry-\base(%r13)
lg %r5,load_psw_mask-\base(%r13)
ogr %r4,%r5
stg %r4,0(%r0)
xgr %r0,%r0
- diag %r0,%r0,0x308
+ lghi %r1,\subcode
+ diag %r0,%r1,0x308
.endm
-.text
-.align PAGE_SIZE
-ENTRY(purgatory_start)
+ .text
+ .balign PAGE_SIZE
+SYM_CODE_START(purgatory_start)
/* The purgatory might be called after a diag308 so better set
* architecture and addressing mode.
*/
@@ -96,7 +100,7 @@ ENTRY(purgatory_start)
* checksum verification only (%r2 = 0 -> verification only).
*
* Check now and preserve over C function call by storing in
- * %r10 whith
+ * %r10 with
* 1 -> checksum verification only
* 0 -> load new kernel
*/
@@ -123,7 +127,7 @@ ENTRY(purgatory_start)
je .start_crash_kernel
/* start normal kernel */
- START_NEXT_KERNEL .base_crash
+ START_NEXT_KERNEL .base_crash 0
.return_old_kernel:
lmg %r6,%r15,gprregs-.base_crash(%r13)
@@ -134,12 +138,18 @@ ENTRY(purgatory_start)
.start_crash_kernel:
/* Location of purgatory_start in crash memory */
+ larl %r0,.base_crash
+ larl %r1,purgatory_start
+ slgr %r0,%r1
lgr %r8,%r13
- aghi %r8,-(.base_crash-purgatory_start)
+ sgr %r8,%r0
/* Destination for this code i.e. end of memory to be swapped. */
+ larl %r0,purgatory_end
+ larl %r1,purgatory_start
+ slgr %r0,%r1
lg %r9,crash_size-.base_crash(%r13)
- aghi %r9,-(purgatory_end-purgatory_start)
+ sgr %r9,%r0
/* Destination in crash memory, i.e. same as r9 but in crash memory. */
lg %r10,crash_start-.base_crash(%r13)
@@ -148,15 +158,19 @@ ENTRY(purgatory_start)
/* Buffer location (in crash memory) and size. As the purgatory is
* behind the point of no return it can re-use the stack as buffer.
*/
- lghi %r11,bufsz
+ larl %r11,purgatory_end
larl %r12,stack
+ slgr %r11,%r12
MEMCPY %r12,%r9,%r11 /* dst -> (crash) buf */
MEMCPY %r9,%r8,%r11 /* self -> dst */
/* Jump to new location. */
lgr %r7,%r9
- aghi %r7,.jump_to_dst-purgatory_start
+ larl %r0,.jump_to_dst
+ larl %r1,purgatory_start
+ slgr %r0,%r1
+ agr %r7,%r0
br %r7
.jump_to_dst:
@@ -168,7 +182,10 @@ ENTRY(purgatory_start)
/* Load new buffer location after jump */
larl %r7,stack
- aghi %r10,stack-purgatory_start
+ lgr %r0,%r7
+ larl %r1,purgatory_start
+ slgr %r0,%r1
+ agr %r10,%r0
MEMCPY %r10,%r7,%r11 /* (new) buf -> (crash) buf */
/* Now the code is set up to run from its designated location. Start
@@ -227,46 +244,22 @@ ENTRY(purgatory_start)
MEMCPY %r9,%r10,%r11
/* start crash kernel */
- START_NEXT_KERNEL .base_dst
-
-
-load_psw_mask:
- .long 0x00080000,0x80000000
-
- .align 8
-disabled_wait_psw:
- .quad 0x0002000180000000
- .quad 0x0000000000000000 + .do_checksum_verification
-
-gprregs:
- .rept 10
- .quad 0
- .endr
-
-/* Macro to define a global variable with name and size (in bytes) to be
- * shared with C code.
- *
- * Add the .size and .type attribute to satisfy checks on the Elf_Sym during
- * purgatory load.
- */
-.macro GLOBAL_VARIABLE name,size
-\name:
- .global \name
- .size \name,\size
- .type \name,object
- .skip \size,0
-.endm
-
-GLOBAL_VARIABLE purgatory_sha256_digest,32
-GLOBAL_VARIABLE purgatory_sha_regions,16*__KEXEC_SHA_REGION_SIZE
-GLOBAL_VARIABLE kernel_entry,8
-GLOBAL_VARIABLE kernel_type,8
-GLOBAL_VARIABLE crash_start,8
-GLOBAL_VARIABLE crash_size,8
-
- .align PAGE_SIZE
-stack:
+ START_NEXT_KERNEL .base_dst 1
+SYM_CODE_END(purgatory_start)
+
+SYM_DATA_LOCAL(load_psw_mask, .long 0x00080000,0x80000000)
+ .balign 8
+SYM_DATA_LOCAL(disabled_wait_psw, .quad 0x0002000180000000,.do_checksum_verification)
+SYM_DATA_LOCAL(gprregs, .fill 10,8,0)
+SYM_DATA(purgatory_sha256_digest, .skip 32)
+SYM_DATA(purgatory_sha_regions, .skip 16*__KEXEC_SHA_REGION_SIZE)
+SYM_DATA(kernel_entry, .skip 8)
+SYM_DATA(kernel_type, .skip 8)
+SYM_DATA(crash_start, .skip 8)
+SYM_DATA(crash_size, .skip 8)
+ .balign PAGE_SIZE
+SYM_DATA_START_LOCAL(stack)
/* The buffer to move this code must be as big as the code. */
.skip stack-purgatory_start
- .align PAGE_SIZE
-purgatory_end:
+ .balign PAGE_SIZE
+SYM_DATA_END_LABEL(stack, SYM_L_LOCAL, purgatory_end)
diff --git a/arch/s390/purgatory/kexec-purgatory.S b/arch/s390/purgatory/kexec-purgatory.S
index 8293753100ae..25f512b1de12 100644
--- a/arch/s390/purgatory/kexec-purgatory.S
+++ b/arch/s390/purgatory/kexec-purgatory.S
@@ -1,14 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
.section .rodata, "a"
- .align 8
-kexec_purgatory:
- .globl kexec_purgatory
+ .balign 8
+SYM_DATA_START(kexec_purgatory)
.incbin "arch/s390/purgatory/purgatory.ro"
-.Lkexec_purgatroy_end:
+SYM_DATA_END_LABEL(kexec_purgatory, SYM_L_LOCAL, kexec_purgatory_end)
- .align 8
-kexec_purgatory_size:
- .globl kexec_purgatory_size
- .quad .Lkexec_purgatroy_end - kexec_purgatory
+ .balign 8
+SYM_DATA(kexec_purgatory_size, .quad kexec_purgatory_end-kexec_purgatory)
diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c
index 0a423bcf6746..030efda05dbe 100644
--- a/arch/s390/purgatory/purgatory.c
+++ b/arch/s390/purgatory/purgatory.c
@@ -9,7 +9,7 @@
#include <linux/kexec.h>
#include <linux/string.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <asm/purgatory.h>
int verify_sha256_digest(void)
diff --git a/arch/s390/purgatory/string.c b/arch/s390/purgatory/string.c
new file mode 100644
index 000000000000..c98c22a72db7
--- /dev/null
+++ b/arch/s390/purgatory/string.c
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+#define __HAVE_ARCH_MEMCMP /* arch function */
+#include "../lib/string.c"
diff --git a/arch/s390/scripts/Makefile.chkbss b/arch/s390/scripts/Makefile.chkbss
deleted file mode 100644
index f4f4c2c6dee9..000000000000
--- a/arch/s390/scripts/Makefile.chkbss
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-chkbss-target ?= built-in.a
-$(obj)/$(chkbss-target): chkbss
-
-chkbss-files := $(addsuffix .chkbss, $(chkbss))
-clean-files += $(chkbss-files)
-
-PHONY += chkbss
-chkbss: $(addprefix $(obj)/, $(chkbss-files))
-
-quiet_cmd_chkbss = CHKBSS $<
- cmd_chkbss = \
- if ! $(OBJSIZE) --common $< | $(AWK) 'END { if ($$3) exit 1 }'; then \
- echo "error: $< .bss section is not empty" >&2; exit 1; \
- fi; \
- touch $@;
-
-$(obj)/%.o.chkbss: $(obj)/%.o
- $(call cmd,chkbss)
diff --git a/arch/s390/tools/.gitignore b/arch/s390/tools/.gitignore
index 71bd6f8eebaf..ea62f37b79ef 100644
--- a/arch/s390/tools/.gitignore
+++ b/arch/s390/tools/.gitignore
@@ -1,2 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
gen_facilities
gen_opcode_table
diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile
index b5e35e8f999a..f9dd47ff9ac4 100644
--- a/arch/s390/tools/Makefile
+++ b/arch/s390/tools/Makefile
@@ -10,8 +10,8 @@ PHONY += kapi
kapi: $(kapi-hdrs-y)
-hostprogs-y += gen_facilities
-hostprogs-y += gen_opcode_table
+hostprogs += gen_facilities
+hostprogs += gen_opcode_table
HOSTCFLAGS_gen_facilities.o += $(LINUXINCLUDE)
diff --git a/arch/s390/tools/gcc-thunk-extern.sh b/arch/s390/tools/gcc-thunk-extern.sh
new file mode 100755
index 000000000000..20bcbf6dd7ab
--- /dev/null
+++ b/arch/s390/tools/gcc-thunk-extern.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Borrowed from gcc: gcc/testsuite/gcc.target/s390/nobp-section-type-conflict.c
+# Checks that we don't get error: section type conflict with ‘put_page’.
+
+cat << "END" | $@ -x c - -fno-PIE -march=z10 -mindirect-branch=thunk-extern -mfunction-return=thunk-extern -mindirect-branch-table -O2 -c -o /dev/null
+int a;
+int b (void);
+void c (int);
+
+static void
+put_page (void)
+{
+ if (b ())
+ c (a);
+}
+
+__attribute__ ((__section__ (".init.text"), __cold__)) void
+d (void)
+{
+ put_page ();
+ put_page ();
+}
+END
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 61ce5b59b828..68580cbea4e6 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -27,24 +27,16 @@ static struct facility_def facility_defs[] = {
*/
.name = "FACILITIES_ALS",
.bits = (int[]){
-#ifdef CONFIG_HAVE_MARCH_Z900_FEATURES
0, /* N3 instructions */
1, /* z/Arch mode installed */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES
18, /* long displacement facility */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
21, /* extended-immediate facility */
25, /* store clock fast */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
27, /* mvcos */
32, /* compare and swap and store */
33, /* compare and swap and store 2 */
34, /* general instructions extension */
35, /* execute extensions */
-#endif
#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
45, /* fast-BCR, etc. */
#endif
@@ -54,6 +46,7 @@ static struct facility_def facility_defs[] = {
#endif
#ifdef CONFIG_HAVE_MARCH_Z13_FEATURES
53, /* load-and-zero-rightmost-byte, etc. */
+ 129, /* vector */
#endif
#ifdef CONFIG_HAVE_MARCH_Z14_FEATURES
58, /* miscellaneous-instruction-extension 2 */
@@ -115,6 +108,11 @@ static struct facility_def facility_defs[] = {
12, /* AP Query Configuration Information */
15, /* AP Facilities Test */
156, /* etoken facility */
+ 165, /* nnpa facility */
+ 193, /* bear enhancement facility */
+ 194, /* rdp enhancement facility */
+ 196, /* processor activity instrumentation facility */
+ 197, /* processor activity instrumentation extension 1 */
-1 /* END */
}
},
diff --git a/arch/s390/tools/opcodes.txt b/arch/s390/tools/opcodes.txt
index 46d8ed96cf06..5f008e794898 100644
--- a/arch/s390/tools/opcodes.txt
+++ b/arch/s390/tools/opcodes.txt
@@ -189,6 +189,8 @@ ad stosm SI_URD
ae sigp RS_RRRD
af mc SI_URD
b1 lra RX_RRRD
+b200 lbear S_RD
+b201 stbear S_RD
b202 stidp S_RD
b204 sck S_RD
b205 stck S_RD
@@ -274,6 +276,7 @@ b285 lpctl S_RD
b286 qsi S_RD
b287 lsctl S_RD
b28e qctri S_RD
+b28f qpaci S_RD
b299 srnm S_RD
b29c stfpc S_RD
b29d lfpc S_RD
@@ -523,6 +526,7 @@ b931 clgfr RRE_RR
b938 sortl RRE_RR
b939 dfltcc RRF_R0RR2
b93a kdsa RRE_RR
+b93b nnpa RRE_00
b93c ppno RRE_RR
b93e kimd RRE_RR
b93f klmd RRE_RR
@@ -562,6 +566,7 @@ b987 dlgr RRE_RR
b988 alcgr RRE_RR
b989 slbgr RRE_RR
b98a cspg RRE_RR
+b98b rdp RRF_RURR2
b98d epsw RRE_RR
b98e idte RRF_RURR2
b98f crdte RRF_RURR2
@@ -597,7 +602,7 @@ b9b3 cu42 RRE_RR
b9bd trtre RRF_U0RR
b9be srstu RRE_RR
b9bf trte RRF_U0RR
-b9c0 selhhhr RRF_RURR
+b9c0 selfhr RRF_RURR
b9c8 ahhhr RRF_R0RR2
b9c9 shhhr RRF_R0RR2
b9ca alhhhr RRF_R0RR2
@@ -876,19 +881,32 @@ e63d vstrl VSI_URDV
e63f vstrlr VRS_RRDV
e649 vlip VRI_V0UU2
e650 vcvb VRR_RV0UU
+e651 vclzdp VRR_VV0U2
e652 vcvbg VRR_RV0UU
+e654 vupkzh VRR_VV0U2
+e655 vcnf VRR_VV0UU2
+e656 vclfnh VRR_VV0UU2
e658 vcvd VRI_VR0UU
e659 vsrp VRI_VVUUU2
e65a vcvdg VRI_VR0UU
e65b vpsop VRI_VVUUU2
+e65c vupkzl VRR_VV0U2
+e65d vcfn VRR_VV0UU2
+e65e vclfnl VRR_VV0UU2
e65f vtp VRR_0V
+e670 vpkzr VRI_VVV0UU2
e671 vap VRI_VVV0UU2
+e672 vsrpr VRI_VVV0UU2
e673 vsp VRI_VVV0UU2
+e674 vschp VRR_VVV0U0U
+e675 vcrnf VRR_VVV0UU
e677 vcp VRR_0VV0U
e678 vmp VRI_VVV0UU2
e679 vmsp VRI_VVV0UU2
e67a vdp VRI_VVV0UU2
e67b vrp VRI_VVV0UU2
+e67c vscshp VRR_VVV
+e67d vcsph VRR_VVV0U0
e67e vsdp VRI_VVV0UU2
e700 vleb VRX_VRRDU
e701 vleh VRX_VRRDU
@@ -1081,6 +1099,7 @@ eb61 stric RSY_RDRU
eb62 mric RSY_RDRU
eb6a asi SIY_IRD
eb6e alsi SIY_IRD
+eb71 lpswey SIY_RD
eb7a agsi SIY_IRD
eb7e algsi SIY_IRD
eb80 icmh RSY_RURD