summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/alpha/include/uapi/asm/socket.h3
-rw-r--r--arch/arm/Kconfig2
-rw-r--r--arch/arm/boot/dts/intel/ixp/intel-ixp42x-linksys-wrv54g.dts92
-rw-r--r--arch/arm/configs/omap2plus_defconfig1
-rw-r--r--arch/arm64/Kconfig7
-rw-r--r--arch/arm64/boot/dts/mediatek/mt8370.dtsi16
-rw-r--r--arch/arm64/include/asm/assembler.h4
-rw-r--r--arch/arm64/include/asm/barrier.h3
-rw-r--r--arch/arm64/include/asm/cpufeature.h28
-rw-r--r--arch/arm64/include/asm/debug-monitors.h40
-rw-r--r--arch/arm64/include/asm/el2_setup.h116
-rw-r--r--arch/arm64/include/asm/exception.h14
-rw-r--r--arch/arm64/include/asm/gcs.h2
-rw-r--r--arch/arm64/include/asm/hwcap.h2
-rw-r--r--arch/arm64/include/asm/kgdb.h12
-rw-r--r--arch/arm64/include/asm/kprobes.h8
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h51
-rw-r--r--arch/arm64/include/asm/kvm_host.h38
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h18
-rw-r--r--arch/arm64/include/asm/kvm_nested.h2
-rw-r--r--arch/arm64/include/asm/memory.h6
-rw-r--r--arch/arm64/include/asm/processor.h2
-rw-r--r--arch/arm64/include/asm/smp.h24
-rw-r--r--arch/arm64/include/asm/stacktrace.h6
-rw-r--r--arch/arm64/include/asm/sysreg.h87
-rw-r--r--arch/arm64/include/asm/system_misc.h4
-rw-r--r--arch/arm64/include/asm/thread_info.h5
-rw-r--r--arch/arm64/include/asm/traps.h6
-rw-r--r--arch/arm64/include/asm/uprobes.h11
-rw-r--r--arch/arm64/include/asm/vncr_mapping.h2
-rw-r--r--arch/arm64/include/uapi/asm/hwcap.h2
-rw-r--r--arch/arm64/kernel/Makefile2
-rw-r--r--arch/arm64/kernel/acpi.c10
-rw-r--r--arch/arm64/kernel/cpufeature.c142
-rw-r--r--arch/arm64/kernel/cpuinfo.c2
-rw-r--r--arch/arm64/kernel/debug-monitors.c263
-rw-r--r--arch/arm64/kernel/efi.c5
-rw-r--r--arch/arm64/kernel/entry-common.c156
-rw-r--r--arch/arm64/kernel/entry.S6
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c60
-rw-r--r--arch/arm64/kernel/irq.c13
-rw-r--r--arch/arm64/kernel/kgdb.c39
-rw-r--r--arch/arm64/kernel/module.c101
-rw-r--r--arch/arm64/kernel/mte.c11
-rw-r--r--arch/arm64/kernel/pi/Makefile2
-rw-r--r--arch/arm64/kernel/probes/kprobes.c31
-rw-r--r--arch/arm64/kernel/probes/kprobes_trampoline.S2
-rw-r--r--arch/arm64/kernel/probes/uprobes.c24
-rw-r--r--arch/arm64/kernel/process.c13
-rw-r--r--arch/arm64/kernel/sdei.c8
-rw-r--r--arch/arm64/kernel/signal.c7
-rw-r--r--arch/arm64/kernel/smp.c142
-rw-r--r--arch/arm64/kernel/stacktrace.c59
-rw-r--r--arch/arm64/kernel/traps.c84
-rw-r--r--arch/arm64/kernel/watchdog_hld.c58
-rw-r--r--arch/arm64/kvm/Makefile3
-rw-r--r--arch/arm64/kvm/arch_timer.c2
-rw-r--r--arch/arm64/kvm/arm.c36
-rw-r--r--arch/arm64/kvm/at.c80
-rw-r--r--arch/arm64/kvm/config.c255
-rw-r--r--arch/arm64/kvm/debug.c4
-rw-r--r--arch/arm64/kvm/emulate-nested.c49
-rw-r--r--arch/arm64/kvm/guest.c62
-rw-r--r--arch/arm64/kvm/handle_exit.c24
-rw-r--r--arch/arm64/kvm/hyp/exception.c16
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h53
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h49
-rw-r--r--arch/arm64/kvm/hyp/nvhe/debug-sr.c32
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c2
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c53
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c14
-rw-r--r--arch/arm64/kvm/hyp/vhe/sysreg-sr.c6
-rw-r--r--arch/arm64/kvm/inject_fault.c235
-rw-r--r--arch/arm64/kvm/mmio.c12
-rw-r--r--arch/arm64/kvm/mmu.c105
-rw-r--r--arch/arm64/kvm/nested.c109
-rw-r--r--arch/arm64/kvm/sys_regs.c218
-rw-r--r--arch/arm64/kvm/sys_regs.h2
-rw-r--r--arch/arm64/kvm/trace_handle_exit.h2
-rw-r--r--arch/arm64/kvm/vgic-sys-reg-v3.c127
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c30
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c5
-rw-r--r--arch/arm64/kvm/vgic/vgic-kvm-device.c70
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v3.c33
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3-nested.c2
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c14
-rw-r--r--arch/arm64/kvm/vgic/vgic-v5.c52
-rw-r--r--arch/arm64/kvm/vgic/vgic.c4
-rw-r--r--arch/arm64/kvm/vgic/vgic.h48
-rw-r--r--arch/arm64/mm/contpte.c213
-rw-r--r--arch/arm64/mm/fault.c83
-rw-r--r--arch/arm64/mm/gcs.c6
-rw-r--r--arch/arm64/mm/hugetlbpage.c2
-rw-r--r--arch/arm64/mm/proc.S2
-rw-r--r--arch/arm64/net/bpf_jit.h5
-rw-r--r--arch/arm64/net/bpf_jit_comp.c167
-rw-r--r--arch/arm64/tools/cpucaps7
-rw-r--r--arch/arm64/tools/sysreg646
-rw-r--r--arch/csky/Kconfig1
-rw-r--r--arch/loongarch/Kconfig1
-rw-r--r--arch/loongarch/configs/loongson3_defconfig1
-rw-r--r--arch/loongarch/include/asm/kvm_host.h12
-rw-r--r--arch/loongarch/kvm/exit.c33
-rw-r--r--arch/loongarch/kvm/intc/eiointc.c553
-rw-r--r--arch/loongarch/kvm/intc/ipi.c28
-rw-r--r--arch/loongarch/kvm/intc/pch_pic.c4
-rw-r--r--arch/loongarch/kvm/interrupt.c25
-rw-r--r--arch/loongarch/kvm/trace.h14
-rw-r--r--arch/loongarch/kvm/vcpu.c8
-rw-r--r--arch/m68k/Kconfig1
-rw-r--r--arch/m68k/Kconfig.debug2
-rw-r--r--arch/m68k/configs/amiga_defconfig10
-rw-r--r--arch/m68k/configs/apollo_defconfig10
-rw-r--r--arch/m68k/configs/atari_defconfig10
-rw-r--r--arch/m68k/configs/bvme6000_defconfig10
-rw-r--r--arch/m68k/configs/hp300_defconfig10
-rw-r--r--arch/m68k/configs/mac_defconfig10
-rw-r--r--arch/m68k/configs/multi_defconfig10
-rw-r--r--arch/m68k/configs/mvme147_defconfig10
-rw-r--r--arch/m68k/configs/mvme16x_defconfig10
-rw-r--r--arch/m68k/configs/q40_defconfig10
-rw-r--r--arch/m68k/configs/sun3_defconfig10
-rw-r--r--arch/m68k/configs/sun3x_defconfig10
-rw-r--r--arch/m68k/include/asm/adb_iop.h4
-rw-r--r--arch/m68k/include/asm/bootinfo.h4
-rw-r--r--arch/m68k/include/asm/entry.h4
-rw-r--r--arch/m68k/include/asm/kexec.h4
-rw-r--r--arch/m68k/include/asm/mac_baboon.h4
-rw-r--r--arch/m68k/include/asm/mac_iop.h4
-rw-r--r--arch/m68k/include/asm/mac_oss.h4
-rw-r--r--arch/m68k/include/asm/mac_psc.h4
-rw-r--r--arch/m68k/include/asm/mac_via.h4
-rw-r--r--arch/m68k/include/asm/math-emu.h6
-rw-r--r--arch/m68k/include/asm/mcf_pgtable.h4
-rw-r--r--arch/m68k/include/asm/mcfmmu.h2
-rw-r--r--arch/m68k/include/asm/motorola_pgtable.h4
-rw-r--r--arch/m68k/include/asm/nettel.h4
-rw-r--r--arch/m68k/include/asm/openprom.h4
-rw-r--r--arch/m68k/include/asm/page.h4
-rw-r--r--arch/m68k/include/asm/page_mm.h4
-rw-r--r--arch/m68k/include/asm/page_no.h4
-rw-r--r--arch/m68k/include/asm/pgtable.h2
-rw-r--r--arch/m68k/include/asm/pgtable_mm.h8
-rw-r--r--arch/m68k/include/asm/ptrace.h4
-rw-r--r--arch/m68k/include/asm/setup.h10
-rw-r--r--arch/m68k/include/asm/sun3_pgtable.h8
-rw-r--r--arch/m68k/include/asm/sun3mmu.h4
-rw-r--r--arch/m68k/include/asm/thread_info.h6
-rw-r--r--arch/m68k/include/asm/traps.h6
-rw-r--r--arch/m68k/include/uapi/asm/bootinfo-vme.h4
-rw-r--r--arch/m68k/include/uapi/asm/bootinfo.h8
-rw-r--r--arch/m68k/include/uapi/asm/ptrace.h4
-rw-r--r--arch/m68k/kernel/early_printk.c42
-rw-r--r--arch/m68k/kernel/head.S81
-rw-r--r--arch/m68k/mac/via.c16
-rw-r--r--arch/m68k/math-emu/fp_emu.h8
-rw-r--r--arch/m68k/mm/motorola.c56
-rw-r--r--arch/microblaze/Kconfig1
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/boot/dts/ralink/mt7620a.dtsi10
-rw-r--r--arch/mips/configs/fuloong2e_defconfig1
-rw-r--r--arch/mips/configs/ip22_defconfig1
-rw-r--r--arch/mips/configs/loongson2k_defconfig1
-rw-r--r--arch/mips/configs/loongson3_defconfig1
-rw-r--r--arch/mips/configs/malta_defconfig1
-rw-r--r--arch/mips/configs/malta_kvm_defconfig1
-rw-r--r--arch/mips/configs/maltaup_xpa_defconfig1
-rw-r--r--arch/mips/configs/rb532_defconfig1
-rw-r--r--arch/mips/configs/rm200_defconfig1
-rw-r--r--arch/mips/include/uapi/asm/socket.h3
-rw-r--r--arch/parisc/Kconfig1
-rw-r--r--arch/parisc/include/uapi/asm/socket.h3
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/Makefile2
-rw-r--r--arch/powerpc/boot/dts/microwatt.dts2
-rw-r--r--arch/powerpc/configs/cell_defconfig1
-rw-r--r--arch/powerpc/configs/powernv_defconfig3
-rw-r--r--arch/powerpc/configs/ppc64_defconfig3
-rw-r--r--arch/powerpc/configs/ppc6xx_defconfig1
-rw-r--r--arch/powerpc/include/asm/floppy.h5
-rw-r--r--arch/powerpc/include/asm/hvcall.h1
-rw-r--r--arch/powerpc/include/uapi/asm/eeh.h13
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h13
-rw-r--r--arch/powerpc/include/uapi/asm/kvm_para.h13
-rw-r--r--arch/powerpc/include/uapi/asm/ps3fb.h13
-rw-r--r--arch/powerpc/kernel/eeh.c20
-rw-r--r--arch/powerpc/kernel/eeh_driver.c2
-rw-r--r--arch/powerpc/kernel/fadump.c11
-rw-r--r--arch/powerpc/kernel/rtas_flash.c64
-rw-r--r--arch/powerpc/kernel/smp.c25
-rw-r--r--arch/powerpc/kvm/trace_book3s.h1
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c31
-rw-r--r--arch/powerpc/net/bpf_jit_comp64.c79
-rw-r--r--arch/powerpc/perf/hv-24x7.c8
-rw-r--r--arch/powerpc/platforms/512x/mpc512x_lpbfifo.c6
-rw-r--r--arch/powerpc/platforms/book3s/vas-api.c32
-rw-r--r--arch/powerpc/platforms/powernv/ocxl.c12
-rw-r--r--arch/powerpc/platforms/pseries/dlpar.c52
-rw-r--r--arch/powerpc/platforms/pseries/plpks-secvar.c104
-rw-r--r--arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c14
-rw-r--r--arch/riscv/Kconfig1
-rw-r--r--arch/riscv/include/asm/kvm_aia.h2
-rw-r--r--arch/riscv/include/asm/kvm_gstage.h72
-rw-r--r--arch/riscv/include/asm/kvm_host.h105
-rw-r--r--arch/riscv/include/asm/kvm_mmu.h21
-rw-r--r--arch/riscv/include/asm/kvm_tlb.h84
-rw-r--r--arch/riscv/include/asm/kvm_vcpu_sbi.h12
-rw-r--r--arch/riscv/include/asm/kvm_vmid.h27
-rw-r--r--arch/riscv/include/uapi/asm/kvm.h1
-rw-r--r--arch/riscv/kvm/Kconfig1
-rw-r--r--arch/riscv/kvm/Makefile1
-rw-r--r--arch/riscv/kvm/aia_device.c6
-rw-r--r--arch/riscv/kvm/aia_imsic.c12
-rw-r--r--arch/riscv/kvm/gstage.c338
-rw-r--r--arch/riscv/kvm/main.c3
-rw-r--r--arch/riscv/kvm/mmu.c509
-rw-r--r--arch/riscv/kvm/tlb.c110
-rw-r--r--arch/riscv/kvm/vcpu.c48
-rw-r--r--arch/riscv/kvm/vcpu_exit.c20
-rw-r--r--arch/riscv/kvm/vcpu_onereg.c83
-rw-r--r--arch/riscv/kvm/vcpu_sbi.c49
-rw-r--r--arch/riscv/kvm/vcpu_sbi_replace.c17
-rw-r--r--arch/riscv/kvm/vcpu_sbi_sta.c3
-rw-r--r--arch/riscv/kvm/vcpu_sbi_v01.c25
-rw-r--r--arch/riscv/kvm/vm.c7
-rw-r--r--arch/riscv/kvm/vmid.c25
-rw-r--r--arch/riscv/mm/fault.c8
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/appldata/appldata_base.c1
-rw-r--r--arch/s390/boot/Makefile6
-rw-r--r--arch/s390/boot/als.c2
-rw-r--r--arch/s390/boot/boot.h5
-rw-r--r--arch/s390/boot/ipl_data.c9
-rw-r--r--arch/s390/boot/startup.c2
-rw-r--r--arch/s390/boot/trampoline.S9
-rw-r--r--arch/s390/configs/debug_defconfig1
-rw-r--r--arch/s390/configs/defconfig1
-rw-r--r--arch/s390/crypto/arch_random.c1
-rw-r--r--arch/s390/crypto/sha_common.c1
-rw-r--r--arch/s390/include/asm/alternative.h6
-rw-r--r--arch/s390/include/asm/asm-const.h2
-rw-r--r--arch/s390/include/asm/cpu.h4
-rw-r--r--arch/s390/include/asm/cpu_mf-insn.h4
-rw-r--r--arch/s390/include/asm/ctlreg.h4
-rw-r--r--arch/s390/include/asm/dwarf.h4
-rw-r--r--arch/s390/include/asm/entry-common.h10
-rw-r--r--arch/s390/include/asm/extmem.h2
-rw-r--r--arch/s390/include/asm/fpu-insn-asm.h4
-rw-r--r--arch/s390/include/asm/fpu-insn.h4
-rw-r--r--arch/s390/include/asm/ftrace.h4
-rw-r--r--arch/s390/include/asm/irq.h4
-rw-r--r--arch/s390/include/asm/jump_label.h4
-rw-r--r--arch/s390/include/asm/kvm_host.h3
-rw-r--r--arch/s390/include/asm/lowcore.h6
-rw-r--r--arch/s390/include/asm/machine.h4
-rw-r--r--arch/s390/include/asm/mem_encrypt.h4
-rw-r--r--arch/s390/include/asm/nmi.h4
-rw-r--r--arch/s390/include/asm/nospec-branch.h4
-rw-r--r--arch/s390/include/asm/nospec-insn.h5
-rw-r--r--arch/s390/include/asm/page.h22
-rw-r--r--arch/s390/include/asm/processor.h4
-rw-r--r--arch/s390/include/asm/ptrace.h4
-rw-r--r--arch/s390/include/asm/purgatory.h4
-rw-r--r--arch/s390/include/asm/sclp.h4
-rw-r--r--arch/s390/include/asm/setup.h6
-rw-r--r--arch/s390/include/asm/sigp.h4
-rw-r--r--arch/s390/include/asm/skey.h32
-rw-r--r--arch/s390/include/asm/thread_info.h2
-rw-r--r--arch/s390/include/asm/timex.h13
-rw-r--r--arch/s390/include/asm/tpi.h4
-rw-r--r--arch/s390/include/asm/types.h4
-rw-r--r--arch/s390/include/asm/uaccess.h204
-rw-r--r--arch/s390/include/asm/vdso.h4
-rw-r--r--arch/s390/include/asm/vdso/getrandom.h4
-rw-r--r--arch/s390/include/asm/vdso/gettimeofday.h8
-rw-r--r--arch/s390/include/asm/vdso/time_data.h3
-rw-r--r--arch/s390/include/asm/vdso/vsyscall.h4
-rw-r--r--arch/s390/include/uapi/asm/ptrace.h5
-rw-r--r--arch/s390/include/uapi/asm/schid.h4
-rw-r--r--arch/s390/include/uapi/asm/types.h4
-rw-r--r--arch/s390/kernel/Makefile2
-rw-r--r--arch/s390/kernel/cpufeature.c1
-rw-r--r--arch/s390/kernel/crash_dump.c1
-rw-r--r--arch/s390/kernel/ctlreg.c1
-rw-r--r--arch/s390/kernel/dis.c1
-rw-r--r--arch/s390/kernel/early.c4
-rw-r--r--arch/s390/kernel/facility.c1
-rw-r--r--arch/s390/kernel/fpu.c2
-rw-r--r--arch/s390/kernel/nmi.c76
-rw-r--r--arch/s390/kernel/perf_cpum_cf.c1
-rw-r--r--arch/s390/kernel/perf_cpum_sf.c1
-rw-r--r--arch/s390/kernel/perf_event.c1
-rw-r--r--arch/s390/kernel/perf_pai_crypto.c3
-rw-r--r--arch/s390/kernel/perf_pai_ext.c1
-rw-r--r--arch/s390/kernel/process.c1
-rw-r--r--arch/s390/kernel/skey.c48
-rw-r--r--arch/s390/kernel/smp.c11
-rw-r--r--arch/s390/kernel/sthyi.c2
-rw-r--r--arch/s390/kernel/time.c121
-rw-r--r--arch/s390/kernel/topology.c10
-rw-r--r--arch/s390/kernel/unwind_bc.c2
-rw-r--r--arch/s390/kernel/uv.c1
-rw-r--r--arch/s390/kernel/vmlinux.lds.S7
-rw-r--r--arch/s390/kvm/interrupt.c1
-rw-r--r--arch/s390/kvm/kvm-s390.c52
-rw-r--r--arch/s390/kvm/pv.c2
-rw-r--r--arch/s390/kvm/vsie.c17
-rw-r--r--arch/s390/lib/delay.c1
-rw-r--r--arch/s390/lib/uaccess.c188
-rw-r--r--arch/s390/mm/gmap.c1
-rw-r--r--arch/s390/mm/gmap_helpers.c2
-rw-r--r--arch/s390/mm/pgalloc.c5
-rw-r--r--arch/s390/mm/pgtable.c1
-rw-r--r--arch/s390/net/bpf_jit.h55
-rw-r--r--arch/s390/net/bpf_jit_comp.c113
-rw-r--r--arch/s390/net/pnet.c1
-rw-r--r--arch/s390/pci/pci_bus.c1
-rw-r--r--arch/s390/pci/pci_kvm_hook.c2
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sh/configs/titan_defconfig1
-rw-r--r--arch/sparc/Kconfig1
-rw-r--r--arch/sparc/include/uapi/asm/socket.h3
-rw-r--r--arch/um/Kconfig5
-rw-r--r--arch/um/drivers/Kconfig1
-rw-r--r--arch/um/drivers/rtc_user.c2
-rw-r--r--arch/um/drivers/vfio_kern.c62
-rw-r--r--arch/um/drivers/virt-pci.c45
-rw-r--r--arch/um/drivers/virtio_pcidev.c8
-rw-r--r--arch/um/include/asm/cpufeature.h4
-rw-r--r--arch/um/include/asm/current.h4
-rw-r--r--arch/um/include/asm/mmu_context.h9
-rw-r--r--arch/um/include/asm/page.h4
-rw-r--r--arch/um/include/asm/ptrace-generic.h2
-rw-r--r--arch/um/include/asm/thread_info.h8
-rw-r--r--arch/um/include/shared/as-layout.h2
-rw-r--r--arch/um/include/shared/skas/mm_id.h2
-rw-r--r--arch/um/include/shared/skas/skas.h1
-rw-r--r--arch/um/kernel/exec.c2
-rw-r--r--arch/um/kernel/process.c20
-rw-r--r--arch/um/kernel/ptrace.c9
-rw-r--r--arch/um/kernel/skas/mmu.c4
-rw-r--r--arch/um/kernel/skas/process.c2
-rw-r--r--arch/um/kernel/skas/syscall.c11
-rw-r--r--arch/um/os-Linux/skas/process.c35
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/compressed/Makefile5
-rw-r--r--arch/x86/boot/compressed/sbat.S7
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S8
-rw-r--r--arch/x86/boot/header.S31
-rw-r--r--arch/x86/coco/sev/core.c89
-rw-r--r--arch/x86/coco/sev/vc-handle.c9
-rw-r--r--arch/x86/configs/i386_defconfig19
-rw-r--r--arch/x86/configs/x86_64_defconfig9
-rw-r--r--arch/x86/events/intel/uncore.c7
-rw-r--r--arch/x86/events/intel/uncore.h2
-rw-r--r--arch/x86/events/intel/uncore_discovery.c89
-rw-r--r--arch/x86/events/intel/uncore_discovery.h7
-rw-r--r--arch/x86/events/intel/uncore_snb.c79
-rw-r--r--arch/x86/events/intel/uncore_snbep.c4
-rw-r--r--arch/x86/include/asm/apic.h66
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/fpu/types.h49
-rw-r--r--arch/x86/include/asm/fpu/xstate.h9
-rw-r--r--arch/x86/include/asm/init.h2
-rw-r--r--arch/x86/include/asm/irq_remapping.h17
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h76
-rw-r--r--arch/x86/include/asm/msr-index.h7
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/sev.h14
-rw-r--r--arch/x86/include/asm/smp.h23
-rw-r--r--arch/x86/include/asm/special_insns.h29
-rw-r--r--arch/x86/include/asm/svm.h13
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/cpu/amd.c4
-rw-r--r--arch/x86/kernel/cpu/bugs.c465
-rw-r--r--arch/x86/kernel/cpu/common.c7
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c14
-rw-r--r--arch/x86/kernel/fpu/core.c53
-rw-r--r--arch/x86/kernel/fpu/init.c1
-rw-r--r--arch/x86/kernel/fpu/xstate.c40
-rw-r--r--arch/x86/kernel/itmt.c23
-rw-r--r--arch/x86/kernel/process.c20
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/setup.c4
-rw-r--r--arch/x86/kernel/smpboot.c51
-rw-r--r--arch/x86/kvm/Kconfig10
-rw-r--r--arch/x86/kvm/Makefile7
-rw-r--r--arch/x86/kvm/cpuid.c1
-rw-r--r--arch/x86/kvm/hyperv.c10
-rw-r--r--arch/x86/kvm/hyperv.h3
-rw-r--r--arch/x86/kvm/i8254.c90
-rw-r--r--arch/x86/kvm/i8254.h17
-rw-r--r--arch/x86/kvm/i8259.c17
-rw-r--r--arch/x86/kvm/ioapic.c55
-rw-r--r--arch/x86/kvm/ioapic.h24
-rw-r--r--arch/x86/kvm/irq.c560
-rw-r--r--arch/x86/kvm/irq.h35
-rw-r--r--arch/x86/kvm/irq_comm.c469
-rw-r--r--arch/x86/kvm/lapic.c104
-rw-r--r--arch/x86/kvm/lapic.h26
-rw-r--r--arch/x86/kvm/mmu/mmu.c75
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h3
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h8
-rw-r--r--arch/x86/kvm/mmu/spte.c43
-rw-r--r--arch/x86/kvm/mmu/spte.h10
-rw-r--r--arch/x86/kvm/svm/avic.c688
-rw-r--r--arch/x86/kvm/svm/nested.c128
-rw-r--r--arch/x86/kvm/svm/sev.c149
-rw-r--r--arch/x86/kvm/svm/svm.c506
-rw-r--r--arch/x86/kvm/svm/svm.h137
-rw-r--r--arch/x86/kvm/trace.h99
-rw-r--r--arch/x86/kvm/vmx/capabilities.h1
-rw-r--r--arch/x86/kvm/vmx/common.h2
-rw-r--r--arch/x86/kvm/vmx/main.c61
-rw-r--r--arch/x86/kvm/vmx/nested.c27
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c8
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c140
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h10
-rw-r--r--arch/x86/kvm/vmx/run_flags.h10
-rw-r--r--arch/x86/kvm/vmx/tdx.c71
-rw-r--r--arch/x86/kvm/vmx/tdx.h1
-rw-r--r--arch/x86/kvm/vmx/vmx.c296
-rw-r--r--arch/x86/kvm/vmx/vmx.h57
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h16
-rw-r--r--arch/x86/kvm/x86.c392
-rw-r--r--arch/x86/kvm/x86.h40
-rw-r--r--arch/x86/lib/cache-smp.c26
-rw-r--r--arch/x86/mm/extable.c5
-rw-r--r--arch/x86/mm/pti.c4
-rw-r--r--arch/x86/net/bpf_jit_comp.c10
-rw-r--r--arch/x86/platform/efi/efi_64.c4
-rw-r--r--arch/x86/tools/insn_decoder_test.c2
-rw-r--r--arch/x86/tools/insn_sanity.c4
-rw-r--r--arch/x86/um/asm/syscall.h2
-rw-r--r--arch/x86/um/shared/sysdep/ptrace.h12
-rw-r--r--arch/x86/um/shared/sysdep/syscalls.h6
-rw-r--r--arch/x86/um/shared/sysdep/syscalls_32.h14
-rw-r--r--arch/x86/um/shared/sysdep/syscalls_64.h28
-rw-r--r--arch/x86/um/tls_32.c2
442 files changed, 9290 insertions, 6155 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index e133c7d1b48f..3d81c5f1ba2d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1772,4 +1772,7 @@ config ARCH_WANTS_PRE_LINK_VMLINUX
An architecture can select this if it provides arch/<arch>/tools/Makefile
with .arch.vmlinux.o target to be linked into vmlinux.
+config ARCH_HAS_CPU_ATTACK_VECTORS
+ bool
+
endmenu
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 8f1f18adcdb5..5ef57f88df6b 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -152,6 +152,9 @@
#define SO_PASSRIGHTS 83
+#define SO_INQ 84
+#define SCM_INQ SO_INQ
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a6e80653abd1..b1f3df39ed40 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -100,12 +100,12 @@ config ARM
select HAVE_BUILDTIME_MCOUNT_SORT
select HAVE_DEBUG_KMEMLEAK if !XIP_KERNEL
select HAVE_DMA_CONTIGUOUS if MMU
+ select HAVE_EXTRA_IPI_TRACEPOINTS
select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
select HAVE_EXIT_THREAD
select HAVE_GUP_FAST if ARM_LPAE
- select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER if !XIP_KERNEL
diff --git a/arch/arm/boot/dts/intel/ixp/intel-ixp42x-linksys-wrv54g.dts b/arch/arm/boot/dts/intel/ixp/intel-ixp42x-linksys-wrv54g.dts
index 98275a363c57..cb1842c83ac8 100644
--- a/arch/arm/boot/dts/intel/ixp/intel-ixp42x-linksys-wrv54g.dts
+++ b/arch/arm/boot/dts/intel/ixp/intel-ixp42x-linksys-wrv54g.dts
@@ -72,10 +72,55 @@
cs-gpios = <&gpio0 5 GPIO_ACTIVE_LOW>;
num-chipselects = <1>;
- switch@0 {
+ ethernet-switch@0 {
compatible = "micrel,ks8995";
reg = <0>;
spi-max-frequency = <50000000>;
+
+ /*
+ * The PHYs are accessed over the external MDIO
+ * bus and not internally through the switch control
+ * registers.
+ */
+ ethernet-ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ethernet-port@0 {
+ reg = <0>;
+ label = "1";
+ phy-mode = "mii";
+ phy-handle = <&phy1>;
+ };
+ ethernet-port@1 {
+ reg = <1>;
+ label = "2";
+ phy-mode = "mii";
+ phy-handle = <&phy2>;
+ };
+ ethernet-port@2 {
+ reg = <2>;
+ label = "3";
+ phy-mode = "mii";
+ phy-handle = <&phy3>;
+ };
+ ethernet-port@3 {
+ reg = <3>;
+ label = "4";
+ phy-mode = "mii";
+ phy-handle = <&phy4>;
+ };
+ ethernet-port@4 {
+ reg = <4>;
+ ethernet = <&ethb>;
+ phy-mode = "mii";
+ fixed-link {
+ speed = <100>;
+ full-duplex;
+ };
+ };
+
+ };
};
};
@@ -135,40 +180,59 @@
};
/*
- * EthB - connected to the KS8995 switch ports 1-4
- * FIXME: the boardfile defines .phy_mask = 0x1e for this port to enable output to
- * all four switch ports, also using an out of tree multiphy patch.
- * Do we need a new binding and property for this?
+ * EthB connects to the KS8995 CPU port and faces ports 1-4
+ * through the switch fabric.
+ *
+ * To complicate things, the MDIO channel is also only
+ * accessible through EthB, but used independently for PHY
+ * control.
*/
- ethernet@c8009000 {
+ ethb: ethernet@c8009000 {
status = "okay";
queue-rx = <&qmgr 3>;
queue-txready = <&qmgr 20>;
- phy-mode = "rgmii";
- phy-handle = <&phy4>;
+ phy-mode = "mii";
+ fixed-link {
+ speed = <100>;
+ full-duplex;
+ };
mdio {
#address-cells = <1>;
#size-cells = <0>;
- /* Should be ports 1-4 on the KS8995 switch */
+ /*
+ * LAN ports 1-4 on the KS8995 switch
+ * and PHY5 for WAN need to be accessed
+ * through this external MDIO channel.
+ */
+ phy1: ethernet-phy@1 {
+ reg = <1>;
+ };
+ phy2: ethernet-phy@2 {
+ reg = <2>;
+ };
+ phy3: ethernet-phy@3 {
+ reg = <3>;
+ };
phy4: ethernet-phy@4 {
reg = <4>;
};
-
- /* Should be port 5 on the KS8995 switch */
phy5: ethernet-phy@5 {
reg = <5>;
};
};
};
- /* EthC - connected to KS8995 switch port 5 */
- ethernet@c800a000 {
+ /*
+ * EthC connects to MII-P5 on the KS8995 bypassing
+ * all of the switch logic and facing PHY5
+ */
+ ethc: ethernet@c800a000 {
status = "okay";
queue-rx = <&qmgr 4>;
queue-txready = <&qmgr 21>;
- phy-mode = "rgmii";
+ phy-mode = "mii";
phy-handle = <&phy5>;
};
};
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 1491add30093..939913ed9a73 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -142,7 +142,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
CONFIG_NETFILTER_XT_MATCH_CPU=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7c456aa838b9..3c117b1fa198 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -127,6 +127,7 @@ config ARM64
select ARM_GIC_V2M if PCI
select ARM_GIC_V3
select ARM_GIC_V3_ITS if PCI
+ select ARM_GIC_V5
select ARM_PSCI_FW
select BUILDTIME_TABLE_SORT
select CLONE_BACKWARDS
@@ -134,6 +135,7 @@ config ARM64
select CPU_PM if (SUSPEND || CPU_IDLE)
select CPUMASK_OFFSTACK if NR_CPUS > 256
select DCACHE_WORD_ACCESS
+ select HAVE_EXTRA_IPI_TRACEPOINTS
select DYNAMIC_FTRACE if FUNCTION_TRACER
select DMA_BOUNCE_UNALIGNED_KMALLOC
select DMA_DIRECT_REMAP
@@ -221,7 +223,6 @@ config ARM64
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_GUP_FAST
select HAVE_FTRACE_GRAPH_FUNC
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_FUNCTION_GRAPH_FREGS
@@ -232,6 +233,7 @@ config ARM64
select HAVE_HW_BREAKPOINT if PERF_EVENTS
select HAVE_IOREMAP_PROT
select HAVE_IRQ_TIME_ACCOUNTING
+ select HAVE_LIVEPATCH
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NMI
select HAVE_PERF_EVENTS
@@ -240,6 +242,7 @@ config ARM64
select HAVE_PERF_USER_STACK_DUMP
select HAVE_PREEMPT_DYNAMIC_KEY
select HAVE_REGS_AND_STACK_ACCESS_API
+ select HAVE_RELIABLE_STACKTRACE
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_FUNCTION_ARG_ACCESS_API
select MMU_GATHER_RCU_TABLE_FREE
@@ -278,6 +281,7 @@ config ARM64
select HAVE_SOFTIRQ_ON_OWN_STACK
select USER_STACKTRACE_SUPPORT
select VDSO_GETRANDOM
+ select VMAP_STACK
help
ARM 64-bit (AArch64) Linux support.
@@ -2498,3 +2502,4 @@ source "drivers/acpi/Kconfig"
source "arch/arm64/kvm/Kconfig"
+source "kernel/livepatch/Kconfig"
diff --git a/arch/arm64/boot/dts/mediatek/mt8370.dtsi b/arch/arm64/boot/dts/mediatek/mt8370.dtsi
index cf1a3759451f..7ac8b8d03494 100644
--- a/arch/arm64/boot/dts/mediatek/mt8370.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8370.dtsi
@@ -59,6 +59,22 @@
<&cpu3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
};
+/*
+ * Please note that overriding compatibles is a discouraged practice and is a
+ * clear indication of nodes not being, well, compatible!
+ *
+ * This is a special case, where the GPU is the same as MT8188, but with one
+ * of the cores fused out in this lower-binned SoC.
+ */
+&gpu {
+ compatible = "mediatek,mt8370-mali", "arm,mali-valhall-jm";
+
+ power-domains = <&spm MT8188_POWER_DOMAIN_MFG2>,
+ <&spm MT8188_POWER_DOMAIN_MFG3>;
+
+ power-domain-names = "core0", "core1";
+};
+
&ppi_cluster0 {
affinity = <&cpu0 &cpu1 &cpu2 &cpu3>;
};
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index c56c21bb1eec..23be85d93348 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -58,7 +58,7 @@
.macro disable_step_tsk, flgs, tmp
tbz \flgs, #TIF_SINGLESTEP, 9990f
mrs \tmp, mdscr_el1
- bic \tmp, \tmp, #DBG_MDSCR_SS
+ bic \tmp, \tmp, #MDSCR_EL1_SS
msr mdscr_el1, \tmp
isb // Take effect before a subsequent clear of DAIF.D
9990:
@@ -68,7 +68,7 @@
.macro enable_step_tsk, flgs, tmp
tbz \flgs, #TIF_SINGLESTEP, 9990f
mrs \tmp, mdscr_el1
- orr \tmp, \tmp, #DBG_MDSCR_SS
+ orr \tmp, \tmp, #MDSCR_EL1_SS
msr mdscr_el1, \tmp
9990:
.endm
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 1ca947d5c939..f5801b0ba9e9 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -44,6 +44,9 @@
SB_BARRIER_INSN"nop\n", \
ARM64_HAS_SB))
+#define gsb_ack() asm volatile(GSB_ACK_BARRIER_INSN : : : "memory")
+#define gsb_sys() asm volatile(GSB_SYS_BARRIER_INSN : : : "memory")
+
#ifdef CONFIG_ARM64_PSEUDO_NMI
#define pmr_sync() \
do { \
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index c4326f1cb917..bf13d676aae2 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -275,6 +275,14 @@ extern struct arm64_ftr_reg arm64_ftr_reg_ctrel0;
#define ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU ((u16)BIT(5))
/* Panic when a conflict is detected */
#define ARM64_CPUCAP_PANIC_ON_CONFLICT ((u16)BIT(6))
+/*
+ * When paired with SCOPE_LOCAL_CPU, all early CPUs must satisfy the
+ * condition. This is different from SCOPE_SYSTEM where the check is performed
+ * only once at the end of the SMP boot on the sanitised ID registers.
+ * SCOPE_SYSTEM is not suitable for cases where the capability depends on
+ * properties local to a CPU like MIDR_EL1.
+ */
+#define ARM64_CPUCAP_MATCH_ALL_EARLY_CPUS ((u16)BIT(7))
/*
* CPU errata workarounds that need to be enabled at boot time if one or
@@ -304,6 +312,16 @@ extern struct arm64_ftr_reg arm64_ftr_reg_ctrel0;
(ARM64_CPUCAP_SCOPE_LOCAL_CPU | \
ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU | \
ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU)
+/*
+ * CPU feature detected at boot time and present on all early CPUs. Late CPUs
+ * are permitted to have the feature even if it hasn't been enabled, although
+ * the feature will not be used by Linux in this case. If all early CPUs have
+ * the feature, then every late CPU must have it.
+ */
+#define ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE \
+ (ARM64_CPUCAP_SCOPE_LOCAL_CPU | \
+ ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU | \
+ ARM64_CPUCAP_MATCH_ALL_EARLY_CPUS)
/*
* CPU feature detected at boot time, on one or more CPUs. A late CPU
@@ -391,6 +409,11 @@ static inline int cpucap_default_scope(const struct arm64_cpu_capabilities *cap)
return cap->type & ARM64_CPUCAP_SCOPE_MASK;
}
+static inline bool cpucap_match_all_early_cpus(const struct arm64_cpu_capabilities *cap)
+{
+ return cap->type & ARM64_CPUCAP_MATCH_ALL_EARLY_CPUS;
+}
+
/*
* Generic helper for handling capabilities with multiple (match,enable) pairs
* of call backs, sharing the same capability bit.
@@ -848,6 +871,11 @@ static inline bool system_supports_pmuv3(void)
return cpus_have_final_cap(ARM64_HAS_PMUV3);
}
+static inline bool system_supports_bbml2_noabort(void)
+{
+ return alternative_has_cap_unlikely(ARM64_HAS_BBML2_NOABORT);
+}
+
int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h
index 8f6ba31b8658..f5e3ed2420ce 100644
--- a/arch/arm64/include/asm/debug-monitors.h
+++ b/arch/arm64/include/asm/debug-monitors.h
@@ -13,14 +13,8 @@
#include <asm/ptrace.h>
/* Low-level stepping controls. */
-#define DBG_MDSCR_SS (1 << 0)
#define DBG_SPSR_SS (1 << 21)
-/* MDSCR_EL1 enabling bits */
-#define DBG_MDSCR_KDE (1 << 13)
-#define DBG_MDSCR_MDE (1 << 15)
-#define DBG_MDSCR_MASK ~(DBG_MDSCR_KDE | DBG_MDSCR_MDE)
-
#define DBG_ESR_EVT(x) (((x) >> 27) & 0x7)
/* AArch64 */
@@ -62,30 +56,6 @@ struct task_struct;
#define DBG_HOOK_HANDLED 0
#define DBG_HOOK_ERROR 1
-struct step_hook {
- struct list_head node;
- int (*fn)(struct pt_regs *regs, unsigned long esr);
-};
-
-void register_user_step_hook(struct step_hook *hook);
-void unregister_user_step_hook(struct step_hook *hook);
-
-void register_kernel_step_hook(struct step_hook *hook);
-void unregister_kernel_step_hook(struct step_hook *hook);
-
-struct break_hook {
- struct list_head node;
- int (*fn)(struct pt_regs *regs, unsigned long esr);
- u16 imm;
- u16 mask; /* These bits are ignored when comparing with imm */
-};
-
-void register_user_break_hook(struct break_hook *hook);
-void unregister_user_break_hook(struct break_hook *hook);
-
-void register_kernel_break_hook(struct break_hook *hook);
-void unregister_kernel_break_hook(struct break_hook *hook);
-
u8 debug_monitors_arch(void);
enum dbg_active_el {
@@ -108,17 +78,15 @@ void kernel_rewind_single_step(struct pt_regs *regs);
void kernel_fastforward_single_step(struct pt_regs *regs);
#ifdef CONFIG_HAVE_HW_BREAKPOINT
-int reinstall_suspended_bps(struct pt_regs *regs);
+bool try_step_suspended_breakpoints(struct pt_regs *regs);
#else
-static inline int reinstall_suspended_bps(struct pt_regs *regs)
+static inline bool try_step_suspended_breakpoints(struct pt_regs *regs)
{
- return -ENODEV;
+ return false;
}
#endif
-int aarch32_break_handler(struct pt_regs *regs);
-
-void debug_traps_init(void);
+bool try_handle_aarch32_break(struct pt_regs *regs);
#endif /* __ASSEMBLY */
#endif /* __ASM_DEBUG_MONITORS_H */
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index 9f38340d24c2..46033027510c 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -165,6 +165,50 @@
.Lskip_gicv3_\@:
.endm
+/* GICv5 system register access */
+.macro __init_el2_gicv5
+ mrs_s x0, SYS_ID_AA64PFR2_EL1
+ ubfx x0, x0, #ID_AA64PFR2_EL1_GCIE_SHIFT, #4
+ cbz x0, .Lskip_gicv5_\@
+
+ mov x0, #(ICH_HFGITR_EL2_GICRCDNMIA | \
+ ICH_HFGITR_EL2_GICRCDIA | \
+ ICH_HFGITR_EL2_GICCDDI | \
+ ICH_HFGITR_EL2_GICCDEOI | \
+ ICH_HFGITR_EL2_GICCDHM | \
+ ICH_HFGITR_EL2_GICCDRCFG | \
+ ICH_HFGITR_EL2_GICCDPEND | \
+ ICH_HFGITR_EL2_GICCDAFF | \
+ ICH_HFGITR_EL2_GICCDPRI | \
+ ICH_HFGITR_EL2_GICCDDIS | \
+ ICH_HFGITR_EL2_GICCDEN)
+ msr_s SYS_ICH_HFGITR_EL2, x0 // Disable instruction traps
+ mov_q x0, (ICH_HFGRTR_EL2_ICC_PPI_ACTIVERn_EL1 | \
+ ICH_HFGRTR_EL2_ICC_PPI_PRIORITYRn_EL1 | \
+ ICH_HFGRTR_EL2_ICC_PPI_PENDRn_EL1 | \
+ ICH_HFGRTR_EL2_ICC_PPI_ENABLERn_EL1 | \
+ ICH_HFGRTR_EL2_ICC_PPI_HMRn_EL1 | \
+ ICH_HFGRTR_EL2_ICC_IAFFIDR_EL1 | \
+ ICH_HFGRTR_EL2_ICC_ICSR_EL1 | \
+ ICH_HFGRTR_EL2_ICC_PCR_EL1 | \
+ ICH_HFGRTR_EL2_ICC_HPPIR_EL1 | \
+ ICH_HFGRTR_EL2_ICC_HAPR_EL1 | \
+ ICH_HFGRTR_EL2_ICC_CR0_EL1 | \
+ ICH_HFGRTR_EL2_ICC_IDRn_EL1 | \
+ ICH_HFGRTR_EL2_ICC_APR_EL1)
+ msr_s SYS_ICH_HFGRTR_EL2, x0 // Disable reg read traps
+ mov_q x0, (ICH_HFGWTR_EL2_ICC_PPI_ACTIVERn_EL1 | \
+ ICH_HFGWTR_EL2_ICC_PPI_PRIORITYRn_EL1 | \
+ ICH_HFGWTR_EL2_ICC_PPI_PENDRn_EL1 | \
+ ICH_HFGWTR_EL2_ICC_PPI_ENABLERn_EL1 | \
+ ICH_HFGWTR_EL2_ICC_ICSR_EL1 | \
+ ICH_HFGWTR_EL2_ICC_PCR_EL1 | \
+ ICH_HFGWTR_EL2_ICC_CR0_EL1 | \
+ ICH_HFGWTR_EL2_ICC_APR_EL1)
+ msr_s SYS_ICH_HFGWTR_EL2, x0 // Disable reg write traps
+.Lskip_gicv5_\@:
+.endm
+
.macro __init_el2_hstr
msr hstr_el2, xzr // Disable CP15 traps to EL2
.endm
@@ -189,6 +233,28 @@
.Lskip_set_cptr_\@:
.endm
+/*
+ * Configure BRBE to permit recording cycle counts and branch mispredicts.
+ *
+ * At any EL, to record cycle counts BRBE requires that both BRBCR_EL2.CC=1 and
+ * BRBCR_EL1.CC=1.
+ *
+ * At any EL, to record branch mispredicts BRBE requires that both
+ * BRBCR_EL2.MPRED=1 and BRBCR_EL1.MPRED=1.
+ *
+ * Set {CC,MPRED} in BRBCR_EL2 in case nVHE mode is used and we are
+ * executing in EL1.
+ */
+.macro __init_el2_brbe
+ mrs x1, id_aa64dfr0_el1
+ ubfx x1, x1, #ID_AA64DFR0_EL1_BRBE_SHIFT, #4
+ cbz x1, .Lskip_brbe_\@
+
+ mov_q x0, BRBCR_ELx_CC | BRBCR_ELx_MPRED
+ msr_s SYS_BRBCR_EL2, x0
+.Lskip_brbe_\@:
+.endm
+
/* Disable any fine grained traps */
.macro __init_el2_fgt
mrs x1, id_aa64mmfr0_el1
@@ -196,20 +262,62 @@
cbz x1, .Lskip_fgt_\@
mov x0, xzr
+ mov x2, xzr
mrs x1, id_aa64dfr0_el1
ubfx x1, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4
cmp x1, #3
b.lt .Lskip_spe_fgt_\@
/* Disable PMSNEVFR_EL1 read and write traps */
- orr x0, x0, #(1 << 62)
+ orr x0, x0, #HDFGRTR_EL2_nPMSNEVFR_EL1_MASK
+ orr x2, x2, #HDFGWTR_EL2_nPMSNEVFR_EL1_MASK
.Lskip_spe_fgt_\@:
+ mrs x1, id_aa64dfr0_el1
+ ubfx x1, x1, #ID_AA64DFR0_EL1_BRBE_SHIFT, #4
+ cbz x1, .Lskip_brbe_fgt_\@
+
+ /*
+ * Disable read traps for the following registers
+ *
+ * [BRBSRC|BRBTGT|RBINF]_EL1
+ * [BRBSRCINJ|BRBTGTINJ|BRBINFINJ|BRBTS]_EL1
+ */
+ orr x0, x0, #HDFGRTR_EL2_nBRBDATA_MASK
+
+ /*
+ * Disable write traps for the following registers
+ *
+ * [BRBSRCINJ|BRBTGTINJ|BRBINFINJ|BRBTS]_EL1
+ */
+ orr x2, x2, #HDFGWTR_EL2_nBRBDATA_MASK
+
+ /* Disable read and write traps for [BRBCR|BRBFCR]_EL1 */
+ orr x0, x0, #HDFGRTR_EL2_nBRBCTL_MASK
+ orr x2, x2, #HDFGWTR_EL2_nBRBCTL_MASK
+
+ /* Disable read traps for BRBIDR_EL1 */
+ orr x0, x0, #HDFGRTR_EL2_nBRBIDR_MASK
+
+.Lskip_brbe_fgt_\@:
.Lset_debug_fgt_\@:
msr_s SYS_HDFGRTR_EL2, x0
- msr_s SYS_HDFGWTR_EL2, x0
+ msr_s SYS_HDFGWTR_EL2, x2
mov x0, xzr
+ mov x2, xzr
+
+ mrs x1, id_aa64dfr0_el1
+ ubfx x1, x1, #ID_AA64DFR0_EL1_BRBE_SHIFT, #4
+ cbz x1, .Lskip_brbe_insn_fgt_\@
+
+ /* Disable traps for BRBIALL instruction */
+ orr x2, x2, #HFGITR_EL2_nBRBIALL_MASK
+
+ /* Disable traps for BRBINJ instruction */
+ orr x2, x2, #HFGITR_EL2_nBRBINJ_MASK
+
+.Lskip_brbe_insn_fgt_\@:
mrs x1, id_aa64pfr1_el1
ubfx x1, x1, #ID_AA64PFR1_EL1_SME_SHIFT, #4
cbz x1, .Lskip_sme_fgt_\@
@@ -250,7 +358,7 @@
.Lset_fgt_\@:
msr_s SYS_HFGRTR_EL2, x0
msr_s SYS_HFGWTR_EL2, x0
- msr_s SYS_HFGITR_EL2, xzr
+ msr_s SYS_HFGITR_EL2, x2
mrs x1, id_aa64pfr0_el1 // AMU traps UNDEF without AMU
ubfx x1, x1, #ID_AA64PFR0_EL1_AMU_SHIFT, #4
@@ -300,9 +408,11 @@
__init_el2_hcrx
__init_el2_timers
__init_el2_debug
+ __init_el2_brbe
__init_el2_lor
__init_el2_stage2
__init_el2_gicv3
+ __init_el2_gicv5
__init_el2_hstr
__init_el2_nvhe_idregs
__init_el2_cptr
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index d48fc16584cd..e3874c4fc399 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -59,8 +59,20 @@ void do_el0_bti(struct pt_regs *regs);
void do_el1_bti(struct pt_regs *regs, unsigned long esr);
void do_el0_gcs(struct pt_regs *regs, unsigned long esr);
void do_el1_gcs(struct pt_regs *regs, unsigned long esr);
-void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+void do_breakpoint(unsigned long esr, struct pt_regs *regs);
+void do_watchpoint(unsigned long addr, unsigned long esr,
struct pt_regs *regs);
+#else
+static inline void do_breakpoint(unsigned long esr, struct pt_regs *regs) {}
+static inline void do_watchpoint(unsigned long addr, unsigned long esr,
+ struct pt_regs *regs) {}
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+void do_el0_softstep(unsigned long esr, struct pt_regs *regs);
+void do_el1_softstep(unsigned long esr, struct pt_regs *regs);
+void do_el0_brk64(unsigned long esr, struct pt_regs *regs);
+void do_el1_brk64(unsigned long esr, struct pt_regs *regs);
+void do_bkpt32(unsigned long esr, struct pt_regs *regs);
void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs);
void do_sve_acc(unsigned long esr, struct pt_regs *regs);
void do_sme_acc(unsigned long esr, struct pt_regs *regs);
diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
index f50660603ecf..5bc432234d3a 100644
--- a/arch/arm64/include/asm/gcs.h
+++ b/arch/arm64/include/asm/gcs.h
@@ -58,7 +58,7 @@ static inline u64 gcsss2(void)
static inline bool task_gcs_el0_enabled(struct task_struct *task)
{
- return current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE;
+ return task->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE;
}
void gcs_set_el0_mode(struct task_struct *task);
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index 1c3f9617d54f..13f94c8ddfc0 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -176,6 +176,8 @@
#define KERNEL_HWCAP_POE __khwcap2_feature(POE)
#define __khwcap3_feature(x) (const_ilog2(HWCAP3_ ## x) + 128)
+#define KERNEL_HWCAP_MTE_FAR __khwcap3_feature(MTE_FAR)
+#define KERNEL_HWCAP_MTE_STORE_ONLY __khwcap3_feature(MTE_STORE_ONLY)
/*
* This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/asm/kgdb.h b/arch/arm64/include/asm/kgdb.h
index 21fc85e9d2be..3184f5d1e3ae 100644
--- a/arch/arm64/include/asm/kgdb.h
+++ b/arch/arm64/include/asm/kgdb.h
@@ -24,6 +24,18 @@ static inline void arch_kgdb_breakpoint(void)
extern void kgdb_handle_bus_error(void);
extern int kgdb_fault_expected;
+int kgdb_brk_handler(struct pt_regs *regs, unsigned long esr);
+int kgdb_compiled_brk_handler(struct pt_regs *regs, unsigned long esr);
+#ifdef CONFIG_KGDB
+int kgdb_single_step_handler(struct pt_regs *regs, unsigned long esr);
+#else
+static inline int kgdb_single_step_handler(struct pt_regs *regs,
+ unsigned long esr)
+{
+ return DBG_HOOK_ERROR;
+}
+#endif
+
#endif /* !__ASSEMBLY__ */
/*
diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h
index be7a3680dadf..f2782560647b 100644
--- a/arch/arm64/include/asm/kprobes.h
+++ b/arch/arm64/include/asm/kprobes.h
@@ -41,4 +41,12 @@ void __kretprobe_trampoline(void);
void __kprobes *trampoline_probe_handler(struct pt_regs *regs);
#endif /* CONFIG_KPROBES */
+
+int __kprobes kprobe_brk_handler(struct pt_regs *regs,
+ unsigned long esr);
+int __kprobes kprobe_ss_brk_handler(struct pt_regs *regs,
+ unsigned long esr);
+int __kprobes kretprobe_brk_handler(struct pt_regs *regs,
+ unsigned long esr);
+
#endif /* _ARM_KPROBES_H */
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 0720898f563e..fa8a08a1ccd5 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -45,16 +45,39 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
void kvm_skip_instr32(struct kvm_vcpu *vcpu);
void kvm_inject_undefined(struct kvm_vcpu *vcpu);
-void kvm_inject_vabt(struct kvm_vcpu *vcpu);
-void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
-void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
+int kvm_inject_serror_esr(struct kvm_vcpu *vcpu, u64 esr);
+int kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr);
void kvm_inject_size_fault(struct kvm_vcpu *vcpu);
+static inline int kvm_inject_sea_dabt(struct kvm_vcpu *vcpu, u64 addr)
+{
+ return kvm_inject_sea(vcpu, false, addr);
+}
+
+static inline int kvm_inject_sea_iabt(struct kvm_vcpu *vcpu, u64 addr)
+{
+ return kvm_inject_sea(vcpu, true, addr);
+}
+
+static inline int kvm_inject_serror(struct kvm_vcpu *vcpu)
+{
+ /*
+ * ESR_ELx.ISV (later renamed to IDS) indicates whether or not
+ * ESR_ELx.ISS contains IMPLEMENTATION DEFINED syndrome information.
+ *
+ * Set the bit when injecting an SError w/o an ESR to indicate ISS
+ * does not follow the architected format.
+ */
+ return kvm_inject_serror_esr(vcpu, ESR_ELx_ISV);
+}
+
void kvm_vcpu_wfi(struct kvm_vcpu *vcpu);
void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
+int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr);
+int kvm_inject_nested_serror(struct kvm_vcpu *vcpu, u64 esr);
static inline void kvm_inject_nested_sve_trap(struct kvm_vcpu *vcpu)
{
@@ -195,6 +218,11 @@ static inline bool vcpu_el2_tge_is_set(const struct kvm_vcpu *vcpu)
return ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2) & HCR_TGE;
}
+static inline bool vcpu_el2_amo_is_set(const struct kvm_vcpu *vcpu)
+{
+ return ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2) & HCR_AMO;
+}
+
static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu)
{
bool e2h, tge;
@@ -224,6 +252,20 @@ static inline bool vcpu_is_host_el0(const struct kvm_vcpu *vcpu)
return is_hyp_ctxt(vcpu) && !vcpu_is_el2(vcpu);
}
+static inline bool is_nested_ctxt(struct kvm_vcpu *vcpu)
+{
+ return vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu);
+}
+
+static inline bool vserror_state_is_nested(struct kvm_vcpu *vcpu)
+{
+ if (!is_nested_ctxt(vcpu))
+ return false;
+
+ return vcpu_el2_amo_is_set(vcpu) ||
+ (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TMEA);
+}
+
/*
* The layout of SPSR for an AArch32 state is different when observed from an
* AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32
@@ -627,6 +669,9 @@ static inline void vcpu_set_hcrx(struct kvm_vcpu *vcpu)
if (kvm_has_fpmr(kvm))
vcpu->arch.hcrx_el2 |= HCRX_EL2_EnFPM;
+
+ if (kvm_has_sctlr2(kvm))
+ vcpu->arch.hcrx_el2 |= HCRX_EL2_SCTLR2En;
}
}
#endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3e41a880b062..2f2394cce24e 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -523,6 +523,7 @@ enum vcpu_sysreg {
/* Anything from this can be RES0/RES1 sanitised */
MARKER(__SANITISED_REG_START__),
TCR2_EL2, /* Extended Translation Control Register (EL2) */
+ SCTLR2_EL2, /* System Control Register 2 (EL2) */
MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */
CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */
@@ -537,6 +538,7 @@ enum vcpu_sysreg {
VNCR(TTBR1_EL1),/* Translation Table Base Register 1 */
VNCR(TCR_EL1), /* Translation Control Register */
VNCR(TCR2_EL1), /* Extended Translation Control Register */
+ VNCR(SCTLR2_EL1), /* System Control Register 2 */
VNCR(ESR_EL1), /* Exception Syndrome Register */
VNCR(AFSR0_EL1),/* Auxiliary Fault Status Register 0 */
VNCR(AFSR1_EL1),/* Auxiliary Fault Status Register 1 */
@@ -565,6 +567,10 @@ enum vcpu_sysreg {
VNCR(POR_EL1), /* Permission Overlay Register 1 (EL1) */
+ /* FEAT_RAS registers */
+ VNCR(VDISR_EL2),
+ VNCR(VSESR_EL2),
+
VNCR(HFGRTR_EL2),
VNCR(HFGWTR_EL2),
VNCR(HFGITR_EL2),
@@ -704,6 +710,7 @@ struct kvm_host_data {
#define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5
#define KVM_HOST_DATA_FLAG_VCPU_IN_HYP_CONTEXT 6
#define KVM_HOST_DATA_FLAG_L1_VNCR_MAPPED 7
+#define KVM_HOST_DATA_FLAG_HAS_BRBE 8
unsigned long flags;
struct kvm_cpu_context host_ctxt;
@@ -737,6 +744,7 @@ struct kvm_host_data {
u64 trfcr_el1;
/* Values of trap registers for the host before guest entry. */
u64 mdcr_el2;
+ u64 brbcr_el1;
} host_debug_state;
/* Guest trace filter value */
@@ -817,7 +825,7 @@ struct kvm_vcpu_arch {
u8 iflags;
/* State flags for kernel bookkeeping, unused by the hypervisor code */
- u8 sflags;
+ u16 sflags;
/*
* Don't run the guest (internal implementation need).
@@ -953,9 +961,21 @@ struct kvm_vcpu_arch {
__vcpu_flags_preempt_enable(); \
} while (0)
+#define __vcpu_test_and_clear_flag(v, flagset, f, m) \
+ ({ \
+ typeof(v->arch.flagset) set; \
+ \
+ set = __vcpu_get_flag(v, flagset, f, m); \
+ __vcpu_clear_flag(v, flagset, f, m); \
+ \
+ set; \
+ })
+
#define vcpu_get_flag(v, ...) __vcpu_get_flag((v), __VA_ARGS__)
#define vcpu_set_flag(v, ...) __vcpu_set_flag((v), __VA_ARGS__)
#define vcpu_clear_flag(v, ...) __vcpu_clear_flag((v), __VA_ARGS__)
+#define vcpu_test_and_clear_flag(v, ...) \
+ __vcpu_test_and_clear_flag((v), __VA_ARGS__)
/* KVM_ARM_VCPU_INIT completed */
#define VCPU_INITIALIZED __vcpu_single_flag(cflags, BIT(0))
@@ -1015,6 +1035,8 @@ struct kvm_vcpu_arch {
#define IN_WFI __vcpu_single_flag(sflags, BIT(6))
/* KVM is currently emulating a nested ERET */
#define IN_NESTED_ERET __vcpu_single_flag(sflags, BIT(7))
+/* SError pending for nested guest */
+#define NESTED_SERROR_PENDING __vcpu_single_flag(sflags, BIT(8))
/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
@@ -1149,6 +1171,8 @@ static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
* System registers listed in the switch are not saved on every
* exit from the guest but are only saved on vcpu_put.
*
+ * SYSREGS_ON_CPU *MUST* be checked before using this helper.
+ *
* Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
* should never be listed below, because the guest cannot modify its
* own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's
@@ -1186,6 +1210,7 @@ static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break;
case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break;
case ZCR_EL1: *val = read_sysreg_s(SYS_ZCR_EL12); break;
+ case SCTLR2_EL1: *val = read_sysreg_s(SYS_SCTLR2_EL12); break;
default: return false;
}
@@ -1200,6 +1225,8 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
* System registers listed in the switch are not restored on every
* entry to the guest but are only restored on vcpu_load.
*
+ * SYSREGS_ON_CPU *MUST* be checked before using this helper.
+ *
* Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
* should never be listed below, because the MPIDR should only be set
* once, before running the VCPU, and never changed later.
@@ -1236,6 +1263,7 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break;
case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break;
case ZCR_EL1: write_sysreg_s(val, SYS_ZCR_EL12); break;
+ case SCTLR2_EL1: write_sysreg_s(val, SYS_SCTLR2_EL12); break;
default: return false;
}
@@ -1387,8 +1415,6 @@ static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
return (vcpu_arch->steal.base != INVALID_GPA);
}
-void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
-
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data);
@@ -1665,6 +1691,12 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);
#define kvm_has_s1poe(k) \
(kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP))
+#define kvm_has_ras(k) \
+ (kvm_has_feat((k), ID_AA64PFR0_EL1, RAS, IMP))
+
+#define kvm_has_sctlr2(k) \
+ (kvm_has_feat((k), ID_AA64MMFR3_EL1, SCTLRX, IMP))
+
static inline bool kvm_arch_has_irq_bypass(void)
{
return true;
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index b98ac6aa631f..ae563ebd6aee 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -371,6 +371,24 @@ static inline void kvm_fault_unlock(struct kvm *kvm)
read_unlock(&kvm->mmu_lock);
}
+/*
+ * ARM64 KVM relies on a simple conversion from physaddr to a kernel
+ * virtual address (KVA) when it does cache maintenance as the CMO
+ * instructions work on virtual addresses. This is incompatible with
+ * VM_PFNMAP VMAs which may not have a kernel direct mapping to a
+ * virtual address.
+ *
+ * With S2FWB and CACHE DIC features, KVM need not do cache flushing
+ * and CMOs are NOP'd. This has the effect of no longer requiring a
+ * KVA for addresses mapped into the S2. The presence of these features
+ * are thus necessary to support cacheable S2 mapping of VM_PFNMAP.
+ */
+static inline bool kvm_supports_cacheable_pfnmap(void)
+{
+ return cpus_have_final_cap(ARM64_HAS_STAGE2_FWB) &&
+ cpus_have_final_cap(ARM64_HAS_CACHE_DIC);
+}
+
#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
void kvm_s2_ptdump_create_debugfs(struct kvm *kvm);
#else
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index 0bd07ea068a1..7fd76f41c296 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -80,6 +80,8 @@ extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu);
extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu);
extern void check_nested_vcpu_requests(struct kvm_vcpu *vcpu);
+extern void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu);
+extern void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu);
struct kvm_s2_trans {
phys_addr_t output;
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 717829df294e..5213248e081b 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -118,7 +118,7 @@
* VMAP'd stacks are allocated at page granularity, so we must ensure that such
* stacks are a multiple of page size.
*/
-#if defined(CONFIG_VMAP_STACK) && (MIN_THREAD_SHIFT < PAGE_SHIFT)
+#if (MIN_THREAD_SHIFT < PAGE_SHIFT)
#define THREAD_SHIFT PAGE_SHIFT
#else
#define THREAD_SHIFT MIN_THREAD_SHIFT
@@ -135,11 +135,7 @@
* checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry
* assembly.
*/
-#ifdef CONFIG_VMAP_STACK
#define THREAD_ALIGN (2 * THREAD_SIZE)
-#else
-#define THREAD_ALIGN THREAD_SIZE
-#endif
#define IRQ_STACK_SIZE THREAD_SIZE
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 1bf1a3b16e88..61d62bfd5a7b 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -23,6 +23,8 @@
#define MTE_CTRL_TCF_ASYNC (1UL << 17)
#define MTE_CTRL_TCF_ASYMM (1UL << 18)
+#define MTE_CTRL_STORE_ONLY (1UL << 19)
+
#ifndef __ASSEMBLY__
#include <linux/build_bug.h>
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 2510eec026f7..d48ef6d5abcc 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -50,10 +50,32 @@ struct seq_file;
*/
extern void smp_init_cpus(void);
+enum ipi_msg_type {
+ IPI_RESCHEDULE,
+ IPI_CALL_FUNC,
+ IPI_CPU_STOP,
+ IPI_CPU_STOP_NMI,
+ IPI_TIMER,
+ IPI_IRQ_WORK,
+ NR_IPI,
+ /*
+ * Any enum >= NR_IPI and < MAX_IPI is special and not tracable
+ * with trace_ipi_*
+ */
+ IPI_CPU_BACKTRACE = NR_IPI,
+ IPI_KGDB_ROUNDUP,
+ MAX_IPI
+};
+
/*
* Register IPI interrupts with the arch SMP code
*/
-extern void set_smp_ipi_range(int ipi_base, int nr_ipi);
+extern void set_smp_ipi_range_percpu(int ipi_base, int nr_ipi, int ncpus);
+
+static inline void set_smp_ipi_range(int ipi_base, int n)
+{
+ set_smp_ipi_range_percpu(ipi_base, n, 0);
+}
/*
* Called from the secondary holding pen, this is the secondary CPU entry point.
diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index 66ec8caa6ac0..6d3280932bf5 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -59,7 +59,6 @@ static inline bool on_task_stack(const struct task_struct *tsk,
#define on_thread_stack() (on_task_stack(current, current_stack_pointer, 1))
-#ifdef CONFIG_VMAP_STACK
DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack);
static inline struct stack_info stackinfo_get_overflow(void)
@@ -72,11 +71,8 @@ static inline struct stack_info stackinfo_get_overflow(void)
.high = high,
};
}
-#else
-#define stackinfo_get_overflow() stackinfo_get_unknown()
-#endif
-#if defined(CONFIG_ARM_SDE_INTERFACE) && defined(CONFIG_VMAP_STACK)
+#if defined(CONFIG_ARM_SDE_INTERFACE)
DECLARE_PER_CPU(unsigned long *, sdei_stack_normal_ptr);
DECLARE_PER_CPU(unsigned long *, sdei_stack_critical_ptr);
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index f1bb0d10c39a..d5b5f2ae1afa 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -113,10 +113,14 @@
/* Register-based PAN access, for save/restore purposes */
#define SYS_PSTATE_PAN sys_reg(3, 0, 4, 2, 3)
-#define __SYS_BARRIER_INSN(CRm, op2, Rt) \
- __emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f))
+#define __SYS_BARRIER_INSN(op0, op1, CRn, CRm, op2, Rt) \
+ __emit_inst(0xd5000000 | \
+ sys_insn((op0), (op1), (CRn), (CRm), (op2)) | \
+ ((Rt) & 0x1f))
-#define SB_BARRIER_INSN __SYS_BARRIER_INSN(0, 7, 31)
+#define SB_BARRIER_INSN __SYS_BARRIER_INSN(0, 3, 3, 0, 7, 31)
+#define GSB_SYS_BARRIER_INSN __SYS_BARRIER_INSN(1, 0, 12, 0, 0, 31)
+#define GSB_ACK_BARRIER_INSN __SYS_BARRIER_INSN(1, 0, 12, 0, 1, 31)
/* Data cache zero operations */
#define SYS_DC_ISW sys_insn(1, 0, 7, 6, 2)
@@ -202,16 +206,8 @@
#define SYS_DBGVCR32_EL2 sys_reg(2, 4, 0, 7, 0)
#define SYS_BRBINF_EL1(n) sys_reg(2, 1, 8, (n & 15), (((n & 16) >> 2) | 0))
-#define SYS_BRBINFINJ_EL1 sys_reg(2, 1, 9, 1, 0)
#define SYS_BRBSRC_EL1(n) sys_reg(2, 1, 8, (n & 15), (((n & 16) >> 2) | 1))
-#define SYS_BRBSRCINJ_EL1 sys_reg(2, 1, 9, 1, 1)
#define SYS_BRBTGT_EL1(n) sys_reg(2, 1, 8, (n & 15), (((n & 16) >> 2) | 2))
-#define SYS_BRBTGTINJ_EL1 sys_reg(2, 1, 9, 1, 2)
-#define SYS_BRBTS_EL1 sys_reg(2, 1, 9, 0, 2)
-
-#define SYS_BRBCR_EL1 sys_reg(2, 1, 9, 0, 0)
-#define SYS_BRBFCR_EL1 sys_reg(2, 1, 9, 0, 1)
-#define SYS_BRBIDR0_EL1 sys_reg(2, 1, 9, 2, 0)
#define SYS_TRCITECR_EL1 sys_reg(3, 0, 1, 2, 3)
#define SYS_TRCACATR(m) sys_reg(2, 1, 2, ((m & 7) << 1), (2 | (m >> 3)))
@@ -277,8 +273,6 @@
/* ETM */
#define SYS_TRCOSLAR sys_reg(2, 1, 1, 0, 4)
-#define SYS_BRBCR_EL2 sys_reg(2, 4, 9, 0, 0)
-
#define SYS_MIDR_EL1 sys_reg(3, 0, 0, 0, 0)
#define SYS_MPIDR_EL1 sys_reg(3, 0, 0, 0, 5)
#define SYS_REVIDR_EL1 sys_reg(3, 0, 0, 0, 6)
@@ -821,6 +815,12 @@
#define OP_COSP_RCTX sys_insn(1, 3, 7, 3, 6)
#define OP_CPP_RCTX sys_insn(1, 3, 7, 3, 7)
+/*
+ * BRBE Instructions
+ */
+#define BRB_IALL_INSN __emit_inst(0xd5000000 | OP_BRB_IALL | (0x1f))
+#define BRB_INJ_INSN __emit_inst(0xd5000000 | OP_BRB_INJ | (0x1f))
+
/* Common SCTLR_ELx flags. */
#define SCTLR_ELx_ENTP2 (BIT(60))
#define SCTLR_ELx_DSSBS (BIT(44))
@@ -1078,6 +1078,67 @@
#define GCS_CAP(x) ((((unsigned long)x) & GCS_CAP_ADDR_MASK) | \
GCS_CAP_VALID_TOKEN)
+/*
+ * Definitions for GICv5 instructions
+ */
+#define GICV5_OP_GIC_CDAFF sys_insn(1, 0, 12, 1, 3)
+#define GICV5_OP_GIC_CDDI sys_insn(1, 0, 12, 2, 0)
+#define GICV5_OP_GIC_CDDIS sys_insn(1, 0, 12, 1, 0)
+#define GICV5_OP_GIC_CDHM sys_insn(1, 0, 12, 2, 1)
+#define GICV5_OP_GIC_CDEN sys_insn(1, 0, 12, 1, 1)
+#define GICV5_OP_GIC_CDEOI sys_insn(1, 0, 12, 1, 7)
+#define GICV5_OP_GIC_CDPEND sys_insn(1, 0, 12, 1, 4)
+#define GICV5_OP_GIC_CDPRI sys_insn(1, 0, 12, 1, 2)
+#define GICV5_OP_GIC_CDRCFG sys_insn(1, 0, 12, 1, 5)
+#define GICV5_OP_GICR_CDIA sys_insn(1, 0, 12, 3, 0)
+
+/* Definitions for GIC CDAFF */
+#define GICV5_GIC_CDAFF_IAFFID_MASK GENMASK_ULL(47, 32)
+#define GICV5_GIC_CDAFF_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDAFF_IRM_MASK BIT_ULL(28)
+#define GICV5_GIC_CDAFF_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDDI */
+#define GICV5_GIC_CDDI_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDDI_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDDIS */
+#define GICV5_GIC_CDDIS_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDDIS_TYPE(r) FIELD_GET(GICV5_GIC_CDDIS_TYPE_MASK, r)
+#define GICV5_GIC_CDDIS_ID_MASK GENMASK_ULL(23, 0)
+#define GICV5_GIC_CDDIS_ID(r) FIELD_GET(GICV5_GIC_CDDIS_ID_MASK, r)
+
+/* Definitions for GIC CDEN */
+#define GICV5_GIC_CDEN_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDEN_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDHM */
+#define GICV5_GIC_CDHM_HM_MASK BIT_ULL(32)
+#define GICV5_GIC_CDHM_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDHM_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDPEND */
+#define GICV5_GIC_CDPEND_PENDING_MASK BIT_ULL(32)
+#define GICV5_GIC_CDPEND_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDPEND_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDPRI */
+#define GICV5_GIC_CDPRI_PRIORITY_MASK GENMASK_ULL(39, 35)
+#define GICV5_GIC_CDPRI_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDPRI_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDRCFG */
+#define GICV5_GIC_CDRCFG_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDRCFG_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GICR CDIA */
+#define GICV5_GIC_CDIA_VALID_MASK BIT_ULL(32)
+#define GICV5_GICR_CDIA_VALID(r) FIELD_GET(GICV5_GIC_CDIA_VALID_MASK, r)
+#define GICV5_GIC_CDIA_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDIA_ID_MASK GENMASK_ULL(23, 0)
+
+#define gicr_insn(insn) read_sysreg_s(GICV5_OP_GICR_##insn)
+#define gic_insn(v, insn) write_sysreg_s(v, GICV5_OP_GIC_##insn)
#define ARM64_FEATURE_FIELD_BITS 4
diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h
index c34344256762..344b1c1a4bbb 100644
--- a/arch/arm64/include/asm/system_misc.h
+++ b/arch/arm64/include/asm/system_misc.h
@@ -25,10 +25,6 @@ void arm64_notify_die(const char *str, struct pt_regs *regs,
int signo, int sicode, unsigned long far,
unsigned long err);
-void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned long,
- struct pt_regs *),
- int sig, int code, const char *name);
-
struct mm_struct;
extern void __show_regs(struct pt_regs *);
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 1269c2487574..f241b8601ebd 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -70,6 +70,7 @@ void arch_setup_new_exec(void);
#define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */
#define TIF_SECCOMP 11 /* syscall secure computing */
#define TIF_SYSCALL_EMU 12 /* syscall emulation active */
+#define TIF_PATCH_PENDING 13 /* pending live patching update */
#define TIF_MEMDIE 18 /* is terminating due to OOM killer */
#define TIF_FREEZE 19
#define TIF_RESTORE_SIGMASK 20
@@ -96,6 +97,7 @@ void arch_setup_new_exec(void);
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
+#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
#define _TIF_32BIT (1 << TIF_32BIT)
@@ -107,7 +109,8 @@ void arch_setup_new_exec(void);
#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
_TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
_TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \
- _TIF_NOTIFY_SIGNAL | _TIF_SIGPENDING)
+ _TIF_NOTIFY_SIGNAL | _TIF_SIGPENDING | \
+ _TIF_PATCH_PENDING)
#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index 82cf1f879c61..e3e8944a71c3 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -29,6 +29,12 @@ void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey);
void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str);
void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str);
+int bug_brk_handler(struct pt_regs *regs, unsigned long esr);
+int cfi_brk_handler(struct pt_regs *regs, unsigned long esr);
+int reserved_fault_brk_handler(struct pt_regs *regs, unsigned long esr);
+int kasan_brk_handler(struct pt_regs *regs, unsigned long esr);
+int ubsan_brk_handler(struct pt_regs *regs, unsigned long esr);
+
int early_brk64(unsigned long addr, unsigned long esr, struct pt_regs *regs);
/*
diff --git a/arch/arm64/include/asm/uprobes.h b/arch/arm64/include/asm/uprobes.h
index 014b02897f8e..89bfb0213a50 100644
--- a/arch/arm64/include/asm/uprobes.h
+++ b/arch/arm64/include/asm/uprobes.h
@@ -28,4 +28,15 @@ struct arch_uprobe {
bool simulate;
};
+int uprobe_brk_handler(struct pt_regs *regs, unsigned long esr);
+#ifdef CONFIG_UPROBES
+int uprobe_single_step_handler(struct pt_regs *regs, unsigned long esr);
+#else
+static inline int uprobe_single_step_handler(struct pt_regs *regs,
+ unsigned long esr)
+{
+ return DBG_HOOK_ERROR;
+}
+#endif
+
#endif
diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h
index 6f556e993644..f6ec500ad3fa 100644
--- a/arch/arm64/include/asm/vncr_mapping.h
+++ b/arch/arm64/include/asm/vncr_mapping.h
@@ -51,6 +51,7 @@
#define VNCR_SP_EL1 0x240
#define VNCR_VBAR_EL1 0x250
#define VNCR_TCR2_EL1 0x270
+#define VNCR_SCTLR2_EL1 0x278
#define VNCR_PIRE0_EL1 0x290
#define VNCR_PIR_EL1 0x2A0
#define VNCR_POR_EL1 0x2A8
@@ -84,6 +85,7 @@
#define VNCR_ICH_HCR_EL2 0x4C0
#define VNCR_ICH_VMCR_EL2 0x4C8
#define VNCR_VDISR_EL2 0x500
+#define VNCR_VSESR_EL2 0x508
#define VNCR_PMBLIMITR_EL1 0x800
#define VNCR_PMBPTR_EL1 0x810
#define VNCR_PMBSR_EL1 0x820
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 705a7afa8e58..72c78468b806 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -143,5 +143,7 @@
/*
* HWCAP3 flags - for AT_HWCAP3
*/
+#define HWCAP3_MTE_FAR (1UL << 0)
+#define HWCAP3_MTE_STORE_ONLY (1UL << 1)
#endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index a2faf0049dab..76f32e424065 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -80,7 +80,7 @@ obj-y += head.o
always-$(KBUILD_BUILTIN) += vmlinux.lds
ifeq ($(CONFIG_DEBUG_EFI),y)
-AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\""
+AFLAGS_head.o += -DVMLINUX_PATH="\"$(abspath vmlinux)\""
endif
# for cleaning
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index b9a66fc146c9..4d529ff7ba51 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -197,6 +197,8 @@ out:
*/
void __init acpi_boot_table_init(void)
{
+ int ret;
+
/*
* Enable ACPI instead of device tree unless
* - ACPI has been disabled explicitly (acpi=off), or
@@ -250,10 +252,12 @@ done:
* behaviour, use acpi=nospcr to disable console in ACPI SPCR
* table as default serial console.
*/
- acpi_parse_spcr(earlycon_acpi_spcr_enable,
+ ret = acpi_parse_spcr(earlycon_acpi_spcr_enable,
!param_acpi_nospcr);
- pr_info("Use ACPI SPCR as default console: %s\n",
- param_acpi_nospcr ? "No" : "Yes");
+ if (!ret || param_acpi_nospcr || !IS_ENABLED(CONFIG_ACPI_SPCR_TABLE))
+ pr_info("Use ACPI SPCR as default console: No\n");
+ else
+ pr_info("Use ACPI SPCR as default console: Yes\n");
if (IS_ENABLED(CONFIG_ACPI_BGRT))
acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index e151585c6cca..9ad065f15f1d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -303,6 +303,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
};
static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_DF2_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS),
FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0),
S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_frac_SHIFT, 4, 0),
@@ -320,6 +321,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
static const struct arm64_ftr_bits ftr_id_aa64pfr2[] = {
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_FPMR_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_MTEFAR_SHIFT, 4, ID_AA64PFR2_EL1_MTEFAR_NI),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_MTESTOREONLY_SHIFT, 4, ID_AA64PFR2_EL1_MTESTOREONLY_NI),
ARM64_FTR_END,
};
@@ -500,6 +503,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = {
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_POE),
FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1POE_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_SCTLRX_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0),
ARM64_FTR_END,
};
@@ -2213,6 +2217,38 @@ static bool hvhe_possible(const struct arm64_cpu_capabilities *entry,
return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE);
}
+static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope)
+{
+ /*
+ * We want to allow usage of BBML2 in as wide a range of kernel contexts
+ * as possible. This list is therefore an allow-list of known-good
+ * implementations that both support BBML2 and additionally, fulfill the
+ * extra constraint of never generating TLB conflict aborts when using
+ * the relaxed BBML2 semantics (such aborts make use of BBML2 in certain
+ * kernel contexts difficult to prove safe against recursive aborts).
+ *
+ * Note that implementations can only be considered "known-good" if their
+ * implementors attest to the fact that the implementation never raises
+ * TLB conflict aborts for BBML2 mapping granularity changes.
+ */
+ static const struct midr_range supports_bbml2_noabort_list[] = {
+ MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf),
+ MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf),
+ {}
+ };
+
+ /* Does our cpu guarantee to never raise TLB conflict aborts? */
+ if (!is_midr_in_range_list(supports_bbml2_noabort_list))
+ return false;
+
+ /*
+ * We currently ignore the ID_AA64MMFR2_EL1 register, and only care
+ * about whether the MIDR check passes.
+ */
+
+ return true;
+}
+
#ifdef CONFIG_ARM64_PAN
static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
{
@@ -2296,11 +2332,11 @@ static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry,
int scope)
{
/*
- * ARM64_HAS_GIC_CPUIF_SYSREGS has a lower index, and is a boot CPU
+ * ARM64_HAS_GICV3_CPUIF has a lower index, and is a boot CPU
* feature, so will be detected earlier.
*/
- BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_MASKING <= ARM64_HAS_GIC_CPUIF_SYSREGS);
- if (!cpus_have_cap(ARM64_HAS_GIC_CPUIF_SYSREGS))
+ BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_MASKING <= ARM64_HAS_GICV3_CPUIF);
+ if (!cpus_have_cap(ARM64_HAS_GICV3_CPUIF))
return false;
return enable_pseudo_nmi;
@@ -2496,8 +2532,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.matches = has_always,
},
{
- .desc = "GIC system register CPU interface",
- .capability = ARM64_HAS_GIC_CPUIF_SYSREGS,
+ .desc = "GICv3 CPU interface",
+ .capability = ARM64_HAS_GICV3_CPUIF,
.type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
.matches = has_useable_gicv3_cpuif,
ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, GIC, IMP)
@@ -2874,6 +2910,20 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.matches = has_cpuid_feature,
ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, MTE, MTE3)
},
+ {
+ .desc = "FAR on MTE Tag Check Fault",
+ .capability = ARM64_MTE_FAR,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = has_cpuid_feature,
+ ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, MTEFAR, IMP)
+ },
+ {
+ .desc = "Store Only MTE Tag Check",
+ .capability = ARM64_MTE_STORE_ONLY,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = has_cpuid_feature,
+ ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, MTESTOREONLY, IMP)
+ },
#endif /* CONFIG_ARM64_MTE */
{
.desc = "RCpc load-acquire (LDAPR)",
@@ -2981,6 +3031,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP)
},
{
+ .desc = "BBM Level 2 without TLB conflict abort",
+ .capability = ARM64_HAS_BBML2_NOABORT,
+ .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE,
+ .matches = has_bbml2_noabort,
+ },
+ {
.desc = "52-bit Virtual Addressing for KVM (LPA2)",
.capability = ARM64_HAS_LPA2,
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
@@ -3061,6 +3117,20 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.matches = has_pmuv3,
},
#endif
+ {
+ .desc = "SCTLR2",
+ .capability = ARM64_HAS_SCTLR2,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = has_cpuid_feature,
+ ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, SCTLRX, IMP)
+ },
+ {
+ .desc = "GICv5 CPU interface",
+ .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
+ .capability = ARM64_HAS_GICV5_CPUIF,
+ .matches = has_cpuid_feature,
+ ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, GCIE, IMP)
+ },
{},
};
@@ -3218,6 +3288,8 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
#ifdef CONFIG_ARM64_MTE
HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE2, CAP_HWCAP, KERNEL_HWCAP_MTE),
HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE3, CAP_HWCAP, KERNEL_HWCAP_MTE3),
+ HWCAP_CAP(ID_AA64PFR2_EL1, MTEFAR, IMP, CAP_HWCAP, KERNEL_HWCAP_MTE_FAR),
+ HWCAP_CAP(ID_AA64PFR2_EL1, MTESTOREONLY, IMP, CAP_HWCAP , KERNEL_HWCAP_MTE_STORE_ONLY),
#endif /* CONFIG_ARM64_MTE */
HWCAP_CAP(ID_AA64MMFR0_EL1, ECV, IMP, CAP_HWCAP, KERNEL_HWCAP_ECV),
HWCAP_CAP(ID_AA64MMFR1_EL1, AFP, IMP, CAP_HWCAP, KERNEL_HWCAP_AFP),
@@ -3377,18 +3449,49 @@ static void update_cpu_capabilities(u16 scope_mask)
scope_mask &= ARM64_CPUCAP_SCOPE_MASK;
for (i = 0; i < ARM64_NCAPS; i++) {
+ bool match_all = false;
+ bool caps_set = false;
+ bool boot_cpu = false;
+
caps = cpucap_ptrs[i];
- if (!caps || !(caps->type & scope_mask) ||
- cpus_have_cap(caps->capability) ||
- !caps->matches(caps, cpucap_default_scope(caps)))
+ if (!caps || !(caps->type & scope_mask))
continue;
- if (caps->desc && !caps->cpus)
+ match_all = cpucap_match_all_early_cpus(caps);
+ caps_set = cpus_have_cap(caps->capability);
+ boot_cpu = scope_mask & SCOPE_BOOT_CPU;
+
+ /*
+ * Unless it's a match-all CPUs feature, avoid probing if
+ * already detected.
+ */
+ if (!match_all && caps_set)
+ continue;
+
+ /*
+ * A match-all CPUs capability is only set when probing the
+ * boot CPU. It may be cleared subsequently if not detected on
+ * secondary ones.
+ */
+ if (match_all && !caps_set && !boot_cpu)
+ continue;
+
+ if (!caps->matches(caps, cpucap_default_scope(caps))) {
+ if (match_all)
+ __clear_bit(caps->capability, system_cpucaps);
+ continue;
+ }
+
+ /*
+ * Match-all CPUs capabilities are logged later when the
+ * system capabilities are finalised.
+ */
+ if (!match_all && caps->desc && !caps->cpus)
pr_info("detected: %s\n", caps->desc);
__set_bit(caps->capability, system_cpucaps);
- if ((scope_mask & SCOPE_BOOT_CPU) && (caps->type & SCOPE_BOOT_CPU))
+ if (boot_cpu && (caps->type & SCOPE_BOOT_CPU))
set_bit(caps->capability, boot_cpucaps);
}
}
@@ -3789,17 +3892,24 @@ static void __init setup_system_capabilities(void)
enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU);
apply_alternatives_all();
- /*
- * Log any cpucaps with a cpumask as these aren't logged by
- * update_cpu_capabilities().
- */
for (int i = 0; i < ARM64_NCAPS; i++) {
const struct arm64_cpu_capabilities *caps = cpucap_ptrs[i];
- if (caps && caps->cpus && caps->desc &&
- cpumask_any(caps->cpus) < nr_cpu_ids)
+ if (!caps || !caps->desc)
+ continue;
+
+ /*
+ * Log any cpucaps with a cpumask as these aren't logged by
+ * update_cpu_capabilities().
+ */
+ if (caps->cpus && cpumask_any(caps->cpus) < nr_cpu_ids)
pr_info("detected: %s on CPU%*pbl\n",
caps->desc, cpumask_pr_args(caps->cpus));
+
+ /* Log match-all CPUs capabilities */
+ if (cpucap_match_all_early_cpus(caps) &&
+ cpus_have_cap(caps->capability))
+ pr_info("detected: %s\n", caps->desc);
}
/*
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index c1f2b6b04b41..ba834909a28b 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -160,6 +160,8 @@ static const char *const hwcap_str[] = {
[KERNEL_HWCAP_SME_SFEXPA] = "smesfexpa",
[KERNEL_HWCAP_SME_STMOP] = "smestmop",
[KERNEL_HWCAP_SME_SMOP4] = "smesmop4",
+ [KERNEL_HWCAP_MTE_FAR] = "mtefar",
+ [KERNEL_HWCAP_MTE_STORE_ONLY] = "mtestoreonly",
};
#ifdef CONFIG_COMPAT
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 58f047de3e1c..110d9ff54174 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -21,8 +21,12 @@
#include <asm/cputype.h>
#include <asm/daifflags.h>
#include <asm/debug-monitors.h>
+#include <asm/exception.h>
+#include <asm/kgdb.h>
+#include <asm/kprobes.h>
#include <asm/system_misc.h>
#include <asm/traps.h>
+#include <asm/uprobes.h>
/* Determine debug architecture. */
u8 debug_monitors_arch(void)
@@ -34,7 +38,7 @@ u8 debug_monitors_arch(void)
/*
* MDSCR access routines.
*/
-static void mdscr_write(u32 mdscr)
+static void mdscr_write(u64 mdscr)
{
unsigned long flags;
flags = local_daif_save();
@@ -43,7 +47,7 @@ static void mdscr_write(u32 mdscr)
}
NOKPROBE_SYMBOL(mdscr_write);
-static u32 mdscr_read(void)
+static u64 mdscr_read(void)
{
return read_sysreg(mdscr_el1);
}
@@ -79,16 +83,16 @@ static DEFINE_PER_CPU(int, kde_ref_count);
void enable_debug_monitors(enum dbg_active_el el)
{
- u32 mdscr, enable = 0;
+ u64 mdscr, enable = 0;
WARN_ON(preemptible());
if (this_cpu_inc_return(mde_ref_count) == 1)
- enable = DBG_MDSCR_MDE;
+ enable = MDSCR_EL1_MDE;
if (el == DBG_ACTIVE_EL1 &&
this_cpu_inc_return(kde_ref_count) == 1)
- enable |= DBG_MDSCR_KDE;
+ enable |= MDSCR_EL1_KDE;
if (enable && debug_enabled) {
mdscr = mdscr_read();
@@ -100,16 +104,16 @@ NOKPROBE_SYMBOL(enable_debug_monitors);
void disable_debug_monitors(enum dbg_active_el el)
{
- u32 mdscr, disable = 0;
+ u64 mdscr, disable = 0;
WARN_ON(preemptible());
if (this_cpu_dec_return(mde_ref_count) == 0)
- disable = ~DBG_MDSCR_MDE;
+ disable = ~MDSCR_EL1_MDE;
if (el == DBG_ACTIVE_EL1 &&
this_cpu_dec_return(kde_ref_count) == 0)
- disable &= ~DBG_MDSCR_KDE;
+ disable &= ~MDSCR_EL1_KDE;
if (disable) {
mdscr = mdscr_read();
@@ -156,74 +160,6 @@ NOKPROBE_SYMBOL(clear_user_regs_spsr_ss);
#define set_regs_spsr_ss(r) set_user_regs_spsr_ss(&(r)->user_regs)
#define clear_regs_spsr_ss(r) clear_user_regs_spsr_ss(&(r)->user_regs)
-static DEFINE_SPINLOCK(debug_hook_lock);
-static LIST_HEAD(user_step_hook);
-static LIST_HEAD(kernel_step_hook);
-
-static void register_debug_hook(struct list_head *node, struct list_head *list)
-{
- spin_lock(&debug_hook_lock);
- list_add_rcu(node, list);
- spin_unlock(&debug_hook_lock);
-
-}
-
-static void unregister_debug_hook(struct list_head *node)
-{
- spin_lock(&debug_hook_lock);
- list_del_rcu(node);
- spin_unlock(&debug_hook_lock);
- synchronize_rcu();
-}
-
-void register_user_step_hook(struct step_hook *hook)
-{
- register_debug_hook(&hook->node, &user_step_hook);
-}
-
-void unregister_user_step_hook(struct step_hook *hook)
-{
- unregister_debug_hook(&hook->node);
-}
-
-void register_kernel_step_hook(struct step_hook *hook)
-{
- register_debug_hook(&hook->node, &kernel_step_hook);
-}
-
-void unregister_kernel_step_hook(struct step_hook *hook)
-{
- unregister_debug_hook(&hook->node);
-}
-
-/*
- * Call registered single step handlers
- * There is no Syndrome info to check for determining the handler.
- * So we call all the registered handlers, until the right handler is
- * found which returns zero.
- */
-static int call_step_hook(struct pt_regs *regs, unsigned long esr)
-{
- struct step_hook *hook;
- struct list_head *list;
- int retval = DBG_HOOK_ERROR;
-
- list = user_mode(regs) ? &user_step_hook : &kernel_step_hook;
-
- /*
- * Since single-step exception disables interrupt, this function is
- * entirely not preemptible, and we can use rcu list safely here.
- */
- list_for_each_entry_rcu(hook, list, node) {
- retval = hook->fn(regs, esr);
- if (retval == DBG_HOOK_HANDLED)
- break;
- }
-
- return retval;
-}
-NOKPROBE_SYMBOL(call_step_hook);
-
static void send_user_sigtrap(int si_code)
{
struct pt_regs *regs = current_pt_regs();
@@ -238,105 +174,110 @@ static void send_user_sigtrap(int si_code)
"User debug trap");
}
-static int single_step_handler(unsigned long unused, unsigned long esr,
- struct pt_regs *regs)
+/*
+ * We have already unmasked interrupts and enabled preemption
+ * when calling do_el0_softstep() from entry-common.c.
+ */
+void do_el0_softstep(unsigned long esr, struct pt_regs *regs)
{
- bool handler_found = false;
+ if (uprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED)
+ return;
+ send_user_sigtrap(TRAP_TRACE);
/*
- * If we are stepping a pending breakpoint, call the hw_breakpoint
- * handler first.
+ * ptrace will disable single step unless explicitly
+ * asked to re-enable it. For other clients, it makes
+ * sense to leave it enabled (i.e. rewind the controls
+ * to the active-not-pending state).
*/
- if (!reinstall_suspended_bps(regs))
- return 0;
-
- if (!handler_found && call_step_hook(regs, esr) == DBG_HOOK_HANDLED)
- handler_found = true;
-
- if (!handler_found && user_mode(regs)) {
- send_user_sigtrap(TRAP_TRACE);
-
- /*
- * ptrace will disable single step unless explicitly
- * asked to re-enable it. For other clients, it makes
- * sense to leave it enabled (i.e. rewind the controls
- * to the active-not-pending state).
- */
- user_rewind_single_step(current);
- } else if (!handler_found) {
- pr_warn("Unexpected kernel single-step exception at EL1\n");
- /*
- * Re-enable stepping since we know that we will be
- * returning to regs.
- */
- set_regs_spsr_ss(regs);
- }
-
- return 0;
+ user_rewind_single_step(current);
}
-NOKPROBE_SYMBOL(single_step_handler);
-
-static LIST_HEAD(user_break_hook);
-static LIST_HEAD(kernel_break_hook);
-void register_user_break_hook(struct break_hook *hook)
+void do_el1_softstep(unsigned long esr, struct pt_regs *regs)
{
- register_debug_hook(&hook->node, &user_break_hook);
-}
+ if (kgdb_single_step_handler(regs, esr) == DBG_HOOK_HANDLED)
+ return;
-void unregister_user_break_hook(struct break_hook *hook)
-{
- unregister_debug_hook(&hook->node);
+ pr_warn("Unexpected kernel single-step exception at EL1\n");
+ /*
+ * Re-enable stepping since we know that we will be
+ * returning to regs.
+ */
+ set_regs_spsr_ss(regs);
}
+NOKPROBE_SYMBOL(do_el1_softstep);
-void register_kernel_break_hook(struct break_hook *hook)
+static int call_el1_break_hook(struct pt_regs *regs, unsigned long esr)
{
- register_debug_hook(&hook->node, &kernel_break_hook);
-}
+ if (esr_brk_comment(esr) == BUG_BRK_IMM)
+ return bug_brk_handler(regs, esr);
-void unregister_kernel_break_hook(struct break_hook *hook)
-{
- unregister_debug_hook(&hook->node);
-}
+ if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr))
+ return cfi_brk_handler(regs, esr);
-static int call_break_hook(struct pt_regs *regs, unsigned long esr)
-{
- struct break_hook *hook;
- struct list_head *list;
+ if (esr_brk_comment(esr) == FAULT_BRK_IMM)
+ return reserved_fault_brk_handler(regs, esr);
- list = user_mode(regs) ? &user_break_hook : &kernel_break_hook;
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) &&
+ (esr_brk_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM)
+ return kasan_brk_handler(regs, esr);
- /*
- * Since brk exception disables interrupt, this function is
- * entirely not preemptible, and we can use rcu list safely here.
- */
- list_for_each_entry_rcu(hook, list, node) {
- if ((esr_brk_comment(esr) & ~hook->mask) == hook->imm)
- return hook->fn(regs, esr);
+ if (IS_ENABLED(CONFIG_UBSAN_TRAP) && esr_is_ubsan_brk(esr))
+ return ubsan_brk_handler(regs, esr);
+
+ if (IS_ENABLED(CONFIG_KGDB)) {
+ if (esr_brk_comment(esr) == KGDB_DYN_DBG_BRK_IMM)
+ return kgdb_brk_handler(regs, esr);
+ if (esr_brk_comment(esr) == KGDB_COMPILED_DBG_BRK_IMM)
+ return kgdb_compiled_brk_handler(regs, esr);
+ }
+
+ if (IS_ENABLED(CONFIG_KPROBES)) {
+ if (esr_brk_comment(esr) == KPROBES_BRK_IMM)
+ return kprobe_brk_handler(regs, esr);
+ if (esr_brk_comment(esr) == KPROBES_BRK_SS_IMM)
+ return kprobe_ss_brk_handler(regs, esr);
}
+ if (IS_ENABLED(CONFIG_KRETPROBES) &&
+ esr_brk_comment(esr) == KRETPROBES_BRK_IMM)
+ return kretprobe_brk_handler(regs, esr);
+
return DBG_HOOK_ERROR;
}
-NOKPROBE_SYMBOL(call_break_hook);
+NOKPROBE_SYMBOL(call_el1_break_hook);
-static int brk_handler(unsigned long unused, unsigned long esr,
- struct pt_regs *regs)
+/*
+ * We have already unmasked interrupts and enabled preemption
+ * when calling do_el0_brk64() from entry-common.c.
+ */
+void do_el0_brk64(unsigned long esr, struct pt_regs *regs)
{
- if (call_break_hook(regs, esr) == DBG_HOOK_HANDLED)
- return 0;
+ if (IS_ENABLED(CONFIG_UPROBES) &&
+ esr_brk_comment(esr) == UPROBES_BRK_IMM &&
+ uprobe_brk_handler(regs, esr) == DBG_HOOK_HANDLED)
+ return;
- if (user_mode(regs)) {
- send_user_sigtrap(TRAP_BRKPT);
- } else {
- pr_warn("Unexpected kernel BRK exception at EL1\n");
- return -EFAULT;
- }
+ send_user_sigtrap(TRAP_BRKPT);
+}
- return 0;
+void do_el1_brk64(unsigned long esr, struct pt_regs *regs)
+{
+ if (call_el1_break_hook(regs, esr) == DBG_HOOK_HANDLED)
+ return;
+
+ die("Oops - BRK", regs, esr);
+}
+NOKPROBE_SYMBOL(do_el1_brk64);
+
+#ifdef CONFIG_COMPAT
+void do_bkpt32(unsigned long esr, struct pt_regs *regs)
+{
+ arm64_notify_die("aarch32 BKPT", regs, SIGTRAP, TRAP_BRKPT, regs->pc, esr);
}
-NOKPROBE_SYMBOL(brk_handler);
+#endif /* CONFIG_COMPAT */
-int aarch32_break_handler(struct pt_regs *regs)
+bool try_handle_aarch32_break(struct pt_regs *regs)
{
u32 arm_instr;
u16 thumb_instr;
@@ -344,7 +285,7 @@ int aarch32_break_handler(struct pt_regs *regs)
void __user *pc = (void __user *)instruction_pointer(regs);
if (!compat_user_mode(regs))
- return -EFAULT;
+ return false;
if (compat_thumb_mode(regs)) {
/* get 16-bit Thumb instruction */
@@ -368,20 +309,12 @@ int aarch32_break_handler(struct pt_regs *regs)
}
if (!bp)
- return -EFAULT;
+ return false;
send_user_sigtrap(TRAP_BRKPT);
- return 0;
-}
-NOKPROBE_SYMBOL(aarch32_break_handler);
-
-void __init debug_traps_init(void)
-{
- hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP,
- TRAP_TRACE, "single-step handler");
- hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP,
- TRAP_BRKPT, "BRK handler");
+ return true;
}
+NOKPROBE_SYMBOL(try_handle_aarch32_break);
/* Re-enable single step for syscall restarting. */
void user_rewind_single_step(struct task_struct *task)
@@ -415,7 +348,7 @@ void kernel_enable_single_step(struct pt_regs *regs)
{
WARN_ON(!irqs_disabled());
set_regs_spsr_ss(regs);
- mdscr_write(mdscr_read() | DBG_MDSCR_SS);
+ mdscr_write(mdscr_read() | MDSCR_EL1_SS);
enable_debug_monitors(DBG_ACTIVE_EL1);
}
NOKPROBE_SYMBOL(kernel_enable_single_step);
@@ -423,7 +356,7 @@ NOKPROBE_SYMBOL(kernel_enable_single_step);
void kernel_disable_single_step(void)
{
WARN_ON(!irqs_disabled());
- mdscr_write(mdscr_read() & ~DBG_MDSCR_SS);
+ mdscr_write(mdscr_read() & ~MDSCR_EL1_SS);
disable_debug_monitors(DBG_ACTIVE_EL1);
}
NOKPROBE_SYMBOL(kernel_disable_single_step);
@@ -431,7 +364,7 @@ NOKPROBE_SYMBOL(kernel_disable_single_step);
int kernel_active_single_step(void)
{
WARN_ON(!irqs_disabled());
- return mdscr_read() & DBG_MDSCR_SS;
+ return mdscr_read() & MDSCR_EL1_SS;
}
NOKPROBE_SYMBOL(kernel_active_single_step);
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 62230d6dd919..6c371b158b99 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -215,11 +215,6 @@ static int __init arm64_efi_rt_init(void)
if (!efi_enabled(EFI_RUNTIME_SERVICES))
return 0;
- if (!IS_ENABLED(CONFIG_VMAP_STACK)) {
- clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
- return -ENOMEM;
- }
-
p = arch_alloc_vmap_stack(THREAD_SIZE, NUMA_NO_NODE);
if (!p) {
pr_warn("Failed to allocate EFI runtime stack\n");
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 7c1970b341b8..2b0c5925502e 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -8,6 +8,7 @@
#include <linux/context_tracking.h>
#include <linux/kasan.h>
#include <linux/linkage.h>
+#include <linux/livepatch.h>
#include <linux/lockdep.h>
#include <linux/ptrace.h>
#include <linux/resume_user_mode.h>
@@ -144,6 +145,9 @@ static void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
(void __user *)NULL, current);
}
+ if (thread_flags & _TIF_PATCH_PENDING)
+ klp_update_patch_state(current);
+
if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
do_signal(regs);
@@ -344,7 +348,7 @@ static DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
static void cortex_a76_erratum_1463225_svc_handler(void)
{
- u32 reg, val;
+ u64 reg, val;
if (!unlikely(test_thread_flag(TIF_SINGLESTEP)))
return;
@@ -354,7 +358,7 @@ static void cortex_a76_erratum_1463225_svc_handler(void)
__this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 1);
reg = read_sysreg(mdscr_el1);
- val = reg | DBG_MDSCR_SS | DBG_MDSCR_KDE;
+ val = reg | MDSCR_EL1_SS | MDSCR_EL1_KDE;
write_sysreg(val, mdscr_el1);
asm volatile("msr daifclr, #8");
isb();
@@ -441,6 +445,28 @@ static __always_inline void fpsimd_syscall_exit(void)
__this_cpu_write(fpsimd_last_state.to_save, FP_STATE_CURRENT);
}
+/*
+ * In debug exception context, we explicitly disable preemption despite
+ * having interrupts disabled.
+ * This serves two purposes: it makes it much less likely that we would
+ * accidentally schedule in exception context and it will force a warning
+ * if we somehow manage to schedule by accident.
+ */
+static void debug_exception_enter(struct pt_regs *regs)
+{
+ preempt_disable();
+
+ /* This code is a bit fragile. Test it. */
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
+}
+NOKPROBE_SYMBOL(debug_exception_enter);
+
+static void debug_exception_exit(struct pt_regs *regs)
+{
+ preempt_enable_no_resched();
+}
+NOKPROBE_SYMBOL(debug_exception_exit);
+
UNHANDLED(el1t, 64, sync)
UNHANDLED(el1t, 64, irq)
UNHANDLED(el1t, 64, fiq)
@@ -504,13 +530,51 @@ static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr)
exit_to_kernel_mode(regs);
}
-static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr)
+static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr)
+{
+ arm64_enter_el1_dbg(regs);
+ debug_exception_enter(regs);
+ do_breakpoint(esr, regs);
+ debug_exception_exit(regs);
+ arm64_exit_el1_dbg(regs);
+}
+
+static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr)
{
+ arm64_enter_el1_dbg(regs);
+ if (!cortex_a76_erratum_1463225_debug_handler(regs)) {
+ debug_exception_enter(regs);
+ /*
+ * After handling a breakpoint, we suspend the breakpoint
+ * and use single-step to move to the next instruction.
+ * If we are stepping a suspended breakpoint there's nothing more to do:
+ * the single-step is complete.
+ */
+ if (!try_step_suspended_breakpoints(regs))
+ do_el1_softstep(esr, regs);
+ debug_exception_exit(regs);
+ }
+ arm64_exit_el1_dbg(regs);
+}
+
+static void noinstr el1_watchpt(struct pt_regs *regs, unsigned long esr)
+{
+ /* Watchpoints are the only debug exception to write FAR_EL1 */
unsigned long far = read_sysreg(far_el1);
arm64_enter_el1_dbg(regs);
- if (!cortex_a76_erratum_1463225_debug_handler(regs))
- do_debug_exception(far, esr, regs);
+ debug_exception_enter(regs);
+ do_watchpoint(far, esr, regs);
+ debug_exception_exit(regs);
+ arm64_exit_el1_dbg(regs);
+}
+
+static void noinstr el1_brk64(struct pt_regs *regs, unsigned long esr)
+{
+ arm64_enter_el1_dbg(regs);
+ debug_exception_enter(regs);
+ do_el1_brk64(esr, regs);
+ debug_exception_exit(regs);
arm64_exit_el1_dbg(regs);
}
@@ -553,10 +617,16 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
el1_mops(regs, esr);
break;
case ESR_ELx_EC_BREAKPT_CUR:
+ el1_breakpt(regs, esr);
+ break;
case ESR_ELx_EC_SOFTSTP_CUR:
+ el1_softstp(regs, esr);
+ break;
case ESR_ELx_EC_WATCHPT_CUR:
+ el1_watchpt(regs, esr);
+ break;
case ESR_ELx_EC_BRK64:
- el1_dbg(regs, esr);
+ el1_brk64(regs, esr);
break;
case ESR_ELx_EC_FPAC:
el1_fpac(regs, esr);
@@ -747,14 +817,56 @@ static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr)
exit_to_user_mode(regs);
}
-static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr)
+static void noinstr el0_breakpt(struct pt_regs *regs, unsigned long esr)
{
- /* Only watchpoints write FAR_EL1, otherwise its UNKNOWN */
+ if (!is_ttbr0_addr(regs->pc))
+ arm64_apply_bp_hardening();
+
+ enter_from_user_mode(regs);
+ debug_exception_enter(regs);
+ do_breakpoint(esr, regs);
+ debug_exception_exit(regs);
+ local_daif_restore(DAIF_PROCCTX);
+ exit_to_user_mode(regs);
+}
+
+static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr)
+{
+ if (!is_ttbr0_addr(regs->pc))
+ arm64_apply_bp_hardening();
+
+ enter_from_user_mode(regs);
+ /*
+ * After handling a breakpoint, we suspend the breakpoint
+ * and use single-step to move to the next instruction.
+ * If we are stepping a suspended breakpoint there's nothing more to do:
+ * the single-step is complete.
+ */
+ if (!try_step_suspended_breakpoints(regs)) {
+ local_daif_restore(DAIF_PROCCTX);
+ do_el0_softstep(esr, regs);
+ }
+ exit_to_user_mode(regs);
+}
+
+static void noinstr el0_watchpt(struct pt_regs *regs, unsigned long esr)
+{
+ /* Watchpoints are the only debug exception to write FAR_EL1 */
unsigned long far = read_sysreg(far_el1);
enter_from_user_mode(regs);
- do_debug_exception(far, esr, regs);
+ debug_exception_enter(regs);
+ do_watchpoint(far, esr, regs);
+ debug_exception_exit(regs);
+ local_daif_restore(DAIF_PROCCTX);
+ exit_to_user_mode(regs);
+}
+
+static void noinstr el0_brk64(struct pt_regs *regs, unsigned long esr)
+{
+ enter_from_user_mode(regs);
local_daif_restore(DAIF_PROCCTX);
+ do_el0_brk64(esr, regs);
exit_to_user_mode(regs);
}
@@ -826,10 +938,16 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs)
el0_gcs(regs, esr);
break;
case ESR_ELx_EC_BREAKPT_LOW:
+ el0_breakpt(regs, esr);
+ break;
case ESR_ELx_EC_SOFTSTP_LOW:
+ el0_softstp(regs, esr);
+ break;
case ESR_ELx_EC_WATCHPT_LOW:
+ el0_watchpt(regs, esr);
+ break;
case ESR_ELx_EC_BRK64:
- el0_dbg(regs, esr);
+ el0_brk64(regs, esr);
break;
case ESR_ELx_EC_FPAC:
el0_fpac(regs, esr);
@@ -912,6 +1030,14 @@ static void noinstr el0_svc_compat(struct pt_regs *regs)
exit_to_user_mode(regs);
}
+static void noinstr el0_bkpt32(struct pt_regs *regs, unsigned long esr)
+{
+ enter_from_user_mode(regs);
+ local_daif_restore(DAIF_PROCCTX);
+ do_bkpt32(esr, regs);
+ exit_to_user_mode(regs);
+}
+
asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs)
{
unsigned long esr = read_sysreg(esr_el1);
@@ -946,10 +1072,16 @@ asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs)
el0_cp15(regs, esr);
break;
case ESR_ELx_EC_BREAKPT_LOW:
+ el0_breakpt(regs, esr);
+ break;
case ESR_ELx_EC_SOFTSTP_LOW:
+ el0_softstp(regs, esr);
+ break;
case ESR_ELx_EC_WATCHPT_LOW:
+ el0_watchpt(regs, esr);
+ break;
case ESR_ELx_EC_BKPT32:
- el0_dbg(regs, esr);
+ el0_bkpt32(regs, esr);
break;
default:
el0_inv(regs, esr);
@@ -977,7 +1109,6 @@ UNHANDLED(el0t, 32, fiq)
UNHANDLED(el0t, 32, error)
#endif /* CONFIG_COMPAT */
-#ifdef CONFIG_VMAP_STACK
asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs)
{
unsigned long esr = read_sysreg(esr_el1);
@@ -986,7 +1117,6 @@ asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs)
arm64_enter_nmi(regs);
panic_bad_stack(regs, esr, far);
}
-#endif /* CONFIG_VMAP_STACK */
#ifdef CONFIG_ARM_SDE_INTERFACE
asmlinkage noinstr unsigned long
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index fe62218b8e0e..f8018b5c1f9a 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -55,7 +55,6 @@
.endif
sub sp, sp, #PT_REGS_SIZE
-#ifdef CONFIG_VMAP_STACK
/*
* Test whether the SP has overflowed, without corrupting a GPR.
* Task and IRQ stacks are aligned so that SP & (1 << THREAD_SHIFT)
@@ -97,7 +96,6 @@
/* We were already on the overflow stack. Restore sp/x0 and carry on. */
sub sp, sp, x0
mrs x0, tpidrro_el0
-#endif
b el\el\ht\()_\regsize\()_\label
.org .Lventry_start\@ + 128 // Did we overflow the ventry slot?
.endm
@@ -540,7 +538,6 @@ SYM_CODE_START(vectors)
kernel_ventry 0, t, 32, error // Error 32-bit EL0
SYM_CODE_END(vectors)
-#ifdef CONFIG_VMAP_STACK
SYM_CODE_START_LOCAL(__bad_stack)
/*
* We detected an overflow in kernel_ventry, which switched to the
@@ -568,7 +565,6 @@ SYM_CODE_START_LOCAL(__bad_stack)
bl handle_bad_stack
ASM_BUG()
SYM_CODE_END(__bad_stack)
-#endif /* CONFIG_VMAP_STACK */
.macro entry_handler el:req, ht:req, regsize:req, label:req
@@ -1009,7 +1005,6 @@ SYM_CODE_START(__sdei_asm_handler)
1: adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6
2: str x19, [x5]
-#ifdef CONFIG_VMAP_STACK
/*
* entry.S may have been using sp as a scratch register, find whether
* this is a normal or critical event and switch to the appropriate
@@ -1022,7 +1017,6 @@ SYM_CODE_START(__sdei_asm_handler)
2: mov x6, #SDEI_STACK_SIZE
add x5, x5, x6
mov sp, x5
-#endif
#ifdef CONFIG_SHADOW_CALL_STACK
/* Use a separate shadow call stack for normal and critical events */
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 722ac45f9f7b..ab76b36dce82 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -22,6 +22,7 @@
#include <asm/current.h>
#include <asm/debug-monitors.h>
#include <asm/esr.h>
+#include <asm/exception.h>
#include <asm/hw_breakpoint.h>
#include <asm/traps.h>
#include <asm/cputype.h>
@@ -618,8 +619,7 @@ NOKPROBE_SYMBOL(toggle_bp_registers);
/*
* Debug exception handlers.
*/
-static int breakpoint_handler(unsigned long unused, unsigned long esr,
- struct pt_regs *regs)
+void do_breakpoint(unsigned long esr, struct pt_regs *regs)
{
int i, step = 0, *kernel_step;
u32 ctrl_reg;
@@ -662,7 +662,7 @@ unlock:
}
if (!step)
- return 0;
+ return;
if (user_mode(regs)) {
debug_info->bps_disabled = 1;
@@ -670,7 +670,7 @@ unlock:
/* If we're already stepping a watchpoint, just return. */
if (debug_info->wps_disabled)
- return 0;
+ return;
if (test_thread_flag(TIF_SINGLESTEP))
debug_info->suspended_step = 1;
@@ -681,7 +681,7 @@ unlock:
kernel_step = this_cpu_ptr(&stepping_kernel_bp);
if (*kernel_step != ARM_KERNEL_STEP_NONE)
- return 0;
+ return;
if (kernel_active_single_step()) {
*kernel_step = ARM_KERNEL_STEP_SUSPEND;
@@ -690,10 +690,8 @@ unlock:
kernel_enable_single_step(regs);
}
}
-
- return 0;
}
-NOKPROBE_SYMBOL(breakpoint_handler);
+NOKPROBE_SYMBOL(do_breakpoint);
/*
* Arm64 hardware does not always report a watchpoint hit address that matches
@@ -752,8 +750,7 @@ static int watchpoint_report(struct perf_event *wp, unsigned long addr,
return step;
}
-static int watchpoint_handler(unsigned long addr, unsigned long esr,
- struct pt_regs *regs)
+void do_watchpoint(unsigned long addr, unsigned long esr, struct pt_regs *regs)
{
int i, step = 0, *kernel_step, access, closest_match = 0;
u64 min_dist = -1, dist;
@@ -808,7 +805,7 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr,
rcu_read_unlock();
if (!step)
- return 0;
+ return;
/*
* We always disable EL0 watchpoints because the kernel can
@@ -821,7 +818,7 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr,
/* If we're already stepping a breakpoint, just return. */
if (debug_info->bps_disabled)
- return 0;
+ return;
if (test_thread_flag(TIF_SINGLESTEP))
debug_info->suspended_step = 1;
@@ -832,7 +829,7 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr,
kernel_step = this_cpu_ptr(&stepping_kernel_bp);
if (*kernel_step != ARM_KERNEL_STEP_NONE)
- return 0;
+ return;
if (kernel_active_single_step()) {
*kernel_step = ARM_KERNEL_STEP_SUSPEND;
@@ -841,44 +838,41 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr,
kernel_enable_single_step(regs);
}
}
-
- return 0;
}
-NOKPROBE_SYMBOL(watchpoint_handler);
+NOKPROBE_SYMBOL(do_watchpoint);
/*
* Handle single-step exception.
*/
-int reinstall_suspended_bps(struct pt_regs *regs)
+bool try_step_suspended_breakpoints(struct pt_regs *regs)
{
struct debug_info *debug_info = &current->thread.debug;
- int handled_exception = 0, *kernel_step;
-
- kernel_step = this_cpu_ptr(&stepping_kernel_bp);
+ int *kernel_step = this_cpu_ptr(&stepping_kernel_bp);
+ bool handled_exception = false;
/*
- * Called from single-step exception handler.
- * Return 0 if execution can resume, 1 if a SIGTRAP should be
- * reported.
+ * Called from single-step exception entry.
+ * Return true if we stepped a breakpoint and can resume execution,
+ * false if we need to handle a single-step.
*/
if (user_mode(regs)) {
if (debug_info->bps_disabled) {
debug_info->bps_disabled = 0;
toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL0, 1);
- handled_exception = 1;
+ handled_exception = true;
}
if (debug_info->wps_disabled) {
debug_info->wps_disabled = 0;
toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL0, 1);
- handled_exception = 1;
+ handled_exception = true;
}
if (handled_exception) {
if (debug_info->suspended_step) {
debug_info->suspended_step = 0;
/* Allow exception handling to fall-through. */
- handled_exception = 0;
+ handled_exception = false;
} else {
user_disable_single_step(current);
}
@@ -892,17 +886,17 @@ int reinstall_suspended_bps(struct pt_regs *regs)
if (*kernel_step != ARM_KERNEL_STEP_SUSPEND) {
kernel_disable_single_step();
- handled_exception = 1;
+ handled_exception = true;
} else {
- handled_exception = 0;
+ handled_exception = false;
}
*kernel_step = ARM_KERNEL_STEP_NONE;
}
- return !handled_exception;
+ return handled_exception;
}
-NOKPROBE_SYMBOL(reinstall_suspended_bps);
+NOKPROBE_SYMBOL(try_step_suspended_breakpoints);
/*
* Context-switcher for restoring suspended breakpoints.
@@ -987,12 +981,6 @@ static int __init arch_hw_breakpoint_init(void)
pr_info("found %d breakpoint and %d watchpoint registers.\n",
core_num_brps, core_num_wrps);
- /* Register debug fault handlers. */
- hook_debug_fault_code(DBG_ESR_EVT_HWBP, breakpoint_handler, SIGTRAP,
- TRAP_HWBKPT, "hw-breakpoint handler");
- hook_debug_fault_code(DBG_ESR_EVT_HWWP, watchpoint_handler, SIGTRAP,
- TRAP_HWBKPT, "hw-watchpoint handler");
-
/*
* Reset the breakpoint resources. We assume that a halting
* debugger will leave the world in a nice state for us.
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index 85087e2df564..c0065a1d77cf 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -51,7 +51,6 @@ static void init_irq_scs(void)
scs_alloc(early_cpu_to_node(cpu));
}
-#ifdef CONFIG_VMAP_STACK
static void __init init_irq_stacks(void)
{
int cpu;
@@ -62,18 +61,6 @@ static void __init init_irq_stacks(void)
per_cpu(irq_stack_ptr, cpu) = p;
}
}
-#else
-/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */
-DEFINE_PER_CPU_ALIGNED(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack);
-
-static void init_irq_stacks(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack, cpu);
-}
-#endif
#ifndef CONFIG_PREEMPT_RT
static void ____do_softirq(struct pt_regs *regs)
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index f3c4d3a8a20f..968324a79a89 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -234,23 +234,23 @@ int kgdb_arch_handle_exception(int exception_vector, int signo,
return err;
}
-static int kgdb_brk_fn(struct pt_regs *regs, unsigned long esr)
+int kgdb_brk_handler(struct pt_regs *regs, unsigned long esr)
{
kgdb_handle_exception(1, SIGTRAP, 0, regs);
return DBG_HOOK_HANDLED;
}
-NOKPROBE_SYMBOL(kgdb_brk_fn)
+NOKPROBE_SYMBOL(kgdb_brk_handler)
-static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned long esr)
+int kgdb_compiled_brk_handler(struct pt_regs *regs, unsigned long esr)
{
compiled_break = 1;
kgdb_handle_exception(1, SIGTRAP, 0, regs);
return DBG_HOOK_HANDLED;
}
-NOKPROBE_SYMBOL(kgdb_compiled_brk_fn);
+NOKPROBE_SYMBOL(kgdb_compiled_brk_handler);
-static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned long esr)
+int kgdb_single_step_handler(struct pt_regs *regs, unsigned long esr)
{
if (!kgdb_single_step)
return DBG_HOOK_ERROR;
@@ -258,21 +258,7 @@ static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned long esr)
kgdb_handle_exception(0, SIGTRAP, 0, regs);
return DBG_HOOK_HANDLED;
}
-NOKPROBE_SYMBOL(kgdb_step_brk_fn);
-
-static struct break_hook kgdb_brkpt_hook = {
- .fn = kgdb_brk_fn,
- .imm = KGDB_DYN_DBG_BRK_IMM,
-};
-
-static struct break_hook kgdb_compiled_brkpt_hook = {
- .fn = kgdb_compiled_brk_fn,
- .imm = KGDB_COMPILED_DBG_BRK_IMM,
-};
-
-static struct step_hook kgdb_step_hook = {
- .fn = kgdb_step_brk_fn
-};
+NOKPROBE_SYMBOL(kgdb_single_step_handler);
static int __kgdb_notify(struct die_args *args, unsigned long cmd)
{
@@ -311,15 +297,7 @@ static struct notifier_block kgdb_notifier = {
*/
int kgdb_arch_init(void)
{
- int ret = register_die_notifier(&kgdb_notifier);
-
- if (ret != 0)
- return ret;
-
- register_kernel_break_hook(&kgdb_brkpt_hook);
- register_kernel_break_hook(&kgdb_compiled_brkpt_hook);
- register_kernel_step_hook(&kgdb_step_hook);
- return 0;
+ return register_die_notifier(&kgdb_notifier);
}
/*
@@ -329,9 +307,6 @@ int kgdb_arch_init(void)
*/
void kgdb_arch_exit(void)
{
- unregister_kernel_break_hook(&kgdb_brkpt_hook);
- unregister_kernel_break_hook(&kgdb_compiled_brkpt_hook);
- unregister_kernel_step_hook(&kgdb_step_hook);
unregister_die_notifier(&kgdb_notifier);
}
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 06bb680bfe97..40148d2725ce 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -23,6 +23,7 @@
#include <asm/insn.h>
#include <asm/scs.h>
#include <asm/sections.h>
+#include <asm/text-patching.h>
enum aarch64_reloc_op {
RELOC_OP_NONE,
@@ -48,7 +49,17 @@ static u64 do_reloc(enum aarch64_reloc_op reloc_op, __le32 *place, u64 val)
return 0;
}
-static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len)
+#define WRITE_PLACE(place, val, mod) do { \
+ __typeof__(val) __val = (val); \
+ \
+ if (mod->state == MODULE_STATE_UNFORMED) \
+ *(place) = __val; \
+ else \
+ aarch64_insn_copy(place, &(__val), sizeof(*place)); \
+} while (0)
+
+static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len,
+ struct module *me)
{
s64 sval = do_reloc(op, place, val);
@@ -66,7 +77,7 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len)
switch (len) {
case 16:
- *(s16 *)place = sval;
+ WRITE_PLACE((s16 *)place, sval, me);
switch (op) {
case RELOC_OP_ABS:
if (sval < 0 || sval > U16_MAX)
@@ -82,7 +93,7 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len)
}
break;
case 32:
- *(s32 *)place = sval;
+ WRITE_PLACE((s32 *)place, sval, me);
switch (op) {
case RELOC_OP_ABS:
if (sval < 0 || sval > U32_MAX)
@@ -98,7 +109,7 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len)
}
break;
case 64:
- *(s64 *)place = sval;
+ WRITE_PLACE((s64 *)place, sval, me);
break;
default:
pr_err("Invalid length (%d) for data relocation\n", len);
@@ -113,7 +124,8 @@ enum aarch64_insn_movw_imm_type {
};
static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val,
- int lsb, enum aarch64_insn_movw_imm_type imm_type)
+ int lsb, enum aarch64_insn_movw_imm_type imm_type,
+ struct module *me)
{
u64 imm;
s64 sval;
@@ -145,7 +157,7 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val,
/* Update the instruction with the new encoding. */
insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm);
- *place = cpu_to_le32(insn);
+ WRITE_PLACE(place, cpu_to_le32(insn), me);
if (imm > U16_MAX)
return -ERANGE;
@@ -154,7 +166,8 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val,
}
static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val,
- int lsb, int len, enum aarch64_insn_imm_type imm_type)
+ int lsb, int len, enum aarch64_insn_imm_type imm_type,
+ struct module *me)
{
u64 imm, imm_mask;
s64 sval;
@@ -170,7 +183,7 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val,
/* Update the instruction's immediate field. */
insn = aarch64_insn_encode_immediate(imm_type, insn, imm);
- *place = cpu_to_le32(insn);
+ WRITE_PLACE(place, cpu_to_le32(insn), me);
/*
* Extract the upper value bits (including the sign bit) and
@@ -189,17 +202,17 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val,
}
static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs,
- __le32 *place, u64 val)
+ __le32 *place, u64 val, struct module *me)
{
u32 insn;
if (!is_forbidden_offset_for_adrp(place))
return reloc_insn_imm(RELOC_OP_PAGE, place, val, 12, 21,
- AARCH64_INSN_IMM_ADR);
+ AARCH64_INSN_IMM_ADR, me);
/* patch ADRP to ADR if it is in range */
if (!reloc_insn_imm(RELOC_OP_PREL, place, val & ~0xfff, 0, 21,
- AARCH64_INSN_IMM_ADR)) {
+ AARCH64_INSN_IMM_ADR, me)) {
insn = le32_to_cpu(*place);
insn &= ~BIT(31);
} else {
@@ -211,7 +224,7 @@ static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs,
AARCH64_INSN_BRANCH_NOLINK);
}
- *place = cpu_to_le32(insn);
+ WRITE_PLACE(place, cpu_to_le32(insn), me);
return 0;
}
@@ -255,23 +268,23 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
/* Data relocations. */
case R_AARCH64_ABS64:
overflow_check = false;
- ovf = reloc_data(RELOC_OP_ABS, loc, val, 64);
+ ovf = reloc_data(RELOC_OP_ABS, loc, val, 64, me);
break;
case R_AARCH64_ABS32:
- ovf = reloc_data(RELOC_OP_ABS, loc, val, 32);
+ ovf = reloc_data(RELOC_OP_ABS, loc, val, 32, me);
break;
case R_AARCH64_ABS16:
- ovf = reloc_data(RELOC_OP_ABS, loc, val, 16);
+ ovf = reloc_data(RELOC_OP_ABS, loc, val, 16, me);
break;
case R_AARCH64_PREL64:
overflow_check = false;
- ovf = reloc_data(RELOC_OP_PREL, loc, val, 64);
+ ovf = reloc_data(RELOC_OP_PREL, loc, val, 64, me);
break;
case R_AARCH64_PREL32:
- ovf = reloc_data(RELOC_OP_PREL, loc, val, 32);
+ ovf = reloc_data(RELOC_OP_PREL, loc, val, 32, me);
break;
case R_AARCH64_PREL16:
- ovf = reloc_data(RELOC_OP_PREL, loc, val, 16);
+ ovf = reloc_data(RELOC_OP_PREL, loc, val, 16, me);
break;
/* MOVW instruction relocations. */
@@ -280,88 +293,88 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
fallthrough;
case R_AARCH64_MOVW_UABS_G0:
ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
- AARCH64_INSN_IMM_MOVKZ);
+ AARCH64_INSN_IMM_MOVKZ, me);
break;
case R_AARCH64_MOVW_UABS_G1_NC:
overflow_check = false;
fallthrough;
case R_AARCH64_MOVW_UABS_G1:
ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16,
- AARCH64_INSN_IMM_MOVKZ);
+ AARCH64_INSN_IMM_MOVKZ, me);
break;
case R_AARCH64_MOVW_UABS_G2_NC:
overflow_check = false;
fallthrough;
case R_AARCH64_MOVW_UABS_G2:
ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32,
- AARCH64_INSN_IMM_MOVKZ);
+ AARCH64_INSN_IMM_MOVKZ, me);
break;
case R_AARCH64_MOVW_UABS_G3:
/* We're using the top bits so we can't overflow. */
overflow_check = false;
ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48,
- AARCH64_INSN_IMM_MOVKZ);
+ AARCH64_INSN_IMM_MOVKZ, me);
break;
case R_AARCH64_MOVW_SABS_G0:
ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
- AARCH64_INSN_IMM_MOVNZ);
+ AARCH64_INSN_IMM_MOVNZ, me);
break;
case R_AARCH64_MOVW_SABS_G1:
ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16,
- AARCH64_INSN_IMM_MOVNZ);
+ AARCH64_INSN_IMM_MOVNZ, me);
break;
case R_AARCH64_MOVW_SABS_G2:
ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32,
- AARCH64_INSN_IMM_MOVNZ);
+ AARCH64_INSN_IMM_MOVNZ, me);
break;
case R_AARCH64_MOVW_PREL_G0_NC:
overflow_check = false;
ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
- AARCH64_INSN_IMM_MOVKZ);
+ AARCH64_INSN_IMM_MOVKZ, me);
break;
case R_AARCH64_MOVW_PREL_G0:
ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
- AARCH64_INSN_IMM_MOVNZ);
+ AARCH64_INSN_IMM_MOVNZ, me);
break;
case R_AARCH64_MOVW_PREL_G1_NC:
overflow_check = false;
ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
- AARCH64_INSN_IMM_MOVKZ);
+ AARCH64_INSN_IMM_MOVKZ, me);
break;
case R_AARCH64_MOVW_PREL_G1:
ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
- AARCH64_INSN_IMM_MOVNZ);
+ AARCH64_INSN_IMM_MOVNZ, me);
break;
case R_AARCH64_MOVW_PREL_G2_NC:
overflow_check = false;
ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,
- AARCH64_INSN_IMM_MOVKZ);
+ AARCH64_INSN_IMM_MOVKZ, me);
break;
case R_AARCH64_MOVW_PREL_G2:
ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,
- AARCH64_INSN_IMM_MOVNZ);
+ AARCH64_INSN_IMM_MOVNZ, me);
break;
case R_AARCH64_MOVW_PREL_G3:
/* We're using the top bits so we can't overflow. */
overflow_check = false;
ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 48,
- AARCH64_INSN_IMM_MOVNZ);
+ AARCH64_INSN_IMM_MOVNZ, me);
break;
/* Immediate instruction relocations. */
case R_AARCH64_LD_PREL_LO19:
ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19,
- AARCH64_INSN_IMM_19);
+ AARCH64_INSN_IMM_19, me);
break;
case R_AARCH64_ADR_PREL_LO21:
ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21,
- AARCH64_INSN_IMM_ADR);
+ AARCH64_INSN_IMM_ADR, me);
break;
case R_AARCH64_ADR_PREL_PG_HI21_NC:
overflow_check = false;
fallthrough;
case R_AARCH64_ADR_PREL_PG_HI21:
- ovf = reloc_insn_adrp(me, sechdrs, loc, val);
+ ovf = reloc_insn_adrp(me, sechdrs, loc, val, me);
if (ovf && ovf != -ERANGE)
return ovf;
break;
@@ -369,46 +382,46 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
case R_AARCH64_LDST8_ABS_LO12_NC:
overflow_check = false;
ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 0, 12,
- AARCH64_INSN_IMM_12);
+ AARCH64_INSN_IMM_12, me);
break;
case R_AARCH64_LDST16_ABS_LO12_NC:
overflow_check = false;
ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 1, 11,
- AARCH64_INSN_IMM_12);
+ AARCH64_INSN_IMM_12, me);
break;
case R_AARCH64_LDST32_ABS_LO12_NC:
overflow_check = false;
ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 2, 10,
- AARCH64_INSN_IMM_12);
+ AARCH64_INSN_IMM_12, me);
break;
case R_AARCH64_LDST64_ABS_LO12_NC:
overflow_check = false;
ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 3, 9,
- AARCH64_INSN_IMM_12);
+ AARCH64_INSN_IMM_12, me);
break;
case R_AARCH64_LDST128_ABS_LO12_NC:
overflow_check = false;
ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 4, 8,
- AARCH64_INSN_IMM_12);
+ AARCH64_INSN_IMM_12, me);
break;
case R_AARCH64_TSTBR14:
ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 14,
- AARCH64_INSN_IMM_14);
+ AARCH64_INSN_IMM_14, me);
break;
case R_AARCH64_CONDBR19:
ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19,
- AARCH64_INSN_IMM_19);
+ AARCH64_INSN_IMM_19, me);
break;
case R_AARCH64_JUMP26:
case R_AARCH64_CALL26:
ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26,
- AARCH64_INSN_IMM_26);
+ AARCH64_INSN_IMM_26, me);
if (ovf == -ERANGE) {
val = module_emit_plt_entry(me, sechdrs, loc, &rel[i], sym);
if (!val)
return -ENOEXEC;
ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2,
- 26, AARCH64_INSN_IMM_26);
+ 26, AARCH64_INSN_IMM_26, me);
}
break;
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 2fbfd27ff5f2..e5e773844889 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -200,7 +200,7 @@ static void mte_update_sctlr_user(struct task_struct *task)
* program requested values go with what was requested.
*/
resolved_mte_tcf = (mte_ctrl & pref) ? pref : mte_ctrl;
- sctlr &= ~SCTLR_EL1_TCF0_MASK;
+ sctlr &= ~(SCTLR_EL1_TCF0_MASK | SCTLR_EL1_TCSO0_MASK);
/*
* Pick an actual setting. The order in which we check for
* set bits and map into register values determines our
@@ -212,6 +212,10 @@ static void mte_update_sctlr_user(struct task_struct *task)
sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, ASYNC);
else if (resolved_mte_tcf & MTE_CTRL_TCF_SYNC)
sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, SYNC);
+
+ if (mte_ctrl & MTE_CTRL_STORE_ONLY)
+ sctlr |= SYS_FIELD_PREP(SCTLR_EL1, TCSO0, 1);
+
task->thread.sctlr_user = sctlr;
}
@@ -371,6 +375,9 @@ long set_mte_ctrl(struct task_struct *task, unsigned long arg)
(arg & PR_MTE_TCF_SYNC))
mte_ctrl |= MTE_CTRL_TCF_ASYMM;
+ if (arg & PR_MTE_STORE_ONLY)
+ mte_ctrl |= MTE_CTRL_STORE_ONLY;
+
task->thread.mte_ctrl = mte_ctrl;
if (task == current) {
preempt_disable();
@@ -398,6 +405,8 @@ long get_mte_ctrl(struct task_struct *task)
ret |= PR_MTE_TCF_ASYNC;
if (mte_ctrl & MTE_CTRL_TCF_SYNC)
ret |= PR_MTE_TCF_SYNC;
+ if (mte_ctrl & MTE_CTRL_STORE_ONLY)
+ ret |= PR_MTE_STORE_ONLY;
return ret;
}
diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile
index f440bf57b1a5..be92d73c25b2 100644
--- a/arch/arm64/kernel/pi/Makefile
+++ b/arch/arm64/kernel/pi/Makefile
@@ -41,4 +41,4 @@ obj-y := idreg-override.pi.o \
obj-$(CONFIG_RELOCATABLE) += relocate.pi.o
obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_early.pi.o
obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.pi.o
-extra-y := $(patsubst %.pi.o,%.o,$(obj-y))
+targets := $(patsubst %.pi.o,%.o,$(obj-y))
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index d9e462eafb95..0c5d408afd95 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -292,8 +292,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr)
return 0;
}
-static int __kprobes
-kprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr)
+int __kprobes
+kprobe_brk_handler(struct pt_regs *regs, unsigned long esr)
{
struct kprobe *p, *cur_kprobe;
struct kprobe_ctlblk *kcb;
@@ -336,13 +336,8 @@ kprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr)
return DBG_HOOK_HANDLED;
}
-static struct break_hook kprobes_break_hook = {
- .imm = KPROBES_BRK_IMM,
- .fn = kprobe_breakpoint_handler,
-};
-
-static int __kprobes
-kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr)
+int __kprobes
+kprobe_ss_brk_handler(struct pt_regs *regs, unsigned long esr)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
unsigned long addr = instruction_pointer(regs);
@@ -360,13 +355,8 @@ kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr)
return DBG_HOOK_ERROR;
}
-static struct break_hook kprobes_break_ss_hook = {
- .imm = KPROBES_BRK_SS_IMM,
- .fn = kprobe_breakpoint_ss_handler,
-};
-
-static int __kprobes
-kretprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr)
+int __kprobes
+kretprobe_brk_handler(struct pt_regs *regs, unsigned long esr)
{
if (regs->pc != (unsigned long)__kretprobe_trampoline)
return DBG_HOOK_ERROR;
@@ -375,11 +365,6 @@ kretprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr)
return DBG_HOOK_HANDLED;
}
-static struct break_hook kretprobes_break_hook = {
- .imm = KRETPROBES_BRK_IMM,
- .fn = kretprobe_breakpoint_handler,
-};
-
/*
* Provide a blacklist of symbols identifying ranges which cannot be kprobed.
* This blacklist is exposed to userspace via debugfs (kprobes/blacklist).
@@ -422,9 +407,5 @@ int __kprobes arch_trampoline_kprobe(struct kprobe *p)
int __init arch_init_kprobes(void)
{
- register_kernel_break_hook(&kprobes_break_hook);
- register_kernel_break_hook(&kprobes_break_ss_hook);
- register_kernel_break_hook(&kretprobes_break_hook);
-
return 0;
}
diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S
index a362f3dbb3d1..b60739d3983f 100644
--- a/arch/arm64/kernel/probes/kprobes_trampoline.S
+++ b/arch/arm64/kernel/probes/kprobes_trampoline.S
@@ -12,7 +12,7 @@
SYM_CODE_START(__kretprobe_trampoline)
/*
* Trigger a breakpoint exception. The PC will be adjusted by
- * kretprobe_breakpoint_handler(), and no subsequent instructions will
+ * kretprobe_brk_handler(), and no subsequent instructions will
* be executed from the trampoline.
*/
brk #KRETPROBES_BRK_IMM
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index cb3d05af36e3..1f91fd2a8187 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -173,7 +173,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self,
return NOTIFY_DONE;
}
-static int uprobe_breakpoint_handler(struct pt_regs *regs,
+int uprobe_brk_handler(struct pt_regs *regs,
unsigned long esr)
{
if (uprobe_pre_sstep_notifier(regs))
@@ -182,7 +182,7 @@ static int uprobe_breakpoint_handler(struct pt_regs *regs,
return DBG_HOOK_ERROR;
}
-static int uprobe_single_step_handler(struct pt_regs *regs,
+int uprobe_single_step_handler(struct pt_regs *regs,
unsigned long esr)
{
struct uprobe_task *utask = current->utask;
@@ -194,23 +194,3 @@ static int uprobe_single_step_handler(struct pt_regs *regs,
return DBG_HOOK_ERROR;
}
-/* uprobe breakpoint handler hook */
-static struct break_hook uprobes_break_hook = {
- .imm = UPROBES_BRK_IMM,
- .fn = uprobe_breakpoint_handler,
-};
-
-/* uprobe single step handler hook */
-static struct step_hook uprobes_step_hook = {
- .fn = uprobe_single_step_handler,
-};
-
-static int __init arch_init_uprobes(void)
-{
- register_user_break_hook(&uprobes_break_hook);
- register_user_step_hook(&uprobes_step_hook);
-
- return 0;
-}
-
-device_initcall(arch_init_uprobes);
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 08b7042a2e2d..96482a1412c6 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -307,13 +307,13 @@ static int copy_thread_gcs(struct task_struct *p,
p->thread.gcs_base = 0;
p->thread.gcs_size = 0;
+ p->thread.gcs_el0_mode = current->thread.gcs_el0_mode;
+ p->thread.gcs_el0_locked = current->thread.gcs_el0_locked;
+
gcs = gcs_alloc_thread_stack(p, args);
if (IS_ERR_VALUE(gcs))
return PTR_ERR((void *)gcs);
- p->thread.gcs_el0_mode = current->thread.gcs_el0_mode;
- p->thread.gcs_el0_locked = current->thread.gcs_el0_locked;
-
return 0;
}
@@ -341,7 +341,6 @@ void flush_thread(void)
void arch_release_task_struct(struct task_struct *tsk)
{
fpsimd_release_task(tsk);
- gcs_free(tsk);
}
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
@@ -856,10 +855,14 @@ long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg)
if (is_compat_thread(ti))
return -EINVAL;
- if (system_supports_mte())
+ if (system_supports_mte()) {
valid_mask |= PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC \
| PR_MTE_TAG_MASK;
+ if (cpus_have_cap(ARM64_MTE_STORE_ONLY))
+ valid_mask |= PR_MTE_STORE_ONLY;
+ }
+
if (arg & ~valid_mask)
return -EINVAL;
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index 255d12f881c2..6f24a0251e18 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -34,10 +34,8 @@ unsigned long sdei_exit_mode;
DECLARE_PER_CPU(unsigned long *, sdei_stack_normal_ptr);
DECLARE_PER_CPU(unsigned long *, sdei_stack_critical_ptr);
-#ifdef CONFIG_VMAP_STACK
DEFINE_PER_CPU(unsigned long *, sdei_stack_normal_ptr);
DEFINE_PER_CPU(unsigned long *, sdei_stack_critical_ptr);
-#endif
DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr);
DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr);
@@ -65,8 +63,7 @@ static void free_sdei_stacks(void)
{
int cpu;
- if (!IS_ENABLED(CONFIG_VMAP_STACK))
- return;
+ BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK));
for_each_possible_cpu(cpu) {
_free_sdei_stack(&sdei_stack_normal_ptr, cpu);
@@ -91,8 +88,7 @@ static int init_sdei_stacks(void)
int cpu;
int err = 0;
- if (!IS_ENABLED(CONFIG_VMAP_STACK))
- return 0;
+ BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK));
for_each_possible_cpu(cpu) {
err = _init_sdei_stack(&sdei_stack_normal_ptr, cpu);
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 417140cd399b..db3f972f8cd9 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -95,8 +95,11 @@ static void save_reset_user_access_state(struct user_access_state *ua_state)
ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0);
write_sysreg_s(por_enable_all, SYS_POR_EL0);
- /* Ensure that any subsequent uaccess observes the updated value */
- isb();
+ /*
+ * No ISB required as we can tolerate spurious Overlay faults -
+ * the fault handler will check again based on the new value
+ * of POR_EL0.
+ */
}
}
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 21a795303568..68cea3a4a35c 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -64,26 +64,18 @@ struct secondary_data secondary_data;
/* Number of CPUs which aren't online, but looping in kernel text. */
static int cpus_stuck_in_kernel;
-enum ipi_msg_type {
- IPI_RESCHEDULE,
- IPI_CALL_FUNC,
- IPI_CPU_STOP,
- IPI_CPU_STOP_NMI,
- IPI_TIMER,
- IPI_IRQ_WORK,
- NR_IPI,
- /*
- * Any enum >= NR_IPI and < MAX_IPI is special and not tracable
- * with trace_ipi_*
- */
- IPI_CPU_BACKTRACE = NR_IPI,
- IPI_KGDB_ROUNDUP,
- MAX_IPI
-};
-
static int ipi_irq_base __ro_after_init;
static int nr_ipi __ro_after_init = NR_IPI;
-static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init;
+
+struct ipi_descs {
+ struct irq_desc *descs[MAX_IPI];
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct ipi_descs, pcpu_ipi_desc);
+
+#define get_ipi_desc(__cpu, __ipi) (per_cpu_ptr(&pcpu_ipi_desc, __cpu)->descs[__ipi])
+
+static bool percpu_ipi_descs __ro_after_init;
static bool crash_stop;
@@ -844,7 +836,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i,
prec >= 4 ? " " : "");
for_each_online_cpu(cpu)
- seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu));
+ seq_printf(p, "%10u ", irq_desc_kstat_cpu(get_ipi_desc(cpu, i), cpu));
seq_printf(p, " %s\n", ipi_types[i]);
}
@@ -917,9 +909,20 @@ static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs
#endif
}
+static void arm64_send_ipi(const cpumask_t *mask, unsigned int nr)
+{
+ unsigned int cpu;
+
+ if (!percpu_ipi_descs)
+ __ipi_send_mask(get_ipi_desc(0, nr), mask);
+ else
+ for_each_cpu(cpu, mask)
+ __ipi_send_single(get_ipi_desc(cpu, nr), cpu);
+}
+
static void arm64_backtrace_ipi(cpumask_t *mask)
{
- __ipi_send_mask(ipi_desc[IPI_CPU_BACKTRACE], mask);
+ arm64_send_ipi(mask, IPI_CPU_BACKTRACE);
}
void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
@@ -944,7 +947,7 @@ void kgdb_roundup_cpus(void)
if (cpu == this_cpu)
continue;
- __ipi_send_single(ipi_desc[IPI_KGDB_ROUNDUP], cpu);
+ __ipi_send_single(get_ipi_desc(cpu, IPI_KGDB_ROUNDUP), cpu);
}
}
#endif
@@ -1013,14 +1016,16 @@ static void do_handle_IPI(int ipinr)
static irqreturn_t ipi_handler(int irq, void *data)
{
- do_handle_IPI(irq - ipi_irq_base);
+ unsigned int ipi = (irq - ipi_irq_base) % nr_ipi;
+
+ do_handle_IPI(ipi);
return IRQ_HANDLED;
}
static void smp_cross_call(const struct cpumask *target, unsigned int ipinr)
{
trace_ipi_raise(target, ipi_types[ipinr]);
- __ipi_send_mask(ipi_desc[ipinr], target);
+ arm64_send_ipi(target, ipinr);
}
static bool ipi_should_be_nmi(enum ipi_msg_type ipi)
@@ -1046,11 +1051,15 @@ static void ipi_setup(int cpu)
return;
for (i = 0; i < nr_ipi; i++) {
- if (ipi_should_be_nmi(i)) {
- prepare_percpu_nmi(ipi_irq_base + i);
- enable_percpu_nmi(ipi_irq_base + i, 0);
+ if (!percpu_ipi_descs) {
+ if (ipi_should_be_nmi(i)) {
+ prepare_percpu_nmi(ipi_irq_base + i);
+ enable_percpu_nmi(ipi_irq_base + i, 0);
+ } else {
+ enable_percpu_irq(ipi_irq_base + i, 0);
+ }
} else {
- enable_percpu_irq(ipi_irq_base + i, 0);
+ enable_irq(irq_desc_get_irq(get_ipi_desc(cpu, i)));
}
}
}
@@ -1064,44 +1073,77 @@ static void ipi_teardown(int cpu)
return;
for (i = 0; i < nr_ipi; i++) {
- if (ipi_should_be_nmi(i)) {
- disable_percpu_nmi(ipi_irq_base + i);
- teardown_percpu_nmi(ipi_irq_base + i);
+ if (!percpu_ipi_descs) {
+ if (ipi_should_be_nmi(i)) {
+ disable_percpu_nmi(ipi_irq_base + i);
+ teardown_percpu_nmi(ipi_irq_base + i);
+ } else {
+ disable_percpu_irq(ipi_irq_base + i);
+ }
} else {
- disable_percpu_irq(ipi_irq_base + i);
+ disable_irq(irq_desc_get_irq(get_ipi_desc(cpu, i)));
}
}
}
#endif
-void __init set_smp_ipi_range(int ipi_base, int n)
+static void ipi_setup_sgi(int ipi)
{
- int i;
+ int err, irq, cpu;
- WARN_ON(n < MAX_IPI);
- nr_ipi = min(n, MAX_IPI);
+ irq = ipi_irq_base + ipi;
- for (i = 0; i < nr_ipi; i++) {
- int err;
+ if (ipi_should_be_nmi(ipi)) {
+ err = request_percpu_nmi(irq, ipi_handler, "IPI", &irq_stat);
+ WARN(err, "Could not request IRQ %d as NMI, err=%d\n", irq, err);
+ } else {
+ err = request_percpu_irq(irq, ipi_handler, "IPI", &irq_stat);
+ WARN(err, "Could not request IRQ %d as IRQ, err=%d\n", irq, err);
+ }
- if (ipi_should_be_nmi(i)) {
- err = request_percpu_nmi(ipi_base + i, ipi_handler,
- "IPI", &irq_stat);
- WARN(err, "Could not request IPI %d as NMI, err=%d\n",
- i, err);
- } else {
- err = request_percpu_irq(ipi_base + i, ipi_handler,
- "IPI", &irq_stat);
- WARN(err, "Could not request IPI %d as IRQ, err=%d\n",
- i, err);
- }
+ for_each_possible_cpu(cpu)
+ get_ipi_desc(cpu, ipi) = irq_to_desc(irq);
+
+ irq_set_status_flags(irq, IRQ_HIDDEN);
+}
+
+static void ipi_setup_lpi(int ipi, int ncpus)
+{
+ for (int cpu = 0; cpu < ncpus; cpu++) {
+ int err, irq;
+
+ irq = ipi_irq_base + (cpu * nr_ipi) + ipi;
+
+ err = irq_force_affinity(irq, cpumask_of(cpu));
+ WARN(err, "Could not force affinity IRQ %d, err=%d\n", irq, err);
+
+ err = request_irq(irq, ipi_handler, IRQF_NO_AUTOEN, "IPI",
+ NULL);
+ WARN(err, "Could not request IRQ %d, err=%d\n", irq, err);
+
+ irq_set_status_flags(irq, (IRQ_HIDDEN | IRQ_NO_BALANCING_MASK));
- ipi_desc[i] = irq_to_desc(ipi_base + i);
- irq_set_status_flags(ipi_base + i, IRQ_HIDDEN);
+ get_ipi_desc(cpu, ipi) = irq_to_desc(irq);
}
+}
+
+void __init set_smp_ipi_range_percpu(int ipi_base, int n, int ncpus)
+{
+ int i;
+
+ WARN_ON(n < MAX_IPI);
+ nr_ipi = min(n, MAX_IPI);
+ percpu_ipi_descs = !!ncpus;
ipi_irq_base = ipi_base;
+ for (i = 0; i < nr_ipi; i++) {
+ if (!percpu_ipi_descs)
+ ipi_setup_sgi(i);
+ else
+ ipi_setup_lpi(i, ncpus);
+ }
+
/* Setup the boot CPU immediately */
ipi_setup(smp_processor_id());
}
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 1d9d51d7627f..3ebcf8c53fb0 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -152,6 +152,8 @@ kunwind_recover_return_address(struct kunwind_state *state)
orig_pc = kretprobe_find_ret_addr(state->task,
(void *)state->common.fp,
&state->kr_cur);
+ if (!orig_pc)
+ return -EINVAL;
state->common.pc = orig_pc;
state->flags.kretprobe = 1;
}
@@ -277,21 +279,24 @@ kunwind_next(struct kunwind_state *state)
typedef bool (*kunwind_consume_fn)(const struct kunwind_state *state, void *cookie);
-static __always_inline void
+static __always_inline int
do_kunwind(struct kunwind_state *state, kunwind_consume_fn consume_state,
void *cookie)
{
- if (kunwind_recover_return_address(state))
- return;
+ int ret;
- while (1) {
- int ret;
+ ret = kunwind_recover_return_address(state);
+ if (ret)
+ return ret;
+ while (1) {
if (!consume_state(state, cookie))
- break;
+ return -EINVAL;
ret = kunwind_next(state);
+ if (ret == -ENOENT)
+ return 0;
if (ret < 0)
- break;
+ return ret;
}
}
@@ -324,7 +329,7 @@ do_kunwind(struct kunwind_state *state, kunwind_consume_fn consume_state,
: stackinfo_get_unknown(); \
})
-static __always_inline void
+static __always_inline int
kunwind_stack_walk(kunwind_consume_fn consume_state,
void *cookie, struct task_struct *task,
struct pt_regs *regs)
@@ -332,10 +337,8 @@ kunwind_stack_walk(kunwind_consume_fn consume_state,
struct stack_info stacks[] = {
stackinfo_get_task(task),
STACKINFO_CPU(irq),
-#if defined(CONFIG_VMAP_STACK)
STACKINFO_CPU(overflow),
-#endif
-#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_ARM_SDE_INTERFACE)
+#if defined(CONFIG_ARM_SDE_INTERFACE)
STACKINFO_SDEI(normal),
STACKINFO_SDEI(critical),
#endif
@@ -352,7 +355,7 @@ kunwind_stack_walk(kunwind_consume_fn consume_state,
if (regs) {
if (task != current)
- return;
+ return -EINVAL;
kunwind_init_from_regs(&state, regs);
} else if (task == current) {
kunwind_init_from_caller(&state);
@@ -360,7 +363,7 @@ kunwind_stack_walk(kunwind_consume_fn consume_state,
kunwind_init_from_task(&state, task);
}
- do_kunwind(&state, consume_state, cookie);
+ return do_kunwind(&state, consume_state, cookie);
}
struct kunwind_consume_entry_data {
@@ -387,6 +390,36 @@ noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry,
kunwind_stack_walk(arch_kunwind_consume_entry, &data, task, regs);
}
+static __always_inline bool
+arch_reliable_kunwind_consume_entry(const struct kunwind_state *state, void *cookie)
+{
+ /*
+ * At an exception boundary we can reliably consume the saved PC. We do
+ * not know whether the LR was live when the exception was taken, and
+ * so we cannot perform the next unwind step reliably.
+ *
+ * All that matters is whether the *entire* unwind is reliable, so give
+ * up as soon as we hit an exception boundary.
+ */
+ if (state->source == KUNWIND_SOURCE_REGS_PC)
+ return false;
+
+ return arch_kunwind_consume_entry(state, cookie);
+}
+
+noinline noinstr int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
+ void *cookie,
+ struct task_struct *task)
+{
+ struct kunwind_consume_entry_data data = {
+ .consume_entry = consume_entry,
+ .cookie = cookie,
+ };
+
+ return kunwind_stack_walk(arch_reliable_kunwind_consume_entry, &data,
+ task, NULL);
+}
+
struct bpf_unwind_consume_entry_data {
bool (*consume_entry)(void *cookie, u64 ip, u64 sp, u64 fp);
void *cookie;
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 9bfa5c944379..f528b6041f6a 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -454,7 +454,7 @@ void do_el0_undef(struct pt_regs *regs, unsigned long esr)
u32 insn;
/* check for AArch32 breakpoint instructions */
- if (!aarch32_break_handler(regs))
+ if (try_handle_aarch32_break(regs))
return;
if (user_insn_read(regs, &insn))
@@ -894,8 +894,6 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr)
"Bad EL0 synchronous exception");
}
-#ifdef CONFIG_VMAP_STACK
-
DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack)
__aligned(16);
@@ -927,10 +925,10 @@ void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigne
nmi_panic(NULL, "kernel stack overflow");
cpu_park_loop();
}
-#endif
void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr)
{
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_STILL_OK);
console_verbose();
pr_crit("SError Interrupt on CPU%d, code 0x%016lx -- %s\n",
@@ -987,7 +985,7 @@ void do_serror(struct pt_regs *regs, unsigned long esr)
int is_valid_bugaddr(unsigned long addr)
{
/*
- * bug_handler() only called for BRK #BUG_BRK_IMM.
+ * bug_brk_handler() only called for BRK #BUG_BRK_IMM.
* So the answer is trivial -- any spurious instances with no
* bug table entry will be rejected by report_bug() and passed
* back to the debug-monitors code and handled as a fatal
@@ -997,7 +995,7 @@ int is_valid_bugaddr(unsigned long addr)
}
#endif
-static int bug_handler(struct pt_regs *regs, unsigned long esr)
+int bug_brk_handler(struct pt_regs *regs, unsigned long esr)
{
switch (report_bug(regs->pc, regs)) {
case BUG_TRAP_TYPE_BUG:
@@ -1017,13 +1015,8 @@ static int bug_handler(struct pt_regs *regs, unsigned long esr)
return DBG_HOOK_HANDLED;
}
-static struct break_hook bug_break_hook = {
- .fn = bug_handler,
- .imm = BUG_BRK_IMM,
-};
-
#ifdef CONFIG_CFI_CLANG
-static int cfi_handler(struct pt_regs *regs, unsigned long esr)
+int cfi_brk_handler(struct pt_regs *regs, unsigned long esr)
{
unsigned long target;
u32 type;
@@ -1046,15 +1039,9 @@ static int cfi_handler(struct pt_regs *regs, unsigned long esr)
arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
return DBG_HOOK_HANDLED;
}
-
-static struct break_hook cfi_break_hook = {
- .fn = cfi_handler,
- .imm = CFI_BRK_IMM_BASE,
- .mask = CFI_BRK_IMM_MASK,
-};
#endif /* CONFIG_CFI_CLANG */
-static int reserved_fault_handler(struct pt_regs *regs, unsigned long esr)
+int reserved_fault_brk_handler(struct pt_regs *regs, unsigned long esr)
{
pr_err("%s generated an invalid instruction at %pS!\n",
"Kernel text patching",
@@ -1064,11 +1051,6 @@ static int reserved_fault_handler(struct pt_regs *regs, unsigned long esr)
return DBG_HOOK_ERROR;
}
-static struct break_hook fault_break_hook = {
- .fn = reserved_fault_handler,
- .imm = FAULT_BRK_IMM,
-};
-
#ifdef CONFIG_KASAN_SW_TAGS
#define KASAN_ESR_RECOVER 0x20
@@ -1076,7 +1058,7 @@ static struct break_hook fault_break_hook = {
#define KASAN_ESR_SIZE_MASK 0x0f
#define KASAN_ESR_SIZE(esr) (1 << ((esr) & KASAN_ESR_SIZE_MASK))
-static int kasan_handler(struct pt_regs *regs, unsigned long esr)
+int kasan_brk_handler(struct pt_regs *regs, unsigned long esr)
{
bool recover = esr & KASAN_ESR_RECOVER;
bool write = esr & KASAN_ESR_WRITE;
@@ -1107,62 +1089,12 @@ static int kasan_handler(struct pt_regs *regs, unsigned long esr)
arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
return DBG_HOOK_HANDLED;
}
-
-static struct break_hook kasan_break_hook = {
- .fn = kasan_handler,
- .imm = KASAN_BRK_IMM,
- .mask = KASAN_BRK_MASK,
-};
#endif
#ifdef CONFIG_UBSAN_TRAP
-static int ubsan_handler(struct pt_regs *regs, unsigned long esr)
+int ubsan_brk_handler(struct pt_regs *regs, unsigned long esr)
{
die(report_ubsan_failure(esr & UBSAN_BRK_MASK), regs, esr);
return DBG_HOOK_HANDLED;
}
-
-static struct break_hook ubsan_break_hook = {
- .fn = ubsan_handler,
- .imm = UBSAN_BRK_IMM,
- .mask = UBSAN_BRK_MASK,
-};
-#endif
-
-/*
- * Initial handler for AArch64 BRK exceptions
- * This handler only used until debug_traps_init().
- */
-int __init early_brk64(unsigned long addr, unsigned long esr,
- struct pt_regs *regs)
-{
-#ifdef CONFIG_CFI_CLANG
- if (esr_is_cfi_brk(esr))
- return cfi_handler(regs, esr) != DBG_HOOK_HANDLED;
-#endif
-#ifdef CONFIG_KASAN_SW_TAGS
- if ((esr_brk_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM)
- return kasan_handler(regs, esr) != DBG_HOOK_HANDLED;
-#endif
-#ifdef CONFIG_UBSAN_TRAP
- if (esr_is_ubsan_brk(esr))
- return ubsan_handler(regs, esr) != DBG_HOOK_HANDLED;
-#endif
- return bug_handler(regs, esr) != DBG_HOOK_HANDLED;
-}
-
-void __init trap_init(void)
-{
- register_kernel_break_hook(&bug_break_hook);
-#ifdef CONFIG_CFI_CLANG
- register_kernel_break_hook(&cfi_break_hook);
-#endif
- register_kernel_break_hook(&fault_break_hook);
-#ifdef CONFIG_KASAN_SW_TAGS
- register_kernel_break_hook(&kasan_break_hook);
-#endif
-#ifdef CONFIG_UBSAN_TRAP
- register_kernel_break_hook(&ubsan_break_hook);
#endif
- debug_traps_init();
-}
diff --git a/arch/arm64/kernel/watchdog_hld.c b/arch/arm64/kernel/watchdog_hld.c
index dcd25322127c..3093037dcb7b 100644
--- a/arch/arm64/kernel/watchdog_hld.c
+++ b/arch/arm64/kernel/watchdog_hld.c
@@ -34,3 +34,61 @@ bool __init arch_perf_nmi_is_available(void)
*/
return arm_pmu_irq_is_nmi();
}
+
+static int watchdog_perf_update_period(void *data)
+{
+ int cpu = smp_processor_id();
+ u64 max_cpu_freq, new_period;
+
+ max_cpu_freq = cpufreq_get_hw_max_freq(cpu) * 1000UL;
+ if (!max_cpu_freq)
+ return 0;
+
+ new_period = watchdog_thresh * max_cpu_freq;
+ hardlockup_detector_perf_adjust_period(new_period);
+
+ return 0;
+}
+
+static int watchdog_freq_notifier_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_policy *policy = data;
+ int cpu;
+
+ if (val != CPUFREQ_CREATE_POLICY)
+ return NOTIFY_DONE;
+
+ /*
+ * Let each online CPU related to the policy update the period by their
+ * own. This will serialize with the framework on start/stop the lockup
+ * detector (softlockup_{start,stop}_all) and avoid potential race
+ * condition. Otherwise we may have below theoretical race condition:
+ * (core 0/1 share the same policy)
+ * [core 0] [core 1]
+ * hardlockup_detector_event_create()
+ * hw_nmi_get_sample_period()
+ * (cpufreq registered, notifier callback invoked)
+ * watchdog_freq_notifier_callback()
+ * watchdog_perf_update_period()
+ * (since core 1's event's not yet created,
+ * the period is not set)
+ * perf_event_create_kernel_counter()
+ * (event's period is SAFE_MAX_CPU_FREQ)
+ */
+ for_each_cpu(cpu, policy->cpus)
+ smp_call_on_cpu(cpu, watchdog_perf_update_period, NULL, false);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block watchdog_freq_notifier = {
+ .notifier_call = watchdog_freq_notifier_callback,
+};
+
+static int __init init_watchdog_freq_notifier(void)
+{
+ return cpufreq_register_notifier(&watchdog_freq_notifier,
+ CPUFREQ_POLICY_NOTIFIER);
+}
+core_initcall(init_watchdog_freq_notifier);
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 7c329e01c557..3ebc0570345c 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -23,7 +23,8 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
vgic/vgic-v3.o vgic/vgic-v4.o \
vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \
vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
- vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o
+ vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o \
+ vgic/vgic-v5.o
kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o
kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 701ea10a63f1..dbd74e4885e2 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -830,7 +830,7 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
* by the guest (either FEAT_VHE or FEAT_E2H0 is implemented, but
* not both). This simplifies the handling of the EL1NV* bits.
*/
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
+ if (is_nested_ctxt(vcpu)) {
u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
/* Use the VHE format for mental sanity */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 23dd3f3fc3eb..888f7c7abf54 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -408,6 +408,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES:
r = BIT(0);
break;
+ case KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED:
+ if (!kvm)
+ r = -EINVAL;
+ else
+ r = kvm_supports_cacheable_pfnmap();
+ break;
+
default:
r = 0;
}
@@ -521,7 +528,7 @@ static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu)
* Either we're running an L2 guest, and the API/APK bits come
* from L1's HCR_EL2, or API/APK are both set.
*/
- if (unlikely(vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) {
+ if (unlikely(is_nested_ctxt(vcpu))) {
u64 val;
val = __vcpu_sys_reg(vcpu, HCR_EL2);
@@ -740,7 +747,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
*/
int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
{
- bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
+ bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF | HCR_VSE);
+
return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
&& !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
}
@@ -1183,6 +1191,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
*/
preempt_disable();
+ kvm_nested_flush_hwstate(vcpu);
+
if (kvm_vcpu_has_pmu(vcpu))
kvm_pmu_flush_hwstate(vcpu);
@@ -1282,6 +1292,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
/* Exit types that need handling before we can be preempted */
handle_exit_early(vcpu, ret);
+ kvm_nested_sync_hwstate(vcpu);
+
preempt_enable();
/*
@@ -2765,19 +2777,15 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq);
}
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
- struct kvm_kernel_irq_routing_entry *new)
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
{
- if (old->type != KVM_IRQ_ROUTING_MSI ||
- new->type != KVM_IRQ_ROUTING_MSI)
- return true;
-
- return memcmp(&old->msi, &new->msi, sizeof(new->msi));
-}
+ if (old->type == KVM_IRQ_ROUTING_MSI &&
+ new->type == KVM_IRQ_ROUTING_MSI &&
+ !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
+ return;
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
/*
* Remapping the vLPI requires taking the its_lock mutex to resolve
* the new translation. We're in spinlock land at this point, so no
@@ -2785,7 +2793,7 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
*
* Unmap the vLPI and fall back to software LPI injection.
*/
- return kvm_vgic_v4_unset_forwarding(kvm, host_irq);
+ return kvm_vgic_v4_unset_forwarding(irqfd->kvm, irqfd->producer->irq);
}
void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index a25be111cd8f..0e5610533949 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -1047,34 +1047,51 @@ static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu,
idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc);
- switch (wi->regime) {
- case TR_EL10:
- pov_perms = perm_idx(vcpu, POR_EL1, idx);
- uov_perms = perm_idx(vcpu, POR_EL0, idx);
- break;
- case TR_EL20:
- pov_perms = perm_idx(vcpu, POR_EL2, idx);
- uov_perms = perm_idx(vcpu, POR_EL0, idx);
- break;
- case TR_EL2:
- pov_perms = perm_idx(vcpu, POR_EL2, idx);
- uov_perms = 0;
- break;
- }
+ if (wr->pov) {
+ switch (wi->regime) {
+ case TR_EL10:
+ pov_perms = perm_idx(vcpu, POR_EL1, idx);
+ break;
+ case TR_EL20:
+ pov_perms = perm_idx(vcpu, POR_EL2, idx);
+ break;
+ case TR_EL2:
+ pov_perms = perm_idx(vcpu, POR_EL2, idx);
+ break;
+ }
+
+ if (pov_perms & ~POE_RWX)
+ pov_perms = POE_NONE;
- if (pov_perms & ~POE_RWX)
- pov_perms = POE_NONE;
+ /* R_QXXPC, S1PrivOverflow enabled */
+ if (wr->pwxn && (pov_perms & POE_X))
+ pov_perms &= ~POE_W;
- if (wi->poe && wr->pov) {
wr->pr &= pov_perms & POE_R;
wr->pw &= pov_perms & POE_W;
wr->px &= pov_perms & POE_X;
}
- if (uov_perms & ~POE_RWX)
- uov_perms = POE_NONE;
+ if (wr->uov) {
+ switch (wi->regime) {
+ case TR_EL10:
+ uov_perms = perm_idx(vcpu, POR_EL0, idx);
+ break;
+ case TR_EL20:
+ uov_perms = perm_idx(vcpu, POR_EL0, idx);
+ break;
+ case TR_EL2:
+ uov_perms = 0;
+ break;
+ }
+
+ if (uov_perms & ~POE_RWX)
+ uov_perms = POE_NONE;
+
+ /* R_NPBXC, S1UnprivOverlay enabled */
+ if (wr->uwxn && (uov_perms & POE_X))
+ uov_perms &= ~POE_W;
- if (wi->e0poe && wr->uov) {
wr->ur &= uov_perms & POE_R;
wr->uw &= uov_perms & POE_W;
wr->ux &= uov_perms & POE_X;
@@ -1095,24 +1112,15 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu,
if (!wi->hpd)
compute_s1_hierarchical_permissions(vcpu, wi, wr);
- if (wi->poe || wi->e0poe)
- compute_s1_overlay_permissions(vcpu, wi, wr);
+ compute_s1_overlay_permissions(vcpu, wi, wr);
- /* R_QXXPC */
- if (wr->pwxn) {
- if (!wr->pov && wr->pw)
- wr->px = false;
- if (wr->pov && wr->px)
- wr->pw = false;
- }
+ /* R_QXXPC, S1PrivOverlay disabled */
+ if (!wr->pov)
+ wr->px &= !(wr->pwxn && wr->pw);
- /* R_NPBXC */
- if (wr->uwxn) {
- if (!wr->uov && wr->uw)
- wr->ux = false;
- if (wr->uov && wr->ux)
- wr->uw = false;
- }
+ /* R_NPBXC, S1UnprivOverlay disabled */
+ if (!wr->uov)
+ wr->ux &= !(wr->uwxn && wr->uw);
pan = wi->pan && (wr->ur || wr->uw ||
(pan3_enabled(vcpu, wi->regime) && wr->ux));
diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c
index 54911a93b001..da66c4a14775 100644
--- a/arch/arm64/kvm/config.c
+++ b/arch/arm64/kvm/config.c
@@ -66,7 +66,6 @@ struct reg_bits_to_feat_map {
#define FEAT_BRBE ID_AA64DFR0_EL1, BRBE, IMP
#define FEAT_TRC_SR ID_AA64DFR0_EL1, TraceVer, IMP
#define FEAT_PMUv3 ID_AA64DFR0_EL1, PMUVer, IMP
-#define FEAT_PMUv3p9 ID_AA64DFR0_EL1, PMUVer, V3P9
#define FEAT_TRBE ID_AA64DFR0_EL1, TraceBuffer, IMP
#define FEAT_TRBEv1p1 ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1
#define FEAT_DoubleLock ID_AA64DFR0_EL1, DoubleLock, IMP
@@ -89,6 +88,7 @@ struct reg_bits_to_feat_map {
#define FEAT_RASv2 ID_AA64PFR0_EL1, RAS, V2
#define FEAT_GICv3 ID_AA64PFR0_EL1, GIC, IMP
#define FEAT_LOR ID_AA64MMFR1_EL1, LO, IMP
+#define FEAT_SPEv1p2 ID_AA64DFR0_EL1, PMSVer, V1P2
#define FEAT_SPEv1p4 ID_AA64DFR0_EL1, PMSVer, V1P4
#define FEAT_SPEv1p5 ID_AA64DFR0_EL1, PMSVer, V1P5
#define FEAT_ATS1A ID_AA64ISAR2_EL1, ATS1A, IMP
@@ -131,6 +131,27 @@ struct reg_bits_to_feat_map {
#define FEAT_SPMU ID_AA64DFR1_EL1, SPMU, IMP
#define FEAT_SPE_nVM ID_AA64DFR2_EL1, SPE_nVM, IMP
#define FEAT_STEP2 ID_AA64DFR2_EL1, STEP, IMP
+#define FEAT_SYSREG128 ID_AA64ISAR2_EL1, SYSREG_128, IMP
+#define FEAT_CPA2 ID_AA64ISAR3_EL1, CPA, CPA2
+#define FEAT_ASID2 ID_AA64MMFR4_EL1, ASID2, IMP
+#define FEAT_MEC ID_AA64MMFR3_EL1, MEC, IMP
+#define FEAT_HAFT ID_AA64MMFR1_EL1, HAFDBS, HAFT
+#define FEAT_BTI ID_AA64PFR1_EL1, BT, IMP
+#define FEAT_ExS ID_AA64MMFR0_EL1, EXS, IMP
+#define FEAT_IESB ID_AA64MMFR2_EL1, IESB, IMP
+#define FEAT_LSE2 ID_AA64MMFR2_EL1, AT, IMP
+#define FEAT_LSMAOC ID_AA64MMFR2_EL1, LSM, IMP
+#define FEAT_MixedEnd ID_AA64MMFR0_EL1, BIGEND, IMP
+#define FEAT_MixedEndEL0 ID_AA64MMFR0_EL1, BIGENDEL0, IMP
+#define FEAT_MTE2 ID_AA64PFR1_EL1, MTE, MTE2
+#define FEAT_MTE_ASYNC ID_AA64PFR1_EL1, MTE_frac, ASYNC
+#define FEAT_MTE_STORE_ONLY ID_AA64PFR2_EL1, MTESTOREONLY, IMP
+#define FEAT_PAN ID_AA64MMFR1_EL1, PAN, IMP
+#define FEAT_PAN3 ID_AA64MMFR1_EL1, PAN, PAN3
+#define FEAT_SSBS ID_AA64PFR1_EL1, SSBS, IMP
+#define FEAT_TIDCP1 ID_AA64MMFR1_EL1, TIDCP1, IMP
+#define FEAT_FGT ID_AA64MMFR0_EL1, FGT, IMP
+#define FEAT_MTPMU ID_AA64DFR0_EL1, MTPMU, IMP
static bool not_feat_aa64el3(struct kvm *kvm)
{
@@ -218,11 +239,62 @@ static bool feat_trbe_mpam(struct kvm *kvm)
(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_MPAM));
}
+static bool feat_asid2_e2h1(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_ASID2) && !kvm_has_feat(kvm, FEAT_E2H0);
+}
+
+static bool feat_d128_e2h1(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_D128) && !kvm_has_feat(kvm, FEAT_E2H0);
+}
+
+static bool feat_mec_e2h1(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_MEC) && !kvm_has_feat(kvm, FEAT_E2H0);
+}
+
static bool feat_ebep_pmuv3_ss(struct kvm *kvm)
{
return kvm_has_feat(kvm, FEAT_EBEP) || kvm_has_feat(kvm, FEAT_PMUv3_SS);
}
+static bool feat_mixedendel0(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_MixedEnd) || kvm_has_feat(kvm, FEAT_MixedEndEL0);
+}
+
+static bool feat_mte_async(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_MTE2) && kvm_has_feat_enum(kvm, FEAT_MTE_ASYNC);
+}
+
+#define check_pmu_revision(k, r) \
+ ({ \
+ (kvm_has_feat((k), ID_AA64DFR0_EL1, PMUVer, r) && \
+ !kvm_has_feat((k), ID_AA64DFR0_EL1, PMUVer, IMP_DEF)); \
+ })
+
+static bool feat_pmuv3p1(struct kvm *kvm)
+{
+ return check_pmu_revision(kvm, V3P1);
+}
+
+static bool feat_pmuv3p5(struct kvm *kvm)
+{
+ return check_pmu_revision(kvm, V3P5);
+}
+
+static bool feat_pmuv3p7(struct kvm *kvm)
+{
+ return check_pmu_revision(kvm, V3P7);
+}
+
+static bool feat_pmuv3p9(struct kvm *kvm)
+{
+ return check_pmu_revision(kvm, V3P9);
+}
+
static bool compute_hcr_rw(struct kvm *kvm, u64 *bits)
{
/* This is purely academic: AArch32 and NV are mutually exclusive */
@@ -681,7 +753,7 @@ static const struct reg_bits_to_feat_map hdfgrtr2_feat_map[] = {
NEEDS_FEAT(HDFGRTR2_EL2_nPMICFILTR_EL0 |
HDFGRTR2_EL2_nPMICNTR_EL0,
FEAT_PMUv3_ICNTR),
- NEEDS_FEAT(HDFGRTR2_EL2_nPMUACR_EL1, FEAT_PMUv3p9),
+ NEEDS_FEAT(HDFGRTR2_EL2_nPMUACR_EL1, feat_pmuv3p9),
NEEDS_FEAT(HDFGRTR2_EL2_nPMSSCR_EL1 |
HDFGRTR2_EL2_nPMSSDATA,
FEAT_PMUv3_SS),
@@ -713,7 +785,7 @@ static const struct reg_bits_to_feat_map hdfgwtr2_feat_map[] = {
FEAT_PMUv3_ICNTR),
NEEDS_FEAT(HDFGWTR2_EL2_nPMUACR_EL1 |
HDFGWTR2_EL2_nPMZR_EL0,
- FEAT_PMUv3p9),
+ feat_pmuv3p9),
NEEDS_FEAT(HDFGWTR2_EL2_nPMSSCR_EL1, FEAT_PMUv3_SS),
NEEDS_FEAT(HDFGWTR2_EL2_nPMIAR_EL1, FEAT_SEBEP),
NEEDS_FEAT(HDFGWTR2_EL2_nPMSDSFR_EL1, feat_spe_fds),
@@ -832,6 +904,150 @@ static const struct reg_bits_to_feat_map hcr_feat_map[] = {
NEEDS_FEAT_FIXED(HCR_EL2_E2H, compute_hcr_e2h),
};
+static const struct reg_bits_to_feat_map sctlr2_feat_map[] = {
+ NEEDS_FEAT(SCTLR2_EL1_NMEA |
+ SCTLR2_EL1_EASE,
+ FEAT_DoubleFault2),
+ NEEDS_FEAT(SCTLR2_EL1_EnADERR, feat_aderr),
+ NEEDS_FEAT(SCTLR2_EL1_EnANERR, feat_anerr),
+ NEEDS_FEAT(SCTLR2_EL1_EnIDCP128, FEAT_SYSREG128),
+ NEEDS_FEAT(SCTLR2_EL1_EnPACM |
+ SCTLR2_EL1_EnPACM0,
+ feat_pauth_lr),
+ NEEDS_FEAT(SCTLR2_EL1_CPTA |
+ SCTLR2_EL1_CPTA0 |
+ SCTLR2_EL1_CPTM |
+ SCTLR2_EL1_CPTM0,
+ FEAT_CPA2),
+};
+
+static const struct reg_bits_to_feat_map tcr2_el2_feat_map[] = {
+ NEEDS_FEAT(TCR2_EL2_FNG1 |
+ TCR2_EL2_FNG0 |
+ TCR2_EL2_A2,
+ feat_asid2_e2h1),
+ NEEDS_FEAT(TCR2_EL2_DisCH1 |
+ TCR2_EL2_DisCH0 |
+ TCR2_EL2_D128,
+ feat_d128_e2h1),
+ NEEDS_FEAT(TCR2_EL2_AMEC1, feat_mec_e2h1),
+ NEEDS_FEAT(TCR2_EL2_AMEC0, FEAT_MEC),
+ NEEDS_FEAT(TCR2_EL2_HAFT, FEAT_HAFT),
+ NEEDS_FEAT(TCR2_EL2_PTTWI |
+ TCR2_EL2_PnCH,
+ FEAT_THE),
+ NEEDS_FEAT(TCR2_EL2_AIE, FEAT_AIE),
+ NEEDS_FEAT(TCR2_EL2_POE |
+ TCR2_EL2_E0POE,
+ FEAT_S1POE),
+ NEEDS_FEAT(TCR2_EL2_PIE, FEAT_S1PIE),
+};
+
+static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = {
+ NEEDS_FEAT(SCTLR_EL1_CP15BEN |
+ SCTLR_EL1_ITD |
+ SCTLR_EL1_SED,
+ FEAT_AA32EL0),
+ NEEDS_FEAT(SCTLR_EL1_BT0 |
+ SCTLR_EL1_BT1,
+ FEAT_BTI),
+ NEEDS_FEAT(SCTLR_EL1_CMOW, FEAT_CMOW),
+ NEEDS_FEAT(SCTLR_EL1_TSCXT, feat_csv2_2_csv2_1p2),
+ NEEDS_FEAT(SCTLR_EL1_EIS |
+ SCTLR_EL1_EOS,
+ FEAT_ExS),
+ NEEDS_FEAT(SCTLR_EL1_EnFPM, FEAT_FPMR),
+ NEEDS_FEAT(SCTLR_EL1_IESB, FEAT_IESB),
+ NEEDS_FEAT(SCTLR_EL1_EnALS, FEAT_LS64),
+ NEEDS_FEAT(SCTLR_EL1_EnAS0, FEAT_LS64_ACCDATA),
+ NEEDS_FEAT(SCTLR_EL1_EnASR, FEAT_LS64_V),
+ NEEDS_FEAT(SCTLR_EL1_nAA, FEAT_LSE2),
+ NEEDS_FEAT(SCTLR_EL1_LSMAOE |
+ SCTLR_EL1_nTLSMD,
+ FEAT_LSMAOC),
+ NEEDS_FEAT(SCTLR_EL1_EE, FEAT_MixedEnd),
+ NEEDS_FEAT(SCTLR_EL1_E0E, feat_mixedendel0),
+ NEEDS_FEAT(SCTLR_EL1_MSCEn, FEAT_MOPS),
+ NEEDS_FEAT(SCTLR_EL1_ATA0 |
+ SCTLR_EL1_ATA |
+ SCTLR_EL1_TCF0 |
+ SCTLR_EL1_TCF,
+ FEAT_MTE2),
+ NEEDS_FEAT(SCTLR_EL1_ITFSB, feat_mte_async),
+ NEEDS_FEAT(SCTLR_EL1_TCSO0 |
+ SCTLR_EL1_TCSO,
+ FEAT_MTE_STORE_ONLY),
+ NEEDS_FEAT(SCTLR_EL1_NMI |
+ SCTLR_EL1_SPINTMASK,
+ FEAT_NMI),
+ NEEDS_FEAT(SCTLR_EL1_SPAN, FEAT_PAN),
+ NEEDS_FEAT(SCTLR_EL1_EPAN, FEAT_PAN3),
+ NEEDS_FEAT(SCTLR_EL1_EnDA |
+ SCTLR_EL1_EnDB |
+ SCTLR_EL1_EnIA |
+ SCTLR_EL1_EnIB,
+ feat_pauth),
+ NEEDS_FEAT(SCTLR_EL1_EnTP2, FEAT_SME),
+ NEEDS_FEAT(SCTLR_EL1_EnRCTX, FEAT_SPECRES),
+ NEEDS_FEAT(SCTLR_EL1_DSSBS, FEAT_SSBS),
+ NEEDS_FEAT(SCTLR_EL1_TIDCP, FEAT_TIDCP1),
+ NEEDS_FEAT(SCTLR_EL1_TME0 |
+ SCTLR_EL1_TME |
+ SCTLR_EL1_TMT0 |
+ SCTLR_EL1_TMT,
+ FEAT_TME),
+ NEEDS_FEAT(SCTLR_EL1_TWEDEL |
+ SCTLR_EL1_TWEDEn,
+ FEAT_TWED),
+ NEEDS_FEAT(SCTLR_EL1_UCI |
+ SCTLR_EL1_EE |
+ SCTLR_EL1_E0E |
+ SCTLR_EL1_WXN |
+ SCTLR_EL1_nTWE |
+ SCTLR_EL1_nTWI |
+ SCTLR_EL1_UCT |
+ SCTLR_EL1_DZE |
+ SCTLR_EL1_I |
+ SCTLR_EL1_UMA |
+ SCTLR_EL1_SA0 |
+ SCTLR_EL1_SA |
+ SCTLR_EL1_C |
+ SCTLR_EL1_A |
+ SCTLR_EL1_M,
+ FEAT_AA64EL1),
+};
+
+static const struct reg_bits_to_feat_map mdcr_el2_feat_map[] = {
+ NEEDS_FEAT(MDCR_EL2_EBWE, FEAT_Debugv8p9),
+ NEEDS_FEAT(MDCR_EL2_TDOSA, FEAT_DoubleLock),
+ NEEDS_FEAT(MDCR_EL2_PMEE, FEAT_EBEP),
+ NEEDS_FEAT(MDCR_EL2_TDCC, FEAT_FGT),
+ NEEDS_FEAT(MDCR_EL2_MTPME, FEAT_MTPMU),
+ NEEDS_FEAT(MDCR_EL2_HPME |
+ MDCR_EL2_HPMN |
+ MDCR_EL2_TPMCR |
+ MDCR_EL2_TPM,
+ FEAT_PMUv3),
+ NEEDS_FEAT(MDCR_EL2_HPMD, feat_pmuv3p1),
+ NEEDS_FEAT(MDCR_EL2_HCCD |
+ MDCR_EL2_HLP,
+ feat_pmuv3p5),
+ NEEDS_FEAT(MDCR_EL2_HPMFZO, feat_pmuv3p7),
+ NEEDS_FEAT(MDCR_EL2_PMSSE, FEAT_PMUv3_SS),
+ NEEDS_FEAT(MDCR_EL2_E2PB |
+ MDCR_EL2_TPMS,
+ FEAT_SPE),
+ NEEDS_FEAT(MDCR_EL2_HPMFZS, FEAT_SPEv1p2),
+ NEEDS_FEAT(MDCR_EL2_EnSPM, FEAT_SPMU),
+ NEEDS_FEAT(MDCR_EL2_EnSTEPOP, FEAT_STEP2),
+ NEEDS_FEAT(MDCR_EL2_E2TB, FEAT_TRBE),
+ NEEDS_FEAT(MDCR_EL2_TTRF, FEAT_TRF),
+ NEEDS_FEAT(MDCR_EL2_TDA |
+ MDCR_EL2_TDE |
+ MDCR_EL2_TDRA,
+ FEAT_AA64EL1),
+};
+
static void __init check_feat_map(const struct reg_bits_to_feat_map *map,
int map_size, u64 res0, const char *str)
{
@@ -863,6 +1079,14 @@ void __init check_feature_map(void)
__HCRX_EL2_RES0, "HCRX_EL2");
check_feat_map(hcr_feat_map, ARRAY_SIZE(hcr_feat_map),
HCR_EL2_RES0, "HCR_EL2");
+ check_feat_map(sctlr2_feat_map, ARRAY_SIZE(sctlr2_feat_map),
+ SCTLR2_EL1_RES0, "SCTLR2_EL1");
+ check_feat_map(tcr2_el2_feat_map, ARRAY_SIZE(tcr2_el2_feat_map),
+ TCR2_EL2_RES0, "TCR2_EL2");
+ check_feat_map(sctlr_el1_feat_map, ARRAY_SIZE(sctlr_el1_feat_map),
+ SCTLR_EL1_RES0, "SCTLR_EL1");
+ check_feat_map(mdcr_el2_feat_map, ARRAY_SIZE(mdcr_el2_feat_map),
+ MDCR_EL2_RES0, "MDCR_EL2");
}
static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map *map)
@@ -1077,6 +1301,31 @@ void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *r
*res0 |= HCR_EL2_RES0 | (mask & ~fixed);
*res1 = HCR_EL2_RES1 | (mask & fixed);
break;
+ case SCTLR2_EL1:
+ case SCTLR2_EL2:
+ *res0 = compute_res0_bits(kvm, sctlr2_feat_map,
+ ARRAY_SIZE(sctlr2_feat_map), 0, 0);
+ *res0 |= SCTLR2_EL1_RES0;
+ *res1 = SCTLR2_EL1_RES1;
+ break;
+ case TCR2_EL2:
+ *res0 = compute_res0_bits(kvm, tcr2_el2_feat_map,
+ ARRAY_SIZE(tcr2_el2_feat_map), 0, 0);
+ *res0 |= TCR2_EL2_RES0;
+ *res1 = TCR2_EL2_RES1;
+ break;
+ case SCTLR_EL1:
+ *res0 = compute_res0_bits(kvm, sctlr_el1_feat_map,
+ ARRAY_SIZE(sctlr_el1_feat_map), 0, 0);
+ *res0 |= SCTLR_EL1_RES0;
+ *res1 = SCTLR_EL1_RES1;
+ break;
+ case MDCR_EL2:
+ *res0 = compute_res0_bits(kvm, mdcr_el2_feat_map,
+ ARRAY_SIZE(mdcr_el2_feat_map), 0, 0);
+ *res0 |= MDCR_EL2_RES0;
+ *res1 = MDCR_EL2_RES1;
+ break;
default:
WARN_ON_ONCE(1);
*res0 = *res1 = 0;
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 1a7dab333f55..381382c19fe4 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -81,6 +81,10 @@ void kvm_init_host_debug_data(void)
!(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P))
host_data_set_flag(HAS_SPE);
+ /* Check if we have BRBE implemented and available at the host */
+ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_BRBE_SHIFT))
+ host_data_set_flag(HAS_BRBE);
+
if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceFilt_SHIFT)) {
/* Force disable trace in protected mode in case of no TRBE */
if (is_protected_kvm_enabled())
diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c
index 3a384e9660b8..90cb4b7ae0ff 100644
--- a/arch/arm64/kvm/emulate-nested.c
+++ b/arch/arm64/kvm/emulate-nested.c
@@ -88,6 +88,7 @@ enum cgt_group_id {
CGT_HCRX_EnFPM,
CGT_HCRX_TCR2En,
+ CGT_HCRX_SCTLR2En,
CGT_CNTHCTL_EL1TVT,
CGT_CNTHCTL_EL1TVCT,
@@ -108,6 +109,7 @@ enum cgt_group_id {
CGT_HCR_TTLB_TTLBOS,
CGT_HCR_TVM_TRVM,
CGT_HCR_TVM_TRVM_HCRX_TCR2En,
+ CGT_HCR_TVM_TRVM_HCRX_SCTLR2En,
CGT_HCR_TPU_TICAB,
CGT_HCR_TPU_TOCU,
CGT_HCR_NV1_nNV2_ENSCXT,
@@ -398,6 +400,12 @@ static const struct trap_bits coarse_trap_bits[] = {
.mask = HCRX_EL2_TCR2En,
.behaviour = BEHAVE_FORWARD_RW,
},
+ [CGT_HCRX_SCTLR2En] = {
+ .index = HCRX_EL2,
+ .value = 0,
+ .mask = HCRX_EL2_SCTLR2En,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
[CGT_CNTHCTL_EL1TVT] = {
.index = CNTHCTL_EL2,
.value = CNTHCTL_EL1TVT,
@@ -449,6 +457,8 @@ static const enum cgt_group_id *coarse_control_combo[] = {
MCB(CGT_HCR_TVM_TRVM, CGT_HCR_TVM, CGT_HCR_TRVM),
MCB(CGT_HCR_TVM_TRVM_HCRX_TCR2En,
CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_TCR2En),
+ MCB(CGT_HCR_TVM_TRVM_HCRX_SCTLR2En,
+ CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_SCTLR2En),
MCB(CGT_HCR_TPU_TICAB, CGT_HCR_TPU, CGT_HCR_TICAB),
MCB(CGT_HCR_TPU_TOCU, CGT_HCR_TPU, CGT_HCR_TOCU),
MCB(CGT_HCR_NV1_nNV2_ENSCXT, CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT),
@@ -782,6 +792,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(OP_TLBI_RVALE1OSNXS, CGT_HCR_TTLB_TTLBOS),
SR_TRAP(OP_TLBI_RVAALE1OSNXS, CGT_HCR_TTLB_TTLBOS),
SR_TRAP(SYS_SCTLR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_SCTLR2_EL1, CGT_HCR_TVM_TRVM_HCRX_SCTLR2En),
SR_TRAP(SYS_TTBR0_EL1, CGT_HCR_TVM_TRVM),
SR_TRAP(SYS_TTBR1_EL1, CGT_HCR_TVM_TRVM),
SR_TRAP(SYS_TCR_EL1, CGT_HCR_TVM_TRVM),
@@ -1354,6 +1365,7 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = {
SR_FGT(SYS_SCXTNUM_EL0, HFGRTR, SCXTNUM_EL0, 1),
SR_FGT(SYS_SCXTNUM_EL1, HFGRTR, SCXTNUM_EL1, 1),
SR_FGT(SYS_SCTLR_EL1, HFGRTR, SCTLR_EL1, 1),
+ SR_FGT(SYS_SCTLR2_EL1, HFGRTR, SCTLR_EL1, 1),
SR_FGT(SYS_REVIDR_EL1, HFGRTR, REVIDR_EL1, 1),
SR_FGT(SYS_PAR_EL1, HFGRTR, PAR_EL1, 1),
SR_FGT(SYS_MPIDR_EL1, HFGRTR, MPIDR_EL1, 1),
@@ -2592,13 +2604,8 @@ inject:
static bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, u64 control_bit)
{
- bool control_bit_set;
-
- if (!vcpu_has_nv(vcpu))
- return false;
-
- control_bit_set = __vcpu_sys_reg(vcpu, reg) & control_bit;
- if (!is_hyp_ctxt(vcpu) && control_bit_set) {
+ if (is_nested_ctxt(vcpu) &&
+ (__vcpu_sys_reg(vcpu, reg) & control_bit)) {
kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
return true;
}
@@ -2719,6 +2726,9 @@ static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2,
case except_type_irq:
kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_IRQ);
break;
+ case except_type_serror:
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SERR);
+ break;
default:
WARN_ONCE(1, "Unsupported EL2 exception injection %d\n", type);
}
@@ -2816,3 +2826,28 @@ int kvm_inject_nested_irq(struct kvm_vcpu *vcpu)
/* esr_el2 value doesn't matter for exits due to irqs. */
return kvm_inject_nested(vcpu, 0, except_type_irq);
}
+
+int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr)
+{
+ u64 esr = FIELD_PREP(ESR_ELx_EC_MASK,
+ iabt ? ESR_ELx_EC_IABT_LOW : ESR_ELx_EC_DABT_LOW);
+ esr |= ESR_ELx_FSC_EXTABT | ESR_ELx_IL;
+
+ vcpu_write_sys_reg(vcpu, FAR_EL2, addr);
+
+ if (__vcpu_sys_reg(vcpu, SCTLR2_EL2) & SCTLR2_EL1_EASE)
+ return kvm_inject_nested(vcpu, esr, except_type_serror);
+
+ return kvm_inject_nested_sync(vcpu, esr);
+}
+
+int kvm_inject_nested_serror(struct kvm_vcpu *vcpu, u64 esr)
+{
+ /*
+ * Hardware sets up the EC field when propagating ESR as a result of
+ * vSError injection. Manually populate EC for an emulated SError
+ * exception.
+ */
+ esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SERROR);
+ return kvm_inject_nested(vcpu, esr, except_type_serror);
+}
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 2196979a24a3..16ba5e9ac86c 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -818,8 +818,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
- events->exception.serror_pending = !!(vcpu->arch.hcr_el2 & HCR_VSE);
events->exception.serror_has_esr = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
+ events->exception.serror_pending = (vcpu->arch.hcr_el2 & HCR_VSE) ||
+ vcpu_get_flag(vcpu, NESTED_SERROR_PENDING);
if (events->exception.serror_pending && events->exception.serror_has_esr)
events->exception.serror_esr = vcpu_get_vsesr(vcpu);
@@ -833,29 +834,62 @@ int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
return 0;
}
+static void commit_pending_events(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION))
+ return;
+
+ /*
+ * Reset the MMIO emulation state to avoid stepping PC after emulating
+ * the exception entry.
+ */
+ vcpu->mmio_needed = false;
+ kvm_call_hyp(__kvm_adjust_pc, vcpu);
+}
+
int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
bool serror_pending = events->exception.serror_pending;
bool has_esr = events->exception.serror_has_esr;
bool ext_dabt_pending = events->exception.ext_dabt_pending;
+ u64 esr = events->exception.serror_esr;
+ int ret = 0;
- if (serror_pending && has_esr) {
- if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
- return -EINVAL;
-
- if (!((events->exception.serror_esr) & ~ESR_ELx_ISS_MASK))
- kvm_set_sei_esr(vcpu, events->exception.serror_esr);
- else
- return -EINVAL;
- } else if (serror_pending) {
- kvm_inject_vabt(vcpu);
+ /*
+ * Immediately commit the pending SEA to the vCPU's architectural
+ * state which is necessary since we do not return a pending SEA
+ * to userspace via KVM_GET_VCPU_EVENTS.
+ */
+ if (ext_dabt_pending) {
+ ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+ commit_pending_events(vcpu);
}
- if (ext_dabt_pending)
- kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+ if (ret < 0)
+ return ret;
- return 0;
+ if (!serror_pending)
+ return 0;
+
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && has_esr)
+ return -EINVAL;
+
+ if (has_esr && (esr & ~ESR_ELx_ISS_MASK))
+ return -EINVAL;
+
+ if (has_esr)
+ ret = kvm_inject_serror_esr(vcpu, esr);
+ else
+ ret = kvm_inject_serror(vcpu);
+
+ /*
+ * We could've decided that the SError is due for immediate software
+ * injection; commit the exception in case userspace decides it wants
+ * to inject more exceptions for some strange reason.
+ */
+ commit_pending_events(vcpu);
+ return (ret < 0) ? ret : 0;
}
u32 __attribute_const__ kvm_target_cpu(void)
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 453266c96481..a598072f36d2 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -32,7 +32,7 @@ typedef int (*exit_handle_fn)(struct kvm_vcpu *);
static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u64 esr)
{
if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(NULL, esr))
- kvm_inject_vabt(vcpu);
+ kvm_inject_serror(vcpu);
}
static int handle_hvc(struct kvm_vcpu *vcpu)
@@ -252,7 +252,7 @@ static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu)
return 1;
}
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
+ if (is_nested_ctxt(vcpu)) {
kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
return 1;
}
@@ -311,12 +311,11 @@ static int kvm_handle_gcs(struct kvm_vcpu *vcpu)
static int handle_other(struct kvm_vcpu *vcpu)
{
- bool is_l2 = vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu);
+ bool allowed, fwd = is_nested_ctxt(vcpu);
u64 hcrx = __vcpu_sys_reg(vcpu, HCRX_EL2);
u64 esr = kvm_vcpu_get_esr(vcpu);
u64 iss = ESR_ELx_ISS(esr);
struct kvm *kvm = vcpu->kvm;
- bool allowed, fwd = false;
/*
* We only trap for two reasons:
@@ -335,28 +334,23 @@ static int handle_other(struct kvm_vcpu *vcpu)
switch (iss) {
case ESR_ELx_ISS_OTHER_ST64BV:
allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_V);
- if (is_l2)
- fwd = !(hcrx & HCRX_EL2_EnASR);
+ fwd &= !(hcrx & HCRX_EL2_EnASR);
break;
case ESR_ELx_ISS_OTHER_ST64BV0:
allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA);
- if (is_l2)
- fwd = !(hcrx & HCRX_EL2_EnAS0);
+ fwd &= !(hcrx & HCRX_EL2_EnAS0);
break;
case ESR_ELx_ISS_OTHER_LDST64B:
allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64);
- if (is_l2)
- fwd = !(hcrx & HCRX_EL2_EnALS);
+ fwd &= !(hcrx & HCRX_EL2_EnALS);
break;
case ESR_ELx_ISS_OTHER_TSBCSYNC:
allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1);
- if (is_l2)
- fwd = (__vcpu_sys_reg(vcpu, HFGITR2_EL2) & HFGITR2_EL2_TSBCSYNC);
+ fwd &= (__vcpu_sys_reg(vcpu, HFGITR2_EL2) & HFGITR2_EL2_TSBCSYNC);
break;
case ESR_ELx_ISS_OTHER_PSBCSYNC:
allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P5);
- if (is_l2)
- fwd = (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_PSBCSYNC);
+ fwd &= (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_PSBCSYNC);
break;
default:
/* Clearly, we're missing something. */
@@ -496,7 +490,7 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
kvm_handle_guest_serror(vcpu, disr_to_esr(disr));
} else {
- kvm_inject_vabt(vcpu);
+ kvm_inject_serror(vcpu);
}
return;
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 6a2a899a344e..95d186e0bf54 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -26,7 +26,8 @@ static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
if (unlikely(vcpu_has_nv(vcpu)))
return vcpu_read_sys_reg(vcpu, reg);
- else if (__vcpu_read_sys_reg_from_cpu(reg, &val))
+ else if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) &&
+ __vcpu_read_sys_reg_from_cpu(reg, &val))
return val;
return __vcpu_sys_reg(vcpu, reg);
@@ -36,7 +37,8 @@ static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
{
if (unlikely(vcpu_has_nv(vcpu)))
vcpu_write_sys_reg(vcpu, val, reg);
- else if (!__vcpu_write_sys_reg_to_cpu(val, reg))
+ else if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU) ||
+ !__vcpu_write_sys_reg_to_cpu(val, reg))
__vcpu_assign_sys_reg(vcpu, reg, val);
}
@@ -339,6 +341,10 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync);
break;
+ case unpack_vcpu_flag(EXCEPT_AA64_EL1_SERR):
+ enter_exception64(vcpu, PSR_MODE_EL1h, except_type_serror);
+ break;
+
case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC):
enter_exception64(vcpu, PSR_MODE_EL2h, except_type_sync);
break;
@@ -347,9 +353,13 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
enter_exception64(vcpu, PSR_MODE_EL2h, except_type_irq);
break;
+ case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR):
+ enter_exception64(vcpu, PSR_MODE_EL2h, except_type_serror);
+ break;
+
default:
/*
- * Only EL1_SYNC and EL2_{SYNC,IRQ} makes
+ * Only EL1_{SYNC,SERR} and EL2_{SYNC,IRQ,SERR} makes
* sense so far. Everything else gets silently
* ignored.
*/
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 2ad57b117385..84ec4e100fbb 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -298,7 +298,7 @@ static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
u64 val; \
\
ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) \
+ if (is_nested_ctxt(vcpu)) \
compute_clr_set(vcpu, reg, c, s); \
\
compute_undef_clr_set(vcpu, kvm, reg, c, s); \
@@ -436,7 +436,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
if (cpus_have_final_cap(ARM64_HAS_HCX)) {
u64 hcrx = vcpu->arch.hcrx_el2;
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
+ if (is_nested_ctxt(vcpu)) {
u64 val = __vcpu_sys_reg(vcpu, HCRX_EL2);
hcrx |= val & __HCRX_EL2_MASK;
hcrx &= ~(~val & __HCRX_EL2_nMASK);
@@ -476,21 +476,56 @@ static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr)
write_sysreg_hcr(hcr);
- if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE))
- write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2);
+ if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE)) {
+ u64 vsesr;
+
+ /*
+ * When HCR_EL2.AMO is set, physical SErrors are taken to EL2
+ * and vSError injection is enabled for EL1. Conveniently, for
+ * NV this means that it is never the case where a 'physical'
+ * SError (injected by KVM or userspace) and vSError are
+ * deliverable to the same context.
+ *
+ * As such, we can trivially select between the host or guest's
+ * VSESR_EL2. Except for the case that FEAT_RAS hasn't been
+ * exposed to the guest, where ESR propagation in hardware
+ * occurs unconditionally.
+ *
+ * Paper over the architectural wart and use an IMPLEMENTATION
+ * DEFINED ESR value in case FEAT_RAS is hidden from the guest.
+ */
+ if (!vserror_state_is_nested(vcpu))
+ vsesr = vcpu->arch.vsesr_el2;
+ else if (kvm_has_ras(kern_hyp_va(vcpu->kvm)))
+ vsesr = __vcpu_sys_reg(vcpu, VSESR_EL2);
+ else
+ vsesr = ESR_ELx_ISV;
+
+ write_sysreg_s(vsesr, SYS_VSESR_EL2);
+ }
}
static inline void ___deactivate_traps(struct kvm_vcpu *vcpu)
{
+ u64 *hcr;
+
+ if (vserror_state_is_nested(vcpu))
+ hcr = __ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2);
+ else
+ hcr = &vcpu->arch.hcr_el2;
+
/*
* If we pended a virtual abort, preserve it until it gets
* cleared. See D1.14.3 (Virtual Interrupts) for details, but
* the crucial bit is "On taking a vSError interrupt,
* HCR_EL2.VSE is cleared to 0."
+ *
+ * Additionally, when in a nested context we need to propagate the
+ * updated state to the guest hypervisor's HCR_EL2.
*/
- if (vcpu->arch.hcr_el2 & HCR_VSE) {
- vcpu->arch.hcr_el2 &= ~HCR_VSE;
- vcpu->arch.hcr_el2 |= read_sysreg(hcr_el2) & HCR_VSE;
+ if (*hcr & HCR_VSE) {
+ *hcr &= ~HCR_VSE;
+ *hcr |= read_sysreg(hcr_el2) & HCR_VSE;
}
}
@@ -531,7 +566,7 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu)
* nested guest, as the guest hypervisor could select a smaller VL. Slap
* that into hardware before wrapping up.
*/
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))
+ if (is_nested_ctxt(vcpu))
sve_cond_update_zcr_vq(__vcpu_sys_reg(vcpu, ZCR_EL2), SYS_ZCR_EL2);
write_sysreg_el1(__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)), SYS_ZCR);
@@ -557,7 +592,7 @@ static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu)
if (vcpu_has_sve(vcpu)) {
/* A guest hypervisor may restrict the effective max VL. */
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))
+ if (is_nested_ctxt(vcpu))
zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2);
else
zcr_el2 = vcpu_sve_max_vq(vcpu) - 1;
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index 4d0dbea4c56f..a17cbe7582de 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -109,6 +109,28 @@ static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt)
return kvm_has_s1poe(kern_hyp_va(vcpu->kvm));
}
+static inline bool ctxt_has_ras(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ return false;
+
+ vcpu = ctxt_to_vcpu(ctxt);
+ return kvm_has_ras(kern_hyp_va(vcpu->kvm));
+}
+
+static inline bool ctxt_has_sctlr2(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!cpus_have_final_cap(ARM64_HAS_SCTLR2))
+ return false;
+
+ vcpu = ctxt_to_vcpu(ctxt);
+ return kvm_has_sctlr2(kern_hyp_va(vcpu->kvm));
+}
+
static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
{
ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR);
@@ -147,6 +169,9 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
ctxt_sys_reg(ctxt, SP_EL1) = read_sysreg(sp_el1);
ctxt_sys_reg(ctxt, ELR_EL1) = read_sysreg_el1(SYS_ELR);
ctxt_sys_reg(ctxt, SPSR_EL1) = read_sysreg_el1(SYS_SPSR);
+
+ if (ctxt_has_sctlr2(ctxt))
+ ctxt_sys_reg(ctxt, SCTLR2_EL1) = read_sysreg_el1(SYS_SCTLR2);
}
static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
@@ -159,8 +184,13 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
if (!has_vhe() && ctxt->__hyp_running_vcpu)
ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR);
- if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ return;
+
+ if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt)))
ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2);
+ else if (ctxt_has_ras(ctxt))
+ ctxt_sys_reg(ctxt, VDISR_EL2) = read_sysreg_s(SYS_VDISR_EL2);
}
static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt)
@@ -252,6 +282,9 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt,
write_sysreg(ctxt_sys_reg(ctxt, SP_EL1), sp_el1);
write_sysreg_el1(ctxt_sys_reg(ctxt, ELR_EL1), SYS_ELR);
write_sysreg_el1(ctxt_sys_reg(ctxt, SPSR_EL1), SYS_SPSR);
+
+ if (ctxt_has_sctlr2(ctxt))
+ write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR2_EL1), SYS_SCTLR2);
}
/* Read the VCPU state's PSTATE, but translate (v)EL2 to EL1. */
@@ -275,6 +308,7 @@ static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctx
{
u64 pstate = to_hw_pstate(ctxt);
u64 mode = pstate & PSR_AA32_MODE_MASK;
+ u64 vdisr;
/*
* Safety check to ensure we're setting the CPU up to enter the guest
@@ -293,8 +327,17 @@ static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctx
write_sysreg_el2(ctxt->regs.pc, SYS_ELR);
write_sysreg_el2(pstate, SYS_SPSR);
- if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
- write_sysreg_s(ctxt_sys_reg(ctxt, DISR_EL1), SYS_VDISR_EL2);
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ return;
+
+ if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt)))
+ vdisr = ctxt_sys_reg(ctxt, DISR_EL1);
+ else if (ctxt_has_ras(ctxt))
+ vdisr = ctxt_sys_reg(ctxt, VDISR_EL2);
+ else
+ vdisr = 0;
+
+ write_sysreg_s(vdisr, SYS_VDISR_EL2);
}
static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
index 2f4a4f5036bb..2a1c0f49792b 100644
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -92,12 +92,42 @@ static void __trace_switch_to_host(void)
*host_data_ptr(host_debug_state.trfcr_el1));
}
+static void __debug_save_brbe(u64 *brbcr_el1)
+{
+ *brbcr_el1 = 0;
+
+ /* Check if the BRBE is enabled */
+ if (!(read_sysreg_el1(SYS_BRBCR) & (BRBCR_ELx_E0BRE | BRBCR_ELx_ExBRE)))
+ return;
+
+ /*
+ * Prohibit branch record generation while we are in guest.
+ * Since access to BRBCR_EL1 is trapped, the guest can't
+ * modify the filtering set by the host.
+ */
+ *brbcr_el1 = read_sysreg_el1(SYS_BRBCR);
+ write_sysreg_el1(0, SYS_BRBCR);
+}
+
+static void __debug_restore_brbe(u64 brbcr_el1)
+{
+ if (!brbcr_el1)
+ return;
+
+ /* Restore BRBE controls */
+ write_sysreg_el1(brbcr_el1, SYS_BRBCR);
+}
+
void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
/* Disable and flush SPE data generation */
if (host_data_test_flag(HAS_SPE))
__debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1));
+ /* Disable BRBE branch records */
+ if (host_data_test_flag(HAS_BRBE))
+ __debug_save_brbe(host_data_ptr(host_debug_state.brbcr_el1));
+
if (__trace_needs_switch())
__trace_switch_to_guest();
}
@@ -111,6 +141,8 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
if (host_data_test_flag(HAS_SPE))
__debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1));
+ if (host_data_test_flag(HAS_BRBE))
+ __debug_restore_brbe(*host_data_ptr(host_debug_state.brbcr_el1));
if (__trace_needs_switch())
__trace_switch_to_host();
}
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 0e752b515d0f..ccd575d5f6de 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -272,7 +272,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
* We're about to restore some new MMU state. Make sure
* ongoing page-table walks that have started before we
* trapped to EL2 have completed. This also synchronises the
- * above disabling of SPE and TRBE.
+ * above disabling of BRBE, SPE and TRBE.
*
* See DDI0487I.a D8.1.5 "Out-of-context translation regimes",
* rule R_LFHQG and subsequent information statements.
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index f162b0df5cae..d81275790e69 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -296,12 +296,19 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
}
/*
- * Prevent the guest from touching the ICC_SRE_EL1 system
- * register. Note that this may not have any effect, as
- * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation.
+ * GICv5 BET0 FEAT_GCIE_LEGACY doesn't include ICC_SRE_EL2. This is due
+ * to be relaxed in a future spec release, at which point this in
+ * condition can be dropped.
*/
- write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
- ICC_SRE_EL2);
+ if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) {
+ /*
+ * Prevent the guest from touching the ICC_SRE_EL1 system
+ * register. Note that this may not have any effect, as
+ * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation.
+ */
+ write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
+ ICC_SRE_EL2);
+ }
/*
* If we need to trap system registers, we must write
@@ -322,8 +329,14 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
}
- val = read_gicreg(ICC_SRE_EL2);
- write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
+ /*
+ * Can be dropped in the future when GICv5 spec is relaxed. See comment
+ * above.
+ */
+ if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) {
+ val = read_gicreg(ICC_SRE_EL2);
+ write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
+ }
if (!cpu_if->vgic_sre) {
/* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
@@ -423,10 +436,20 @@ void __vgic_v3_init_lrs(void)
*/
u64 __vgic_v3_get_gic_config(void)
{
- u64 val, sre = read_gicreg(ICC_SRE_EL1);
+ u64 val, sre;
unsigned long flags = 0;
/*
+ * In compat mode, we cannot access ICC_SRE_EL1 at any EL
+ * other than EL1 itself; just return the
+ * ICH_VTR_EL2. ICC_IDR0_EL1 is only implemented on a GICv5
+ * system, so we first check if we have GICv5 support.
+ */
+ if (cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF))
+ return read_gicreg(ICH_VTR_EL2);
+
+ sre = read_gicreg(ICC_SRE_EL1);
+ /*
* To check whether we have a MMIO-based (GICv2 compatible)
* CPU interface, we need to disable the system register
* view.
@@ -471,6 +494,16 @@ u64 __vgic_v3_get_gic_config(void)
return val;
}
+static void __vgic_v3_compat_mode_enable(void)
+{
+ if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF))
+ return;
+
+ sysreg_clear_set_s(SYS_ICH_VCTLR_EL2, 0, ICH_VCTLR_EL2_V3);
+ /* Wait for V3 to become enabled */
+ isb();
+}
+
static u64 __vgic_v3_read_vmcr(void)
{
return read_gicreg(ICH_VMCR_EL2);
@@ -490,6 +523,8 @@ void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
{
+ __vgic_v3_compat_mode_enable();
+
/*
* If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
* is dependent on ICC_SRE_EL1.SRE, and we have to perform the
@@ -1050,7 +1085,7 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu,
{
u64 ich_hcr;
- if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu))
+ if (!is_nested_ctxt(vcpu))
return false;
ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 477f1580ffea..e482181c6632 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -48,8 +48,7 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
static u64 __compute_hcr(struct kvm_vcpu *vcpu)
{
- u64 guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
- u64 hcr = vcpu->arch.hcr_el2;
+ u64 guest_hcr, hcr = vcpu->arch.hcr_el2;
if (!vcpu_has_nv(vcpu))
return hcr;
@@ -68,10 +67,21 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu)
if (!vcpu_el2_e2h_is_set(vcpu))
hcr |= HCR_NV1;
+ /*
+ * Nothing in HCR_EL2 should impact running in hypervisor
+ * context, apart from bits we have defined as RESx (E2H,
+ * HCD and co), or that cannot be set directly (the EXCLUDE
+ * bits). Given that we OR the guest's view with the host's,
+ * we can use the 0 value as the starting point, and only
+ * use the config-driven RES1 bits.
+ */
+ guest_hcr = kvm_vcpu_apply_reg_masks(vcpu, HCR_EL2, 0);
+
write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2);
} else {
host_data_clear_flag(VCPU_IN_HYP_CONTEXT);
+ guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
if (guest_hcr & HCR_NV) {
u64 va = __fix_to_virt(vncr_fixmap(smp_processor_id()));
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 73e4bc7fde9e..f28c6cf4fe1b 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -77,6 +77,9 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu)
__vcpu_assign_sys_reg(vcpu, SP_EL2, read_sysreg(sp_el1));
__vcpu_assign_sys_reg(vcpu, ELR_EL2, read_sysreg_el1(SYS_ELR));
__vcpu_assign_sys_reg(vcpu, SPSR_EL2, read_sysreg_el1(SYS_SPSR));
+
+ if (ctxt_has_sctlr2(&vcpu->arch.ctxt))
+ __vcpu_assign_sys_reg(vcpu, SCTLR2_EL2, read_sysreg_el1(SYS_SCTLR2));
}
static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu)
@@ -139,6 +142,9 @@ static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu)
write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1);
write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR);
write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR);
+
+ if (ctxt_has_sctlr2(&vcpu->arch.ctxt))
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR2_EL2), SYS_SCTLR2);
}
/*
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index a640e839848e..6745f38b64f9 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -15,13 +15,11 @@
#include <asm/kvm_nested.h>
#include <asm/esr.h>
-static void pend_sync_exception(struct kvm_vcpu *vcpu)
+static unsigned int exception_target_el(struct kvm_vcpu *vcpu)
{
/* If not nesting, EL1 is the only possible exception target */
- if (likely(!vcpu_has_nv(vcpu))) {
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
- return;
- }
+ if (likely(!vcpu_has_nv(vcpu)))
+ return PSR_MODE_EL1h;
/*
* With NV, we need to pick between EL1 and EL2. Note that we
@@ -32,26 +30,76 @@ static void pend_sync_exception(struct kvm_vcpu *vcpu)
switch(*vcpu_cpsr(vcpu) & PSR_MODE_MASK) {
case PSR_MODE_EL2h:
case PSR_MODE_EL2t:
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
- break;
+ return PSR_MODE_EL2h;
case PSR_MODE_EL1h:
case PSR_MODE_EL1t:
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
- break;
+ return PSR_MODE_EL1h;
case PSR_MODE_EL0t:
- if (vcpu_el2_tge_is_set(vcpu))
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
- else
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
- break;
+ return vcpu_el2_tge_is_set(vcpu) ? PSR_MODE_EL2h : PSR_MODE_EL1h;
default:
BUG();
}
}
-static bool match_target_el(struct kvm_vcpu *vcpu, unsigned long target)
+static enum vcpu_sysreg exception_esr_elx(struct kvm_vcpu *vcpu)
+{
+ if (exception_target_el(vcpu) == PSR_MODE_EL2h)
+ return ESR_EL2;
+
+ return ESR_EL1;
+}
+
+static enum vcpu_sysreg exception_far_elx(struct kvm_vcpu *vcpu)
+{
+ if (exception_target_el(vcpu) == PSR_MODE_EL2h)
+ return FAR_EL2;
+
+ return FAR_EL1;
+}
+
+static void pend_sync_exception(struct kvm_vcpu *vcpu)
+{
+ if (exception_target_el(vcpu) == PSR_MODE_EL1h)
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
+ else
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
+}
+
+static void pend_serror_exception(struct kvm_vcpu *vcpu)
{
- return (vcpu_get_flag(vcpu, EXCEPT_MASK) == target);
+ if (exception_target_el(vcpu) == PSR_MODE_EL1h)
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SERR);
+ else
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SERR);
+}
+
+static bool __effective_sctlr2_bit(struct kvm_vcpu *vcpu, unsigned int idx)
+{
+ u64 sctlr2;
+
+ if (!kvm_has_sctlr2(vcpu->kvm))
+ return false;
+
+ if (is_nested_ctxt(vcpu) &&
+ !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_SCTLR2En))
+ return false;
+
+ if (exception_target_el(vcpu) == PSR_MODE_EL1h)
+ sctlr2 = vcpu_read_sys_reg(vcpu, SCTLR2_EL1);
+ else
+ sctlr2 = vcpu_read_sys_reg(vcpu, SCTLR2_EL2);
+
+ return sctlr2 & BIT(idx);
+}
+
+static bool effective_sctlr2_ease(struct kvm_vcpu *vcpu)
+{
+ return __effective_sctlr2_bit(vcpu, SCTLR2_EL1_EASE_SHIFT);
+}
+
+static bool effective_sctlr2_nmea(struct kvm_vcpu *vcpu)
+{
+ return __effective_sctlr2_bit(vcpu, SCTLR2_EL1_NMEA_SHIFT);
}
static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr)
@@ -60,7 +108,11 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
bool is_aarch32 = vcpu_mode_is_32bit(vcpu);
u64 esr = 0;
- pend_sync_exception(vcpu);
+ /* This delight is brought to you by FEAT_DoubleFault2. */
+ if (effective_sctlr2_ease(vcpu))
+ pend_serror_exception(vcpu);
+ else
+ pend_sync_exception(vcpu);
/*
* Build an {i,d}abort, depending on the level and the
@@ -83,13 +135,8 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
esr |= ESR_ELx_FSC_EXTABT;
- if (match_target_el(vcpu, unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC))) {
- vcpu_write_sys_reg(vcpu, addr, FAR_EL1);
- vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
- } else {
- vcpu_write_sys_reg(vcpu, addr, FAR_EL2);
- vcpu_write_sys_reg(vcpu, esr, ESR_EL2);
- }
+ vcpu_write_sys_reg(vcpu, addr, exception_far_elx(vcpu));
+ vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
}
static void inject_undef64(struct kvm_vcpu *vcpu)
@@ -105,10 +152,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
if (kvm_vcpu_trap_il_is32bit(vcpu))
esr |= ESR_ELx_IL;
- if (match_target_el(vcpu, unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC)))
- vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
- else
- vcpu_write_sys_reg(vcpu, esr, ESR_EL2);
+ vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
}
#define DFSR_FSC_EXTABT_LPAE 0x10
@@ -155,36 +199,35 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr)
vcpu_write_sys_reg(vcpu, far, FAR_EL1);
}
-/**
- * kvm_inject_dabt - inject a data abort into the guest
- * @vcpu: The VCPU to receive the data abort
- * @addr: The address to report in the DFAR
- *
- * It is assumed that this code is called from the VCPU thread and that the
- * VCPU therefore is not currently executing guest code.
- */
-void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
+static void __kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr)
{
if (vcpu_el1_is_32bit(vcpu))
- inject_abt32(vcpu, false, addr);
+ inject_abt32(vcpu, iabt, addr);
else
- inject_abt64(vcpu, false, addr);
+ inject_abt64(vcpu, iabt, addr);
}
-/**
- * kvm_inject_pabt - inject a prefetch abort into the guest
- * @vcpu: The VCPU to receive the prefetch abort
- * @addr: The address to report in the DFAR
- *
- * It is assumed that this code is called from the VCPU thread and that the
- * VCPU therefore is not currently executing guest code.
- */
-void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
+static bool kvm_sea_target_is_el2(struct kvm_vcpu *vcpu)
{
- if (vcpu_el1_is_32bit(vcpu))
- inject_abt32(vcpu, true, addr);
- else
- inject_abt64(vcpu, true, addr);
+ if (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_TGE | HCR_TEA))
+ return true;
+
+ if (!vcpu_mode_priv(vcpu))
+ return false;
+
+ return (*vcpu_cpsr(vcpu) & PSR_A_BIT) &&
+ (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TMEA);
+}
+
+int kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr)
+{
+ lockdep_assert_held(&vcpu->mutex);
+
+ if (is_nested_ctxt(vcpu) && kvm_sea_target_is_el2(vcpu))
+ return kvm_inject_nested_sea(vcpu, iabt, addr);
+
+ __kvm_inject_sea(vcpu, iabt, addr);
+ return 1;
}
void kvm_inject_size_fault(struct kvm_vcpu *vcpu)
@@ -194,10 +237,7 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu)
addr = kvm_vcpu_get_fault_ipa(vcpu);
addr |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
- if (kvm_vcpu_trap_is_iabt(vcpu))
- kvm_inject_pabt(vcpu, addr);
- else
- kvm_inject_dabt(vcpu, addr);
+ __kvm_inject_sea(vcpu, kvm_vcpu_trap_is_iabt(vcpu), addr);
/*
* If AArch64 or LPAE, set FSC to 0 to indicate an Address
@@ -210,9 +250,9 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu)
!(vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE))
return;
- esr = vcpu_read_sys_reg(vcpu, ESR_EL1);
+ esr = vcpu_read_sys_reg(vcpu, exception_esr_elx(vcpu));
esr &= ~GENMASK_ULL(5, 0);
- vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
+ vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
}
/**
@@ -230,25 +270,70 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
inject_undef64(vcpu);
}
-void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 esr)
+static bool serror_is_masked(struct kvm_vcpu *vcpu)
{
- vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK);
- *vcpu_hcr(vcpu) |= HCR_VSE;
+ return (*vcpu_cpsr(vcpu) & PSR_A_BIT) && !effective_sctlr2_nmea(vcpu);
}
-/**
- * kvm_inject_vabt - inject an async abort / SError into the guest
- * @vcpu: The VCPU to receive the exception
- *
- * It is assumed that this code is called from the VCPU thread and that the
- * VCPU therefore is not currently executing guest code.
- *
- * Systems with the RAS Extensions specify an imp-def ESR (ISV/IDS = 1) with
- * the remaining ISS all-zeros so that this error is not interpreted as an
- * uncategorized RAS error. Without the RAS Extensions we can't specify an ESR
- * value, so the CPU generates an imp-def value.
- */
-void kvm_inject_vabt(struct kvm_vcpu *vcpu)
+static bool kvm_serror_target_is_el2(struct kvm_vcpu *vcpu)
+{
+ if (is_hyp_ctxt(vcpu) || vcpu_el2_amo_is_set(vcpu))
+ return true;
+
+ if (!(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TMEA))
+ return false;
+
+ /*
+ * In another example where FEAT_DoubleFault2 is entirely backwards,
+ * "masked" as it relates to the routing effects of HCRX_EL2.TMEA
+ * doesn't consider SCTLR2_EL1.NMEA. That is to say, even if EL1 asked
+ * for non-maskable SErrors, the EL2 bit takes priority if A is set.
+ */
+ if (vcpu_mode_priv(vcpu))
+ return *vcpu_cpsr(vcpu) & PSR_A_BIT;
+
+ /*
+ * Otherwise SErrors are considered unmasked when taken from EL0 and
+ * NMEA is set.
+ */
+ return serror_is_masked(vcpu);
+}
+
+static bool kvm_serror_undeliverable_at_el2(struct kvm_vcpu *vcpu)
+{
+ return !(vcpu_el2_tge_is_set(vcpu) || vcpu_el2_amo_is_set(vcpu));
+}
+
+int kvm_inject_serror_esr(struct kvm_vcpu *vcpu, u64 esr)
{
- kvm_set_sei_esr(vcpu, ESR_ELx_ISV);
+ lockdep_assert_held(&vcpu->mutex);
+
+ if (is_nested_ctxt(vcpu) && kvm_serror_target_is_el2(vcpu))
+ return kvm_inject_nested_serror(vcpu, esr);
+
+ if (vcpu_is_el2(vcpu) && kvm_serror_undeliverable_at_el2(vcpu)) {
+ vcpu_set_vsesr(vcpu, esr);
+ vcpu_set_flag(vcpu, NESTED_SERROR_PENDING);
+ return 1;
+ }
+
+ /*
+ * Emulate the exception entry if SErrors are unmasked. This is useful if
+ * the vCPU is in a nested context w/ vSErrors enabled then we've already
+ * delegated he hardware vSError context (i.e. HCR_EL2.VSE, VSESR_EL2,
+ * VDISR_EL2) to the guest hypervisor.
+ *
+ * As we're emulating the SError injection we need to explicitly populate
+ * ESR_ELx.EC because hardware will not do it on our behalf.
+ */
+ if (!serror_is_masked(vcpu)) {
+ pend_serror_exception(vcpu);
+ esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SERROR);
+ vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
+ return 1;
+ }
+
+ vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK);
+ *vcpu_hcr(vcpu) |= HCR_VSE;
+ return 1;
}
diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
index ab365e839874..54f9358c9e0e 100644
--- a/arch/arm64/kvm/mmio.c
+++ b/arch/arm64/kvm/mmio.c
@@ -72,7 +72,7 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
return data;
}
-static bool kvm_pending_sync_exception(struct kvm_vcpu *vcpu)
+static bool kvm_pending_external_abort(struct kvm_vcpu *vcpu)
{
if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION))
return false;
@@ -90,6 +90,8 @@ static bool kvm_pending_sync_exception(struct kvm_vcpu *vcpu)
switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) {
case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC):
case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC):
+ case unpack_vcpu_flag(EXCEPT_AA64_EL1_SERR):
+ case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR):
return true;
default:
return false;
@@ -113,7 +115,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu)
* Detect if the MMIO return was already handled or if userspace aborted
* the MMIO access.
*/
- if (unlikely(!vcpu->mmio_needed || kvm_pending_sync_exception(vcpu)))
+ if (unlikely(!vcpu->mmio_needed || kvm_pending_external_abort(vcpu)))
return 1;
vcpu->mmio_needed = 0;
@@ -169,10 +171,8 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
kvm_vcpu_get_hfar(vcpu), fault_ipa);
- if (vcpu_is_protected(vcpu)) {
- kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
- return 1;
- }
+ if (vcpu_is_protected(vcpu))
+ return kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
&vcpu->kvm->arch.flags)) {
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 2942ec92c5a4..1c78864767c5 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -193,11 +193,6 @@ int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
return 0;
}
-static bool kvm_is_device_pfn(unsigned long pfn)
-{
- return !pfn_is_map_memory(pfn);
-}
-
static void *stage2_memcache_zalloc_page(void *arg)
{
struct kvm_mmu_memory_cache *mc = arg;
@@ -1470,6 +1465,18 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
return vma->vm_flags & VM_MTE_ALLOWED;
}
+static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
+{
+ switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) {
+ case MT_NORMAL_NC:
+ case MT_DEVICE_nGnRnE:
+ case MT_DEVICE_nGnRE:
+ return false;
+ default:
+ return true;
+ }
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_s2_trans *nested,
struct kvm_memory_slot *memslot, unsigned long hva,
@@ -1477,8 +1484,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
{
int ret = 0;
bool write_fault, writable, force_pte = false;
- bool exec_fault, mte_allowed;
- bool device = false, vfio_allow_any_uc = false;
+ bool exec_fault, mte_allowed, is_vma_cacheable;
+ bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
unsigned long mmu_seq;
phys_addr_t ipa = fault_ipa;
struct kvm *kvm = vcpu->kvm;
@@ -1492,6 +1499,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
struct page *page;
+ vm_flags_t vm_flags;
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
if (fault_is_perm)
@@ -1619,6 +1627,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
+ vm_flags = vma->vm_flags;
+
+ is_vma_cacheable = kvm_vma_is_cacheable(vma);
+
/* Don't use the VMA after the unlock -- it may have vanished */
vma = NULL;
@@ -1642,18 +1654,39 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (is_error_noslot_pfn(pfn))
return -EFAULT;
- if (kvm_is_device_pfn(pfn)) {
- /*
- * If the page was identified as device early by looking at
- * the VMA flags, vma_pagesize is already representing the
- * largest quantity we can map. If instead it was mapped
- * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
- * and must not be upgraded.
- *
- * In both cases, we don't let transparent_hugepage_adjust()
- * change things at the last minute.
- */
- device = true;
+ /*
+ * Check if this is non-struct page memory PFN, and cannot support
+ * CMOs. It could potentially be unsafe to access as cachable.
+ */
+ if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) {
+ if (is_vma_cacheable) {
+ /*
+ * Whilst the VMA owner expects cacheable mapping to this
+ * PFN, hardware also has to support the FWB and CACHE DIC
+ * features.
+ *
+ * ARM64 KVM relies on kernel VA mapping to the PFN to
+ * perform cache maintenance as the CMO instructions work on
+ * virtual addresses. VM_PFNMAP region are not necessarily
+ * mapped to a KVA and hence the presence of hardware features
+ * S2FWB and CACHE DIC are mandatory to avoid the need for
+ * cache maintenance.
+ */
+ if (!kvm_supports_cacheable_pfnmap())
+ return -EFAULT;
+ } else {
+ /*
+ * If the page was identified as device early by looking at
+ * the VMA flags, vma_pagesize is already representing the
+ * largest quantity we can map. If instead it was mapped
+ * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
+ * and must not be upgraded.
+ *
+ * In both cases, we don't let transparent_hugepage_adjust()
+ * change things at the last minute.
+ */
+ s2_force_noncacheable = true;
+ }
} else if (logging_active && !write_fault) {
/*
* Only actually map the page as writable if this was a write
@@ -1662,7 +1695,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
writable = false;
}
- if (exec_fault && device)
+ if (exec_fault && s2_force_noncacheable)
return -ENOEXEC;
/*
@@ -1695,7 +1728,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* If we are not forced to use page mapping, check if we are
* backed by a THP and thus use block mapping if possible.
*/
- if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
+ if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) {
if (fault_is_perm && fault_granule > PAGE_SIZE)
vma_pagesize = fault_granule;
else
@@ -1709,7 +1742,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
}
- if (!fault_is_perm && !device && kvm_has_mte(kvm)) {
+ if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) {
/* Check the VMM hasn't introduced a new disallowed VMA */
if (mte_allowed) {
sanitise_mte_tags(kvm, pfn, vma_pagesize);
@@ -1725,7 +1758,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault)
prot |= KVM_PGTABLE_PROT_X;
- if (device) {
+ if (s2_force_noncacheable) {
if (vfio_allow_any_uc)
prot |= KVM_PGTABLE_PROT_NORMAL_NC;
else
@@ -1808,7 +1841,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
* There is no need to pass the error into the guest.
*/
if (kvm_handle_guest_sea())
- kvm_inject_vabt(vcpu);
+ return kvm_inject_serror(vcpu);
return 1;
}
@@ -1836,11 +1869,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) {
fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
- if (is_iabt)
- kvm_inject_pabt(vcpu, fault_ipa);
- else
- kvm_inject_dabt(vcpu, fault_ipa);
- return 1;
+ return kvm_inject_sea(vcpu, is_iabt, fault_ipa);
}
}
@@ -1912,8 +1941,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
}
if (kvm_vcpu_abt_iss1tw(vcpu)) {
- kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
- ret = 1;
+ ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
goto out_unlock;
}
@@ -1958,10 +1986,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
if (ret == 0)
ret = 1;
out:
- if (ret == -ENOEXEC) {
- kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
- ret = 1;
- }
+ if (ret == -ENOEXEC)
+ ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu));
out_unlock:
srcu_read_unlock(&vcpu->kvm->srcu, idx);
return ret;
@@ -2221,6 +2247,15 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
ret = -EINVAL;
break;
}
+
+ /*
+ * Cacheable PFNMAP is allowed only if the hardware
+ * supports it.
+ */
+ if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) {
+ ret = -EINVAL;
+ break;
+ }
}
hva = min(reg_end, vma->vm_end);
} while (hva < reg_end);
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index dc1d26559bfa..153b3e11b115 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -1441,12 +1441,11 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
break;
case SYS_ID_AA64PFR0_EL1:
- /* No RME, AMU, MPAM, S-EL2, or RAS */
+ /* No RME, AMU, MPAM, or S-EL2 */
val &= ~(ID_AA64PFR0_EL1_RME |
ID_AA64PFR0_EL1_AMU |
ID_AA64PFR0_EL1_MPAM |
ID_AA64PFR0_EL1_SEL2 |
- ID_AA64PFR0_EL1_RAS |
ID_AA64PFR0_EL1_EL3 |
ID_AA64PFR0_EL1_EL2 |
ID_AA64PFR0_EL1_EL1 |
@@ -1683,69 +1682,21 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
set_sysreg_masks(kvm, HFGITR2_EL2, res0, res1);
/* TCR2_EL2 */
- res0 = TCR2_EL2_RES0;
- res1 = TCR2_EL2_RES1;
- if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, D128, IMP))
- res0 |= (TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1 | TCR2_EL2_D128);
- if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, MEC, IMP))
- res0 |= TCR2_EL2_AMEC1 | TCR2_EL2_AMEC0;
- if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, HAFDBS, HAFT))
- res0 |= TCR2_EL2_HAFT;
- if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP))
- res0 |= TCR2_EL2_PTTWI | TCR2_EL2_PnCH;
- if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP))
- res0 |= TCR2_EL2_AIE;
- if (!kvm_has_s1poe(kvm))
- res0 |= TCR2_EL2_POE | TCR2_EL2_E0POE;
- if (!kvm_has_s1pie(kvm))
- res0 |= TCR2_EL2_PIE;
- if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP))
- res0 |= (TCR2_EL2_E0POE | TCR2_EL2_D128 |
- TCR2_EL2_AMEC1 | TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1);
+ get_reg_fixed_bits(kvm, TCR2_EL2, &res0, &res1);
set_sysreg_masks(kvm, TCR2_EL2, res0, res1);
/* SCTLR_EL1 */
- res0 = SCTLR_EL1_RES0;
- res1 = SCTLR_EL1_RES1;
- if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
- res0 |= SCTLR_EL1_EPAN;
+ get_reg_fixed_bits(kvm, SCTLR_EL1, &res0, &res1);
set_sysreg_masks(kvm, SCTLR_EL1, res0, res1);
+ /* SCTLR2_ELx */
+ get_reg_fixed_bits(kvm, SCTLR2_EL1, &res0, &res1);
+ set_sysreg_masks(kvm, SCTLR2_EL1, res0, res1);
+ get_reg_fixed_bits(kvm, SCTLR2_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, SCTLR2_EL2, res0, res1);
+
/* MDCR_EL2 */
- res0 = MDCR_EL2_RES0;
- res1 = MDCR_EL2_RES1;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP))
- res0 |= (MDCR_EL2_HPMN | MDCR_EL2_TPMCR |
- MDCR_EL2_TPM | MDCR_EL2_HPME);
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP))
- res0 |= MDCR_EL2_E2PB | MDCR_EL2_TPMS;
- if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, SPMU, IMP))
- res0 |= MDCR_EL2_EnSPM;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P1))
- res0 |= MDCR_EL2_HPMD;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP))
- res0 |= MDCR_EL2_TTRF;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P5))
- res0 |= MDCR_EL2_HCCD | MDCR_EL2_HLP;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP))
- res0 |= MDCR_EL2_E2TB;
- if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP))
- res0 |= MDCR_EL2_TDCC;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, MTPMU, IMP) ||
- kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP))
- res0 |= MDCR_EL2_MTPME;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P7))
- res0 |= MDCR_EL2_HPMFZO;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSS, IMP))
- res0 |= MDCR_EL2_PMSSE;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2))
- res0 |= MDCR_EL2_HPMFZS;
- if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, EBEP, IMP))
- res0 |= MDCR_EL2_PMEE;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, V8P9))
- res0 |= MDCR_EL2_EBWE;
- if (!kvm_has_feat(kvm, ID_AA64DFR2_EL1, STEP, IMP))
- res0 |= MDCR_EL2_EnSTEPOP;
+ get_reg_fixed_bits(kvm, MDCR_EL2, &res0, &res1);
set_sysreg_masks(kvm, MDCR_EL2, res0, res1);
/* CNTHCTL_EL2 */
@@ -1802,3 +1753,43 @@ void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu))
kvm_inject_nested_irq(vcpu);
}
+
+/*
+ * One of the many architectural bugs in FEAT_NV2 is that the guest hypervisor
+ * can write to HCR_EL2 behind our back, potentially changing the exception
+ * routing / masking for even the host context.
+ *
+ * What follows is some slop to (1) react to exception routing / masking and (2)
+ * preserve the pending SError state across translation regimes.
+ */
+void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu_has_nv(vcpu))
+ return;
+
+ if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
+ kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
+}
+
+void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+ unsigned long *hcr = vcpu_hcr(vcpu);
+
+ if (!vcpu_has_nv(vcpu))
+ return;
+
+ /*
+ * We previously decided that an SError was deliverable to the guest.
+ * Reap the pending state from HCR_EL2 and...
+ */
+ if (unlikely(__test_and_clear_bit(__ffs(HCR_VSE), hcr)))
+ vcpu_set_flag(vcpu, NESTED_SERROR_PENDING);
+
+ /*
+ * Re-attempt SError injection in case the deliverability has changed,
+ * which is necessary to faithfully emulate WFI the case of a pending
+ * SError being a wakeup condition.
+ */
+ if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
+ kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
+}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c20bd6f21e60..82ffb3b3b3cf 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -108,7 +108,6 @@ static bool get_el2_to_el1_mapping(unsigned int reg,
PURE_EL2_SYSREG( HACR_EL2 );
PURE_EL2_SYSREG( VTTBR_EL2 );
PURE_EL2_SYSREG( VTCR_EL2 );
- PURE_EL2_SYSREG( RVBAR_EL2 );
PURE_EL2_SYSREG( TPIDR_EL2 );
PURE_EL2_SYSREG( HPFAR_EL2 );
PURE_EL2_SYSREG( HCRX_EL2 );
@@ -144,6 +143,7 @@ static bool get_el2_to_el1_mapping(unsigned int reg,
MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL );
MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL );
MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL );
+ MAPPED_EL2_SYSREG(SCTLR2_EL2, SCTLR2_EL1, NULL );
default:
return false;
}
@@ -533,8 +533,7 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu,
return ignore_write(vcpu, p);
if (p->Op1 == 4) { /* ICC_SRE_EL2 */
- p->regval = (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE |
- ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB);
+ p->regval = KVM_ICC_SRE_EL2;
} else { /* ICC_SRE_EL1 */
p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre;
}
@@ -773,6 +772,12 @@ static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
return mpidr;
}
+static unsigned int hidden_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *r)
+{
+ return REG_HIDDEN;
+}
+
static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
@@ -1612,13 +1617,14 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac);
break;
case SYS_ID_AA64PFR2_EL1:
- /* We only expose FPMR */
- val &= ID_AA64PFR2_EL1_FPMR;
+ val &= ID_AA64PFR2_EL1_FPMR |
+ (kvm_has_mte(vcpu->kvm) ?
+ ID_AA64PFR2_EL1_MTEFAR | ID_AA64PFR2_EL1_MTESTOREONLY :
+ 0);
break;
case SYS_ID_AA64ISAR1_EL1:
if (!vcpu_has_ptrauth(vcpu))
@@ -1643,8 +1649,10 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
val &= ~ID_AA64MMFR2_EL1_NV;
break;
case SYS_ID_AA64MMFR3_EL1:
- val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE |
- ID_AA64MMFR3_EL1_S1PIE;
+ val &= ID_AA64MMFR3_EL1_TCRX |
+ ID_AA64MMFR3_EL1_SCTLRX |
+ ID_AA64MMFR3_EL1_S1POE |
+ ID_AA64MMFR3_EL1_S1PIE;
break;
case SYS_ID_MMFR4_EL1:
val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX);
@@ -1811,7 +1819,7 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP);
}
- if (kvm_vgic_global_state.type == VGIC_V3) {
+ if (vgic_is_v3(vcpu->kvm)) {
val &= ~ID_AA64PFR0_EL1_GIC_MASK;
val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
}
@@ -1953,6 +1961,14 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
(vcpu_has_nv(vcpu) && !FIELD_GET(ID_AA64PFR0_EL1_EL2, user_val)))
return -EINVAL;
+ /*
+ * If we are running on a GICv5 host and support FEAT_GCIE_LEGACY, then
+ * we support GICv3. Fail attempts to do anything but set that to IMP.
+ */
+ if (vgic_is_v3_compat(vcpu->kvm) &&
+ FIELD_GET(ID_AA64PFR0_EL1_GIC_MASK, user_val) != ID_AA64PFR0_EL1_GIC_IMP)
+ return -EINVAL;
+
return set_id_reg(vcpu, rd, user_val);
}
@@ -2325,6 +2341,10 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu,
EL2_REG_FILTERED(name, acc, rst, v, el2_visibility)
#define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v)
+#define EL2_REG_VNCR_FILT(name, vis) \
+ EL2_REG_FILTERED(name, bad_vncr_trap, reset_val, 0, vis)
+#define EL2_REG_VNCR_GICv3(name) \
+ EL2_REG_VNCR_FILT(name, hidden_visibility)
#define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v)
/*
@@ -2483,6 +2503,21 @@ static unsigned int vncr_el2_visibility(const struct kvm_vcpu *vcpu,
return REG_HIDDEN;
}
+static unsigned int sctlr2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_sctlr2(vcpu->kvm))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int sctlr2_el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return __el2_visibility(vcpu, rd, sctlr2_visibility);
+}
+
static bool access_zcr_el2(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -2513,11 +2548,7 @@ static bool access_gic_vtr(struct kvm_vcpu *vcpu,
if (p->is_write)
return write_to_read_only(vcpu, p, r);
- p->regval = kvm_vgic_global_state.ich_vtr_el2;
- p->regval &= ~(ICH_VTR_EL2_DVIM |
- ICH_VTR_EL2_A3V |
- ICH_VTR_EL2_IDbits);
- p->regval |= ICH_VTR_EL2_nV4;
+ p->regval = kvm_get_guest_vtr_el2();
return true;
}
@@ -2588,6 +2619,26 @@ static unsigned int tcr2_el2_visibility(const struct kvm_vcpu *vcpu,
return __el2_visibility(vcpu, rd, tcr2_visibility);
}
+static unsigned int fgt2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (el2_visibility(vcpu, rd) == 0 &&
+ kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, FGT, FGT2))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int fgt_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (el2_visibility(vcpu, rd) == 0 &&
+ kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, FGT, IMP))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
static unsigned int s1pie_visibility(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
@@ -2639,6 +2690,23 @@ static bool access_mdcr(struct kvm_vcpu *vcpu,
return true;
}
+static bool access_ras(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ switch(reg_to_encoding(r)) {
+ default:
+ if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+ }
+
+ return trap_raz_wi(vcpu, p, r);
+}
+
/*
* For historical (ahem ABI) reasons, KVM treated MIDR_EL1, REVIDR_EL1, and
* AIDR_EL1 as "invariant" registers, meaning userspace cannot change them.
@@ -2866,7 +2934,6 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64PFR0_EL1_FP)),
ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1,
~(ID_AA64PFR1_EL1_PFAR |
- ID_AA64PFR1_EL1_DF2 |
ID_AA64PFR1_EL1_MTEX |
ID_AA64PFR1_EL1_THE |
ID_AA64PFR1_EL1_GCS |
@@ -2878,7 +2945,10 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64PFR1_EL1_MPAM_frac |
ID_AA64PFR1_EL1_RAS_frac |
ID_AA64PFR1_EL1_MTE)),
- ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR),
+ ID_WRITABLE(ID_AA64PFR2_EL1,
+ ID_AA64PFR2_EL1_FPMR |
+ ID_AA64PFR2_EL1_MTEFAR |
+ ID_AA64PFR2_EL1_MTESTOREONLY),
ID_UNALLOCATED(4,3),
ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0),
ID_HIDDEN(ID_AA64SMFR0_EL1),
@@ -2945,6 +3015,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64MMFR2_EL1_NV |
ID_AA64MMFR2_EL1_CCIDX)),
ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX |
+ ID_AA64MMFR3_EL1_SCTLRX |
ID_AA64MMFR3_EL1_S1PIE |
ID_AA64MMFR3_EL1_S1POE)),
ID_WRITABLE(ID_AA64MMFR4_EL1, ID_AA64MMFR4_EL1_NV_frac),
@@ -2955,6 +3026,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 },
{ SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 },
{ SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 },
+ { SYS_DESC(SYS_SCTLR2_EL1), access_vm_reg, reset_val, SCTLR2_EL1, 0,
+ .visibility = sctlr2_visibility },
MTE_REG(RGSR_EL1),
MTE_REG(GCR_EL1),
@@ -2984,14 +3057,14 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 },
{ SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 },
- { SYS_DESC(SYS_ERRIDR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERRSELR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXFR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXCTLR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXSTATUS_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXADDR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi },
+ { SYS_DESC(SYS_ERRIDR_EL1), access_ras },
+ { SYS_DESC(SYS_ERRSELR_EL1), access_ras },
+ { SYS_DESC(SYS_ERXFR_EL1), access_ras },
+ { SYS_DESC(SYS_ERXCTLR_EL1), access_ras },
+ { SYS_DESC(SYS_ERXSTATUS_EL1), access_ras },
+ { SYS_DESC(SYS_ERXADDR_EL1), access_ras },
+ { SYS_DESC(SYS_ERXMISC0_EL1), access_ras },
+ { SYS_DESC(SYS_ERXMISC1_EL1), access_ras },
MTE_REG(TFSR_EL1),
MTE_REG(TFSRE0_EL1),
@@ -3302,12 +3375,14 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG_VNCR(VMPIDR_EL2, reset_unknown, 0),
EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1),
EL2_REG(ACTLR_EL2, access_rw, reset_val, 0),
+ EL2_REG_FILTERED(SCTLR2_EL2, access_vm_reg, reset_val, 0,
+ sctlr2_el2_visibility),
EL2_REG_VNCR(HCR_EL2, reset_hcr, 0),
EL2_REG(MDCR_EL2, access_mdcr, reset_mdcr, 0),
EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1),
EL2_REG_VNCR(HSTR_EL2, reset_val, 0),
- EL2_REG_VNCR(HFGRTR_EL2, reset_val, 0),
- EL2_REG_VNCR(HFGWTR_EL2, reset_val, 0),
+ EL2_REG_VNCR_FILT(HFGRTR_EL2, fgt_visibility),
+ EL2_REG_VNCR_FILT(HFGWTR_EL2, fgt_visibility),
EL2_REG_VNCR(HFGITR_EL2, reset_val, 0),
EL2_REG_VNCR(HACR_EL2, reset_val, 0),
@@ -3327,9 +3402,14 @@ static const struct sys_reg_desc sys_reg_descs[] = {
vncr_el2_visibility),
{ SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 },
- EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0),
- EL2_REG_VNCR(HDFGWTR_EL2, reset_val, 0),
- EL2_REG_VNCR(HAFGRTR_EL2, reset_val, 0),
+ EL2_REG_VNCR_FILT(HDFGRTR2_EL2, fgt2_visibility),
+ EL2_REG_VNCR_FILT(HDFGWTR2_EL2, fgt2_visibility),
+ EL2_REG_VNCR_FILT(HFGRTR2_EL2, fgt2_visibility),
+ EL2_REG_VNCR_FILT(HFGWTR2_EL2, fgt2_visibility),
+ EL2_REG_VNCR_FILT(HDFGRTR_EL2, fgt_visibility),
+ EL2_REG_VNCR_FILT(HDFGWTR_EL2, fgt_visibility),
+ EL2_REG_VNCR_FILT(HAFGRTR_EL2, fgt_visibility),
+ EL2_REG_VNCR_FILT(HFGITR2_EL2, fgt2_visibility),
EL2_REG_REDIR(SPSR_EL2, reset_val, 0),
EL2_REG_REDIR(ELR_EL2, reset_val, 0),
{ SYS_DESC(SYS_SP_EL1), access_sp_el1},
@@ -3344,6 +3424,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG(AFSR0_EL2, access_rw, reset_val, 0),
EL2_REG(AFSR1_EL2, access_rw, reset_val, 0),
EL2_REG_REDIR(ESR_EL2, reset_val, 0),
+ EL2_REG_VNCR(VSESR_EL2, reset_unknown, 0),
{ SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 },
EL2_REG_REDIR(FAR_EL2, reset_val, 0),
@@ -3370,43 +3451,44 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_MPAMVPM7_EL2), undef_access },
EL2_REG(VBAR_EL2, access_rw, reset_val, 0),
- EL2_REG(RVBAR_EL2, access_rw, reset_val, 0),
+ { SYS_DESC(SYS_RVBAR_EL2), undef_access },
{ SYS_DESC(SYS_RMR_EL2), undef_access },
+ EL2_REG_VNCR(VDISR_EL2, reset_unknown, 0),
- EL2_REG_VNCR(ICH_AP0R0_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP0R1_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP0R2_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP0R3_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP1R0_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP1R1_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP1R2_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP1R3_EL2, reset_val, 0),
+ EL2_REG_VNCR_GICv3(ICH_AP0R0_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP0R1_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP0R2_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP0R3_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP1R0_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP1R1_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP1R2_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP1R3_EL2),
{ SYS_DESC(SYS_ICC_SRE_EL2), access_gic_sre },
- EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0),
+ EL2_REG_VNCR_GICv3(ICH_HCR_EL2),
{ SYS_DESC(SYS_ICH_VTR_EL2), access_gic_vtr },
{ SYS_DESC(SYS_ICH_MISR_EL2), access_gic_misr },
{ SYS_DESC(SYS_ICH_EISR_EL2), access_gic_eisr },
{ SYS_DESC(SYS_ICH_ELRSR_EL2), access_gic_elrsr },
- EL2_REG_VNCR(ICH_VMCR_EL2, reset_val, 0),
-
- EL2_REG_VNCR(ICH_LR0_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR1_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR2_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR3_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR4_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR5_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR6_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR7_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR8_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR9_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR10_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR11_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR12_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR13_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR14_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR15_EL2, reset_val, 0),
+ EL2_REG_VNCR_GICv3(ICH_VMCR_EL2),
+
+ EL2_REG_VNCR_GICv3(ICH_LR0_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR1_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR2_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR3_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR4_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR5_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR6_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR7_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR8_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR9_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR10_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR11_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR12_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR13_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR14_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR15_EL2),
EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0),
EL2_REG(TPIDR_EL2, access_rw, reset_val, 0),
@@ -4275,12 +4357,12 @@ static const struct sys_reg_desc cp15_64_regs[] = {
};
static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n,
- bool is_32)
+ bool reset_check)
{
unsigned int i;
for (i = 0; i < n; i++) {
- if (!is_32 && table[i].reg && !table[i].reset) {
+ if (reset_check && table[i].reg && !table[i].reset) {
kvm_err("sys_reg table %pS entry %d (%s) lacks reset\n",
&table[i], i, table[i].name);
return false;
@@ -4475,7 +4557,7 @@ static bool kvm_esr_cp10_id_to_sys64(u64 esr, struct sys_reg_params *params)
return true;
kvm_pr_unimpl("Unhandled cp10 register %s: %u\n",
- params->is_write ? "write" : "read", reg_id);
+ str_write_read(params->is_write), reg_id);
return false;
}
@@ -5269,18 +5351,22 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu)
int __init kvm_sys_reg_table_init(void)
{
+ const struct sys_reg_desc *gicv3_regs;
bool valid = true;
- unsigned int i;
+ unsigned int i, sz;
int ret = 0;
/* Make sure tables are unique and in order. */
- valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false);
- valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true);
- valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true);
- valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true);
- valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true);
+ valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), true);
+ valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), false);
+ valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), false);
+ valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), false);
+ valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), false);
valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false);
+ gicv3_regs = vgic_v3_get_sysreg_table(&sz);
+ valid &= check_sysreg_table(gicv3_regs, sz, false);
+
if (!valid)
return -EINVAL;
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index ef97d9fc67cc..317abc490368 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -108,7 +108,7 @@ inline void print_sys_reg_msg(const struct sys_reg_params *p,
/* Look, we even formatted it for you to paste into the table! */
kvm_pr_unimpl("%pV { Op0(%2u), Op1(%2u), CRn(%2u), CRm(%2u), Op2(%2u), func_%s },\n",
&(struct va_format){ fmt, &va },
- p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, p->is_write ? "write" : "read");
+ p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, str_write_read(p->is_write));
va_end(va);
}
diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h
index f85415db7713..a7ab9a3bbed0 100644
--- a/arch/arm64/kvm/trace_handle_exit.h
+++ b/arch/arm64/kvm/trace_handle_exit.h
@@ -113,7 +113,7 @@ TRACE_EVENT(kvm_sys_access,
__entry->vcpu_pc, __entry->name ?: "UNKN",
__entry->Op0, __entry->Op1, __entry->CRn,
__entry->CRm, __entry->Op2,
- __entry->is_write ? "write" : "read")
+ str_write_read(__entry->is_write))
);
TRACE_EVENT(kvm_set_guest_debug,
diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c
index 5eacb4b3250a..bdc2d57370b2 100644
--- a/arch/arm64/kvm/vgic-sys-reg-v3.c
+++ b/arch/arm64/kvm/vgic-sys-reg-v3.c
@@ -297,6 +297,91 @@ static int get_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
return 0;
}
+static int set_gic_ich_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ __vcpu_assign_sys_reg(vcpu, r->reg, val);
+ return 0;
+}
+
+static int get_gic_ich_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ *val = __vcpu_sys_reg(vcpu, r->reg);
+ return 0;
+}
+
+static int set_gic_ich_apr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ u8 idx = r->Op2 & 3;
+
+ if (idx > vgic_v3_max_apr_idx(vcpu))
+ return -EINVAL;
+
+ return set_gic_ich_reg(vcpu, r, val);
+}
+
+static int get_gic_ich_apr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ u8 idx = r->Op2 & 3;
+
+ if (idx > vgic_v3_max_apr_idx(vcpu))
+ return -EINVAL;
+
+ return get_gic_ich_reg(vcpu, r, val);
+}
+
+static int set_gic_icc_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ if (val != KVM_ICC_SRE_EL2)
+ return -EINVAL;
+ return 0;
+}
+
+static int get_gic_icc_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ *val = KVM_ICC_SRE_EL2;
+ return 0;
+}
+
+static int set_gic_ich_vtr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ if (val != kvm_get_guest_vtr_el2())
+ return -EINVAL;
+ return 0;
+}
+
+static int get_gic_ich_vtr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ *val = kvm_get_guest_vtr_el2();
+ return 0;
+}
+
+static unsigned int el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return vcpu_has_nv(vcpu) ? 0 : REG_HIDDEN;
+}
+
+#define __EL2_REG(r, acc, i) \
+ { \
+ SYS_DESC(SYS_ ## r), \
+ .get_user = get_gic_ ## acc, \
+ .set_user = set_gic_ ## acc, \
+ .reg = i, \
+ .visibility = el2_visibility, \
+ }
+
+#define EL2_REG(r, acc) __EL2_REG(r, acc, r)
+
+#define EL2_REG_RO(r, acc) __EL2_REG(r, acc, 0)
+
static const struct sys_reg_desc gic_v3_icc_reg_descs[] = {
{ SYS_DESC(SYS_ICC_PMR_EL1),
.set_user = set_gic_pmr, .get_user = get_gic_pmr, },
@@ -328,8 +413,42 @@ static const struct sys_reg_desc gic_v3_icc_reg_descs[] = {
.set_user = set_gic_grpen0, .get_user = get_gic_grpen0, },
{ SYS_DESC(SYS_ICC_IGRPEN1_EL1),
.set_user = set_gic_grpen1, .get_user = get_gic_grpen1, },
+ EL2_REG(ICH_AP0R0_EL2, ich_apr),
+ EL2_REG(ICH_AP0R1_EL2, ich_apr),
+ EL2_REG(ICH_AP0R2_EL2, ich_apr),
+ EL2_REG(ICH_AP0R3_EL2, ich_apr),
+ EL2_REG(ICH_AP1R0_EL2, ich_apr),
+ EL2_REG(ICH_AP1R1_EL2, ich_apr),
+ EL2_REG(ICH_AP1R2_EL2, ich_apr),
+ EL2_REG(ICH_AP1R3_EL2, ich_apr),
+ EL2_REG_RO(ICC_SRE_EL2, icc_sre),
+ EL2_REG(ICH_HCR_EL2, ich_reg),
+ EL2_REG_RO(ICH_VTR_EL2, ich_vtr),
+ EL2_REG(ICH_VMCR_EL2, ich_reg),
+ EL2_REG(ICH_LR0_EL2, ich_reg),
+ EL2_REG(ICH_LR1_EL2, ich_reg),
+ EL2_REG(ICH_LR2_EL2, ich_reg),
+ EL2_REG(ICH_LR3_EL2, ich_reg),
+ EL2_REG(ICH_LR4_EL2, ich_reg),
+ EL2_REG(ICH_LR5_EL2, ich_reg),
+ EL2_REG(ICH_LR6_EL2, ich_reg),
+ EL2_REG(ICH_LR7_EL2, ich_reg),
+ EL2_REG(ICH_LR8_EL2, ich_reg),
+ EL2_REG(ICH_LR9_EL2, ich_reg),
+ EL2_REG(ICH_LR10_EL2, ich_reg),
+ EL2_REG(ICH_LR11_EL2, ich_reg),
+ EL2_REG(ICH_LR12_EL2, ich_reg),
+ EL2_REG(ICH_LR13_EL2, ich_reg),
+ EL2_REG(ICH_LR14_EL2, ich_reg),
+ EL2_REG(ICH_LR15_EL2, ich_reg),
};
+const struct sys_reg_desc *vgic_v3_get_sysreg_table(unsigned int *sz)
+{
+ *sz = ARRAY_SIZE(gic_v3_icc_reg_descs);
+ return gic_v3_icc_reg_descs;
+}
+
static u64 attr_to_id(u64 attr)
{
return ARM64_SYS_REG(FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP0_MASK, attr),
@@ -341,8 +460,12 @@ static u64 attr_to_id(u64 attr)
int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
- if (get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs,
- ARRAY_SIZE(gic_v3_icc_reg_descs)))
+ const struct sys_reg_desc *r;
+
+ r = get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs,
+ ARRAY_SIZE(gic_v3_icc_reg_descs));
+
+ if (r && !sysreg_hidden(vcpu, r))
return 0;
return -ENXIO;
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index eb1205654ac8..1e680ad6e863 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -157,6 +157,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
kvm->arch.vgic.in_kernel = true;
kvm->arch.vgic.vgic_model = type;
+ kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST;
kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
@@ -165,6 +166,9 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
else
INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
+ if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
+ kvm->arch.vgic.nassgicap = system_supports_direct_sgis();
+
out_unlock:
mutex_unlock(&kvm->arch.config_lock);
kvm_unlock_all_vcpus(kvm);
@@ -391,11 +395,10 @@ int vgic_init(struct kvm *kvm)
goto out;
/*
- * If we have GICv4.1 enabled, unconditionally request enable the
- * v4 support so that we get HW-accelerated vSGIs. Otherwise, only
- * enable it if we present a virtual ITS to the guest.
+ * Ensure vPEs are allocated if direct IRQ injection (e.g. vSGIs,
+ * vLPIs) is supported.
*/
- if (vgic_supports_direct_msis(kvm)) {
+ if (vgic_supports_direct_irqs(kvm)) {
ret = vgic_v4_init(kvm);
if (ret)
goto out;
@@ -409,15 +412,7 @@ int vgic_init(struct kvm *kvm)
goto out;
vgic_debug_init(kvm);
-
- /*
- * If userspace didn't set the GIC implementation revision,
- * default to the latest and greatest. You know want it.
- */
- if (!dist->implementation_rev)
- dist->implementation_rev = KVM_VGIC_IMP_REV_LATEST;
dist->initialized = true;
-
out:
return ret;
}
@@ -443,7 +438,7 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
}
- if (vgic_supports_direct_msis(kvm))
+ if (vgic_supports_direct_irqs(kvm))
vgic_v4_teardown(kvm);
xa_destroy(&dist->lpi_xa);
@@ -674,10 +669,12 @@ void kvm_vgic_init_cpu_hardware(void)
* We want to make sure the list registers start out clear so that we
* only have the program the used registers.
*/
- if (kvm_vgic_global_state.type == VGIC_V2)
+ if (kvm_vgic_global_state.type == VGIC_V2) {
vgic_v2_init_lrs();
- else
+ } else if (kvm_vgic_global_state.type == VGIC_V3 ||
+ kvm_vgic_global_state.has_gcie_v3_compat) {
kvm_call_hyp(__vgic_v3_init_lrs);
+ }
}
/**
@@ -722,6 +719,9 @@ int kvm_vgic_hyp_init(void)
kvm_info("GIC system register CPU interface enabled\n");
}
break;
+ case GIC_V5:
+ ret = vgic_v5_probe(gic_kvm_info);
+ break;
default:
ret = -ENODEV;
}
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 534049c7c94b..7368c13f16b7 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -758,7 +758,7 @@ static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
if (irq) {
scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
if (irq->hw)
- WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
+ its_unmap_vlpi(ite->irq->host_irq);
irq->hw = false;
}
@@ -2694,6 +2694,9 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
case KVM_DEV_ARM_ITS_RESTORE_TABLES:
ret = abi->restore_tables(its);
break;
+ default:
+ ret = -ENXIO;
+ break;
}
mutex_unlock(&its->its_lock);
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index f9ae790163fb..3d1a776b716d 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -5,6 +5,7 @@
* Copyright (C) 2015 ARM Ltd.
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
+#include <linux/irqchip/arm-gic-v3.h>
#include <linux/kvm_host.h>
#include <kvm/arm_vgic.h>
#include <linux/uaccess.h>
@@ -303,12 +304,6 @@ static int vgic_get_common_attr(struct kvm_device *dev,
VGIC_NR_PRIVATE_IRQS, uaddr);
break;
}
- case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: {
- u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-
- r = put_user(dev->kvm->arch.vgic.mi_intid, uaddr);
- break;
- }
}
return r;
@@ -510,6 +505,24 @@ int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
}
/*
+ * Allow access to certain ID-like registers prior to VGIC initialization,
+ * thereby allowing the VMM to provision the features / sizing of the VGIC.
+ */
+static bool reg_allowed_pre_init(struct kvm_device_attr *attr)
+{
+ if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS)
+ return false;
+
+ switch (attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK) {
+ case GICD_IIDR:
+ case GICD_TYPER2:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
* vgic_v3_attr_regs_access - allows user space to access VGIC v3 state
*
* @dev: kvm device handle
@@ -523,7 +536,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
struct vgic_reg_attr reg_attr;
gpa_t addr;
struct kvm_vcpu *vcpu;
- bool uaccess, post_init = true;
+ bool uaccess;
u32 val;
int ret;
@@ -539,9 +552,6 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
/* Sysregs uaccess is performed by the sysreg handling code */
uaccess = false;
break;
- case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
- post_init = false;
- fallthrough;
default:
uaccess = true;
}
@@ -561,7 +571,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->arch.config_lock);
- if (post_init != vgic_initialized(dev->kvm)) {
+ if (!(vgic_initialized(dev->kvm) || reg_allowed_pre_init(attr))) {
ret = -EBUSY;
goto out;
}
@@ -591,19 +601,6 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
}
break;
}
- case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
- if (!is_write) {
- val = dev->kvm->arch.vgic.mi_intid;
- ret = 0;
- break;
- }
-
- ret = -EINVAL;
- if ((val < VGIC_NR_PRIVATE_IRQS) && (val >= VGIC_NR_SGIS)) {
- dev->kvm->arch.vgic.mi_intid = val;
- ret = 0;
- }
- break;
default:
ret = -EINVAL;
break;
@@ -630,8 +627,24 @@ static int vgic_v3_set_attr(struct kvm_device *dev,
case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO:
- case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
return vgic_v3_attr_regs_access(dev, attr, true);
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: {
+ u32 __user *uaddr = (u32 __user *)attr->addr;
+ u32 val;
+
+ if (get_user(val, uaddr))
+ return -EFAULT;
+
+ guard(mutex)(&dev->kvm->arch.config_lock);
+ if (vgic_initialized(dev->kvm))
+ return -EBUSY;
+
+ if (!irq_is_ppi(val))
+ return -EINVAL;
+
+ dev->kvm->arch.vgic.mi_intid = val;
+ return 0;
+ }
default:
return vgic_set_common_attr(dev, attr);
}
@@ -645,8 +658,13 @@ static int vgic_v3_get_attr(struct kvm_device *dev,
case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO:
- case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
return vgic_v3_attr_regs_access(dev, attr, false);
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+
+ guard(mutex)(&dev->kvm->arch.config_lock);
+ return put_user(dev->kvm->arch.vgic.mi_intid, uaddr);
+ }
default:
return vgic_get_common_attr(dev, attr);
}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index ae4c0593d114..a3ef185209e9 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -50,8 +50,17 @@ bool vgic_has_its(struct kvm *kvm)
bool vgic_supports_direct_msis(struct kvm *kvm)
{
- return (kvm_vgic_global_state.has_gicv4_1 ||
- (kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm)));
+ return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm);
+}
+
+bool system_supports_direct_sgis(void)
+{
+ return kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi();
+}
+
+bool vgic_supports_direct_sgis(struct kvm *kvm)
+{
+ return kvm->arch.vgic.nassgicap;
}
/*
@@ -86,7 +95,7 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
}
break;
case GICD_TYPER2:
- if (kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi())
+ if (vgic_supports_direct_sgis(vcpu->kvm))
value = GICD_TYPER2_nASSGIcap;
break;
case GICD_IIDR:
@@ -119,7 +128,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
/* Not a GICv4.1? No HW SGIs */
- if (!kvm_vgic_global_state.has_gicv4_1 || !gic_cpuif_has_vsgi())
+ if (!vgic_supports_direct_sgis(vcpu->kvm))
val &= ~GICD_CTLR_nASSGIreq;
/* Dist stays enabled? nASSGIreq is RO */
@@ -133,7 +142,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
if (is_hwsgi != dist->nassgireq)
vgic_v4_configure_vsgis(vcpu->kvm);
- if (kvm_vgic_global_state.has_gicv4_1 &&
+ if (vgic_supports_direct_sgis(vcpu->kvm) &&
was_enabled != dist->enabled)
kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4);
else if (!was_enabled && dist->enabled)
@@ -159,8 +168,18 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
switch (addr & 0x0c) {
case GICD_TYPER2:
- if (val != vgic_mmio_read_v3_misc(vcpu, addr, len))
+ reg = vgic_mmio_read_v3_misc(vcpu, addr, len);
+
+ if (reg == val)
+ return 0;
+ if (vgic_initialized(vcpu->kvm))
+ return -EBUSY;
+ if ((reg ^ val) & ~GICD_TYPER2_nASSGIcap)
return -EINVAL;
+ if (!system_supports_direct_sgis() && val)
+ return -EINVAL;
+
+ dist->nassgicap = val & GICD_TYPER2_nASSGIcap;
return 0;
case GICD_IIDR:
reg = vgic_mmio_read_v3_misc(vcpu, addr, len);
@@ -178,7 +197,7 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
}
case GICD_CTLR:
/* Not a GICv4.1? No HW SGIs */
- if (!kvm_vgic_global_state.has_gicv4_1)
+ if (!vgic_supports_direct_sgis(vcpu->kvm))
val &= ~GICD_CTLR_nASSGIreq;
dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c
index 679aafe77de2..7f1259b49c50 100644
--- a/arch/arm64/kvm/vgic/vgic-v3-nested.c
+++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c
@@ -116,7 +116,7 @@ bool vgic_state_is_nested(struct kvm_vcpu *vcpu)
{
u64 xmo;
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
+ if (is_nested_ctxt(vcpu)) {
xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO);
WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO),
"Separate virtual IRQ/FIQ settings not supported\n");
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 193946108192..4d9343d2b0b1 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -356,7 +356,7 @@ int vgic_v4_put(struct kvm_vcpu *vcpu)
{
struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
- if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
+ if (!vgic_supports_direct_irqs(vcpu->kvm) || !vpe->resident)
return 0;
return its_make_vpe_non_resident(vpe, vgic_v4_want_doorbell(vcpu));
@@ -367,7 +367,7 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
int err;
- if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident)
+ if (!vgic_supports_direct_irqs(vcpu->kvm) || vpe->resident)
return 0;
if (vcpu_get_flag(vcpu, IN_WFI))
@@ -527,28 +527,26 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq)
return NULL;
}
-int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
+void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
{
struct vgic_irq *irq;
unsigned long flags;
- int ret = 0;
if (!vgic_supports_direct_msis(kvm))
- return 0;
+ return;
irq = __vgic_host_irq_get_vlpi(kvm, host_irq);
if (!irq)
- return 0;
+ return;
raw_spin_lock_irqsave(&irq->irq_lock, flags);
WARN_ON(irq->hw && irq->host_irq != host_irq);
if (irq->hw) {
atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
irq->hw = false;
- ret = its_unmap_vlpi(host_irq);
+ its_unmap_vlpi(host_irq);
}
raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(kvm, irq);
- return ret;
}
diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
new file mode 100644
index 000000000000..6bdbb221bcde
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <kvm/arm_vgic.h>
+#include <linux/irqchip/arm-vgic-info.h>
+
+#include "vgic.h"
+
+/*
+ * Probe for a vGICv5 compatible interrupt controller, returning 0 on success.
+ * Currently only supports GICv3-based VMs on a GICv5 host, and hence only
+ * registers a VGIC_V3 device.
+ */
+int vgic_v5_probe(const struct gic_kvm_info *info)
+{
+ u64 ich_vtr_el2;
+ int ret;
+
+ if (!info->has_gcie_v3_compat)
+ return -ENODEV;
+
+ kvm_vgic_global_state.type = VGIC_V5;
+ kvm_vgic_global_state.has_gcie_v3_compat = true;
+
+ /* We only support v3 compat mode - use vGICv3 limits */
+ kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
+
+ kvm_vgic_global_state.vcpu_base = 0;
+ kvm_vgic_global_state.vctrl_base = NULL;
+ kvm_vgic_global_state.can_emulate_gicv2 = false;
+ kvm_vgic_global_state.has_gicv4 = false;
+ kvm_vgic_global_state.has_gicv4_1 = false;
+
+ ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config);
+ kvm_vgic_global_state.ich_vtr_el2 = (u32)ich_vtr_el2;
+
+ /*
+ * The ListRegs field is 5 bits, but there is an architectural
+ * maximum of 16 list registers. Just ignore bit 4...
+ */
+ kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
+
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+ if (ret) {
+ kvm_err("Cannot register GICv3-legacy KVM device.\n");
+ return ret;
+ }
+
+ static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
+ kvm_info("GCIE legacy system register CPU interface\n");
+
+ return 0;
+}
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 8f8096d48925..f5148b38120a 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -951,7 +951,7 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
* can be directly injected (GICv4).
*/
if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
- !vgic_supports_direct_msis(vcpu->kvm))
+ !vgic_supports_direct_irqs(vcpu->kvm))
return;
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
@@ -965,7 +965,7 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
if (can_access_vgic_from_kernel())
vgic_restore_state(vcpu);
- if (vgic_supports_direct_msis(vcpu->kvm))
+ if (vgic_supports_direct_irqs(vcpu->kvm))
vgic_v4_commit(vcpu);
}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 4349084cb9a6..1384a04c0784 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -64,6 +64,24 @@
KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \
KVM_REG_ARM_VGIC_SYSREG_OP2_MASK)
+#define KVM_ICC_SRE_EL2 (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE | \
+ ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB)
+#define KVM_ICH_VTR_EL2_RES0 (ICH_VTR_EL2_DVIM | \
+ ICH_VTR_EL2_A3V | \
+ ICH_VTR_EL2_IDbits)
+#define KVM_ICH_VTR_EL2_RES1 ICH_VTR_EL2_nV4
+
+static inline u64 kvm_get_guest_vtr_el2(void)
+{
+ u64 vtr;
+
+ vtr = kvm_vgic_global_state.ich_vtr_el2;
+ vtr &= ~KVM_ICH_VTR_EL2_RES0;
+ vtr |= KVM_ICH_VTR_EL2_RES1;
+
+ return vtr;
+}
+
/*
* As per Documentation/virt/kvm/devices/arm-vgic-its.rst,
* below macros are defined for ITS table entry encoding.
@@ -297,6 +315,7 @@ int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr, bool is_write);
int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
+const struct sys_reg_desc *vgic_v3_get_sysreg_table(unsigned int *sz);
int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
u32 intid, u32 *val);
int kvm_register_vgic_device(unsigned long type);
@@ -308,6 +327,8 @@ int vgic_init(struct kvm *kvm);
void vgic_debug_init(struct kvm *kvm);
void vgic_debug_destroy(struct kvm *kvm);
+int vgic_v5_probe(const struct gic_kvm_info *info);
+
static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu;
@@ -369,7 +390,23 @@ void vgic_its_invalidate_all_caches(struct kvm *kvm);
int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq);
int vgic_its_invall(struct kvm_vcpu *vcpu);
+bool system_supports_direct_sgis(void);
bool vgic_supports_direct_msis(struct kvm *kvm);
+bool vgic_supports_direct_sgis(struct kvm *kvm);
+
+static inline bool vgic_supports_direct_irqs(struct kvm *kvm)
+{
+ /*
+ * Deliberately conflate vLPI and vSGI support on GICv4.1 hardware,
+ * indirectly allowing userspace to control whether or not vPEs are
+ * allocated for the VM.
+ */
+ if (system_supports_direct_sgis())
+ return vgic_supports_direct_sgis(kvm);
+
+ return vgic_supports_direct_msis(kvm);
+}
+
int vgic_v4_init(struct kvm *kvm);
void vgic_v4_teardown(struct kvm *kvm);
void vgic_v4_configure_vsgis(struct kvm *kvm);
@@ -389,6 +426,17 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu);
void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu);
void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu);
+static inline bool vgic_is_v3_compat(struct kvm *kvm)
+{
+ return cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF) &&
+ kvm_vgic_global_state.has_gcie_v3_compat;
+}
+
+static inline bool vgic_is_v3(struct kvm *kvm)
+{
+ return kvm_vgic_global_state.type == VGIC_V3 || vgic_is_v3_compat(kvm);
+}
+
int vgic_its_debug_init(struct kvm_device *dev);
void vgic_its_debug_destroy(struct kvm_device *dev);
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index bcac4f55f9c1..c0557945939c 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -68,7 +68,144 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr,
pte = pte_mkyoung(pte);
}
- __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
+ /*
+ * On eliding the __tlb_flush_range() under BBML2+noabort:
+ *
+ * NOTE: Instead of using N=16 as the contiguous block length, we use
+ * N=4 for clarity.
+ *
+ * NOTE: 'n' and 'c' are used to denote the "contiguous bit" being
+ * unset and set, respectively.
+ *
+ * We worry about two cases where contiguous bit is used:
+ * - When folding N smaller non-contiguous ptes as 1 contiguous block.
+ * - When unfolding a contiguous block into N smaller non-contiguous ptes.
+ *
+ * Currently, the BBML0 folding case looks as follows:
+ *
+ * 0) Initial page-table layout:
+ *
+ * +----+----+----+----+
+ * |RO,n|RO,n|RO,n|RW,n| <--- last page being set as RO
+ * +----+----+----+----+
+ *
+ * 1) Aggregate AF + dirty flags using __ptep_get_and_clear():
+ *
+ * +----+----+----+----+
+ * | 0 | 0 | 0 | 0 |
+ * +----+----+----+----+
+ *
+ * 2) __flush_tlb_range():
+ *
+ * |____ tlbi + dsb ____|
+ *
+ * 3) __set_ptes() to repaint contiguous block:
+ *
+ * +----+----+----+----+
+ * |RO,c|RO,c|RO,c|RO,c|
+ * +----+----+----+----+
+ *
+ * 4) The kernel will eventually __flush_tlb() for changed page:
+ *
+ * |____| <--- tlbi + dsb
+ *
+ * As expected, the intermediate tlbi+dsb ensures that other PEs
+ * only ever see an invalid (0) entry, or the new contiguous TLB entry.
+ * The final tlbi+dsb will always throw away the newly installed
+ * contiguous TLB entry, which is a micro-optimisation opportunity,
+ * but does not affect correctness.
+ *
+ * In the BBML2 case, the change is avoiding the intermediate tlbi+dsb.
+ * This means a few things, but notably other PEs will still "see" any
+ * stale cached TLB entries. This could lead to a "contiguous bit
+ * misprogramming" issue until the final tlbi+dsb of the changed page,
+ * which would clear out both the stale (RW,n) entry and the new (RO,c)
+ * contiguous entry installed in its place.
+ *
+ * What this is saying, is the following:
+ *
+ * +----+----+----+----+
+ * |RO,n|RO,n|RO,n|RW,n| <--- old page tables, all non-contiguous
+ * +----+----+----+----+
+ *
+ * +----+----+----+----+
+ * |RO,c|RO,c|RO,c|RO,c| <--- new page tables, all contiguous
+ * +----+----+----+----+
+ * /\
+ * ||
+ *
+ * If both the old single (RW,n) and new contiguous (RO,c) TLB entries
+ * are present, and a write is made to this address, do we fault or
+ * is the write permitted (via amalgamation)?
+ *
+ * The relevant Arm ARM DDI 0487L.a requirements are RNGLXZ and RJQQTC,
+ * and together state that when BBML1 or BBML2 are implemented, either
+ * a TLB conflict abort is raised (which we expressly forbid), or will
+ * "produce an OA, access permissions, and memory attributes that are
+ * consistent with any of the programmed translation table values".
+ *
+ * That is to say, will either raise a TLB conflict, or produce one of
+ * the cached TLB entries, but never amalgamate.
+ *
+ * Thus, as the page tables are only considered "consistent" after
+ * the final tlbi+dsb (which evicts both the single stale (RW,n) TLB
+ * entry as well as the new contiguous (RO,c) TLB entry), omitting the
+ * initial tlbi+dsb is correct.
+ *
+ * It is also important to note that at the end of the BBML2 folding
+ * case, we are still left with potentially all N TLB entries still
+ * cached (the N-1 non-contiguous ptes, and the single contiguous
+ * block). However, over time, natural TLB pressure will cause the
+ * non-contiguous pte TLB entries to be flushed, leaving only the
+ * contiguous block TLB entry. This means that omitting the tlbi+dsb is
+ * not only correct, but also keeps our eventual performance benefits.
+ *
+ * For the unfolding case, BBML0 looks as follows:
+ *
+ * 0) Initial page-table layout:
+ *
+ * +----+----+----+----+
+ * |RW,c|RW,c|RW,c|RW,c| <--- last page being set as RO
+ * +----+----+----+----+
+ *
+ * 1) Aggregate AF + dirty flags using __ptep_get_and_clear():
+ *
+ * +----+----+----+----+
+ * | 0 | 0 | 0 | 0 |
+ * +----+----+----+----+
+ *
+ * 2) __flush_tlb_range():
+ *
+ * |____ tlbi + dsb ____|
+ *
+ * 3) __set_ptes() to repaint as non-contiguous:
+ *
+ * +----+----+----+----+
+ * |RW,n|RW,n|RW,n|RW,n|
+ * +----+----+----+----+
+ *
+ * 4) Update changed page permissions:
+ *
+ * +----+----+----+----+
+ * |RW,n|RW,n|RW,n|RO,n| <--- last page permissions set
+ * +----+----+----+----+
+ *
+ * 5) The kernel will eventually __flush_tlb() for changed page:
+ *
+ * |____| <--- tlbi + dsb
+ *
+ * For BBML2, we again remove the intermediate tlbi+dsb. Here, there
+ * are no issues, as the final tlbi+dsb covering the changed page is
+ * guaranteed to remove the original large contiguous (RW,c) TLB entry,
+ * as well as the intermediate (RW,n) TLB entry; the next access will
+ * install the new (RO,n) TLB entry and the page tables are only
+ * considered "consistent" after the final tlbi+dsb, so software must
+ * be prepared for this inconsistency prior to finishing the mm dance
+ * regardless.
+ */
+
+ if (!system_supports_bbml2_noabort())
+ __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
__set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
}
@@ -169,17 +306,46 @@ pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte)
for (i = 0; i < CONT_PTES; i++, ptep++) {
pte = __ptep_get(ptep);
- if (pte_dirty(pte))
+ if (pte_dirty(pte)) {
orig_pte = pte_mkdirty(orig_pte);
-
- if (pte_young(pte))
+ for (; i < CONT_PTES; i++, ptep++) {
+ pte = __ptep_get(ptep);
+ if (pte_young(pte)) {
+ orig_pte = pte_mkyoung(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
+
+ if (pte_young(pte)) {
orig_pte = pte_mkyoung(orig_pte);
+ i++;
+ ptep++;
+ for (; i < CONT_PTES; i++, ptep++) {
+ pte = __ptep_get(ptep);
+ if (pte_dirty(pte)) {
+ orig_pte = pte_mkdirty(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
}
return orig_pte;
}
EXPORT_SYMBOL_GPL(contpte_ptep_get);
+static inline bool contpte_is_consistent(pte_t pte, unsigned long pfn,
+ pgprot_t orig_prot)
+{
+ pgprot_t prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
+
+ return pte_valid_cont(pte) && pte_pfn(pte) == pfn &&
+ pgprot_val(prot) == pgprot_val(orig_prot);
+}
+
pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
{
/*
@@ -202,7 +368,6 @@ pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
pgprot_t orig_prot;
unsigned long pfn;
pte_t orig_pte;
- pgprot_t prot;
pte_t *ptep;
pte_t pte;
int i;
@@ -219,18 +384,44 @@ retry:
for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) {
pte = __ptep_get(ptep);
- prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
- if (!pte_valid_cont(pte) ||
- pte_pfn(pte) != pfn ||
- pgprot_val(prot) != pgprot_val(orig_prot))
+ if (!contpte_is_consistent(pte, pfn, orig_prot))
goto retry;
- if (pte_dirty(pte))
+ if (pte_dirty(pte)) {
orig_pte = pte_mkdirty(orig_pte);
+ for (; i < CONT_PTES; i++, ptep++, pfn++) {
+ pte = __ptep_get(ptep);
+
+ if (!contpte_is_consistent(pte, pfn, orig_prot))
+ goto retry;
+
+ if (pte_young(pte)) {
+ orig_pte = pte_mkyoung(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
- if (pte_young(pte))
+ if (pte_young(pte)) {
orig_pte = pte_mkyoung(orig_pte);
+ i++;
+ ptep++;
+ pfn++;
+ for (; i < CONT_PTES; i++, ptep++, pfn++) {
+ pte = __ptep_get(ptep);
+
+ if (!contpte_is_consistent(pte, pfn, orig_prot))
+ goto retry;
+
+ if (pte_dirty(pte)) {
+ orig_pte = pte_mkdirty(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
}
return orig_pte;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 11eb8d1adc84..fcc783e8e9bb 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -53,18 +53,12 @@ struct fault_info {
};
static const struct fault_info fault_info[];
-static struct fault_info debug_fault_info[];
static inline const struct fault_info *esr_to_fault_info(unsigned long esr)
{
return fault_info + (esr & ESR_ELx_FSC);
}
-static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr)
-{
- return debug_fault_info + DBG_ESR_EVT(esr);
-}
-
static void data_abort_decode(unsigned long esr)
{
unsigned long iss2 = ESR_ELx_ISS2(esr);
@@ -838,6 +832,7 @@ static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
*/
siaddr = untagged_addr(far);
}
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_STILL_OK);
arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
return 0;
@@ -849,9 +844,12 @@ static int do_tag_check_fault(unsigned long far, unsigned long esr,
/*
* The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
* for tag check faults. Set them to corresponding bits in the untagged
- * address.
+ * address if ARM64_MTE_FAR isn't supported.
+ * Otherwise, bits 63:60 of FAR_EL1 are not UNKNOWN.
*/
- far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
+ if (!cpus_have_cap(ARM64_MTE_FAR))
+ far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
+
do_bad_area(far, esr, regs);
return 0;
}
@@ -951,75 +949,6 @@ void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs)
NOKPROBE_SYMBOL(do_sp_pc_abort);
/*
- * __refdata because early_brk64 is __init, but the reference to it is
- * clobbered at arch_initcall time.
- * See traps.c and debug-monitors.c:debug_traps_init().
- */
-static struct fault_info __refdata debug_fault_info[] = {
- { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" },
- { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" },
- { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" },
- { do_bad, SIGKILL, SI_KERNEL, "unknown 3" },
- { do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" },
- { do_bad, SIGKILL, SI_KERNEL, "aarch32 vector catch" },
- { early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" },
- { do_bad, SIGKILL, SI_KERNEL, "unknown 7" },
-};
-
-void __init hook_debug_fault_code(int nr,
- int (*fn)(unsigned long, unsigned long, struct pt_regs *),
- int sig, int code, const char *name)
-{
- BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));
-
- debug_fault_info[nr].fn = fn;
- debug_fault_info[nr].sig = sig;
- debug_fault_info[nr].code = code;
- debug_fault_info[nr].name = name;
-}
-
-/*
- * In debug exception context, we explicitly disable preemption despite
- * having interrupts disabled.
- * This serves two purposes: it makes it much less likely that we would
- * accidentally schedule in exception context and it will force a warning
- * if we somehow manage to schedule by accident.
- */
-static void debug_exception_enter(struct pt_regs *regs)
-{
- preempt_disable();
-
- /* This code is a bit fragile. Test it. */
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
-}
-NOKPROBE_SYMBOL(debug_exception_enter);
-
-static void debug_exception_exit(struct pt_regs *regs)
-{
- preempt_enable_no_resched();
-}
-NOKPROBE_SYMBOL(debug_exception_exit);
-
-void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
- struct pt_regs *regs)
-{
- const struct fault_info *inf = esr_to_debug_fault_info(esr);
- unsigned long pc = instruction_pointer(regs);
-
- debug_exception_enter(regs);
-
- if (user_mode(regs) && !is_ttbr0_addr(pc))
- arm64_apply_bp_hardening();
-
- if (inf->fn(addr_if_watchpoint, esr, regs)) {
- arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr);
- }
-
- debug_exception_exit(regs);
-}
-NOKPROBE_SYMBOL(do_debug_exception);
-
-/*
* Used during anonymous page fault handling.
*/
struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
index 5c46ec527b1c..6e93f78de79b 100644
--- a/arch/arm64/mm/gcs.c
+++ b/arch/arm64/mm/gcs.c
@@ -157,12 +157,6 @@ void gcs_free(struct task_struct *task)
if (!system_supports_gcs())
return;
- /*
- * When fork() with CLONE_VM fails, the child (tsk) already
- * has a GCS allocated, and exit_thread() calls this function
- * to free it. In this case the parent (current) and the
- * child share the same mm struct.
- */
if (!task->mm || task->mm != current->mm)
return;
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 0c8737f4f2ce..1d90a7e75333 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -225,7 +225,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
ncontig = num_contig_ptes(sz, &pgsize);
if (!pte_present(pte)) {
- for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
+ for (i = 0; i < ncontig; i++, ptep++)
__set_ptes_anysz(mm, ptep, pte, 1, pgsize);
return;
}
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 54dccfd6aa11..8c75965afc9e 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -454,7 +454,7 @@ SYM_FUNC_START(__cpu_setup)
dsb nsh
msr cpacr_el1, xzr // Reset cpacr_el1
- mov x1, #1 << 12 // Reset mdscr_el1 and disable
+ mov x1, MDSCR_EL1_TDCC // Reset mdscr_el1 and disable
msr mdscr_el1, x1 // access to the DCC from EL0
reset_pmuserenr_el0 x1 // Disable PMU access from EL0
reset_amuserenr_el0 x1 // Disable AMU access from EL0
diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
index a3b0e693a125..bbea4f36f9f2 100644
--- a/arch/arm64/net/bpf_jit.h
+++ b/arch/arm64/net/bpf_jit.h
@@ -325,4 +325,9 @@
#define A64_MRS_SP_EL0(Rt) \
aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_SP_EL0)
+/* Barriers */
+#define A64_SB aarch64_insn_get_sb_value()
+#define A64_DSB_NSH (aarch64_insn_get_dsb_base_value() | 0x7 << 8)
+#define A64_ISB aarch64_insn_get_isb_value()
+
#endif /* _BPF_JIT_H */
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index da8b89dd2910..97dfd5432809 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -30,6 +30,7 @@
#define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
#define TCCNT_PTR (MAX_BPF_JIT_REG + 2)
#define TMP_REG_3 (MAX_BPF_JIT_REG + 3)
+#define PRIVATE_SP (MAX_BPF_JIT_REG + 4)
#define ARENA_VM_START (MAX_BPF_JIT_REG + 5)
#define check_imm(bits, imm) do { \
@@ -68,6 +69,8 @@ static const int bpf2a64[] = {
[TCCNT_PTR] = A64_R(26),
/* temporary register for blinding constants */
[BPF_REG_AX] = A64_R(9),
+ /* callee saved register for private stack pointer */
+ [PRIVATE_SP] = A64_R(27),
/* callee saved register for kern_vm_start address */
[ARENA_VM_START] = A64_R(28),
};
@@ -86,6 +89,7 @@ struct jit_ctx {
u64 user_vm_start;
u64 arena_vm_start;
bool fp_used;
+ bool priv_sp_used;
bool write;
};
@@ -98,6 +102,10 @@ struct bpf_plt {
#define PLT_TARGET_SIZE sizeof_field(struct bpf_plt, target)
#define PLT_TARGET_OFFSET offsetof(struct bpf_plt, target)
+/* Memory size/value to protect private stack overflow/underflow */
+#define PRIV_STACK_GUARD_SZ 16
+#define PRIV_STACK_GUARD_VAL 0xEB9F12345678eb9fULL
+
static inline void emit(const u32 insn, struct jit_ctx *ctx)
{
if (ctx->image != NULL && ctx->write)
@@ -387,8 +395,11 @@ static void find_used_callee_regs(struct jit_ctx *ctx)
if (reg_used & 8)
ctx->used_callee_reg[i++] = bpf2a64[BPF_REG_9];
- if (reg_used & 16)
+ if (reg_used & 16) {
ctx->used_callee_reg[i++] = bpf2a64[BPF_REG_FP];
+ if (ctx->priv_sp_used)
+ ctx->used_callee_reg[i++] = bpf2a64[PRIVATE_SP];
+ }
if (ctx->arena_vm_start)
ctx->used_callee_reg[i++] = bpf2a64[ARENA_VM_START];
@@ -412,6 +423,7 @@ static void push_callee_regs(struct jit_ctx *ctx)
emit(A64_PUSH(A64_R(23), A64_R(24), A64_SP), ctx);
emit(A64_PUSH(A64_R(25), A64_R(26), A64_SP), ctx);
emit(A64_PUSH(A64_R(27), A64_R(28), A64_SP), ctx);
+ ctx->fp_used = true;
} else {
find_used_callee_regs(ctx);
for (i = 0; i + 1 < ctx->nr_used_callee_reg; i += 2) {
@@ -461,6 +473,19 @@ static void pop_callee_regs(struct jit_ctx *ctx)
}
}
+static void emit_percpu_ptr(const u8 dst_reg, void __percpu *ptr,
+ struct jit_ctx *ctx)
+{
+ const u8 tmp = bpf2a64[TMP_REG_1];
+
+ emit_a64_mov_i64(dst_reg, (__force const u64)ptr, ctx);
+ if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN))
+ emit(A64_MRS_TPIDR_EL2(tmp), ctx);
+ else
+ emit(A64_MRS_TPIDR_EL1(tmp), ctx);
+ emit(A64_ADD(1, dst_reg, dst_reg, tmp), ctx);
+}
+
#define BTI_INSNS (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) ? 1 : 0)
#define PAC_INSNS (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) ? 1 : 0)
@@ -476,6 +501,8 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
const bool is_main_prog = !bpf_is_subprog(prog);
const u8 fp = bpf2a64[BPF_REG_FP];
const u8 arena_vm_base = bpf2a64[ARENA_VM_START];
+ const u8 priv_sp = bpf2a64[PRIVATE_SP];
+ void __percpu *priv_stack_ptr;
const int idx0 = ctx->idx;
int cur_offset;
@@ -551,15 +578,23 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
emit(A64_SUB_I(1, A64_SP, A64_FP, 96), ctx);
}
- if (ctx->fp_used)
- /* Set up BPF prog stack base register */
- emit(A64_MOV(1, fp, A64_SP), ctx);
-
/* Stack must be multiples of 16B */
ctx->stack_size = round_up(prog->aux->stack_depth, 16);
+ if (ctx->fp_used) {
+ if (ctx->priv_sp_used) {
+ /* Set up private stack pointer */
+ priv_stack_ptr = prog->aux->priv_stack_ptr + PRIV_STACK_GUARD_SZ;
+ emit_percpu_ptr(priv_sp, priv_stack_ptr, ctx);
+ emit(A64_ADD_I(1, fp, priv_sp, ctx->stack_size), ctx);
+ } else {
+ /* Set up BPF prog stack base register */
+ emit(A64_MOV(1, fp, A64_SP), ctx);
+ }
+ }
+
/* Set up function call stack */
- if (ctx->stack_size)
+ if (ctx->stack_size && !ctx->priv_sp_used)
emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
if (ctx->arena_vm_start)
@@ -623,7 +658,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx)
emit(A64_STR64I(tcc, ptr, 0), ctx);
/* restore SP */
- if (ctx->stack_size)
+ if (ctx->stack_size && !ctx->priv_sp_used)
emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
pop_callee_regs(ctx);
@@ -991,7 +1026,7 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic)
const u8 ptr = bpf2a64[TCCNT_PTR];
/* We're done with BPF stack */
- if (ctx->stack_size)
+ if (ctx->stack_size && !ctx->priv_sp_used)
emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
pop_callee_regs(ctx);
@@ -1120,6 +1155,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
const u8 tmp2 = bpf2a64[TMP_REG_2];
const u8 fp = bpf2a64[BPF_REG_FP];
const u8 arena_vm_base = bpf2a64[ARENA_VM_START];
+ const u8 priv_sp = bpf2a64[PRIVATE_SP];
const s16 off = insn->off;
const s32 imm = insn->imm;
const int i = insn - ctx->prog->insnsi;
@@ -1564,7 +1600,7 @@ emit_cond_jmp:
src = tmp2;
}
if (src == fp) {
- src_adj = A64_SP;
+ src_adj = ctx->priv_sp_used ? priv_sp : A64_SP;
off_adj = off + ctx->stack_size;
} else {
src_adj = src;
@@ -1630,17 +1666,14 @@ emit_cond_jmp:
return ret;
break;
- /* speculation barrier */
+ /* speculation barrier against v1 and v4 */
case BPF_ST | BPF_NOSPEC:
- /*
- * Nothing required here.
- *
- * In case of arm64, we rely on the firmware mitigation of
- * Speculative Store Bypass as controlled via the ssbd kernel
- * parameter. Whenever the mitigation is enabled, it works
- * for all of the kernel code with no need to provide any
- * additional instructions.
- */
+ if (alternative_has_cap_likely(ARM64_HAS_SB)) {
+ emit(A64_SB, ctx);
+ } else {
+ emit(A64_DSB_NSH, ctx);
+ emit(A64_ISB, ctx);
+ }
break;
/* ST: *(size *)(dst + off) = imm */
@@ -1657,7 +1690,7 @@ emit_cond_jmp:
dst = tmp2;
}
if (dst == fp) {
- dst_adj = A64_SP;
+ dst_adj = ctx->priv_sp_used ? priv_sp : A64_SP;
off_adj = off + ctx->stack_size;
} else {
dst_adj = dst;
@@ -1719,7 +1752,7 @@ emit_cond_jmp:
dst = tmp2;
}
if (dst == fp) {
- dst_adj = A64_SP;
+ dst_adj = ctx->priv_sp_used ? priv_sp : A64_SP;
off_adj = off + ctx->stack_size;
} else {
dst_adj = dst;
@@ -1862,6 +1895,39 @@ static inline void bpf_flush_icache(void *start, void *end)
flush_icache_range((unsigned long)start, (unsigned long)end);
}
+static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size)
+{
+ int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+ u64 *stack_ptr;
+
+ for_each_possible_cpu(cpu) {
+ stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+ stack_ptr[0] = PRIV_STACK_GUARD_VAL;
+ stack_ptr[1] = PRIV_STACK_GUARD_VAL;
+ stack_ptr[underflow_idx] = PRIV_STACK_GUARD_VAL;
+ stack_ptr[underflow_idx + 1] = PRIV_STACK_GUARD_VAL;
+ }
+}
+
+static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size,
+ struct bpf_prog *prog)
+{
+ int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+ u64 *stack_ptr;
+
+ for_each_possible_cpu(cpu) {
+ stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+ if (stack_ptr[0] != PRIV_STACK_GUARD_VAL ||
+ stack_ptr[1] != PRIV_STACK_GUARD_VAL ||
+ stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL ||
+ stack_ptr[underflow_idx + 1] != PRIV_STACK_GUARD_VAL) {
+ pr_err("BPF private stack overflow/underflow detected for prog %sx\n",
+ bpf_jit_get_prog_name(prog));
+ break;
+ }
+ }
+}
+
struct arm64_jit_data {
struct bpf_binary_header *header;
u8 *ro_image;
@@ -1874,9 +1940,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
int image_size, prog_size, extable_size, extable_align, extable_offset;
struct bpf_prog *tmp, *orig_prog = prog;
struct bpf_binary_header *header;
- struct bpf_binary_header *ro_header;
+ struct bpf_binary_header *ro_header = NULL;
struct arm64_jit_data *jit_data;
+ void __percpu *priv_stack_ptr = NULL;
bool was_classic = bpf_prog_was_classic(prog);
+ int priv_stack_alloc_sz;
bool tmp_blinded = false;
bool extra_pass = false;
struct jit_ctx ctx;
@@ -1908,6 +1976,23 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
}
prog->aux->jit_data = jit_data;
}
+ priv_stack_ptr = prog->aux->priv_stack_ptr;
+ if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
+ /* Allocate actual private stack size with verifier-calculated
+ * stack size plus two memory guards to protect overflow and
+ * underflow.
+ */
+ priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 16) +
+ 2 * PRIV_STACK_GUARD_SZ;
+ priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 16, GFP_KERNEL);
+ if (!priv_stack_ptr) {
+ prog = orig_prog;
+ goto out_priv_stack;
+ }
+
+ priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_sz);
+ prog->aux->priv_stack_ptr = priv_stack_ptr;
+ }
if (jit_data->ctx.offset) {
ctx = jit_data->ctx;
ro_image_ptr = jit_data->ro_image;
@@ -1931,6 +2016,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
ctx.arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena);
+ if (priv_stack_ptr)
+ ctx.priv_sp_used = true;
+
/* Pass 1: Estimate the maximum image size.
*
* BPF line info needs ctx->offset[i] to be the offset of
@@ -2070,7 +2158,12 @@ skip_init_ctx:
ctx.offset[i] *= AARCH64_INSN_SIZE;
bpf_prog_fill_jited_linfo(prog, ctx.offset + 1);
out_off:
+ if (!ro_header && priv_stack_ptr) {
+ free_percpu(priv_stack_ptr);
+ prog->aux->priv_stack_ptr = NULL;
+ }
kvfree(ctx.offset);
+out_priv_stack:
kfree(jit_data);
prog->aux->jit_data = NULL;
}
@@ -2089,6 +2182,11 @@ out_free_hdr:
goto out_off;
}
+bool bpf_jit_supports_private_stack(void)
+{
+ return true;
+}
+
bool bpf_jit_supports_kfunc_call(void)
{
return true;
@@ -2243,11 +2341,6 @@ static int calc_arg_aux(const struct btf_func_model *m,
/* the rest arguments are passed through stack */
for (; i < m->nr_args; i++) {
- /* We can not know for sure about exact alignment needs for
- * struct passed on stack, so deny those
- */
- if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
- return -ENOTSUPP;
stack_slots = (m->arg_size[i] + 7) / 8;
a->bstack_for_args += stack_slots * 8;
a->ostack_for_args = a->ostack_for_args + stack_slots * 8;
@@ -2911,6 +3004,17 @@ bool bpf_jit_supports_percpu_insn(void)
return true;
}
+bool bpf_jit_bypass_spec_v4(void)
+{
+ /* In case of arm64, we rely on the firmware mitigation of Speculative
+ * Store Bypass as controlled via the ssbd kernel parameter. Whenever
+ * the mitigation is enabled, it works for all of the kernel code with
+ * no need to provide any additional instructions. Therefore, skip
+ * inserting nospec insns against Spectre v4.
+ */
+ return true;
+}
+
bool bpf_jit_inlines_helper_call(s32 imm)
{
switch (imm) {
@@ -2928,6 +3032,8 @@ void bpf_jit_free(struct bpf_prog *prog)
if (prog->jited) {
struct arm64_jit_data *jit_data = prog->aux->jit_data;
struct bpf_binary_header *hdr;
+ void __percpu *priv_stack_ptr;
+ int priv_stack_alloc_sz;
/*
* If we fail the final pass of JIT (from jit_subprogs),
@@ -2941,6 +3047,13 @@ void bpf_jit_free(struct bpf_prog *prog)
}
hdr = bpf_jit_binary_pack_hdr(prog);
bpf_jit_binary_pack_free(hdr, NULL);
+ priv_stack_ptr = prog->aux->priv_stack_ptr;
+ if (priv_stack_ptr) {
+ priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 16) +
+ 2 * PRIV_STACK_GUARD_SZ;
+ priv_stack_check_guard(priv_stack_ptr, priv_stack_alloc_sz, prog);
+ free_percpu(prog->aux->priv_stack_ptr);
+ }
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
}
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 10effd4cff6b..ef0b7946f5a4 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -35,7 +35,8 @@ HAS_GENERIC_AUTH
HAS_GENERIC_AUTH_ARCH_QARMA3
HAS_GENERIC_AUTH_ARCH_QARMA5
HAS_GENERIC_AUTH_IMP_DEF
-HAS_GIC_CPUIF_SYSREGS
+HAS_GICV3_CPUIF
+HAS_GICV5_CPUIF
HAS_GIC_PRIO_MASKING
HAS_GIC_PRIO_RELAXED_SYNC
HAS_HCR_NV1
@@ -45,10 +46,12 @@ HAS_LPA2
HAS_LSE_ATOMICS
HAS_MOPS
HAS_NESTED_VIRT
+HAS_BBML2_NOABORT
HAS_PAN
HAS_PMUV3
HAS_S1PIE
HAS_S1POE
+HAS_SCTLR2
HAS_RAS_EXTN
HAS_RNG
HAS_SB
@@ -68,6 +71,8 @@ MPAM
MPAM_HCR
MTE
MTE_ASYMM
+MTE_FAR
+MTE_STORE_ONLY
SME
SME_FA64
SME2
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 8a8cf6874298..696ab1f32a67 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1314,7 +1314,10 @@ UnsignedEnum 19:16 UINJ
0b0000 NI
0b0001 IMP
EndEnum
-Res0 15:12
+UnsignedEnum 15:12 GCIE
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
UnsignedEnum 11:8 MTEFAR
0b0000 NI
0b0001 IMP
@@ -1329,6 +1332,138 @@ UnsignedEnum 3:0 MTEPERM
EndEnum
EndSysreg
+
+SysregFields BRBINFx_EL1
+Res0 63:47
+Field 46 CCU
+Field 45:40 CC_EXP
+Field 39:32 CC_MANT
+Res0 31:18
+Field 17 LASTFAILED
+Field 16 T
+Res0 15:14
+Enum 13:8 TYPE
+ 0b000000 DIRECT_UNCOND
+ 0b000001 INDIRECT
+ 0b000010 DIRECT_LINK
+ 0b000011 INDIRECT_LINK
+ 0b000101 RET
+ 0b000111 ERET
+ 0b001000 DIRECT_COND
+ 0b100001 DEBUG_HALT
+ 0b100010 CALL
+ 0b100011 TRAP
+ 0b100100 SERROR
+ 0b100110 INSN_DEBUG
+ 0b100111 DATA_DEBUG
+ 0b101010 ALIGN_FAULT
+ 0b101011 INSN_FAULT
+ 0b101100 DATA_FAULT
+ 0b101110 IRQ
+ 0b101111 FIQ
+ 0b110000 IMPDEF_TRAP_EL3
+ 0b111001 DEBUG_EXIT
+EndEnum
+Enum 7:6 EL
+ 0b00 EL0
+ 0b01 EL1
+ 0b10 EL2
+ 0b11 EL3
+EndEnum
+Field 5 MPRED
+Res0 4:2
+Enum 1:0 VALID
+ 0b00 NONE
+ 0b01 TARGET
+ 0b10 SOURCE
+ 0b11 FULL
+EndEnum
+EndSysregFields
+
+SysregFields BRBCR_ELx
+Res0 63:24
+Field 23 EXCEPTION
+Field 22 ERTN
+Res0 21:10
+Field 9 FZPSS
+Field 8 FZP
+Res0 7
+Enum 6:5 TS
+ 0b01 VIRTUAL
+ 0b10 GUEST_PHYSICAL
+ 0b11 PHYSICAL
+EndEnum
+Field 4 MPRED
+Field 3 CC
+Res0 2
+Field 1 ExBRE
+Field 0 E0BRE
+EndSysregFields
+
+Sysreg BRBCR_EL1 2 1 9 0 0
+Fields BRBCR_ELx
+EndSysreg
+
+Sysreg BRBFCR_EL1 2 1 9 0 1
+Res0 63:30
+Enum 29:28 BANK
+ 0b00 BANK_0
+ 0b01 BANK_1
+EndEnum
+Res0 27:23
+Field 22 CONDDIR
+Field 21 DIRCALL
+Field 20 INDCALL
+Field 19 RTN
+Field 18 INDIRECT
+Field 17 DIRECT
+Field 16 EnI
+Res0 15:8
+Field 7 PAUSED
+Field 6 LASTFAILED
+Res0 5:0
+EndSysreg
+
+Sysreg BRBTS_EL1 2 1 9 0 2
+Field 63:0 TS
+EndSysreg
+
+Sysreg BRBINFINJ_EL1 2 1 9 1 0
+Fields BRBINFx_EL1
+EndSysreg
+
+Sysreg BRBSRCINJ_EL1 2 1 9 1 1
+Field 63:0 ADDRESS
+EndSysreg
+
+Sysreg BRBTGTINJ_EL1 2 1 9 1 2
+Field 63:0 ADDRESS
+EndSysreg
+
+Sysreg BRBIDR0_EL1 2 1 9 2 0
+Res0 63:16
+Enum 15:12 CC
+ 0b0101 20_BIT
+EndEnum
+Enum 11:8 FORMAT
+ 0b0000 FORMAT_0
+EndEnum
+Enum 7:0 NUMREC
+ 0b00001000 8
+ 0b00010000 16
+ 0b00100000 32
+ 0b01000000 64
+EndEnum
+EndSysreg
+
+Sysreg BRBCR_EL2 2 4 9 0 0
+Fields BRBCR_ELx
+EndSysreg
+
+Sysreg BRBCR_EL12 2 5 9 0 0
+Fields BRBCR_ELx
+EndSysreg
+
Sysreg ID_AA64ZFR0_EL1 3 0 0 4 4
Res0 63:60
UnsignedEnum 59:56 F64MM
@@ -3021,6 +3156,435 @@ Sysreg PMIAR_EL1 3 0 9 14 7
Field 63:0 ADDRESS
EndSysreg
+SysregFields ICC_PPI_HMRx_EL1
+Field 63 HM63
+Field 62 HM62
+Field 61 HM61
+Field 60 HM60
+Field 59 HM59
+Field 58 HM58
+Field 57 HM57
+Field 56 HM56
+Field 55 HM55
+Field 54 HM54
+Field 53 HM53
+Field 52 HM52
+Field 51 HM51
+Field 50 HM50
+Field 49 HM49
+Field 48 HM48
+Field 47 HM47
+Field 46 HM46
+Field 45 HM45
+Field 44 HM44
+Field 43 HM43
+Field 42 HM42
+Field 41 HM41
+Field 40 HM40
+Field 39 HM39
+Field 38 HM38
+Field 37 HM37
+Field 36 HM36
+Field 35 HM35
+Field 34 HM34
+Field 33 HM33
+Field 32 HM32
+Field 31 HM31
+Field 30 HM30
+Field 29 HM29
+Field 28 HM28
+Field 27 HM27
+Field 26 HM26
+Field 25 HM25
+Field 24 HM24
+Field 23 HM23
+Field 22 HM22
+Field 21 HM21
+Field 20 HM20
+Field 19 HM19
+Field 18 HM18
+Field 17 HM17
+Field 16 HM16
+Field 15 HM15
+Field 14 HM14
+Field 13 HM13
+Field 12 HM12
+Field 11 HM11
+Field 10 HM10
+Field 9 HM9
+Field 8 HM8
+Field 7 HM7
+Field 6 HM6
+Field 5 HM5
+Field 4 HM4
+Field 3 HM3
+Field 2 HM2
+Field 1 HM1
+Field 0 HM0
+EndSysregFields
+
+Sysreg ICC_PPI_HMR0_EL1 3 0 12 10 0
+Fields ICC_PPI_HMRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_HMR1_EL1 3 0 12 10 1
+Fields ICC_PPI_HMRx_EL1
+EndSysreg
+
+Sysreg ICC_IDR0_EL1 3 0 12 10 2
+Res0 63:12
+UnsignedEnum 11:8 GCIE_LEGACY
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+UnsignedEnum 7:4 PRI_BITS
+ 0b0011 4BITS
+ 0b0100 5BITS
+EndEnum
+UnsignedEnum 3:0 ID_BITS
+ 0b0000 16BITS
+ 0b0001 24BITS
+EndEnum
+EndSysreg
+
+Sysreg ICC_ICSR_EL1 3 0 12 10 4
+Res0 63:48
+Field 47:32 IAFFID
+Res0 31:16
+Field 15:11 Priority
+Res0 10:6
+Field 5 HM
+Field 4 Active
+Field 3 IRM
+Field 2 Pending
+Field 1 Enabled
+Field 0 F
+EndSysreg
+
+SysregFields ICC_PPI_ENABLERx_EL1
+Field 63 EN63
+Field 62 EN62
+Field 61 EN61
+Field 60 EN60
+Field 59 EN59
+Field 58 EN58
+Field 57 EN57
+Field 56 EN56
+Field 55 EN55
+Field 54 EN54
+Field 53 EN53
+Field 52 EN52
+Field 51 EN51
+Field 50 EN50
+Field 49 EN49
+Field 48 EN48
+Field 47 EN47
+Field 46 EN46
+Field 45 EN45
+Field 44 EN44
+Field 43 EN43
+Field 42 EN42
+Field 41 EN41
+Field 40 EN40
+Field 39 EN39
+Field 38 EN38
+Field 37 EN37
+Field 36 EN36
+Field 35 EN35
+Field 34 EN34
+Field 33 EN33
+Field 32 EN32
+Field 31 EN31
+Field 30 EN30
+Field 29 EN29
+Field 28 EN28
+Field 27 EN27
+Field 26 EN26
+Field 25 EN25
+Field 24 EN24
+Field 23 EN23
+Field 22 EN22
+Field 21 EN21
+Field 20 EN20
+Field 19 EN19
+Field 18 EN18
+Field 17 EN17
+Field 16 EN16
+Field 15 EN15
+Field 14 EN14
+Field 13 EN13
+Field 12 EN12
+Field 11 EN11
+Field 10 EN10
+Field 9 EN9
+Field 8 EN8
+Field 7 EN7
+Field 6 EN6
+Field 5 EN5
+Field 4 EN4
+Field 3 EN3
+Field 2 EN2
+Field 1 EN1
+Field 0 EN0
+EndSysregFields
+
+Sysreg ICC_PPI_ENABLER0_EL1 3 0 12 10 6
+Fields ICC_PPI_ENABLERx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_ENABLER1_EL1 3 0 12 10 7
+Fields ICC_PPI_ENABLERx_EL1
+EndSysreg
+
+SysregFields ICC_PPI_ACTIVERx_EL1
+Field 63 Active63
+Field 62 Active62
+Field 61 Active61
+Field 60 Active60
+Field 59 Active59
+Field 58 Active58
+Field 57 Active57
+Field 56 Active56
+Field 55 Active55
+Field 54 Active54
+Field 53 Active53
+Field 52 Active52
+Field 51 Active51
+Field 50 Active50
+Field 49 Active49
+Field 48 Active48
+Field 47 Active47
+Field 46 Active46
+Field 45 Active45
+Field 44 Active44
+Field 43 Active43
+Field 42 Active42
+Field 41 Active41
+Field 40 Active40
+Field 39 Active39
+Field 38 Active38
+Field 37 Active37
+Field 36 Active36
+Field 35 Active35
+Field 34 Active34
+Field 33 Active33
+Field 32 Active32
+Field 31 Active31
+Field 30 Active30
+Field 29 Active29
+Field 28 Active28
+Field 27 Active27
+Field 26 Active26
+Field 25 Active25
+Field 24 Active24
+Field 23 Active23
+Field 22 Active22
+Field 21 Active21
+Field 20 Active20
+Field 19 Active19
+Field 18 Active18
+Field 17 Active17
+Field 16 Active16
+Field 15 Active15
+Field 14 Active14
+Field 13 Active13
+Field 12 Active12
+Field 11 Active11
+Field 10 Active10
+Field 9 Active9
+Field 8 Active8
+Field 7 Active7
+Field 6 Active6
+Field 5 Active5
+Field 4 Active4
+Field 3 Active3
+Field 2 Active2
+Field 1 Active1
+Field 0 Active0
+EndSysregFields
+
+Sysreg ICC_PPI_CACTIVER0_EL1 3 0 12 13 0
+Fields ICC_PPI_ACTIVERx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_CACTIVER1_EL1 3 0 12 13 1
+Fields ICC_PPI_ACTIVERx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_SACTIVER0_EL1 3 0 12 13 2
+Fields ICC_PPI_ACTIVERx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_SACTIVER1_EL1 3 0 12 13 3
+Fields ICC_PPI_ACTIVERx_EL1
+EndSysreg
+
+SysregFields ICC_PPI_PENDRx_EL1
+Field 63 Pend63
+Field 62 Pend62
+Field 61 Pend61
+Field 60 Pend60
+Field 59 Pend59
+Field 58 Pend58
+Field 57 Pend57
+Field 56 Pend56
+Field 55 Pend55
+Field 54 Pend54
+Field 53 Pend53
+Field 52 Pend52
+Field 51 Pend51
+Field 50 Pend50
+Field 49 Pend49
+Field 48 Pend48
+Field 47 Pend47
+Field 46 Pend46
+Field 45 Pend45
+Field 44 Pend44
+Field 43 Pend43
+Field 42 Pend42
+Field 41 Pend41
+Field 40 Pend40
+Field 39 Pend39
+Field 38 Pend38
+Field 37 Pend37
+Field 36 Pend36
+Field 35 Pend35
+Field 34 Pend34
+Field 33 Pend33
+Field 32 Pend32
+Field 31 Pend31
+Field 30 Pend30
+Field 29 Pend29
+Field 28 Pend28
+Field 27 Pend27
+Field 26 Pend26
+Field 25 Pend25
+Field 24 Pend24
+Field 23 Pend23
+Field 22 Pend22
+Field 21 Pend21
+Field 20 Pend20
+Field 19 Pend19
+Field 18 Pend18
+Field 17 Pend17
+Field 16 Pend16
+Field 15 Pend15
+Field 14 Pend14
+Field 13 Pend13
+Field 12 Pend12
+Field 11 Pend11
+Field 10 Pend10
+Field 9 Pend9
+Field 8 Pend8
+Field 7 Pend7
+Field 6 Pend6
+Field 5 Pend5
+Field 4 Pend4
+Field 3 Pend3
+Field 2 Pend2
+Field 1 Pend1
+Field 0 Pend0
+EndSysregFields
+
+Sysreg ICC_PPI_CPENDR0_EL1 3 0 12 13 4
+Fields ICC_PPI_PENDRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_CPENDR1_EL1 3 0 12 13 5
+Fields ICC_PPI_PENDRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_SPENDR0_EL1 3 0 12 13 6
+Fields ICC_PPI_PENDRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_SPENDR1_EL1 3 0 12 13 7
+Fields ICC_PPI_PENDRx_EL1
+EndSysreg
+
+SysregFields ICC_PPI_PRIORITYRx_EL1
+Res0 63:61
+Field 60:56 Priority7
+Res0 55:53
+Field 52:48 Priority6
+Res0 47:45
+Field 44:40 Priority5
+Res0 39:37
+Field 36:32 Priority4
+Res0 31:29
+Field 28:24 Priority3
+Res0 23:21
+Field 20:16 Priority2
+Res0 15:13
+Field 12:8 Priority1
+Res0 7:5
+Field 4:0 Priority0
+EndSysregFields
+
+Sysreg ICC_PPI_PRIORITYR0_EL1 3 0 12 14 0
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR1_EL1 3 0 12 14 1
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR2_EL1 3 0 12 14 2
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR3_EL1 3 0 12 14 3
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR4_EL1 3 0 12 14 4
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR5_EL1 3 0 12 14 5
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR6_EL1 3 0 12 14 6
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR7_EL1 3 0 12 14 7
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR8_EL1 3 0 12 15 0
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR9_EL1 3 0 12 15 1
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR10_EL1 3 0 12 15 2
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR11_EL1 3 0 12 15 3
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR12_EL1 3 0 12 15 4
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR13_EL1 3 0 12 15 5
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR14_EL1 3 0 12 15 6
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR15_EL1 3 0 12 15 7
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
Sysreg PMSELR_EL0 3 3 9 12 5
Res0 63:5
Field 4:0 SEL
@@ -3103,6 +3667,19 @@ Res0 14:12
Field 11:0 AFFINITY
EndSysreg
+Sysreg ICC_CR0_EL1 3 1 12 0 1
+Res0 63:39
+Field 38 PID
+Field 37:32 IPPT
+Res0 31:1
+Field 0 EN
+EndSysreg
+
+Sysreg ICC_PCR_EL1 3 1 12 0 2
+Res0 63:5
+Field 4:0 PRIORITY
+EndSysreg
+
Sysreg CSSELR_EL1 3 2 0 0 0
Res0 63:5
Field 4 TnD
@@ -3989,6 +4566,54 @@ Field 31:16 PhyPARTID29
Field 15:0 PhyPARTID28
EndSysreg
+Sysreg ICH_HFGRTR_EL2 3 4 12 9 4
+Res0 63:21
+Field 20 ICC_PPI_ACTIVERn_EL1
+Field 19 ICC_PPI_PRIORITYRn_EL1
+Field 18 ICC_PPI_PENDRn_EL1
+Field 17 ICC_PPI_ENABLERn_EL1
+Field 16 ICC_PPI_HMRn_EL1
+Res0 15:8
+Field 7 ICC_IAFFIDR_EL1
+Field 6 ICC_ICSR_EL1
+Field 5 ICC_PCR_EL1
+Field 4 ICC_HPPIR_EL1
+Field 3 ICC_HAPR_EL1
+Field 2 ICC_CR0_EL1
+Field 1 ICC_IDRn_EL1
+Field 0 ICC_APR_EL1
+EndSysreg
+
+Sysreg ICH_HFGWTR_EL2 3 4 12 9 6
+Res0 63:21
+Field 20 ICC_PPI_ACTIVERn_EL1
+Field 19 ICC_PPI_PRIORITYRn_EL1
+Field 18 ICC_PPI_PENDRn_EL1
+Field 17 ICC_PPI_ENABLERn_EL1
+Res0 16:7
+Field 6 ICC_ICSR_EL1
+Field 5 ICC_PCR_EL1
+Res0 4:3
+Field 2 ICC_CR0_EL1
+Res0 1
+Field 0 ICC_APR_EL1
+EndSysreg
+
+Sysreg ICH_HFGITR_EL2 3 4 12 9 7
+Res0 63:11
+Field 10 GICRCDNMIA
+Field 9 GICRCDIA
+Field 8 GICCDDI
+Field 7 GICCDEOI
+Field 6 GICCDHM
+Field 5 GICCDRCFG
+Field 4 GICCDPEND
+Field 3 GICCDAFF
+Field 2 GICCDPRI
+Field 1 GICCDDIS
+Field 0 GICCDEN
+EndSysreg
+
Sysreg ICH_HCR_EL2 3 4 12 11 0
Res0 63:32
Field 31:27 EOIcount
@@ -4037,6 +4662,12 @@ Field 1 U
Field 0 EOI
EndSysreg
+Sysreg ICH_VCTLR_EL2 3 4 12 11 4
+Res0 63:2
+Field 1 V3
+Field 0 En
+EndSysreg
+
Sysreg CONTEXTIDR_EL2 3 4 13 0 1
Fields CONTEXTIDR_ELx
EndSysreg
@@ -4150,7 +4781,13 @@ Mapping TCR_EL1
EndSysreg
Sysreg TCR2_EL1 3 0 2 0 3
-Res0 63:16
+Res0 63:22
+Field 21 FNGNA1
+Field 20 FNGNA0
+Res0 19
+Field 18 FNG1
+Field 17 FNG0
+Field 16 A2
Field 15 DisCH1
Field 14 DisCH0
Res0 13:12
@@ -4174,7 +4811,10 @@ Mapping TCR2_EL1
EndSysreg
Sysreg TCR2_EL2 3 4 2 0 3
-Res0 63:16
+Res0 63:19
+Field 18 FNG1
+Field 17 FNG0
+Field 16 A2
Field 15 DisCH1
Field 14 DisCH0
Field 13 AMEC1
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index acc431c331b0..4331313a42ff 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -80,7 +80,6 @@ config CSKY
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_ERROR_INJECTION
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZO
select HAVE_KERNEL_LZMA
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index fa3eb87f0999..03ee9c2fd3d5 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -143,7 +143,6 @@ config LOONGARCH
select HAVE_EXIT_THREAD
select HAVE_GUP_FAST
select HAVE_FTRACE_GRAPH_FUNC
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_FUNCTION_GRAPH_FREGS
diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig
index 0d59af6007b7..68e337aed2bb 100644
--- a/arch/loongarch/configs/loongson3_defconfig
+++ b/arch/loongarch/configs/loongson3_defconfig
@@ -225,7 +225,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
CONFIG_NETFILTER_XT_MATCH_CPU=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index a3c4cc46c892..0cecbd038bb3 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -50,12 +50,6 @@ struct kvm_vm_stat {
struct kvm_vm_stat_generic generic;
u64 pages;
u64 hugepages;
- u64 ipi_read_exits;
- u64 ipi_write_exits;
- u64 eiointc_read_exits;
- u64 eiointc_write_exits;
- u64 pch_pic_read_exits;
- u64 pch_pic_write_exits;
};
struct kvm_vcpu_stat {
@@ -65,6 +59,12 @@ struct kvm_vcpu_stat {
u64 cpucfg_exits;
u64 signal_exits;
u64 hypercall_exits;
+ u64 ipi_read_exits;
+ u64 ipi_write_exits;
+ u64 eiointc_read_exits;
+ u64 eiointc_write_exits;
+ u64 pch_pic_read_exits;
+ u64 pch_pic_write_exits;
};
#define KVM_MEM_HUGEPAGE_CAPABLE (1UL << 0)
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index fa52251b3bf1..2ce41f93b2a4 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -289,9 +289,11 @@ static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
er = EMULATE_FAIL;
switch (((inst.word >> 24) & 0xff)) {
case 0x0: /* CPUCFG GSPR */
+ trace_kvm_exit_cpucfg(vcpu, KVM_TRACE_EXIT_CPUCFG);
er = kvm_emu_cpucfg(vcpu, inst);
break;
case 0x4: /* CSR{RD,WR,XCHG} GSPR */
+ trace_kvm_exit_csr(vcpu, KVM_TRACE_EXIT_CSR);
er = kvm_handle_csr(vcpu, inst);
break;
case 0x6: /* Cache, Idle and IOCSR GSPR */
@@ -821,32 +823,25 @@ static int kvm_handle_lbt_disabled(struct kvm_vcpu *vcpu, int ecode)
return RESUME_GUEST;
}
-static int kvm_send_pv_ipi(struct kvm_vcpu *vcpu)
+static void kvm_send_pv_ipi(struct kvm_vcpu *vcpu)
{
- unsigned int min, cpu, i;
- unsigned long ipi_bitmap;
+ unsigned int min, cpu;
struct kvm_vcpu *dest;
+ DECLARE_BITMAP(ipi_bitmap, BITS_PER_LONG * 2) = {
+ kvm_read_reg(vcpu, LOONGARCH_GPR_A1),
+ kvm_read_reg(vcpu, LOONGARCH_GPR_A2)
+ };
min = kvm_read_reg(vcpu, LOONGARCH_GPR_A3);
- for (i = 0; i < 2; i++, min += BITS_PER_LONG) {
- ipi_bitmap = kvm_read_reg(vcpu, LOONGARCH_GPR_A1 + i);
- if (!ipi_bitmap)
+ for_each_set_bit(cpu, ipi_bitmap, BITS_PER_LONG * 2) {
+ dest = kvm_get_vcpu_by_cpuid(vcpu->kvm, cpu + min);
+ if (!dest)
continue;
- cpu = find_first_bit((void *)&ipi_bitmap, BITS_PER_LONG);
- while (cpu < BITS_PER_LONG) {
- dest = kvm_get_vcpu_by_cpuid(vcpu->kvm, cpu + min);
- cpu = find_next_bit((void *)&ipi_bitmap, BITS_PER_LONG, cpu + 1);
- if (!dest)
- continue;
-
- /* Send SWI0 to dest vcpu to emulate IPI interrupt */
- kvm_queue_irq(dest, INT_SWI0);
- kvm_vcpu_kick(dest);
- }
+ /* Send SWI0 to dest vcpu to emulate IPI interrupt */
+ kvm_queue_irq(dest, INT_SWI0);
+ kvm_vcpu_kick(dest);
}
-
- return 0;
}
/*
diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c
index a75f865d6fb9..a3a12af9ecbf 100644
--- a/arch/loongarch/kvm/intc/eiointc.c
+++ b/arch/loongarch/kvm/intc/eiointc.c
@@ -9,7 +9,7 @@
static void eiointc_set_sw_coreisr(struct loongarch_eiointc *s)
{
- int ipnum, cpu, cpuid, irq_index, irq_mask, irq;
+ int ipnum, cpu, cpuid, irq;
struct kvm_vcpu *vcpu;
for (irq = 0; irq < EIOINTC_IRQS; irq++) {
@@ -18,8 +18,6 @@ static void eiointc_set_sw_coreisr(struct loongarch_eiointc *s)
ipnum = count_trailing_zeros(ipnum);
ipnum = (ipnum >= 0 && ipnum < 4) ? ipnum : 0;
}
- irq_index = irq / 32;
- irq_mask = BIT(irq & 0x1f);
cpuid = s->coremap.reg_u8[irq];
vcpu = kvm_get_vcpu_by_cpuid(s->kvm, cpuid);
@@ -27,16 +25,16 @@ static void eiointc_set_sw_coreisr(struct loongarch_eiointc *s)
continue;
cpu = vcpu->vcpu_id;
- if (!!(s->coreisr.reg_u32[cpu][irq_index] & irq_mask))
- set_bit(irq, s->sw_coreisr[cpu][ipnum]);
+ if (test_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu]))
+ __set_bit(irq, s->sw_coreisr[cpu][ipnum]);
else
- clear_bit(irq, s->sw_coreisr[cpu][ipnum]);
+ __clear_bit(irq, s->sw_coreisr[cpu][ipnum]);
}
}
static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level)
{
- int ipnum, cpu, found, irq_index, irq_mask;
+ int ipnum, cpu, found;
struct kvm_vcpu *vcpu;
struct kvm_interrupt vcpu_irq;
@@ -48,19 +46,16 @@ static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level)
cpu = s->sw_coremap[irq];
vcpu = kvm_get_vcpu(s->kvm, cpu);
- irq_index = irq / 32;
- irq_mask = BIT(irq & 0x1f);
-
if (level) {
/* if not enable return false */
- if (((s->enable.reg_u32[irq_index]) & irq_mask) == 0)
+ if (!test_bit(irq, (unsigned long *)s->enable.reg_u32))
return;
- s->coreisr.reg_u32[cpu][irq_index] |= irq_mask;
+ __set_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu]);
found = find_first_bit(s->sw_coreisr[cpu][ipnum], EIOINTC_IRQS);
- set_bit(irq, s->sw_coreisr[cpu][ipnum]);
+ __set_bit(irq, s->sw_coreisr[cpu][ipnum]);
} else {
- s->coreisr.reg_u32[cpu][irq_index] &= ~irq_mask;
- clear_bit(irq, s->sw_coreisr[cpu][ipnum]);
+ __clear_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu]);
+ __clear_bit(irq, s->sw_coreisr[cpu][ipnum]);
found = find_first_bit(s->sw_coreisr[cpu][ipnum], EIOINTC_IRQS);
}
@@ -110,159 +105,14 @@ void eiointc_set_irq(struct loongarch_eiointc *s, int irq, int level)
unsigned long flags;
unsigned long *isr = (unsigned long *)s->isr.reg_u8;
- level ? set_bit(irq, isr) : clear_bit(irq, isr);
spin_lock_irqsave(&s->lock, flags);
+ level ? __set_bit(irq, isr) : __clear_bit(irq, isr);
eiointc_update_irq(s, irq, level);
spin_unlock_irqrestore(&s->lock, flags);
}
-static inline void eiointc_enable_irq(struct kvm_vcpu *vcpu,
- struct loongarch_eiointc *s, int index, u8 mask, int level)
-{
- u8 val;
- int irq;
-
- val = mask & s->isr.reg_u8[index];
- irq = ffs(val);
- while (irq != 0) {
- /*
- * enable bit change from 0 to 1,
- * need to update irq by pending bits
- */
- eiointc_update_irq(s, irq - 1 + index * 8, level);
- val &= ~BIT(irq - 1);
- irq = ffs(val);
- }
-}
-
-static int loongarch_eiointc_readb(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s,
- gpa_t addr, int len, void *val)
-{
- int index, ret = 0;
- u8 data = 0;
- gpa_t offset;
-
- offset = addr - EIOINTC_BASE;
- switch (offset) {
- case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
- index = offset - EIOINTC_NODETYPE_START;
- data = s->nodetype.reg_u8[index];
- break;
- case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
- index = offset - EIOINTC_IPMAP_START;
- data = s->ipmap.reg_u8[index];
- break;
- case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
- index = offset - EIOINTC_ENABLE_START;
- data = s->enable.reg_u8[index];
- break;
- case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
- index = offset - EIOINTC_BOUNCE_START;
- data = s->bounce.reg_u8[index];
- break;
- case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
- index = offset - EIOINTC_COREISR_START;
- data = s->coreisr.reg_u8[vcpu->vcpu_id][index];
- break;
- case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- index = offset - EIOINTC_COREMAP_START;
- data = s->coremap.reg_u8[index];
- break;
- default:
- ret = -EINVAL;
- break;
- }
- *(u8 *)val = data;
-
- return ret;
-}
-
-static int loongarch_eiointc_readw(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s,
- gpa_t addr, int len, void *val)
-{
- int index, ret = 0;
- u16 data = 0;
- gpa_t offset;
-
- offset = addr - EIOINTC_BASE;
- switch (offset) {
- case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
- index = (offset - EIOINTC_NODETYPE_START) >> 1;
- data = s->nodetype.reg_u16[index];
- break;
- case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
- index = (offset - EIOINTC_IPMAP_START) >> 1;
- data = s->ipmap.reg_u16[index];
- break;
- case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
- index = (offset - EIOINTC_ENABLE_START) >> 1;
- data = s->enable.reg_u16[index];
- break;
- case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
- index = (offset - EIOINTC_BOUNCE_START) >> 1;
- data = s->bounce.reg_u16[index];
- break;
- case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
- index = (offset - EIOINTC_COREISR_START) >> 1;
- data = s->coreisr.reg_u16[vcpu->vcpu_id][index];
- break;
- case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- index = (offset - EIOINTC_COREMAP_START) >> 1;
- data = s->coremap.reg_u16[index];
- break;
- default:
- ret = -EINVAL;
- break;
- }
- *(u16 *)val = data;
-
- return ret;
-}
-
-static int loongarch_eiointc_readl(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s,
- gpa_t addr, int len, void *val)
-{
- int index, ret = 0;
- u32 data = 0;
- gpa_t offset;
-
- offset = addr - EIOINTC_BASE;
- switch (offset) {
- case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
- index = (offset - EIOINTC_NODETYPE_START) >> 2;
- data = s->nodetype.reg_u32[index];
- break;
- case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
- index = (offset - EIOINTC_IPMAP_START) >> 2;
- data = s->ipmap.reg_u32[index];
- break;
- case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
- index = (offset - EIOINTC_ENABLE_START) >> 2;
- data = s->enable.reg_u32[index];
- break;
- case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
- index = (offset - EIOINTC_BOUNCE_START) >> 2;
- data = s->bounce.reg_u32[index];
- break;
- case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
- index = (offset - EIOINTC_COREISR_START) >> 2;
- data = s->coreisr.reg_u32[vcpu->vcpu_id][index];
- break;
- case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- index = (offset - EIOINTC_COREMAP_START) >> 2;
- data = s->coremap.reg_u32[index];
- break;
- default:
- ret = -EINVAL;
- break;
- }
- *(u32 *)val = data;
-
- return ret;
-}
-
-static int loongarch_eiointc_readq(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s,
- gpa_t addr, int len, void *val)
+static int loongarch_eiointc_read(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s,
+ gpa_t addr, unsigned long *val)
{
int index, ret = 0;
u64 data = 0;
@@ -298,7 +148,7 @@ static int loongarch_eiointc_readq(struct kvm_vcpu *vcpu, struct loongarch_eioin
ret = -EINVAL;
break;
}
- *(u64 *)val = data;
+ *val = data;
return ret;
}
@@ -308,7 +158,7 @@ static int kvm_eiointc_read(struct kvm_vcpu *vcpu,
gpa_t addr, int len, void *val)
{
int ret = -EINVAL;
- unsigned long flags;
+ unsigned long flags, data, offset;
struct loongarch_eiointc *eiointc = vcpu->kvm->arch.eiointc;
if (!eiointc) {
@@ -321,355 +171,115 @@ static int kvm_eiointc_read(struct kvm_vcpu *vcpu,
return -EINVAL;
}
- vcpu->kvm->stat.eiointc_read_exits++;
+ offset = addr & 0x7;
+ addr -= offset;
+ vcpu->stat.eiointc_read_exits++;
spin_lock_irqsave(&eiointc->lock, flags);
+ ret = loongarch_eiointc_read(vcpu, eiointc, addr, &data);
+ spin_unlock_irqrestore(&eiointc->lock, flags);
+ if (ret)
+ return ret;
+
+ data = data >> (offset * 8);
switch (len) {
case 1:
- ret = loongarch_eiointc_readb(vcpu, eiointc, addr, len, val);
+ *(long *)val = (s8)data;
break;
case 2:
- ret = loongarch_eiointc_readw(vcpu, eiointc, addr, len, val);
+ *(long *)val = (s16)data;
break;
case 4:
- ret = loongarch_eiointc_readl(vcpu, eiointc, addr, len, val);
- break;
- case 8:
- ret = loongarch_eiointc_readq(vcpu, eiointc, addr, len, val);
+ *(long *)val = (s32)data;
break;
default:
- WARN_ONCE(1, "%s: Abnormal address access: addr 0x%llx, size %d\n",
- __func__, addr, len);
- }
- spin_unlock_irqrestore(&eiointc->lock, flags);
-
- return ret;
-}
-
-static int loongarch_eiointc_writeb(struct kvm_vcpu *vcpu,
- struct loongarch_eiointc *s,
- gpa_t addr, int len, const void *val)
-{
- int index, irq, bits, ret = 0;
- u8 cpu;
- u8 data, old_data;
- u8 coreisr, old_coreisr;
- gpa_t offset;
-
- data = *(u8 *)val;
- offset = addr - EIOINTC_BASE;
-
- switch (offset) {
- case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
- index = (offset - EIOINTC_NODETYPE_START);
- s->nodetype.reg_u8[index] = data;
- break;
- case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
- /*
- * ipmap cannot be set at runtime, can be set only at the beginning
- * of irqchip driver, need not update upper irq level
- */
- index = (offset - EIOINTC_IPMAP_START);
- s->ipmap.reg_u8[index] = data;
- break;
- case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
- index = (offset - EIOINTC_ENABLE_START);
- old_data = s->enable.reg_u8[index];
- s->enable.reg_u8[index] = data;
- /*
- * 1: enable irq.
- * update irq when isr is set.
- */
- data = s->enable.reg_u8[index] & ~old_data & s->isr.reg_u8[index];
- eiointc_enable_irq(vcpu, s, index, data, 1);
- /*
- * 0: disable irq.
- * update irq when isr is set.
- */
- data = ~s->enable.reg_u8[index] & old_data & s->isr.reg_u8[index];
- eiointc_enable_irq(vcpu, s, index, data, 0);
- break;
- case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
- /* do not emulate hw bounced irq routing */
- index = offset - EIOINTC_BOUNCE_START;
- s->bounce.reg_u8[index] = data;
- break;
- case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
- index = (offset - EIOINTC_COREISR_START);
- /* use attrs to get current cpu index */
- cpu = vcpu->vcpu_id;
- coreisr = data;
- old_coreisr = s->coreisr.reg_u8[cpu][index];
- /* write 1 to clear interrupt */
- s->coreisr.reg_u8[cpu][index] = old_coreisr & ~coreisr;
- coreisr &= old_coreisr;
- bits = sizeof(data) * 8;
- irq = find_first_bit((void *)&coreisr, bits);
- while (irq < bits) {
- eiointc_update_irq(s, irq + index * bits, 0);
- bitmap_clear((void *)&coreisr, irq, 1);
- irq = find_first_bit((void *)&coreisr, bits);
- }
- break;
- case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- irq = offset - EIOINTC_COREMAP_START;
- index = irq;
- s->coremap.reg_u8[index] = data;
- eiointc_update_sw_coremap(s, irq, data, sizeof(data), true);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-static int loongarch_eiointc_writew(struct kvm_vcpu *vcpu,
- struct loongarch_eiointc *s,
- gpa_t addr, int len, const void *val)
-{
- int i, index, irq, bits, ret = 0;
- u8 cpu;
- u16 data, old_data;
- u16 coreisr, old_coreisr;
- gpa_t offset;
-
- data = *(u16 *)val;
- offset = addr - EIOINTC_BASE;
-
- switch (offset) {
- case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
- index = (offset - EIOINTC_NODETYPE_START) >> 1;
- s->nodetype.reg_u16[index] = data;
- break;
- case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
- /*
- * ipmap cannot be set at runtime, can be set only at the beginning
- * of irqchip driver, need not update upper irq level
- */
- index = (offset - EIOINTC_IPMAP_START) >> 1;
- s->ipmap.reg_u16[index] = data;
- break;
- case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
- index = (offset - EIOINTC_ENABLE_START) >> 1;
- old_data = s->enable.reg_u16[index];
- s->enable.reg_u16[index] = data;
- /*
- * 1: enable irq.
- * update irq when isr is set.
- */
- data = s->enable.reg_u16[index] & ~old_data & s->isr.reg_u16[index];
- for (i = 0; i < sizeof(data); i++) {
- u8 mask = (data >> (i * 8)) & 0xff;
- eiointc_enable_irq(vcpu, s, index * 2 + i, mask, 1);
- }
- /*
- * 0: disable irq.
- * update irq when isr is set.
- */
- data = ~s->enable.reg_u16[index] & old_data & s->isr.reg_u16[index];
- for (i = 0; i < sizeof(data); i++) {
- u8 mask = (data >> (i * 8)) & 0xff;
- eiointc_enable_irq(vcpu, s, index * 2 + i, mask, 0);
- }
- break;
- case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
- /* do not emulate hw bounced irq routing */
- index = (offset - EIOINTC_BOUNCE_START) >> 1;
- s->bounce.reg_u16[index] = data;
- break;
- case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
- index = (offset - EIOINTC_COREISR_START) >> 1;
- /* use attrs to get current cpu index */
- cpu = vcpu->vcpu_id;
- coreisr = data;
- old_coreisr = s->coreisr.reg_u16[cpu][index];
- /* write 1 to clear interrupt */
- s->coreisr.reg_u16[cpu][index] = old_coreisr & ~coreisr;
- coreisr &= old_coreisr;
- bits = sizeof(data) * 8;
- irq = find_first_bit((void *)&coreisr, bits);
- while (irq < bits) {
- eiointc_update_irq(s, irq + index * bits, 0);
- bitmap_clear((void *)&coreisr, irq, 1);
- irq = find_first_bit((void *)&coreisr, bits);
- }
- break;
- case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- irq = offset - EIOINTC_COREMAP_START;
- index = irq >> 1;
- s->coremap.reg_u16[index] = data;
- eiointc_update_sw_coremap(s, irq, data, sizeof(data), true);
- break;
- default:
- ret = -EINVAL;
+ *(long *)val = (long)data;
break;
}
- return ret;
+ return 0;
}
-static int loongarch_eiointc_writel(struct kvm_vcpu *vcpu,
+static int loongarch_eiointc_write(struct kvm_vcpu *vcpu,
struct loongarch_eiointc *s,
- gpa_t addr, int len, const void *val)
+ gpa_t addr, u64 value, u64 field_mask)
{
- int i, index, irq, bits, ret = 0;
+ int index, irq, ret = 0;
u8 cpu;
- u32 data, old_data;
- u32 coreisr, old_coreisr;
+ u64 data, old, mask;
gpa_t offset;
- data = *(u32 *)val;
- offset = addr - EIOINTC_BASE;
+ offset = addr & 7;
+ mask = field_mask << (offset * 8);
+ data = (value & field_mask) << (offset * 8);
- switch (offset) {
- case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
- index = (offset - EIOINTC_NODETYPE_START) >> 2;
- s->nodetype.reg_u32[index] = data;
- break;
- case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
- /*
- * ipmap cannot be set at runtime, can be set only at the beginning
- * of irqchip driver, need not update upper irq level
- */
- index = (offset - EIOINTC_IPMAP_START) >> 2;
- s->ipmap.reg_u32[index] = data;
- break;
- case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
- index = (offset - EIOINTC_ENABLE_START) >> 2;
- old_data = s->enable.reg_u32[index];
- s->enable.reg_u32[index] = data;
- /*
- * 1: enable irq.
- * update irq when isr is set.
- */
- data = s->enable.reg_u32[index] & ~old_data & s->isr.reg_u32[index];
- for (i = 0; i < sizeof(data); i++) {
- u8 mask = (data >> (i * 8)) & 0xff;
- eiointc_enable_irq(vcpu, s, index * 4 + i, mask, 1);
- }
- /*
- * 0: disable irq.
- * update irq when isr is set.
- */
- data = ~s->enable.reg_u32[index] & old_data & s->isr.reg_u32[index];
- for (i = 0; i < sizeof(data); i++) {
- u8 mask = (data >> (i * 8)) & 0xff;
- eiointc_enable_irq(vcpu, s, index * 4 + i, mask, 0);
- }
- break;
- case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
- /* do not emulate hw bounced irq routing */
- index = (offset - EIOINTC_BOUNCE_START) >> 2;
- s->bounce.reg_u32[index] = data;
- break;
- case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
- index = (offset - EIOINTC_COREISR_START) >> 2;
- /* use attrs to get current cpu index */
- cpu = vcpu->vcpu_id;
- coreisr = data;
- old_coreisr = s->coreisr.reg_u32[cpu][index];
- /* write 1 to clear interrupt */
- s->coreisr.reg_u32[cpu][index] = old_coreisr & ~coreisr;
- coreisr &= old_coreisr;
- bits = sizeof(data) * 8;
- irq = find_first_bit((void *)&coreisr, bits);
- while (irq < bits) {
- eiointc_update_irq(s, irq + index * bits, 0);
- bitmap_clear((void *)&coreisr, irq, 1);
- irq = find_first_bit((void *)&coreisr, bits);
- }
- break;
- case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- irq = offset - EIOINTC_COREMAP_START;
- index = irq >> 2;
- s->coremap.reg_u32[index] = data;
- eiointc_update_sw_coremap(s, irq, data, sizeof(data), true);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-static int loongarch_eiointc_writeq(struct kvm_vcpu *vcpu,
- struct loongarch_eiointc *s,
- gpa_t addr, int len, const void *val)
-{
- int i, index, irq, bits, ret = 0;
- u8 cpu;
- u64 data, old_data;
- u64 coreisr, old_coreisr;
- gpa_t offset;
-
- data = *(u64 *)val;
+ addr -= offset;
offset = addr - EIOINTC_BASE;
switch (offset) {
case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
index = (offset - EIOINTC_NODETYPE_START) >> 3;
- s->nodetype.reg_u64[index] = data;
+ old = s->nodetype.reg_u64[index];
+ s->nodetype.reg_u64[index] = (old & ~mask) | data;
break;
case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
/*
* ipmap cannot be set at runtime, can be set only at the beginning
* of irqchip driver, need not update upper irq level
*/
- index = (offset - EIOINTC_IPMAP_START) >> 3;
- s->ipmap.reg_u64 = data;
+ old = s->ipmap.reg_u64;
+ s->ipmap.reg_u64 = (old & ~mask) | data;
break;
case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
index = (offset - EIOINTC_ENABLE_START) >> 3;
- old_data = s->enable.reg_u64[index];
- s->enable.reg_u64[index] = data;
+ old = s->enable.reg_u64[index];
+ s->enable.reg_u64[index] = (old & ~mask) | data;
/*
* 1: enable irq.
* update irq when isr is set.
*/
- data = s->enable.reg_u64[index] & ~old_data & s->isr.reg_u64[index];
- for (i = 0; i < sizeof(data); i++) {
- u8 mask = (data >> (i * 8)) & 0xff;
- eiointc_enable_irq(vcpu, s, index * 8 + i, mask, 1);
+ data = s->enable.reg_u64[index] & ~old & s->isr.reg_u64[index];
+ while (data) {
+ irq = __ffs(data);
+ eiointc_update_irq(s, irq + index * 64, 1);
+ data &= ~BIT_ULL(irq);
}
/*
* 0: disable irq.
* update irq when isr is set.
*/
- data = ~s->enable.reg_u64[index] & old_data & s->isr.reg_u64[index];
- for (i = 0; i < sizeof(data); i++) {
- u8 mask = (data >> (i * 8)) & 0xff;
- eiointc_enable_irq(vcpu, s, index * 8 + i, mask, 0);
+ data = ~s->enable.reg_u64[index] & old & s->isr.reg_u64[index];
+ while (data) {
+ irq = __ffs(data);
+ eiointc_update_irq(s, irq + index * 64, 0);
+ data &= ~BIT_ULL(irq);
}
break;
case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
/* do not emulate hw bounced irq routing */
index = (offset - EIOINTC_BOUNCE_START) >> 3;
- s->bounce.reg_u64[index] = data;
+ old = s->bounce.reg_u64[index];
+ s->bounce.reg_u64[index] = (old & ~mask) | data;
break;
case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
index = (offset - EIOINTC_COREISR_START) >> 3;
/* use attrs to get current cpu index */
cpu = vcpu->vcpu_id;
- coreisr = data;
- old_coreisr = s->coreisr.reg_u64[cpu][index];
+ old = s->coreisr.reg_u64[cpu][index];
/* write 1 to clear interrupt */
- s->coreisr.reg_u64[cpu][index] = old_coreisr & ~coreisr;
- coreisr &= old_coreisr;
- bits = sizeof(data) * 8;
- irq = find_first_bit((void *)&coreisr, bits);
- while (irq < bits) {
- eiointc_update_irq(s, irq + index * bits, 0);
- bitmap_clear((void *)&coreisr, irq, 1);
- irq = find_first_bit((void *)&coreisr, bits);
+ s->coreisr.reg_u64[cpu][index] = old & ~data;
+ data &= old;
+ while (data) {
+ irq = __ffs(data);
+ eiointc_update_irq(s, irq + index * 64, 0);
+ data &= ~BIT_ULL(irq);
}
break;
case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- irq = offset - EIOINTC_COREMAP_START;
- index = irq >> 3;
- s->coremap.reg_u64[index] = data;
- eiointc_update_sw_coremap(s, irq, data, sizeof(data), true);
+ index = (offset - EIOINTC_COREMAP_START) >> 3;
+ old = s->coremap.reg_u64[index];
+ s->coremap.reg_u64[index] = (old & ~mask) | data;
+ data = s->coremap.reg_u64[index];
+ eiointc_update_sw_coremap(s, index * 8, data, sizeof(data), true);
break;
default:
ret = -EINVAL;
@@ -684,7 +294,7 @@ static int kvm_eiointc_write(struct kvm_vcpu *vcpu,
gpa_t addr, int len, const void *val)
{
int ret = -EINVAL;
- unsigned long flags;
+ unsigned long flags, value;
struct loongarch_eiointc *eiointc = vcpu->kvm->arch.eiointc;
if (!eiointc) {
@@ -697,24 +307,25 @@ static int kvm_eiointc_write(struct kvm_vcpu *vcpu,
return -EINVAL;
}
- vcpu->kvm->stat.eiointc_write_exits++;
+ vcpu->stat.eiointc_write_exits++;
spin_lock_irqsave(&eiointc->lock, flags);
switch (len) {
case 1:
- ret = loongarch_eiointc_writeb(vcpu, eiointc, addr, len, val);
+ value = *(unsigned char *)val;
+ ret = loongarch_eiointc_write(vcpu, eiointc, addr, value, 0xFF);
break;
case 2:
- ret = loongarch_eiointc_writew(vcpu, eiointc, addr, len, val);
+ value = *(unsigned short *)val;
+ ret = loongarch_eiointc_write(vcpu, eiointc, addr, value, USHRT_MAX);
break;
case 4:
- ret = loongarch_eiointc_writel(vcpu, eiointc, addr, len, val);
- break;
- case 8:
- ret = loongarch_eiointc_writeq(vcpu, eiointc, addr, len, val);
+ value = *(unsigned int *)val;
+ ret = loongarch_eiointc_write(vcpu, eiointc, addr, value, UINT_MAX);
break;
default:
- WARN_ONCE(1, "%s: Abnormal address access: addr 0x%llx, size %d\n",
- __func__, addr, len);
+ value = *(unsigned long *)val;
+ ret = loongarch_eiointc_write(vcpu, eiointc, addr, value, ULONG_MAX);
+ break;
}
spin_unlock_irqrestore(&eiointc->lock, flags);
@@ -989,7 +600,7 @@ static int kvm_eiointc_create(struct kvm_device *dev, u32 type)
{
int ret;
struct loongarch_eiointc *s;
- struct kvm_io_device *device, *device1;
+ struct kvm_io_device *device;
struct kvm *kvm = dev->kvm;
/* eiointc has been created */
@@ -1017,10 +628,10 @@ static int kvm_eiointc_create(struct kvm_device *dev, u32 type)
return ret;
}
- device1 = &s->device_vext;
- kvm_iodevice_init(device1, &kvm_eiointc_virt_ops);
+ device = &s->device_vext;
+ kvm_iodevice_init(device, &kvm_eiointc_virt_ops);
ret = kvm_io_bus_register_dev(kvm, KVM_IOCSR_BUS,
- EIOINTC_VIRT_BASE, EIOINTC_VIRT_SIZE, device1);
+ EIOINTC_VIRT_BASE, EIOINTC_VIRT_SIZE, device);
if (ret < 0) {
kvm_io_bus_unregister_dev(kvm, KVM_IOCSR_BUS, &s->device);
kfree(s);
diff --git a/arch/loongarch/kvm/intc/ipi.c b/arch/loongarch/kvm/intc/ipi.c
index fe734dc062ed..e658d5b37c04 100644
--- a/arch/loongarch/kvm/intc/ipi.c
+++ b/arch/loongarch/kvm/intc/ipi.c
@@ -268,36 +268,16 @@ static int kvm_ipi_read(struct kvm_vcpu *vcpu,
struct kvm_io_device *dev,
gpa_t addr, int len, void *val)
{
- int ret;
- struct loongarch_ipi *ipi;
-
- ipi = vcpu->kvm->arch.ipi;
- if (!ipi) {
- kvm_err("%s: ipi irqchip not valid!\n", __func__);
- return -EINVAL;
- }
- ipi->kvm->stat.ipi_read_exits++;
- ret = loongarch_ipi_readl(vcpu, addr, len, val);
-
- return ret;
+ vcpu->stat.ipi_read_exits++;
+ return loongarch_ipi_readl(vcpu, addr, len, val);
}
static int kvm_ipi_write(struct kvm_vcpu *vcpu,
struct kvm_io_device *dev,
gpa_t addr, int len, const void *val)
{
- int ret;
- struct loongarch_ipi *ipi;
-
- ipi = vcpu->kvm->arch.ipi;
- if (!ipi) {
- kvm_err("%s: ipi irqchip not valid!\n", __func__);
- return -EINVAL;
- }
- ipi->kvm->stat.ipi_write_exits++;
- ret = loongarch_ipi_writel(vcpu, addr, len, val);
-
- return ret;
+ vcpu->stat.ipi_write_exits++;
+ return loongarch_ipi_writel(vcpu, addr, len, val);
}
static const struct kvm_io_device_ops kvm_ipi_ops = {
diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c
index 08fce845f668..6f00ffe05c54 100644
--- a/arch/loongarch/kvm/intc/pch_pic.c
+++ b/arch/loongarch/kvm/intc/pch_pic.c
@@ -196,7 +196,7 @@ static int kvm_pch_pic_read(struct kvm_vcpu *vcpu,
}
/* statistics of pch pic reading */
- vcpu->kvm->stat.pch_pic_read_exits++;
+ vcpu->stat.pch_pic_read_exits++;
ret = loongarch_pch_pic_read(s, addr, len, val);
return ret;
@@ -303,7 +303,7 @@ static int kvm_pch_pic_write(struct kvm_vcpu *vcpu,
}
/* statistics of pch pic writing */
- vcpu->kvm->stat.pch_pic_write_exits++;
+ vcpu->stat.pch_pic_write_exits++;
ret = loongarch_pch_pic_write(s, addr, len, val);
return ret;
diff --git a/arch/loongarch/kvm/interrupt.c b/arch/loongarch/kvm/interrupt.c
index 4c3f22de4b40..8462083f0301 100644
--- a/arch/loongarch/kvm/interrupt.c
+++ b/arch/loongarch/kvm/interrupt.c
@@ -83,28 +83,11 @@ void kvm_deliver_intr(struct kvm_vcpu *vcpu)
unsigned long *pending = &vcpu->arch.irq_pending;
unsigned long *pending_clr = &vcpu->arch.irq_clear;
- if (!(*pending) && !(*pending_clr))
- return;
-
- if (*pending_clr) {
- priority = __ffs(*pending_clr);
- while (priority <= INT_IPI) {
- kvm_irq_clear(vcpu, priority);
- priority = find_next_bit(pending_clr,
- BITS_PER_BYTE * sizeof(*pending_clr),
- priority + 1);
- }
- }
+ for_each_set_bit(priority, pending_clr, INT_IPI + 1)
+ kvm_irq_clear(vcpu, priority);
- if (*pending) {
- priority = __ffs(*pending);
- while (priority <= INT_IPI) {
- kvm_irq_deliver(vcpu, priority);
- priority = find_next_bit(pending,
- BITS_PER_BYTE * sizeof(*pending),
- priority + 1);
- }
- }
+ for_each_set_bit(priority, pending, INT_IPI + 1)
+ kvm_irq_deliver(vcpu, priority);
}
int kvm_pending_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/loongarch/kvm/trace.h b/arch/loongarch/kvm/trace.h
index 1783397b1bc8..145514dab6d5 100644
--- a/arch/loongarch/kvm/trace.h
+++ b/arch/loongarch/kvm/trace.h
@@ -46,11 +46,15 @@ DEFINE_EVENT(kvm_transition, kvm_out,
/* Further exit reasons */
#define KVM_TRACE_EXIT_IDLE 64
#define KVM_TRACE_EXIT_CACHE 65
+#define KVM_TRACE_EXIT_CPUCFG 66
+#define KVM_TRACE_EXIT_CSR 67
/* Tracepoints for VM exits */
#define kvm_trace_symbol_exit_types \
{ KVM_TRACE_EXIT_IDLE, "IDLE" }, \
- { KVM_TRACE_EXIT_CACHE, "CACHE" }
+ { KVM_TRACE_EXIT_CACHE, "CACHE" }, \
+ { KVM_TRACE_EXIT_CPUCFG, "CPUCFG" }, \
+ { KVM_TRACE_EXIT_CSR, "CSR" }
DECLARE_EVENT_CLASS(kvm_exit,
TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
@@ -82,6 +86,14 @@ DEFINE_EVENT(kvm_exit, kvm_exit_cache,
TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
TP_ARGS(vcpu, reason));
+DEFINE_EVENT(kvm_exit, kvm_exit_cpucfg,
+ TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
+ TP_ARGS(vcpu, reason));
+
+DEFINE_EVENT(kvm_exit, kvm_exit_csr,
+ TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
+ TP_ARGS(vcpu, reason));
+
DEFINE_EVENT(kvm_exit, kvm_exit,
TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
TP_ARGS(vcpu, reason));
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 5af32ec62cb1..d1b8c50941ca 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -20,7 +20,13 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, idle_exits),
STATS_DESC_COUNTER(VCPU, cpucfg_exits),
STATS_DESC_COUNTER(VCPU, signal_exits),
- STATS_DESC_COUNTER(VCPU, hypercall_exits)
+ STATS_DESC_COUNTER(VCPU, hypercall_exits),
+ STATS_DESC_COUNTER(VCPU, ipi_read_exits),
+ STATS_DESC_COUNTER(VCPU, ipi_write_exits),
+ STATS_DESC_COUNTER(VCPU, eiointc_read_exits),
+ STATS_DESC_COUNTER(VCPU, eiointc_write_exits),
+ STATS_DESC_COUNTER(VCPU, pch_pic_read_exits),
+ STATS_DESC_COUNTER(VCPU, pch_pic_write_exits)
};
const struct kvm_stats_header kvm_vcpu_stats_header = {
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index eb5bb6d36899..11835eb59d94 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -32,6 +32,7 @@ config M68K
select HAVE_ASM_MODVERSIONS
select HAVE_DEBUG_BUGVERBOSE
select HAVE_EFFICIENT_UNALIGNED_ACCESS if !CPU_HAS_NO_UNALIGNED
+ select HAVE_LD_DEAD_CODE_DATA_ELIMINATION
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_UID16
select MMU_GATHER_NO_RANGE if MMU
diff --git a/arch/m68k/Kconfig.debug b/arch/m68k/Kconfig.debug
index 30638a6e8edc..d036f903864c 100644
--- a/arch/m68k/Kconfig.debug
+++ b/arch/m68k/Kconfig.debug
@@ -10,7 +10,7 @@ config BOOTPARAM_STRING
config EARLY_PRINTK
bool "Early printk"
- depends on !(SUN3 || M68000 || COLDFIRE)
+ depends on MMU_MOTOROLA
help
Write kernel log output directly to a serial port.
Where implemented, output goes to the framebuffer as well.
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index d05690289e33..5171bb183967 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -85,7 +85,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
@@ -267,6 +266,7 @@ CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_NFLOG=m
+CONFIG_IP_SCTP=m
CONFIG_SCTP_COOKIE_HMAC_SHA1=y
CONFIG_RDS=m
CONFIG_RDS_TCP=m
@@ -356,6 +356,7 @@ CONFIG_TCM_PSCSI=m
CONFIG_NETDEVICES=y
CONFIG_DUMMY=m
CONFIG_WIREGUARD=m
+CONFIG_OVPN=m
CONFIG_EQUALIZER=m
CONFIG_NET_TEAM=m
CONFIG_NET_TEAM_MODE_BROADCAST=m
@@ -375,6 +376,7 @@ CONFIG_PFCP=m
CONFIG_MACSEC=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_A2065=y
CONFIG_ARIADNE=y
@@ -448,8 +450,10 @@ CONFIG_RTC_DRV_RP5C01=m
CONFIG_DAX=m
CONFIG_EXT4_FS=y
CONFIG_JFS_FS=m
+CONFIG_XFS_FS=m
CONFIG_OCFS2_FS=m
# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_BTRFS_FS=m
CONFIG_BCACHEFS_FS=m
CONFIG_FANOTIFY=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
@@ -548,6 +552,7 @@ CONFIG_DLM=m
CONFIG_ENCRYPTED_KEYS=m
CONFIG_HARDENED_USERCOPY=y
CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_NULL=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_BENCHMARK=m
CONFIG_CRYPTO_RSA=m
@@ -580,7 +585,6 @@ CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
@@ -600,6 +604,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
# CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
CONFIG_XZ_DEC_TEST=m
CONFIG_GLOB_SELFTEST=m
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -631,6 +636,7 @@ CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
CONFIG_LINEAR_RANGES_TEST=m
+CONFIG_CRC_BENCHMARK=y
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index a1747fbe23fb..16f343ae48c6 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -81,7 +81,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
@@ -263,6 +262,7 @@ CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_NFLOG=m
+CONFIG_IP_SCTP=m
CONFIG_SCTP_COOKIE_HMAC_SHA1=y
CONFIG_RDS=m
CONFIG_RDS_TCP=m
@@ -336,6 +336,7 @@ CONFIG_TCM_PSCSI=m
CONFIG_NETDEVICES=y
CONFIG_DUMMY=m
CONFIG_WIREGUARD=m
+CONFIG_OVPN=m
CONFIG_EQUALIZER=m
CONFIG_NET_TEAM=m
CONFIG_NET_TEAM_MODE_BROADCAST=m
@@ -355,6 +356,7 @@ CONFIG_PFCP=m
CONFIG_MACSEC=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_PPP=m
CONFIG_PPP_BSDCOMP=m
@@ -405,8 +407,10 @@ CONFIG_RTC_DRV_GENERIC=m
CONFIG_DAX=m
CONFIG_EXT4_FS=y
CONFIG_JFS_FS=m
+CONFIG_XFS_FS=m
CONFIG_OCFS2_FS=m
# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_BTRFS_FS=m
CONFIG_BCACHEFS_FS=m
CONFIG_FANOTIFY=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
@@ -505,6 +509,7 @@ CONFIG_DLM=m
CONFIG_ENCRYPTED_KEYS=m
CONFIG_HARDENED_USERCOPY=y
CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_NULL=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_BENCHMARK=m
CONFIG_CRYPTO_RSA=m
@@ -537,7 +542,6 @@ CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
@@ -557,6 +561,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
# CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
CONFIG_XZ_DEC_TEST=m
CONFIG_GLOB_SELFTEST=m
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -588,6 +593,7 @@ CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
CONFIG_LINEAR_RANGES_TEST=m
+CONFIG_CRC_BENCHMARK=y
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index 74293551f66b..c08788728ea9 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -88,7 +88,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
@@ -270,6 +269,7 @@ CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_NFLOG=m
+CONFIG_IP_SCTP=m
CONFIG_SCTP_COOKIE_HMAC_SHA1=y
CONFIG_RDS=m
CONFIG_RDS_TCP=m
@@ -351,6 +351,7 @@ CONFIG_TCM_PSCSI=m
CONFIG_NETDEVICES=y
CONFIG_DUMMY=m
CONFIG_WIREGUARD=m
+CONFIG_OVPN=m
CONFIG_EQUALIZER=m
CONFIG_NET_TEAM=m
CONFIG_NET_TEAM_MODE_BROADCAST=m
@@ -370,6 +371,7 @@ CONFIG_PFCP=m
CONFIG_MACSEC=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_ATARILANCE=y
CONFIG_NE2000=y
@@ -425,8 +427,10 @@ CONFIG_RTC_DRV_GENERIC=m
CONFIG_DAX=m
CONFIG_EXT4_FS=y
CONFIG_JFS_FS=m
+CONFIG_XFS_FS=m
CONFIG_OCFS2_FS=m
# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_BTRFS_FS=m
CONFIG_BCACHEFS_FS=m
CONFIG_FANOTIFY=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
@@ -525,6 +529,7 @@ CONFIG_DLM=m
CONFIG_ENCRYPTED_KEYS=m
CONFIG_HARDENED_USERCOPY=y
CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_NULL=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_BENCHMARK=m
CONFIG_CRYPTO_RSA=m
@@ -557,7 +562,6 @@ CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
@@ -577,6 +581,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
# CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
CONFIG_XZ_DEC_TEST=m
CONFIG_GLOB_SELFTEST=m
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -608,6 +613,7 @@ CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
CONFIG_LINEAR_RANGES_TEST=m
+CONFIG_CRC_BENCHMARK=y
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 419b13ae950a..962497e7c53f 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -78,7 +78,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
@@ -260,6 +259,7 @@ CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_NFLOG=m
+CONFIG_IP_SCTP=m
CONFIG_SCTP_COOKIE_HMAC_SHA1=y
CONFIG_RDS=m
CONFIG_RDS_TCP=m
@@ -334,6 +334,7 @@ CONFIG_TCM_PSCSI=m
CONFIG_NETDEVICES=y
CONFIG_DUMMY=m
CONFIG_WIREGUARD=m
+CONFIG_OVPN=m
CONFIG_EQUALIZER=m
CONFIG_NET_TEAM=m
CONFIG_NET_TEAM_MODE_BROADCAST=m
@@ -353,6 +354,7 @@ CONFIG_PFCP=m
CONFIG_MACSEC=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_BVME6000_NET=y
CONFIG_PPP=m
@@ -397,8 +399,10 @@ CONFIG_RTC_DRV_GENERIC=m
CONFIG_DAX=m
CONFIG_EXT4_FS=y
CONFIG_JFS_FS=m
+CONFIG_XFS_FS=m
CONFIG_OCFS2_FS=m
# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_BTRFS_FS=m
CONFIG_BCACHEFS_FS=m
CONFIG_FANOTIFY=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
@@ -497,6 +501,7 @@ CONFIG_DLM=m
CONFIG_ENCRYPTED_KEYS=m
CONFIG_HARDENED_USERCOPY=y
CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_NULL=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_BENCHMARK=m
CONFIG_CRYPTO_RSA=m
@@ -529,7 +534,6 @@ CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
@@ -549,6 +553,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
# CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
CONFIG_XZ_DEC_TEST=m
CONFIG_GLOB_SELFTEST=m
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -580,6 +585,7 @@ CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
CONFIG_LINEAR_RANGES_TEST=m
+CONFIG_CRC_BENCHMARK=y
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 4c81d756587c..ec28650189e4 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -80,7 +80,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
@@ -262,6 +261,7 @@ CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_NFLOG=m
+CONFIG_IP_SCTP=m
CONFIG_SCTP_COOKIE_HMAC_SHA1=y
CONFIG_RDS=m
CONFIG_RDS_TCP=m
@@ -335,6 +335,7 @@ CONFIG_TCM_PSCSI=m
CONFIG_NETDEVICES=y
CONFIG_DUMMY=m
CONFIG_WIREGUARD=m
+CONFIG_OVPN=m
CONFIG_EQUALIZER=m
CONFIG_NET_TEAM=m
CONFIG_NET_TEAM_MODE_BROADCAST=m
@@ -354,6 +355,7 @@ CONFIG_PFCP=m
CONFIG_MACSEC=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_HPLANCE=y
CONFIG_PPP=m
@@ -407,8 +409,10 @@ CONFIG_RTC_DRV_GENERIC=m
CONFIG_DAX=m
CONFIG_EXT4_FS=y
CONFIG_JFS_FS=m
+CONFIG_XFS_FS=m
CONFIG_OCFS2_FS=m
# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_BTRFS_FS=m
CONFIG_BCACHEFS_FS=m
CONFIG_FANOTIFY=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
@@ -507,6 +511,7 @@ CONFIG_DLM=m
CONFIG_ENCRYPTED_KEYS=m
CONFIG_HARDENED_USERCOPY=y
CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_NULL=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_BENCHMARK=m
CONFIG_CRYPTO_RSA=m
@@ -539,7 +544,6 @@ CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
@@ -559,6 +563,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
# CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
CONFIG_XZ_DEC_TEST=m
CONFIG_GLOB_SELFTEST=m
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -590,6 +595,7 @@ CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
CONFIG_LINEAR_RANGES_TEST=m
+CONFIG_CRC_BENCHMARK=y
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index daa01d7fb462..0afb3ad180de 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -79,7 +79,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
@@ -261,6 +260,7 @@ CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_NFLOG=m
+CONFIG_IP_SCTP=m
CONFIG_SCTP_COOKIE_HMAC_SHA1=y
CONFIG_RDS=m
CONFIG_RDS_TCP=m
@@ -347,6 +347,7 @@ CONFIG_MAC_EMUMOUSEBTN=y
CONFIG_NETDEVICES=y
CONFIG_DUMMY=m
CONFIG_WIREGUARD=m
+CONFIG_OVPN=m
CONFIG_EQUALIZER=m
CONFIG_NET_TEAM=m
CONFIG_NET_TEAM_MODE_BROADCAST=m
@@ -366,6 +367,7 @@ CONFIG_PFCP=m
CONFIG_MACSEC=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_MACMACE=y
CONFIG_MAC89x0=y
@@ -424,8 +426,10 @@ CONFIG_RTC_DRV_GENERIC=m
CONFIG_DAX=m
CONFIG_EXT4_FS=y
CONFIG_JFS_FS=m
+CONFIG_XFS_FS=m
CONFIG_OCFS2_FS=m
# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_BTRFS_FS=m
CONFIG_BCACHEFS_FS=m
CONFIG_FANOTIFY=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
@@ -524,6 +528,7 @@ CONFIG_DLM=m
CONFIG_ENCRYPTED_KEYS=m
CONFIG_HARDENED_USERCOPY=y
CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_NULL=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_BENCHMARK=m
CONFIG_CRYPTO_RSA=m
@@ -556,7 +561,6 @@ CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
@@ -576,6 +580,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
# CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
CONFIG_XZ_DEC_TEST=m
CONFIG_GLOB_SELFTEST=m
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -607,6 +612,7 @@ CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
CONFIG_LINEAR_RANGES_TEST=m
+CONFIG_CRC_BENCHMARK=y
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 641ca22eb3b2..b311e953995d 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -99,7 +99,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
@@ -281,6 +280,7 @@ CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_NFLOG=m
+CONFIG_IP_SCTP=m
CONFIG_SCTP_COOKIE_HMAC_SHA1=y
CONFIG_RDS=m
CONFIG_RDS_TCP=m
@@ -390,6 +390,7 @@ CONFIG_MAC_EMUMOUSEBTN=y
CONFIG_NETDEVICES=y
CONFIG_DUMMY=m
CONFIG_WIREGUARD=m
+CONFIG_OVPN=m
CONFIG_EQUALIZER=m
CONFIG_NET_TEAM=m
CONFIG_NET_TEAM_MODE_BROADCAST=m
@@ -409,6 +410,7 @@ CONFIG_PFCP=m
CONFIG_MACSEC=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_A2065=y
CONFIG_ARIADNE=y
@@ -511,8 +513,10 @@ CONFIG_RTC_DRV_GENERIC=m
CONFIG_DAX=m
CONFIG_EXT4_FS=y
CONFIG_JFS_FS=m
+CONFIG_XFS_FS=m
CONFIG_OCFS2_FS=m
# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_BTRFS_FS=m
CONFIG_BCACHEFS_FS=m
CONFIG_FANOTIFY=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
@@ -611,6 +615,7 @@ CONFIG_DLM=m
CONFIG_ENCRYPTED_KEYS=m
CONFIG_HARDENED_USERCOPY=y
CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_NULL=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_BENCHMARK=m
CONFIG_CRYPTO_RSA=m
@@ -643,7 +648,6 @@ CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
@@ -663,6 +667,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
# CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
CONFIG_XZ_DEC_TEST=m
CONFIG_GLOB_SELFTEST=m
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -694,6 +699,7 @@ CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
CONFIG_LINEAR_RANGES_TEST=m
+CONFIG_CRC_BENCHMARK=y
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index f98ffa7a1640..f4e6224f137f 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -77,7 +77,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
@@ -259,6 +258,7 @@ CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_NFLOG=m
+CONFIG_IP_SCTP=m
CONFIG_SCTP_COOKIE_HMAC_SHA1=y
CONFIG_RDS=m
CONFIG_RDS_TCP=m
@@ -333,6 +333,7 @@ CONFIG_TCM_PSCSI=m
CONFIG_NETDEVICES=y
CONFIG_DUMMY=m
CONFIG_WIREGUARD=m
+CONFIG_OVPN=m
CONFIG_EQUALIZER=m
CONFIG_NET_TEAM=m
CONFIG_NET_TEAM_MODE_BROADCAST=m
@@ -352,6 +353,7 @@ CONFIG_PFCP=m
CONFIG_MACSEC=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_MVME147_NET=y
CONFIG_PPP=m
@@ -397,8 +399,10 @@ CONFIG_RTC_DRV_GENERIC=m
CONFIG_DAX=m
CONFIG_EXT4_FS=y
CONFIG_JFS_FS=m
+CONFIG_XFS_FS=m
CONFIG_OCFS2_FS=m
# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_BTRFS_FS=m
CONFIG_BCACHEFS_FS=m
CONFIG_FANOTIFY=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
@@ -497,6 +501,7 @@ CONFIG_DLM=m
CONFIG_ENCRYPTED_KEYS=m
CONFIG_HARDENED_USERCOPY=y
CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_NULL=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_BENCHMARK=m
CONFIG_CRYPTO_RSA=m
@@ -529,7 +534,6 @@ CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
@@ -549,6 +553,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
# CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
CONFIG_XZ_DEC_TEST=m
CONFIG_GLOB_SELFTEST=m
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -580,6 +585,7 @@ CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
CONFIG_LINEAR_RANGES_TEST=m
+CONFIG_CRC_BENCHMARK=y
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 2bfc3f4b48f9..498e167222f1 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -78,7 +78,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
@@ -260,6 +259,7 @@ CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_NFLOG=m
+CONFIG_IP_SCTP=m
CONFIG_SCTP_COOKIE_HMAC_SHA1=y
CONFIG_RDS=m
CONFIG_RDS_TCP=m
@@ -334,6 +334,7 @@ CONFIG_TCM_PSCSI=m
CONFIG_NETDEVICES=y
CONFIG_DUMMY=m
CONFIG_WIREGUARD=m
+CONFIG_OVPN=m
CONFIG_EQUALIZER=m
CONFIG_NET_TEAM=m
CONFIG_NET_TEAM_MODE_BROADCAST=m
@@ -353,6 +354,7 @@ CONFIG_PFCP=m
CONFIG_MACSEC=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_MVME16x_NET=y
CONFIG_PPP=m
@@ -398,8 +400,10 @@ CONFIG_RTC_DRV_GENERIC=m
CONFIG_DAX=m
CONFIG_EXT4_FS=y
CONFIG_JFS_FS=m
+CONFIG_XFS_FS=m
CONFIG_OCFS2_FS=m
# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_BTRFS_FS=m
CONFIG_BCACHEFS_FS=m
CONFIG_FANOTIFY=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
@@ -498,6 +502,7 @@ CONFIG_DLM=m
CONFIG_ENCRYPTED_KEYS=m
CONFIG_HARDENED_USERCOPY=y
CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_NULL=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_BENCHMARK=m
CONFIG_CRYPTO_RSA=m
@@ -530,7 +535,6 @@ CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
@@ -550,6 +554,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
# CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
CONFIG_XZ_DEC_TEST=m
CONFIG_GLOB_SELFTEST=m
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -581,6 +586,7 @@ CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
CONFIG_LINEAR_RANGES_TEST=m
+CONFIG_CRC_BENCHMARK=y
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 2bd46cbcca2a..8c6b1eef8534 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -79,7 +79,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
@@ -261,6 +260,7 @@ CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_NFLOG=m
+CONFIG_IP_SCTP=m
CONFIG_SCTP_COOKIE_HMAC_SHA1=y
CONFIG_RDS=m
CONFIG_RDS_TCP=m
@@ -340,6 +340,7 @@ CONFIG_TCM_PSCSI=m
CONFIG_NETDEVICES=y
CONFIG_DUMMY=m
CONFIG_WIREGUARD=m
+CONFIG_OVPN=m
CONFIG_EQUALIZER=m
CONFIG_NET_TEAM=m
CONFIG_NET_TEAM_MODE_BROADCAST=m
@@ -359,6 +360,7 @@ CONFIG_PFCP=m
CONFIG_MACSEC=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_NE2000=y
CONFIG_PLIP=m
@@ -414,8 +416,10 @@ CONFIG_RTC_DRV_GENERIC=m
CONFIG_DAX=m
CONFIG_EXT4_FS=y
CONFIG_JFS_FS=m
+CONFIG_XFS_FS=m
CONFIG_OCFS2_FS=m
# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_BTRFS_FS=m
CONFIG_BCACHEFS_FS=m
CONFIG_FANOTIFY=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
@@ -514,6 +518,7 @@ CONFIG_DLM=m
CONFIG_ENCRYPTED_KEYS=m
CONFIG_HARDENED_USERCOPY=y
CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_NULL=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_BENCHMARK=m
CONFIG_CRYPTO_RSA=m
@@ -546,7 +551,6 @@ CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
@@ -566,6 +570,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
# CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
CONFIG_XZ_DEC_TEST=m
CONFIG_GLOB_SELFTEST=m
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -597,6 +602,7 @@ CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
CONFIG_LINEAR_RANGES_TEST=m
+CONFIG_CRC_BENCHMARK=y
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index dc7fc94fc669..c34648f299ef 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -74,7 +74,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
@@ -256,6 +255,7 @@ CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_NFLOG=m
+CONFIG_IP_SCTP=m
CONFIG_SCTP_COOKIE_HMAC_SHA1=y
CONFIG_RDS=m
CONFIG_RDS_TCP=m
@@ -330,6 +330,7 @@ CONFIG_TCM_PSCSI=m
CONFIG_NETDEVICES=y
CONFIG_DUMMY=m
CONFIG_WIREGUARD=m
+CONFIG_OVPN=m
CONFIG_EQUALIZER=m
CONFIG_NET_TEAM=m
CONFIG_NET_TEAM_MODE_BROADCAST=m
@@ -349,6 +350,7 @@ CONFIG_PFCP=m
CONFIG_MACSEC=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_SUN3LANCE=y
CONFIG_SUN3_82586=y
@@ -395,8 +397,10 @@ CONFIG_RTC_DRV_GENERIC=m
CONFIG_DAX=m
CONFIG_EXT4_FS=y
CONFIG_JFS_FS=m
+CONFIG_XFS_FS=m
CONFIG_OCFS2_FS=m
# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_BTRFS_FS=m
CONFIG_BCACHEFS_FS=m
CONFIG_FANOTIFY=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
@@ -495,6 +499,7 @@ CONFIG_DLM=m
CONFIG_ENCRYPTED_KEYS=m
CONFIG_HARDENED_USERCOPY=y
CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_NULL=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_BENCHMARK=m
CONFIG_CRYPTO_RSA=m
@@ -527,7 +532,6 @@ CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
@@ -547,6 +551,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
# CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
CONFIG_XZ_DEC_TEST=m
CONFIG_GLOB_SELFTEST=m
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -577,6 +582,7 @@ CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
CONFIG_LINEAR_RANGES_TEST=m
+CONFIG_CRC_BENCHMARK=y
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index b026a54867f5..73810d14660f 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -75,7 +75,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
@@ -257,6 +256,7 @@ CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_NFLOG=m
+CONFIG_IP_SCTP=m
CONFIG_SCTP_COOKIE_HMAC_SHA1=y
CONFIG_RDS=m
CONFIG_RDS_TCP=m
@@ -331,6 +331,7 @@ CONFIG_TCM_PSCSI=m
CONFIG_NETDEVICES=y
CONFIG_DUMMY=m
CONFIG_WIREGUARD=m
+CONFIG_OVPN=m
CONFIG_EQUALIZER=m
CONFIG_NET_TEAM=m
CONFIG_NET_TEAM_MODE_BROADCAST=m
@@ -350,6 +351,7 @@ CONFIG_PFCP=m
CONFIG_MACSEC=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=m
CONFIG_VETH=m
CONFIG_SUN3LANCE=y
CONFIG_PPP=m
@@ -395,8 +397,10 @@ CONFIG_RTC_DRV_GENERIC=m
CONFIG_DAX=m
CONFIG_EXT4_FS=y
CONFIG_JFS_FS=m
+CONFIG_XFS_FS=m
CONFIG_OCFS2_FS=m
# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_BTRFS_FS=m
CONFIG_BCACHEFS_FS=m
CONFIG_FANOTIFY=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
@@ -495,6 +499,7 @@ CONFIG_DLM=m
CONFIG_ENCRYPTED_KEYS=m
CONFIG_HARDENED_USERCOPY=y
CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_NULL=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_BENCHMARK=m
CONFIG_CRYPTO_RSA=m
@@ -527,7 +532,6 @@ CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
@@ -547,6 +551,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
# CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
CONFIG_XZ_DEC_TEST=m
CONFIG_GLOB_SELFTEST=m
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -578,6 +583,7 @@ CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
CONFIG_LINEAR_RANGES_TEST=m
+CONFIG_CRC_BENCHMARK=y
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/include/asm/adb_iop.h b/arch/m68k/include/asm/adb_iop.h
index 6aecd020e2fc..ca10b1ec0c78 100644
--- a/arch/m68k/include/asm/adb_iop.h
+++ b/arch/m68k/include/asm/adb_iop.h
@@ -33,7 +33,7 @@
#define ADB_IOP_SRQ 0x04 /* SRQ detected */
#define ADB_IOP_TIMEOUT 0x02 /* nonzero if timeout */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct adb_iopmsg {
__u8 flags; /* ADB flags */
@@ -43,4 +43,4 @@ struct adb_iopmsg {
__u8 spare[21]; /* spare */
};
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
diff --git a/arch/m68k/include/asm/bootinfo.h b/arch/m68k/include/asm/bootinfo.h
index 81c91af8ec6c..267272b436e2 100644
--- a/arch/m68k/include/asm/bootinfo.h
+++ b/arch/m68k/include/asm/bootinfo.h
@@ -14,7 +14,7 @@
#include <uapi/asm/bootinfo.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef CONFIG_BOOTINFO_PROC
extern void save_bootinfo(const struct bi_record *bi);
@@ -28,7 +28,7 @@ void process_uboot_commandline(char *commandp, int size);
static inline void process_uboot_commandline(char *commandp, int size) {}
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _M68K_BOOTINFO_H */
diff --git a/arch/m68k/include/asm/entry.h b/arch/m68k/include/asm/entry.h
index 9b52b060c76a..86cba7c19e67 100644
--- a/arch/m68k/include/asm/entry.h
+++ b/arch/m68k/include/asm/entry.h
@@ -4,7 +4,7 @@
#include <asm/setup.h>
#include <asm/page.h>
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#include <asm/thread_info.h>
#endif
@@ -41,7 +41,7 @@
#define ALLOWINT (~0x700)
#endif /* machine compilation types */
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
/*
* This defines the normal kernel pt-regs layout.
*
diff --git a/arch/m68k/include/asm/kexec.h b/arch/m68k/include/asm/kexec.h
index 3b0b64f0a353..f79427bd6487 100644
--- a/arch/m68k/include/asm/kexec.h
+++ b/arch/m68k/include/asm/kexec.h
@@ -15,7 +15,7 @@
#define KEXEC_ARCH KEXEC_ARCH_68K
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
static inline void crash_setup_regs(struct pt_regs *newregs,
struct pt_regs *oldregs)
@@ -23,7 +23,7 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
/* Dummy implementation for now */
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* CONFIG_KEXEC_CORE */
diff --git a/arch/m68k/include/asm/mac_baboon.h b/arch/m68k/include/asm/mac_baboon.h
index 08d9b8829a1a..ed5b5b48bdf8 100644
--- a/arch/m68k/include/asm/mac_baboon.h
+++ b/arch/m68k/include/asm/mac_baboon.h
@@ -5,7 +5,7 @@
#define BABOON_BASE (0x50F1A000) /* same as IDE controller base */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct baboon {
char pad1[208]; /* generic IDE registers, not used here */
@@ -36,4 +36,4 @@ extern void baboon_register_interrupts(void);
extern void baboon_irq_enable(int);
extern void baboon_irq_disable(int);
-#endif /* __ASSEMBLY **/
+#endif /* __ASSEMBLER__ */
diff --git a/arch/m68k/include/asm/mac_iop.h b/arch/m68k/include/asm/mac_iop.h
index 32f1c79c818f..a6753eb16ba4 100644
--- a/arch/m68k/include/asm/mac_iop.h
+++ b/arch/m68k/include/asm/mac_iop.h
@@ -66,7 +66,7 @@
#define IOP_ADDR_ALIVE 0x031F
#define IOP_ADDR_RECV_MSG 0x0320
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* IOP Control registers, staggered because in usual Apple style they were
@@ -163,4 +163,4 @@ extern void iop_ism_irq_poll(uint);
extern void iop_register_interrupts(void);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
diff --git a/arch/m68k/include/asm/mac_oss.h b/arch/m68k/include/asm/mac_oss.h
index 56ef986c0a9b..a6e86e443155 100644
--- a/arch/m68k/include/asm/mac_oss.h
+++ b/arch/m68k/include/asm/mac_oss.h
@@ -59,7 +59,7 @@
#define OSS_POWEROFF 0x80
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct mac_oss {
__u8 irq_level[0x10]; /* [0x000-0x00f] Interrupt levels */
@@ -77,4 +77,4 @@ extern void oss_register_interrupts(void);
extern void oss_irq_enable(int);
extern void oss_irq_disable(int);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
diff --git a/arch/m68k/include/asm/mac_psc.h b/arch/m68k/include/asm/mac_psc.h
index 86a5a5eab89e..6587dbd54476 100644
--- a/arch/m68k/include/asm/mac_psc.h
+++ b/arch/m68k/include/asm/mac_psc.h
@@ -207,7 +207,7 @@
* Unknown, always 0x0000.
*/
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern volatile __u8 *psc;
@@ -249,4 +249,4 @@ static inline u32 psc_read_long(int offset)
return *((volatile __u32 *)(psc + offset));
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
diff --git a/arch/m68k/include/asm/mac_via.h b/arch/m68k/include/asm/mac_via.h
index a9ef1e9ba6c4..b065cd8e5071 100644
--- a/arch/m68k/include/asm/mac_via.h
+++ b/arch/m68k/include/asm/mac_via.h
@@ -250,7 +250,7 @@
#define IER_SET_BIT(b) (0x80 | (1<<(b)) )
#define IER_CLR_BIT(b) (0x7F & (1<<(b)) )
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern volatile __u8 *via1,*via2;
extern int rbv_present,via_alt_mapping;
@@ -267,6 +267,6 @@ extern void via1_irq(struct irq_desc *desc);
extern void via1_set_head(int);
extern int via2_scsi_drq_pending(void);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_MAC_VIA_H_ */
diff --git a/arch/m68k/include/asm/math-emu.h b/arch/m68k/include/asm/math-emu.h
index eefaa3a2b596..91074ade14ad 100644
--- a/arch/m68k/include/asm/math-emu.h
+++ b/arch/m68k/include/asm/math-emu.h
@@ -67,7 +67,7 @@
#define PMUNIMPL (1<<PUNIMPL)
#define PMMOVEM (1<<PMOVEM)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/kernel.h>
#include <linux/sched.h>
@@ -127,7 +127,7 @@ extern unsigned int fp_debugprint;
#define FPDATA ((struct fp_data *)current->thread.fp)
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
#define FPDATA %a2
@@ -311,6 +311,6 @@ old_gas=old_gas+1
.endm
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_M68K_SETUP_H */
diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h
index d79fef609194..189bb7b1e663 100644
--- a/arch/m68k/include/asm/mcf_pgtable.h
+++ b/arch/m68k/include/asm/mcf_pgtable.h
@@ -92,7 +92,7 @@
#define PTE_MASK PAGE_MASK
#define CF_PAGE_CHG_MASK (PTE_MASK | CF_PAGE_ACCESSED | CF_PAGE_DIRTY)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#define pmd_pgtable(pmd) pfn_to_virt(pmd_val(pmd) >> PAGE_SHIFT)
@@ -292,5 +292,5 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
#define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
#define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT)
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _MCF_PGTABLE_H */
diff --git a/arch/m68k/include/asm/mcfmmu.h b/arch/m68k/include/asm/mcfmmu.h
index 283352ab0d5d..db16ea1057f7 100644
--- a/arch/m68k/include/asm/mcfmmu.h
+++ b/arch/m68k/include/asm/mcfmmu.h
@@ -88,7 +88,7 @@
#define MMUDR_PAN 10 /* Physical address */
#define MMUDR_PAMASK 0xfffffc00 /* PA mask */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* Simple access functions for the MMU registers. Nothing fancy
diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h
index 14fee64d3e60..dcf6829b3eab 100644
--- a/arch/m68k/include/asm/motorola_pgtable.h
+++ b/arch/m68k/include/asm/motorola_pgtable.h
@@ -44,7 +44,7 @@
/* We borrow bit 11 to store the exclusive marker in swap PTEs. */
#define _PAGE_SWP_EXCLUSIVE 0x800
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/* This is the cache mode to be used for pages containing page descriptors for
* processors >= '040. It is in pte_mknocache(), and the variable is defined
@@ -202,5 +202,5 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
return pte;
}
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _MOTOROLA_PGTABLE_H */
diff --git a/arch/m68k/include/asm/nettel.h b/arch/m68k/include/asm/nettel.h
index 3bd4b7a4613f..9bf55cef119e 100644
--- a/arch/m68k/include/asm/nettel.h
+++ b/arch/m68k/include/asm/nettel.h
@@ -38,7 +38,7 @@
#define NETtel_LEDADDR 0x30400000
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern volatile unsigned short ppdata;
@@ -80,7 +80,7 @@ static __inline__ void mcf_setppdata(unsigned int mask, unsigned int bits)
#define MCFPP_DTR0 0x0040
#define MCFPP_DTR1 0x0000 /* Port 1 no DTR support */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* These functions defined to give quasi generic access to the
* PPIO bits used for DTR/DCD.
diff --git a/arch/m68k/include/asm/openprom.h b/arch/m68k/include/asm/openprom.h
index dd22e649f5c5..6456ba40a946 100644
--- a/arch/m68k/include/asm/openprom.h
+++ b/arch/m68k/include/asm/openprom.h
@@ -21,7 +21,7 @@
#define LINUX_OPPROM_MAGIC 0x10010407
#endif
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/* V0 prom device operations. */
struct linux_dev_v0_funcs {
int (*v0_devopen)(char *device_str);
@@ -308,6 +308,6 @@ struct linux_prom_ranges {
unsigned int or_size;
};
-#endif /* !(__ASSEMBLY__) */
+#endif /* !(__ASSEMBLER__) */
#endif /* !(__SPARC_OPENPROM_H) */
diff --git a/arch/m68k/include/asm/page.h b/arch/m68k/include/asm/page.h
index b173ba27d36f..d30f8b2f1592 100644
--- a/arch/m68k/include/asm/page.h
+++ b/arch/m68k/include/asm/page.h
@@ -10,7 +10,7 @@
#define PAGE_OFFSET (PAGE_OFFSET_RAW)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* These are used to make use of C type-checking..
@@ -48,7 +48,7 @@ extern unsigned long _rambase;
extern unsigned long _ramstart;
extern unsigned long _ramend;
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#ifdef CONFIG_MMU
#include <asm/page_mm.h>
diff --git a/arch/m68k/include/asm/page_mm.h b/arch/m68k/include/asm/page_mm.h
index e0ae4d5fc985..ed782609ca41 100644
--- a/arch/m68k/include/asm/page_mm.h
+++ b/arch/m68k/include/asm/page_mm.h
@@ -2,7 +2,7 @@
#ifndef _M68K_PAGE_MM_H
#define _M68K_PAGE_MM_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/compiler.h>
#include <asm/module.h>
@@ -144,6 +144,6 @@ extern int m68k_virt_to_node_shift;
#define virt_addr_valid(kaddr) ((unsigned long)(kaddr) >= PAGE_OFFSET && (unsigned long)(kaddr) < (unsigned long)high_memory)
#define pfn_valid(pfn) virt_addr_valid(pfn_to_virt(pfn))
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _M68K_PAGE_MM_H */
diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h
index 63c0e706084b..39db2026a4b4 100644
--- a/arch/m68k/include/asm/page_no.h
+++ b/arch/m68k/include/asm/page_no.h
@@ -2,7 +2,7 @@
#ifndef _M68K_PAGE_NO_H
#define _M68K_PAGE_NO_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern unsigned long memory_start;
extern unsigned long memory_end;
@@ -37,6 +37,6 @@ static inline void *pfn_to_virt(unsigned long pfn)
#define ARCH_PFN_OFFSET PHYS_PFN(PAGE_OFFSET_RAW)
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _M68K_PAGE_NO_H */
diff --git a/arch/m68k/include/asm/pgtable.h b/arch/m68k/include/asm/pgtable.h
index 49fcfd734860..02f1a4601379 100644
--- a/arch/m68k/include/asm/pgtable.h
+++ b/arch/m68k/include/asm/pgtable.h
@@ -10,7 +10,7 @@
#include <asm/pgtable_mm.h>
#endif
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern void paging_init(void);
#endif
diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h
index dbdf1c2b2f66..62f2ff4e6799 100644
--- a/arch/m68k/include/asm/pgtable_mm.h
+++ b/arch/m68k/include/asm/pgtable_mm.h
@@ -11,7 +11,7 @@
#include <asm/setup.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/processor.h>
#include <linux/sched.h>
#include <linux/threads.h>
@@ -145,7 +145,7 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
#define update_mmu_cache(vma, addr, ptep) \
update_mmu_cache_range(NULL, vma, addr, ptep, 1)
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
/* MMU-specific headers */
@@ -157,7 +157,7 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
#include <asm/motorola_pgtable.h>
#endif
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* Macro to mark a page protection value as "uncacheable".
*/
@@ -182,6 +182,6 @@ pgprot_t pgprot_dmacoherent(pgprot_t prot);
#define pgprot_dmacoherent(prot) pgprot_dmacoherent(prot)
#endif /* CONFIG_COLDFIRE */
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _M68K_PGTABLE_H */
diff --git a/arch/m68k/include/asm/ptrace.h b/arch/m68k/include/asm/ptrace.h
index ea5a80ca1ab3..bc86ce012025 100644
--- a/arch/m68k/include/asm/ptrace.h
+++ b/arch/m68k/include/asm/ptrace.h
@@ -4,7 +4,7 @@
#include <uapi/asm/ptrace.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifndef PS_S
#define PS_S (0x2000)
@@ -24,5 +24,5 @@
#define arch_has_block_step() (1)
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _M68K_PTRACE_H */
diff --git a/arch/m68k/include/asm/setup.h b/arch/m68k/include/asm/setup.h
index 2c99477aaf89..e4ec169f5c7d 100644
--- a/arch/m68k/include/asm/setup.h
+++ b/arch/m68k/include/asm/setup.h
@@ -28,9 +28,9 @@
#define CL_SIZE COMMAND_LINE_SIZE
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern unsigned long m68k_machtype;
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#if !defined(CONFIG_AMIGA)
# define MACH_IS_AMIGA (0)
@@ -199,7 +199,7 @@ extern unsigned long m68k_machtype;
#endif
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern unsigned long m68k_cputype;
extern unsigned long m68k_fputype;
extern unsigned long m68k_mmutype;
@@ -213,7 +213,7 @@ extern unsigned long vme_brdtype;
*/
extern int m68k_is040or060;
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#if !defined(CONFIG_M68020)
# define CPU_IS_020 (0)
@@ -321,7 +321,7 @@ extern int m68k_is040or060;
#define NUM_MEMINFO 4
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct m68k_mem_info {
unsigned long addr; /* physical address of memory chunk */
unsigned long size; /* length of memory chunk (in bytes) */
diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h
index 858cbe936f5b..80ca185a18a1 100644
--- a/arch/m68k/include/asm/sun3_pgtable.h
+++ b/arch/m68k/include/asm/sun3_pgtable.h
@@ -4,7 +4,7 @@
#include <asm/sun3mmu.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/virtconvert.h>
#include <linux/linkage.h>
@@ -19,7 +19,7 @@
#define PTOV(addr) __va(addr)
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
/* These need to be defined for compatibility although the sun3 doesn't use them */
#define _PAGE_NOCACHE030 0x040
@@ -74,7 +74,7 @@
/* We borrow bit 6 to store the exclusive marker in swap PTEs. */
#define _PAGE_SWP_EXCLUSIVE 0x040
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
@@ -186,5 +186,5 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
return pte;
}
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* !_SUN3_PGTABLE_H */
diff --git a/arch/m68k/include/asm/sun3mmu.h b/arch/m68k/include/asm/sun3mmu.h
index 21a75daa278f..fee05cd2ce5b 100644
--- a/arch/m68k/include/asm/sun3mmu.h
+++ b/arch/m68k/include/asm/sun3mmu.h
@@ -67,7 +67,7 @@
#define SUN3_BUSERR_PROTERR (0x40)
#define SUN3_BUSERR_INVALID (0x80)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/* Read bus error status register (implicitly clearing it). */
static inline unsigned char sun3_get_buserr(void)
@@ -167,6 +167,6 @@ extern void __iomem *sun3_ioremap(unsigned long phys, unsigned long size,
extern int sun3_map_test(unsigned long addr, char *val);
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* !__SUN3_MMU_H__ */
diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h
index 3e31adbddc75..5cb3ace55622 100644
--- a/arch/m68k/include/asm/thread_info.h
+++ b/arch/m68k/include/asm/thread_info.h
@@ -22,7 +22,7 @@
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct thread_info {
struct task_struct *task; /* main task structure */
@@ -31,7 +31,7 @@ struct thread_info {
__u32 cpu; /* should always be 0 on m68k */
unsigned long tp_value; /* thread pointer */
};
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#define INIT_THREAD_INFO(tsk) \
{ \
@@ -39,7 +39,7 @@ struct thread_info {
.preempt_count = INIT_PREEMPT_COUNT, \
}
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/* how to get the thread information struct from C */
static inline struct thread_info *current_thread_info(void)
{
diff --git a/arch/m68k/include/asm/traps.h b/arch/m68k/include/asm/traps.h
index a9d5c1c870d3..c7b3989bd4b2 100644
--- a/arch/m68k/include/asm/traps.h
+++ b/arch/m68k/include/asm/traps.h
@@ -11,7 +11,7 @@
#ifndef _M68K_TRAPS_H
#define _M68K_TRAPS_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/linkage.h>
#include <asm/ptrace.h>
@@ -94,7 +94,7 @@ asmlinkage void bad_inthandler(void);
#define VECOFF(vec) ((vec)<<2)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/* Status register bits */
#define PS_T (0x8000)
@@ -271,6 +271,6 @@ struct frame {
asmlinkage void berr_040cleanup(struct frame *fp);
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _M68K_TRAPS_H */
diff --git a/arch/m68k/include/uapi/asm/bootinfo-vme.h b/arch/m68k/include/uapi/asm/bootinfo-vme.h
index f36a09ab5e79..b8139eb39352 100644
--- a/arch/m68k/include/uapi/asm/bootinfo-vme.h
+++ b/arch/m68k/include/uapi/asm/bootinfo-vme.h
@@ -33,7 +33,7 @@
#define VME_TYPE_BVME6000 0x6000 /* BVM Ltd. BVME6000 */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* Board ID data structure - pointer to this retrieved from Bug by head.S
@@ -56,7 +56,7 @@ typedef struct {
__be32 option2;
} t_bdid, *p_bdid;
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
/*
diff --git a/arch/m68k/include/uapi/asm/bootinfo.h b/arch/m68k/include/uapi/asm/bootinfo.h
index 024e87d7095f..28d2d44c08d0 100644
--- a/arch/m68k/include/uapi/asm/bootinfo.h
+++ b/arch/m68k/include/uapi/asm/bootinfo.h
@@ -16,7 +16,7 @@
#include <linux/types.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* Bootinfo definitions
@@ -43,7 +43,7 @@ struct mem_info {
__be32 size; /* length of memory chunk (in bytes) */
};
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
/*
@@ -167,7 +167,7 @@ struct mem_info {
#define BI_VERSION_MAJOR(v) (((v) >> 16) & 0xffff)
#define BI_VERSION_MINOR(v) ((v) & 0xffff)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct bootversion {
__be16 branch;
@@ -178,7 +178,7 @@ struct bootversion {
} machversions[];
} __packed;
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _UAPI_ASM_M68K_BOOTINFO_H */
diff --git a/arch/m68k/include/uapi/asm/ptrace.h b/arch/m68k/include/uapi/asm/ptrace.h
index ebd9fccb3d11..d70f771399b4 100644
--- a/arch/m68k/include/uapi/asm/ptrace.h
+++ b/arch/m68k/include/uapi/asm/ptrace.h
@@ -22,7 +22,7 @@
#define PT_SR 17
#define PT_PC 18
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/* this struct defines the way the registers are stored on the
stack during a system call. */
@@ -81,5 +81,5 @@ struct switch_stack {
#define PTRACE_GETFDPIC_EXEC 0
#define PTRACE_GETFDPIC_INTERP 1
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _UAPI_M68K_PTRACE_H */
diff --git a/arch/m68k/kernel/early_printk.c b/arch/m68k/kernel/early_printk.c
index f11ef9f1f56f..521cbb8a150c 100644
--- a/arch/m68k/kernel/early_printk.c
+++ b/arch/m68k/kernel/early_printk.c
@@ -16,25 +16,10 @@
#include "../mvme147/mvme147.h"
#include "../mvme16x/mvme16x.h"
-asmlinkage void __init debug_cons_nputs(const char *s, unsigned n);
-
-static void __ref debug_cons_write(struct console *c,
- const char *s, unsigned n)
-{
-#if !(defined(CONFIG_SUN3) || defined(CONFIG_M68000) || \
- defined(CONFIG_COLDFIRE))
- if (MACH_IS_MVME147)
- mvme147_scc_write(c, s, n);
- else if (MACH_IS_MVME16x)
- mvme16x_cons_write(c, s, n);
- else
- debug_cons_nputs(s, n);
-#endif
-}
+asmlinkage void __init debug_cons_nputs(struct console *c, const char *s, unsigned int n);
static struct console early_console_instance = {
.name = "debug",
- .write = debug_cons_write,
.flags = CON_PRINTBUFFER | CON_BOOT,
.index = -1
};
@@ -44,6 +29,12 @@ static int __init setup_early_printk(char *buf)
if (early_console || buf)
return 0;
+ if (MACH_IS_MVME147)
+ early_console_instance.write = mvme147_scc_write;
+ else if (MACH_IS_MVME16x)
+ early_console_instance.write = mvme16x_cons_write;
+ else
+ early_console_instance.write = debug_cons_nputs;
early_console = &early_console_instance;
register_console(early_console);
@@ -51,20 +42,15 @@ static int __init setup_early_printk(char *buf)
}
early_param("earlyprintk", setup_early_printk);
-/*
- * debug_cons_nputs() defined in arch/m68k/kernel/head.S cannot be called
- * after init sections are discarded (for platforms that use it).
- */
-#if !(defined(CONFIG_SUN3) || defined(CONFIG_M68000) || \
- defined(CONFIG_COLDFIRE))
-
static int __init unregister_early_console(void)
{
- if (!early_console || MACH_IS_MVME16x)
- return 0;
+ /*
+ * debug_cons_nputs() defined in arch/m68k/kernel/head.S cannot be
+ * called after init sections are discarded (for platforms that use it).
+ */
+ if (early_console && early_console->write == debug_cons_nputs)
+ return unregister_console(early_console);
- return unregister_console(early_console);
+ return 0;
}
late_initcall(unregister_early_console);
-
-#endif
diff --git a/arch/m68k/kernel/head.S b/arch/m68k/kernel/head.S
index 852255cf60de..2e4ef0358887 100644
--- a/arch/m68k/kernel/head.S
+++ b/arch/m68k/kernel/head.S
@@ -3263,8 +3263,8 @@ func_return putn
* turns around and calls the internal routines. This routine
* is used by the boot console.
*
- * The calling parameters are:
- * void debug_cons_nputs(const char *str, unsigned length)
+ * The function signature is -
+ * void debug_cons_nputs(struct console *c, const char *s, unsigned int n)
*
* This routine does NOT understand variable arguments only
* simple strings!
@@ -3273,8 +3273,8 @@ ENTRY(debug_cons_nputs)
moveml %d0/%d1/%a0,%sp@-
movew %sr,%sp@-
ori #0x0700,%sr
- movel %sp@(18),%a0 /* fetch parameter */
- movel %sp@(22),%d1 /* fetch parameter */
+ movel %sp@(22),%a0 /* char *s */
+ movel %sp@(26),%d1 /* unsigned int n */
jra 2f
1:
#ifdef CONSOLE_DEBUG
@@ -3400,6 +3400,7 @@ L(console_clear_loop):
movel %d4,%d1 /* screen height in pixels */
divul %a0@(FONT_DESC_HEIGHT),%d1 /* d1 = max num rows */
+ subql #1,%d1 /* row range is 0 to num - 1 */
movel %d0,%a2@(Lconsole_struct_num_columns)
movel %d1,%a2@(Lconsole_struct_num_rows)
@@ -3532,61 +3533,44 @@ func_start console_putc,%a0/%a1/%d0-%d7
tstl %pc@(L(console_font))
jeq L(console_exit)
+ lea %pc@(L(console_globals)),%a0
+
/* Output character in d7 on console.
*/
movel ARG1,%d7
cmpib #'\n',%d7
- jbne 1f
+ jne L(console_not_lf)
- /* A little safe recursion is good for the soul */
- console_putc #'\r'
-1:
- lea %pc@(L(console_globals)),%a0
+ clrl %a0@(Lconsole_struct_cur_column) /* implicit \r */
- cmpib #10,%d7
- jne L(console_not_lf)
movel %a0@(Lconsole_struct_cur_row),%d0
- addil #1,%d0
- movel %d0,%a0@(Lconsole_struct_cur_row)
movel %a0@(Lconsole_struct_num_rows),%d1
cmpl %d1,%d0
jcs 1f
- subil #1,%d0
- movel %d0,%a0@(Lconsole_struct_cur_row)
console_scroll
+ jra L(console_exit)
1:
+ addql #1,%d0
+ movel %d0,%a0@(Lconsole_struct_cur_row)
jra L(console_exit)
L(console_not_lf):
- cmpib #13,%d7
- jne L(console_not_cr)
+ cmpib #'\r',%d7
+ jne L(console_not_lf_not_cr)
clrl %a0@(Lconsole_struct_cur_column)
jra L(console_exit)
-L(console_not_cr):
- cmpib #1,%d7
- jne L(console_not_home)
- clrl %a0@(Lconsole_struct_cur_row)
- clrl %a0@(Lconsole_struct_cur_column)
- jra L(console_exit)
-
-/*
- * At this point we know that the %d7 character is going to be
- * rendered on the screen. Register usage is -
- * a0 = pointer to console globals
- * a1 = font data
- * d0 = cursor column
- * d1 = cursor row to draw the character
- * d7 = character number
- */
-L(console_not_home):
+ /*
+ * At this point we know that the %d7 character is going to be
+ * rendered on the screen. Register usage is -
+ * a0 = pointer to console globals
+ * a1 = font data
+ * d0 = cursor column
+ * d1 = cursor row to draw the character
+ * d7 = character number
+ */
+L(console_not_lf_not_cr):
movel %a0@(Lconsole_struct_cur_column),%d0
- addql #1,%a0@(Lconsole_struct_cur_column)
- movel %a0@(Lconsole_struct_num_columns),%d1
- cmpl %d1,%d0
- jcs 1f
- console_putc #'\n' /* recursion is OK! */
-1:
movel %a0@(Lconsole_struct_cur_row),%d1
/*
@@ -3633,6 +3617,23 @@ L(console_do_font_scanline):
addq #1,%d1
dbra %d7,L(console_read_char_scanline)
+ /*
+ * Register usage in the code below:
+ * a0 = pointer to console globals
+ * d0 = cursor column
+ * d1 = cursor column limit
+ */
+
+ lea %pc@(L(console_globals)),%a0
+
+ movel %a0@(Lconsole_struct_cur_column),%d0
+ addql #1,%d0
+ movel %d0,%a0@(Lconsole_struct_cur_column) /* Update cursor pos */
+ movel %a0@(Lconsole_struct_num_columns),%d1
+ cmpl %d1,%d0
+ jcs L(console_exit)
+ console_putc #'\n' /* Line wrap using tail recursion */
+
L(console_exit):
func_return console_putc
diff --git a/arch/m68k/mac/via.c b/arch/m68k/mac/via.c
index 01e6b0e37f8d..9cb813eda4fd 100644
--- a/arch/m68k/mac/via.c
+++ b/arch/m68k/mac/via.c
@@ -621,6 +621,22 @@ static u64 mac_read_clk(struct clocksource *cs)
* These problems are avoided by ignoring the low byte. Clock accuracy
* is 256 times worse (error can reach 0.327 ms) but CPU overhead is
* reduced by avoiding slow VIA register accesses.
+ *
+ * The VIA timer counter observably decrements to 0xFFFF before the
+ * counter reload interrupt gets raised. That complicates things a bit.
+ *
+ * State | vT1CH | VIA_TIMER_1_INT | inference drawn
+ * ------+------------+-----------------+-----------------------------
+ * i | FE thru 00 | false | counter is decrementing
+ * ii | FF | false | counter wrapped
+ * iii | FF | true | wrapped, interrupt raised
+ * iv | FF | false | wrapped, interrupt handled
+ * v | FE thru 00 | true | wrapped, interrupt unhandled
+ *
+ * State iv is never observed because handling the interrupt involves
+ * a 6522 register access and every access consumes a "phi 2" clock
+ * cycle. So 0xFF implies either state ii or state iii, depending on
+ * the value of the VIA_TIMER_1_INT bit.
*/
local_irq_save(flags);
diff --git a/arch/m68k/math-emu/fp_emu.h b/arch/m68k/math-emu/fp_emu.h
index c1ecfef7886a..6ac811c31ca4 100644
--- a/arch/m68k/math-emu/fp_emu.h
+++ b/arch/m68k/math-emu/fp_emu.h
@@ -38,12 +38,12 @@
#ifndef _FP_EMU_H
#define _FP_EMU_H
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#include <asm/asm-offsets.h>
#endif
#include <asm/math-emu.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#define IS_INF(a) ((a)->exp == 0x7fff)
#define IS_ZERO(a) ((a)->mant.m64 == 0)
@@ -124,7 +124,7 @@ extern const struct fp_ext fp_Inf;
: "a1", "d1", "d2", "memory"); \
})
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
/*
* set, reset or clear a bit in the fp status register
@@ -141,6 +141,6 @@ extern const struct fp_ext fp_Inf;
btst #(\bit&7),(FPD_FPSR+3-(\bit/8),FPDATA)
.endm
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _FP_EMU_H */
diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c
index 745bd575dcfa..62283bc2ed79 100644
--- a/arch/m68k/mm/motorola.c
+++ b/arch/m68k/mm/motorola.c
@@ -92,7 +92,7 @@ void mmu_page_dtor(void *page)
}
/* ++andreas: {get,free}_pointer_table rewritten to use unused fields from
- struct page instead of separately kmalloced struct. Stolen from
+ struct ptdesc instead of separately kmalloced struct. Stolen from
arch/sparc/mm/srmmu.c ... */
typedef struct list_head ptable_desc;
@@ -103,8 +103,7 @@ static struct list_head ptable_list[3] = {
LIST_HEAD_INIT(ptable_list[2]),
};
-#define PD_PTABLE(page) ((ptable_desc *)&(virt_to_page((void *)(page))->lru))
-#define PD_PAGE(ptable) (list_entry(ptable, struct page, lru))
+#define PD_PTABLE(ptdesc) ((ptable_desc *)&(virt_to_ptdesc((void *)(ptdesc))->pt_list))
#define PD_PTDESC(ptable) (list_entry(ptable, struct ptdesc, pt_list))
#define PD_MARKBITS(dp) (*(unsigned int *)&PD_PTDESC(dp)->pt_index)
@@ -121,10 +120,10 @@ void __init init_pointer_table(void *table, int type)
{
ptable_desc *dp;
unsigned long ptable = (unsigned long)table;
- unsigned long page = ptable & PAGE_MASK;
- unsigned int mask = 1U << ((ptable - page)/ptable_size(type));
+ unsigned long pt_addr = ptable & PAGE_MASK;
+ unsigned int mask = 1U << ((ptable - pt_addr)/ptable_size(type));
- dp = PD_PTABLE(page);
+ dp = PD_PTABLE(pt_addr);
if (!(PD_MARKBITS(dp) & mask)) {
PD_MARKBITS(dp) = ptable_mask(type);
list_add(dp, &ptable_list[type]);
@@ -133,9 +132,9 @@ void __init init_pointer_table(void *table, int type)
PD_MARKBITS(dp) &= ~mask;
pr_debug("init_pointer_table: %lx, %x\n", ptable, PD_MARKBITS(dp));
- /* unreserve the page so it's possible to free that page */
- __ClearPageReserved(PD_PAGE(dp));
- init_page_count(PD_PAGE(dp));
+ /* unreserve the ptdesc so it's possible to free that ptdesc */
+ __ClearPageReserved(ptdesc_page(PD_PTDESC(dp)));
+ init_page_count(ptdesc_page(PD_PTDESC(dp)));
return;
}
@@ -148,40 +147,44 @@ void *get_pointer_table(struct mm_struct *mm, int type)
/*
* For a pointer table for a user process address space, a
- * table is taken from a page allocated for the purpose. Each
- * page can hold 8 pointer tables. The page is remapped in
+ * table is taken from a ptdesc allocated for the purpose. Each
+ * ptdesc can hold 8 pointer tables. The ptdesc is remapped in
* virtual address space to be noncacheable.
*/
if (mask == 0) {
- void *page;
+ struct ptdesc *ptdesc;
ptable_desc *new;
+ void *pt_addr;
- if (!(page = (void *)get_zeroed_page(GFP_KERNEL)))
+ ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0);
+ if (!ptdesc)
return NULL;
+ pt_addr = ptdesc_address(ptdesc);
+
switch (type) {
case TABLE_PTE:
/*
* m68k doesn't have SPLIT_PTE_PTLOCKS for not having
* SMP.
*/
- pagetable_pte_ctor(mm, virt_to_ptdesc(page));
+ pagetable_pte_ctor(mm, ptdesc);
break;
case TABLE_PMD:
- pagetable_pmd_ctor(mm, virt_to_ptdesc(page));
+ pagetable_pmd_ctor(mm, ptdesc);
break;
case TABLE_PGD:
- pagetable_pgd_ctor(virt_to_ptdesc(page));
+ pagetable_pgd_ctor(ptdesc);
break;
}
- mmu_page_ctor(page);
+ mmu_page_ctor(pt_addr);
- new = PD_PTABLE(page);
+ new = PD_PTABLE(pt_addr);
PD_MARKBITS(new) = ptable_mask(type) - 1;
list_add_tail(new, dp);
- return (pmd_t *)page;
+ return (pmd_t *)pt_addr;
}
for (tmp = 1, off = 0; (mask & tmp) == 0; tmp <<= 1, off += ptable_size(type))
@@ -191,28 +194,27 @@ void *get_pointer_table(struct mm_struct *mm, int type)
/* move to end of list */
list_move_tail(dp, &ptable_list[type]);
}
- return page_address(PD_PAGE(dp)) + off;
+ return ptdesc_address(PD_PTDESC(dp)) + off;
}
int free_pointer_table(void *table, int type)
{
ptable_desc *dp;
unsigned long ptable = (unsigned long)table;
- unsigned long page = ptable & PAGE_MASK;
- unsigned int mask = 1U << ((ptable - page)/ptable_size(type));
+ unsigned long pt_addr = ptable & PAGE_MASK;
+ unsigned int mask = 1U << ((ptable - pt_addr)/ptable_size(type));
- dp = PD_PTABLE(page);
+ dp = PD_PTABLE(pt_addr);
if (PD_MARKBITS (dp) & mask)
panic ("table already free!");
PD_MARKBITS (dp) |= mask;
if (PD_MARKBITS(dp) == ptable_mask(type)) {
- /* all tables in page are free, free page */
+ /* all tables in ptdesc are free, free ptdesc */
list_del(dp);
- mmu_page_dtor((void *)page);
- pagetable_dtor(virt_to_ptdesc((void *)page));
- free_page (page);
+ mmu_page_dtor((void *)pt_addr);
+ pagetable_dtor_free(virt_to_ptdesc((void *)pt_addr));
return 1;
} else if (ptable_list[type].next != dp) {
/*
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index f18ec02ddeb2..484ebb3baedf 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -28,7 +28,6 @@ config MICROBLAZE
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_PAGE_SIZE_4KB
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 934eb961bd0d..f4a4f82eed98 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -73,7 +73,6 @@ config MIPS
select HAVE_EBPF_JIT if !CPU_MICROMIPS
select HAVE_EXIT_THREAD
select HAVE_GUP_FAST
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS
diff --git a/arch/mips/boot/dts/ralink/mt7620a.dtsi b/arch/mips/boot/dts/ralink/mt7620a.dtsi
index d66045948a83..460164bdd430 100644
--- a/arch/mips/boot/dts/ralink/mt7620a.dtsi
+++ b/arch/mips/boot/dts/ralink/mt7620a.dtsi
@@ -62,4 +62,14 @@
reg-shift = <2>;
};
};
+
+ wmac: wifi@10180000 {
+ compatible = "ralink,rt2880-wifi";
+ reg = <0x10180000 0x40000>;
+
+ clocks = <&sysc 16>;
+
+ interrupt-parent = <&cpuintc>;
+ interrupts = <6>;
+ };
};
diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig
index 114fcd67898d..cdedbb8a8f53 100644
--- a/arch/mips/configs/fuloong2e_defconfig
+++ b/arch/mips/configs/fuloong2e_defconfig
@@ -44,7 +44,6 @@ CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
CONFIG_NETFILTER_XT_TARGET_TRACE=m
CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig
index f1a8ccf2c459..2decf8b98d31 100644
--- a/arch/mips/configs/ip22_defconfig
+++ b/arch/mips/configs/ip22_defconfig
@@ -79,7 +79,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
diff --git a/arch/mips/configs/loongson2k_defconfig b/arch/mips/configs/loongson2k_defconfig
index 4b7f914d01d0..6aea6a5b1b66 100644
--- a/arch/mips/configs/loongson2k_defconfig
+++ b/arch/mips/configs/loongson2k_defconfig
@@ -52,7 +52,6 @@ CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
CONFIG_NETFILTER_XT_TARGET_MARK=m
CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
CONFIG_NETFILTER_XT_MATCH_LIMIT=m
diff --git a/arch/mips/configs/loongson3_defconfig b/arch/mips/configs/loongson3_defconfig
index 5ff0c1554168..b00344e0d899 100644
--- a/arch/mips/configs/loongson3_defconfig
+++ b/arch/mips/configs/loongson3_defconfig
@@ -72,7 +72,6 @@ CONFIG_NETFILTER_XT_TARGET_MARK=m
CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
CONFIG_NETFILTER_XT_MATCH_LIMIT=m
diff --git a/arch/mips/configs/malta_defconfig b/arch/mips/configs/malta_defconfig
index 869a14b3184f..9fcbac829920 100644
--- a/arch/mips/configs/malta_defconfig
+++ b/arch/mips/configs/malta_defconfig
@@ -80,7 +80,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
CONFIG_NETFILTER_XT_MATCH_HELPER=m
diff --git a/arch/mips/configs/malta_kvm_defconfig b/arch/mips/configs/malta_kvm_defconfig
index 41e1fea303ea..19102386a81c 100644
--- a/arch/mips/configs/malta_kvm_defconfig
+++ b/arch/mips/configs/malta_kvm_defconfig
@@ -84,7 +84,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
CONFIG_NETFILTER_XT_MATCH_HELPER=m
diff --git a/arch/mips/configs/maltaup_xpa_defconfig b/arch/mips/configs/maltaup_xpa_defconfig
index 13ff1877e26e..1dd07c9d1812 100644
--- a/arch/mips/configs/maltaup_xpa_defconfig
+++ b/arch/mips/configs/maltaup_xpa_defconfig
@@ -82,7 +82,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
CONFIG_NETFILTER_XT_MATCH_HELPER=m
diff --git a/arch/mips/configs/rb532_defconfig b/arch/mips/configs/rb532_defconfig
index 9fb114ef5e2d..30d18b084cda 100644
--- a/arch/mips/configs/rb532_defconfig
+++ b/arch/mips/configs/rb532_defconfig
@@ -56,7 +56,6 @@ CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
CONFIG_NETFILTER_XT_TARGET_TRACE=m
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
CONFIG_NETFILTER_XT_MATCH_LIMIT=y
CONFIG_NETFILTER_XT_MATCH_MULTIPORT=y
diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig
index 7b5a5591ccc9..39a2419e1f3e 100644
--- a/arch/mips/configs/rm200_defconfig
+++ b/arch/mips/configs/rm200_defconfig
@@ -64,7 +64,6 @@ CONFIG_NETFILTER_XT_MATCH_COMMENT=m
CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 31ac655b7837..72fb1b006da9 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -163,6 +163,9 @@
#define SO_PASSRIGHTS 83
+#define SO_INQ 84
+#define SCM_INQ SO_INQ
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index fcc5973f7519..2efa4b08b7b8 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -81,7 +81,6 @@ config PARISC
select HAVE_KPROBES
select HAVE_KRETPROBES
select HAVE_DYNAMIC_FTRACE if $(cc-option,-fpatchable-function-entry=1,1)
- select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE
select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE
select HAVE_KPROBES_ON_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 1f2d5b7a7f5d..c16ec36dfee6 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -144,6 +144,9 @@
#define SO_PASSRIGHTS 0x4051
+#define SO_INQ 0x4052
+#define SCM_INQ SO_INQ
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 45b4fa7b9b02..514d0e92b336 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -244,7 +244,6 @@ config PPC
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_GUP_FAST
select HAVE_FTRACE_GRAPH_FUNC
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_DESCRIPTORS if PPC64_ELF_ABI_V1
select HAVE_FUNCTION_ERROR_INJECTION
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index f3804103c56c..9753fb87217c 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -101,7 +101,7 @@ KBUILD_LDFLAGS += -m elf$(BITS)$(LDEMULATION)
endif
LDFLAGS_vmlinux-y := -Bstatic
-LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie
+LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie --no-dynamic-linker
LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) += -z notext
LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y)
diff --git a/arch/powerpc/boot/dts/microwatt.dts b/arch/powerpc/boot/dts/microwatt.dts
index b7eac4e56019..292b909ca9ce 100644
--- a/arch/powerpc/boot/dts/microwatt.dts
+++ b/arch/powerpc/boot/dts/microwatt.dts
@@ -37,7 +37,7 @@
ibm,powerpc-cpu-features {
display-name = "Microwatt";
- isa = <3010>;
+ isa = <3100>;
device_type = "cpu-features";
compatible = "ibm,powerpc-cpu-features";
diff --git a/arch/powerpc/configs/cell_defconfig b/arch/powerpc/configs/cell_defconfig
index 3347192b77b8..7a31b52e92e1 100644
--- a/arch/powerpc/configs/cell_defconfig
+++ b/arch/powerpc/configs/cell_defconfig
@@ -62,7 +62,6 @@ CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index 98f56e63ad21..d06388b0f66e 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -46,7 +46,7 @@ CONFIG_CPU_FREQ_GOV_POWERSAVE=y
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
CONFIG_CPU_IDLE=y
-CONFIG_HZ_100=y
+CONFIG_HZ_1000=y
CONFIG_BINFMT_MISC=m
CONFIG_PPC_TRANSACTIONAL_MEM=y
CONFIG_PPC_UV=y
@@ -340,3 +340,4 @@ CONFIG_KVM_BOOK3S_64_HV=m
CONFIG_VHOST_NET=m
CONFIG_PRINTK_TIME=y
CONFIG_PRINTK_CALLER=y
+CONFIG_KALLSYMS_ALL=y
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index dca67aae5da3..ce34597e9f3e 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -57,7 +57,7 @@ CONFIG_CPU_FREQ_GOV_POWERSAVE=y
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
CONFIG_CPU_FREQ_PMAC64=y
-CONFIG_HZ_100=y
+CONFIG_HZ_1000=y
CONFIG_PPC_TRANSACTIONAL_MEM=y
CONFIG_KEXEC=y
CONFIG_KEXEC_FILE=y
@@ -465,3 +465,4 @@ CONFIG_TEST_MEMCAT_P=m
CONFIG_TEST_MEMINIT=m
CONFIG_TEST_FREE_PAGES=m
CONFIG_MEMTEST=y
+CONFIG_KALLSYMS_ALL=y
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index f96f8ed9856c..bb359643ddc1 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -252,7 +252,6 @@ CONFIG_NET_SCH_DSMARK=m
CONFIG_NET_SCH_NETEM=m
CONFIG_NET_SCH_INGRESS=m
CONFIG_NET_CLS_BASIC=m
-CONFIG_NET_CLS_TCINDEX=m
CONFIG_NET_CLS_ROUTE4=m
CONFIG_NET_CLS_FW=m
CONFIG_NET_CLS_U32=m
diff --git a/arch/powerpc/include/asm/floppy.h b/arch/powerpc/include/asm/floppy.h
index f8ce178b43b7..34abf8bea2cc 100644
--- a/arch/powerpc/include/asm/floppy.h
+++ b/arch/powerpc/include/asm/floppy.h
@@ -144,9 +144,12 @@ static int hard_dma_setup(char *addr, unsigned long size, int mode, int io)
bus_addr = 0;
}
- if (!bus_addr) /* need to map it */
+ if (!bus_addr) { /* need to map it */
bus_addr = dma_map_single(&isa_bridge_pcidev->dev, addr, size,
dir);
+ if (dma_mapping_error(&isa_bridge_pcidev->dev, bus_addr))
+ return -ENOMEM;
+ }
/* remember this one as prev */
prev_addr = addr;
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 6df6dbbe1e7c..ea6c8dc400d2 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -270,6 +270,7 @@
#define H_QUERY_INT_STATE 0x1E4
#define H_POLL_PENDING 0x1D8
#define H_ILLAN_ATTRIBUTES 0x244
+#define H_ADD_LOGICAL_LAN_BUFFERS 0x248
#define H_MODIFY_HEA_QP 0x250
#define H_QUERY_HEA_QP 0x254
#define H_QUERY_HEA 0x258
diff --git a/arch/powerpc/include/uapi/asm/eeh.h b/arch/powerpc/include/uapi/asm/eeh.h
index 28186071fafc..3b5c47ff3fc4 100644
--- a/arch/powerpc/include/uapi/asm/eeh.h
+++ b/arch/powerpc/include/uapi/asm/eeh.h
@@ -1,18 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
* Copyright IBM Corp. 2015
*
* Authors: Gavin Shan <gwshan@linux.vnet.ibm.com>
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index eaeda001784e..077c5437f521 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -1,18 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
* Copyright IBM Corp. 2007
*
* Authors: Hollis Blanchard <hollisb@us.ibm.com>
diff --git a/arch/powerpc/include/uapi/asm/kvm_para.h b/arch/powerpc/include/uapi/asm/kvm_para.h
index a809b1b44ddf..ac596064d4c7 100644
--- a/arch/powerpc/include/uapi/asm/kvm_para.h
+++ b/arch/powerpc/include/uapi/asm/kvm_para.h
@@ -1,18 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
* Copyright IBM Corp. 2008
*
* Authors: Hollis Blanchard <hollisb@us.ibm.com>
diff --git a/arch/powerpc/include/uapi/asm/ps3fb.h b/arch/powerpc/include/uapi/asm/ps3fb.h
index fd7e3a0d35d5..b1c6b0cd9e80 100644
--- a/arch/powerpc/include/uapi/asm/ps3fb.h
+++ b/arch/powerpc/include/uapi/asm/ps3fb.h
@@ -2,19 +2,6 @@
/*
* Copyright (C) 2006 Sony Computer Entertainment Inc.
* Copyright 2006, 2007 Sony Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published
- * by the Free Software Foundation; version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifndef _ASM_POWERPC_PS3FB_H_
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ca7f7bb2b478..13578f4db254 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1208,16 +1208,16 @@ int eeh_dev_open(struct pci_dev *pdev)
struct eeh_dev *edev;
int ret = -ENODEV;
- mutex_lock(&eeh_dev_mutex);
+ guard(mutex)(&eeh_dev_mutex);
/* No PCI device ? */
if (!pdev)
- goto out;
+ return ret;
/* No EEH device or PE ? */
edev = pci_dev_to_eeh_dev(pdev);
if (!edev || !edev->pe)
- goto out;
+ return ret;
/*
* The PE might have been put into frozen state, but we
@@ -1227,16 +1227,12 @@ int eeh_dev_open(struct pci_dev *pdev)
*/
ret = eeh_pe_change_owner(edev->pe);
if (ret)
- goto out;
+ return ret;
/* Increase PE's pass through count */
atomic_inc(&edev->pe->pass_dev_cnt);
- mutex_unlock(&eeh_dev_mutex);
return 0;
-out:
- mutex_unlock(&eeh_dev_mutex);
- return ret;
}
EXPORT_SYMBOL_GPL(eeh_dev_open);
@@ -1252,22 +1248,20 @@ void eeh_dev_release(struct pci_dev *pdev)
{
struct eeh_dev *edev;
- mutex_lock(&eeh_dev_mutex);
+ guard(mutex)(&eeh_dev_mutex);
/* No PCI device ? */
if (!pdev)
- goto out;
+ return;
/* No EEH device ? */
edev = pci_dev_to_eeh_dev(pdev);
if (!edev || !edev->pe || !eeh_pe_passed(edev->pe))
- goto out;
+ return;
/* Decrease PE's pass through count */
WARN_ON(atomic_dec_if_positive(&edev->pe->pass_dev_cnt) < 0);
eeh_pe_change_owner(edev->pe);
-out:
- mutex_unlock(&eeh_dev_mutex);
}
EXPORT_SYMBOL(eeh_dev_release);
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 7efe04c68f0f..10ce6b3bd3b7 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -907,7 +907,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
/* FIXME: Use the same format as dump_stack() */
pr_err("EEH: Call Trace:\n");
for (i = 0; i < pe->trace_entries; i++)
- pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]);
+ pr_err("EEH: [%p] %pS\n", ptrs[i], ptrs[i]);
pe->trace_entries = 0;
}
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 8ca49e40c473..8a050f30e6d9 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -1373,15 +1373,12 @@ static void fadump_free_elfcorehdr_buf(void)
static void fadump_invalidate_release_mem(void)
{
- mutex_lock(&fadump_mutex);
- if (!fw_dump.dump_active) {
- mutex_unlock(&fadump_mutex);
- return;
+ scoped_guard(mutex, &fadump_mutex) {
+ if (!fw_dump.dump_active)
+ return;
+ fadump_cleanup();
}
- fadump_cleanup();
- mutex_unlock(&fadump_mutex);
-
fadump_free_elfcorehdr_buf();
fadump_release_memory(fw_dump.boot_mem_top, memblock_end_of_DRAM());
fadump_free_cpu_notes_buf();
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 5407024881e5..583dc16e9d3c 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -312,13 +312,13 @@ static ssize_t rtas_flash_write(struct file *file, const char __user *buffer,
{
struct rtas_update_flash_t *const uf = &rtas_update_flash_data;
char *p;
- int next_free, rc;
+ int next_free;
struct flash_block_list *fl;
- mutex_lock(&rtas_update_flash_mutex);
+ guard(mutex)(&rtas_update_flash_mutex);
if (uf->status == FLASH_AUTH || count == 0)
- goto out; /* discard data */
+ return count; /* discard data */
/* In the case that the image is not ready for flashing, the memory
* allocated for the block list will be freed upon the release of the
@@ -327,7 +327,7 @@ static ssize_t rtas_flash_write(struct file *file, const char __user *buffer,
if (uf->flist == NULL) {
uf->flist = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL);
if (!uf->flist)
- goto nomem;
+ return -ENOMEM;
}
fl = uf->flist;
@@ -338,7 +338,7 @@ static ssize_t rtas_flash_write(struct file *file, const char __user *buffer,
/* Need to allocate another block_list */
fl->next = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL);
if (!fl->next)
- goto nomem;
+ return -ENOMEM;
fl = fl->next;
next_free = 0;
}
@@ -347,25 +347,17 @@ static ssize_t rtas_flash_write(struct file *file, const char __user *buffer,
count = RTAS_BLK_SIZE;
p = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL);
if (!p)
- goto nomem;
+ return -ENOMEM;
if(copy_from_user(p, buffer, count)) {
kmem_cache_free(flash_block_cache, p);
- rc = -EFAULT;
- goto error;
+ return -EFAULT;
}
fl->blocks[next_free].data = p;
fl->blocks[next_free].length = count;
fl->num_blocks++;
-out:
- mutex_unlock(&rtas_update_flash_mutex);
- return count;
-nomem:
- rc = -ENOMEM;
-error:
- mutex_unlock(&rtas_update_flash_mutex);
- return rc;
+ return count;
}
/*
@@ -405,19 +397,18 @@ static ssize_t manage_flash_write(struct file *file, const char __user *buf,
static const char reject_str[] = "0";
static const char commit_str[] = "1";
char stkbuf[10];
- int op, rc;
+ int op;
- mutex_lock(&rtas_manage_flash_mutex);
+ guard(mutex)(&rtas_manage_flash_mutex);
if ((args_buf->status == MANAGE_AUTH) || (count == 0))
- goto out;
+ return count;
op = -1;
if (buf) {
if (count > 9) count = 9;
- rc = -EFAULT;
if (copy_from_user (stkbuf, buf, count))
- goto error;
+ return -EFAULT;
if (strncmp(stkbuf, reject_str, strlen(reject_str)) == 0)
op = RTAS_REJECT_TMP_IMG;
else if (strncmp(stkbuf, commit_str, strlen(commit_str)) == 0)
@@ -425,18 +416,11 @@ static ssize_t manage_flash_write(struct file *file, const char __user *buf,
}
if (op == -1) { /* buf is empty, or contains invalid string */
- rc = -EINVAL;
- goto error;
+ return -EINVAL;
}
manage_flash(args_buf, op);
-out:
- mutex_unlock(&rtas_manage_flash_mutex);
return count;
-
-error:
- mutex_unlock(&rtas_manage_flash_mutex);
- return rc;
}
/*
@@ -499,16 +483,14 @@ static ssize_t validate_flash_write(struct file *file, const char __user *buf,
{
struct rtas_validate_flash_t *const args_buf =
&rtas_validate_flash_data;
- int rc;
- mutex_lock(&rtas_validate_flash_mutex);
+ guard(mutex)(&rtas_validate_flash_mutex);
/* We are only interested in the first 4K of the
* candidate image */
if ((*off >= VALIDATE_BUF_SIZE) ||
(args_buf->status == VALIDATE_AUTH)) {
*off += count;
- mutex_unlock(&rtas_validate_flash_mutex);
return count;
}
@@ -519,20 +501,14 @@ static ssize_t validate_flash_write(struct file *file, const char __user *buf,
args_buf->status = VALIDATE_INCOMPLETE;
}
- if (!access_ok(buf, count)) {
- rc = -EFAULT;
- goto done;
- }
- if (copy_from_user(args_buf->buf + *off, buf, count)) {
- rc = -EFAULT;
- goto done;
- }
+ if (!access_ok(buf, count))
+ return -EFAULT;
+
+ if (copy_from_user(args_buf->buf + *off, buf, count))
+ return -EFAULT;
*off += count;
- rc = count;
-done:
- mutex_unlock(&rtas_validate_flash_mutex);
- return rc;
+ return count;
}
static int validate_flash_release(struct inode *inode, struct file *file)
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5ac7084eebc0..f59e4b9cc207 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1700,28 +1700,23 @@ static void __init build_sched_topology(void)
#ifdef CONFIG_SCHED_SMT
if (has_big_cores) {
pr_info("Big cores detected but using small core scheduling\n");
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
- };
+ powerpc_topology[i++] =
+ SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT);
} else {
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
- };
+ powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT);
}
#endif
if (shared_caches) {
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE)
- };
+ powerpc_topology[i++] =
+ SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE);
}
+
if (has_coregroup_support()) {
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC)
- };
+ powerpc_topology[i++] =
+ SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC);
}
- powerpc_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG)
- };
+
+ powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG);
/* There must be one trailing NULL entry left. */
BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);
diff --git a/arch/powerpc/kvm/trace_book3s.h b/arch/powerpc/kvm/trace_book3s.h
index 372a82fa2de3..9260ddbd557f 100644
--- a/arch/powerpc/kvm/trace_book3s.h
+++ b/arch/powerpc/kvm/trace_book3s.h
@@ -25,6 +25,7 @@
{0xe00, "H_DATA_STORAGE"}, \
{0xe20, "H_INST_STORAGE"}, \
{0xe40, "H_EMUL_ASSIST"}, \
+ {0xea0, "H_VIRT"}, \
{0xf00, "PERFMON"}, \
{0xf20, "ALTIVEC"}, \
{0xf40, "VSX"}
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 376dba5992f2..d865ec915f9d 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1122,18 +1122,25 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in
pte_t *pte;
/*
- * Make sure we align the start vmemmap addr so that we calculate
- * the correct start_pfn in altmap boundary check to decided whether
- * we should use altmap or RAM based backing memory allocation. Also
- * the address need to be aligned for set_pte operation.
-
- * If the start addr is already PMD_SIZE aligned we will try to use
- * a pmd mapping. We don't want to be too aggressive here beacause
- * that will cause more allocations in RAM. So only if the namespace
- * vmemmap start addr is PMD_SIZE aligned we will use PMD mapping.
+ * If altmap is present, Make sure we align the start vmemmap addr
+ * to PAGE_SIZE so that we calculate the correct start_pfn in
+ * altmap boundary check to decide whether we should use altmap or
+ * RAM based backing memory allocation. Also the address need to be
+ * aligned for set_pte operation. If the start addr is already
+ * PMD_SIZE aligned and with in the altmap boundary then we will
+ * try to use a pmd size altmap mapping else we go for page size
+ * mapping.
+ *
+ * If altmap is not present, align the vmemmap addr to PMD_SIZE and
+ * always allocate a PMD size page for vmemmap backing.
+ *
*/
- start = ALIGN_DOWN(start, PAGE_SIZE);
+ if (altmap)
+ start = ALIGN_DOWN(start, PAGE_SIZE);
+ else
+ start = ALIGN_DOWN(start, PMD_SIZE);
+
for (addr = start; addr < end; addr = next) {
next = pmd_addr_end(addr, end);
@@ -1159,7 +1166,7 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in
* in altmap block allocation failures, in which case
* we fallback to RAM for vmemmap allocation.
*/
- if (!IS_ALIGNED(addr, PMD_SIZE) || (altmap &&
+ if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
/*
* make sure we don't create altmap mappings
@@ -1173,7 +1180,7 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in
vmemmap_set_pmd(pmd, p, node, addr, next);
pr_debug("PMD_SIZE vmemmap mapping\n");
continue;
- } else if (altmap) {
+ } else {
/*
* A vmemmap block allocation can fail due to
* alignment requirements and we trying to align
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 5daa77aee7f7..a25a6ffe7d7c 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -370,6 +370,23 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o
return 0;
}
+bool bpf_jit_bypass_spec_v1(void)
+{
+#if defined(CONFIG_PPC_E500) || defined(CONFIG_PPC_BOOK3S_64)
+ return !(security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+ security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR));
+#else
+ return true;
+#endif
+}
+
+bool bpf_jit_bypass_spec_v4(void)
+{
+ return !(security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+ security_ftr_enabled(SEC_FTR_STF_BARRIER) &&
+ stf_barrier_type_get() != STF_BARRIER_NONE);
+}
+
/*
* We spill into the redzone always, even if the bpf program has its own stackframe.
* Offsets hardcoded based on BPF_PPC_STACK_SAVE -- see bpf_jit_stack_local()
@@ -397,6 +414,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
u32 *addrs, int pass, bool extra_pass)
{
enum stf_barrier_type stf_barrier = stf_barrier_type_get();
+ bool sync_emitted, ori31_emitted;
const struct bpf_insn *insn = fp->insnsi;
int flen = fp->len;
int i, ret;
@@ -789,30 +807,51 @@ emit_clear:
/*
* BPF_ST NOSPEC (speculation barrier)
+ *
+ * The following must act as a barrier against both Spectre v1
+ * and v4 if we requested both mitigations. Therefore, also emit
+ * 'isync; sync' on E500 or 'ori31' on BOOK3S_64 in addition to
+ * the insns needed for a Spectre v4 barrier.
+ *
+ * If we requested only !bypass_spec_v1 OR only !bypass_spec_v4,
+ * we can skip the respective other barrier type as an
+ * optimization.
*/
case BPF_ST | BPF_NOSPEC:
- if (!security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) ||
- !security_ftr_enabled(SEC_FTR_STF_BARRIER))
- break;
-
- switch (stf_barrier) {
- case STF_BARRIER_EIEIO:
- EMIT(PPC_RAW_EIEIO() | 0x02000000);
- break;
- case STF_BARRIER_SYNC_ORI:
+ sync_emitted = false;
+ ori31_emitted = false;
+ if (IS_ENABLED(CONFIG_PPC_E500) &&
+ !bpf_jit_bypass_spec_v1()) {
+ EMIT(PPC_RAW_ISYNC());
EMIT(PPC_RAW_SYNC());
- EMIT(PPC_RAW_LD(tmp1_reg, _R13, 0));
- EMIT(PPC_RAW_ORI(_R31, _R31, 0));
- break;
- case STF_BARRIER_FALLBACK:
- ctx->seen |= SEEN_FUNC;
- PPC_LI64(_R12, dereference_kernel_function_descriptor(bpf_stf_barrier));
- EMIT(PPC_RAW_MTCTR(_R12));
- EMIT(PPC_RAW_BCTRL());
- break;
- case STF_BARRIER_NONE:
- break;
+ sync_emitted = true;
}
+ if (!bpf_jit_bypass_spec_v4()) {
+ switch (stf_barrier) {
+ case STF_BARRIER_EIEIO:
+ EMIT(PPC_RAW_EIEIO() | 0x02000000);
+ break;
+ case STF_BARRIER_SYNC_ORI:
+ if (!sync_emitted)
+ EMIT(PPC_RAW_SYNC());
+ EMIT(PPC_RAW_LD(tmp1_reg, _R13, 0));
+ EMIT(PPC_RAW_ORI(_R31, _R31, 0));
+ ori31_emitted = true;
+ break;
+ case STF_BARRIER_FALLBACK:
+ ctx->seen |= SEEN_FUNC;
+ PPC_LI64(_R12, dereference_kernel_function_descriptor(bpf_stf_barrier));
+ EMIT(PPC_RAW_MTCTR(_R12));
+ EMIT(PPC_RAW_BCTRL());
+ break;
+ case STF_BARRIER_NONE:
+ break;
+ }
+ }
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) &&
+ !bpf_jit_bypass_spec_v1() &&
+ !ori31_emitted)
+ EMIT(PPC_RAW_ORI(_R31, _R31, 0));
break;
/*
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 681cf85af2b3..e42677cc254a 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -713,12 +713,12 @@ static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event,
ev_len = be16_to_cpu(event->length);
if (ev_len % 16)
- pr_info("event %zu has length %zu not divisible by 16: event=%pK\n",
+ pr_info("event %zu has length %zu not divisible by 16: event=%p\n",
event_idx, ev_len, event);
ev_end = (__u8 *)event + ev_len;
if (ev_end > end) {
- pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%pK > end=%pK, offset=%zu\n",
+ pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%p > end=%p, offset=%zu\n",
event_idx, ev_len, ev_end, end,
offset);
return -1;
@@ -726,14 +726,14 @@ static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event,
calc_ev_end = event_end(event, end);
if (!calc_ev_end) {
- pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%pK end=%pK, offset=%zu\n",
+ pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%p end=%p, offset=%zu\n",
event_idx, event_data_bytes, event, end,
offset);
return -1;
}
if (calc_ev_end > ev_end) {
- pr_warn("event %zu exceeds its own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n",
+ pr_warn("event %zu exceeds its own length: event=%p, end=%p, offset=%zu, calc_ev_end=%p\n",
event_idx, event, ev_end, offset, calc_ev_end);
return -1;
}
diff --git a/arch/powerpc/platforms/512x/mpc512x_lpbfifo.c b/arch/powerpc/platforms/512x/mpc512x_lpbfifo.c
index 9668b052cd4b..f251e0f68262 100644
--- a/arch/powerpc/platforms/512x/mpc512x_lpbfifo.c
+++ b/arch/powerpc/platforms/512x/mpc512x_lpbfifo.c
@@ -240,10 +240,8 @@ static int mpc512x_lpbfifo_kick(void)
dma_conf.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
/* Make DMA channel work with LPB FIFO data register */
- if (dma_dev->device_config(lpbfifo.chan, &dma_conf)) {
- ret = -EINVAL;
- goto err_dma_prep;
- }
+ if (dma_dev->device_config(lpbfifo.chan, &dma_conf))
+ return -EINVAL;
sg_init_table(&sg, 1);
diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c
index dc6f75d3ac6e..49b15e7a8265 100644
--- a/arch/powerpc/platforms/book3s/vas-api.c
+++ b/arch/powerpc/platforms/book3s/vas-api.c
@@ -425,23 +425,22 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
- mutex_lock(&txwin->task_ref.mmap_mutex);
/*
* The window may be inactive due to lost credit (Ex: core
* removal with DLPAR). If the window is active again when
* the credit is available, map the new paste address at the
* window virtual address.
*/
- if (txwin->status == VAS_WIN_ACTIVE) {
- paste_addr = cp_inst->coproc->vops->paste_addr(txwin);
- if (paste_addr) {
- fault = vmf_insert_pfn(vma, vma->vm_start,
- (paste_addr >> PAGE_SHIFT));
- mutex_unlock(&txwin->task_ref.mmap_mutex);
- return fault;
+ scoped_guard(mutex, &txwin->task_ref.mmap_mutex) {
+ if (txwin->status == VAS_WIN_ACTIVE) {
+ paste_addr = cp_inst->coproc->vops->paste_addr(txwin);
+ if (paste_addr) {
+ fault = vmf_insert_pfn(vma, vma->vm_start,
+ (paste_addr >> PAGE_SHIFT));
+ return fault;
+ }
}
}
- mutex_unlock(&txwin->task_ref.mmap_mutex);
/*
* Received this fault due to closing the actual window.
@@ -494,9 +493,8 @@ static void vas_mmap_close(struct vm_area_struct *vma)
return;
}
- mutex_lock(&txwin->task_ref.mmap_mutex);
- txwin->task_ref.vma = NULL;
- mutex_unlock(&txwin->task_ref.mmap_mutex);
+ scoped_guard(mutex, &txwin->task_ref.mmap_mutex)
+ txwin->task_ref.vma = NULL;
}
static const struct vm_operations_struct vas_vm_ops = {
@@ -552,18 +550,16 @@ static int coproc_mmap(struct file *fp, struct vm_area_struct *vma)
* close/open event and allows mmap() only when the window is
* active.
*/
- mutex_lock(&txwin->task_ref.mmap_mutex);
+ guard(mutex)(&txwin->task_ref.mmap_mutex);
if (txwin->status != VAS_WIN_ACTIVE) {
pr_err("Window is not active\n");
- rc = -EACCES;
- goto out;
+ return -EACCES;
}
paste_addr = cp_inst->coproc->vops->paste_addr(txwin);
if (!paste_addr) {
pr_err("Window paste address failed\n");
- rc = -EINVAL;
- goto out;
+ return -EINVAL;
}
pfn = paste_addr >> PAGE_SHIFT;
@@ -583,8 +579,6 @@ static int coproc_mmap(struct file *fp, struct vm_area_struct *vma)
txwin->task_ref.vma = vma;
vma->vm_ops = &vas_vm_ops;
-out:
- mutex_unlock(&txwin->task_ref.mmap_mutex);
return rc;
}
diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c
index 64a9c7125c29..f8139948348e 100644
--- a/arch/powerpc/platforms/powernv/ocxl.c
+++ b/arch/powerpc/platforms/powernv/ocxl.c
@@ -172,12 +172,11 @@ static void pnv_ocxl_fixup_actag(struct pci_dev *dev)
if (phb->type != PNV_PHB_NPU_OCAPI)
return;
- mutex_lock(&links_list_lock);
+ guard(mutex)(&links_list_lock);
link = find_link(dev);
if (!link) {
dev_warn(&dev->dev, "couldn't update actag information\n");
- mutex_unlock(&links_list_lock);
return;
}
@@ -206,7 +205,6 @@ static void pnv_ocxl_fixup_actag(struct pci_dev *dev)
dev_dbg(&dev->dev, "total actags for function: %d\n",
link->fn_desired_actags[PCI_FUNC(dev->devfn)]);
- mutex_unlock(&links_list_lock);
}
DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_ocxl_fixup_actag);
@@ -253,12 +251,11 @@ int pnv_ocxl_get_actag(struct pci_dev *dev, u16 *base, u16 *enabled,
{
struct npu_link *link;
- mutex_lock(&links_list_lock);
+ guard(mutex)(&links_list_lock);
link = find_link(dev);
if (!link) {
dev_err(&dev->dev, "actag information not found\n");
- mutex_unlock(&links_list_lock);
return -ENODEV;
}
/*
@@ -274,7 +271,6 @@ int pnv_ocxl_get_actag(struct pci_dev *dev, u16 *base, u16 *enabled,
*enabled = link->fn_actags[PCI_FUNC(dev->devfn)].count;
*supported = link->fn_desired_actags[PCI_FUNC(dev->devfn)];
- mutex_unlock(&links_list_lock);
return 0;
}
EXPORT_SYMBOL_GPL(pnv_ocxl_get_actag);
@@ -293,12 +289,11 @@ int pnv_ocxl_get_pasid_count(struct pci_dev *dev, int *count)
*
* We only support one AFU-carrying function for now.
*/
- mutex_lock(&links_list_lock);
+ guard(mutex)(&links_list_lock);
link = find_link(dev);
if (!link) {
dev_err(&dev->dev, "actag information not found\n");
- mutex_unlock(&links_list_lock);
return -ENODEV;
}
@@ -309,7 +304,6 @@ int pnv_ocxl_get_pasid_count(struct pci_dev *dev, int *count)
break;
}
- mutex_unlock(&links_list_lock);
dev_dbg(&dev->dev, "%d PASIDs available for function\n",
rc ? 0 : *count);
return rc;
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 213aa26dc8b3..979487da6522 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -404,6 +404,45 @@ get_device_node_with_drc_info(u32 index)
return NULL;
}
+static struct device_node *
+get_device_node_with_drc_indexes(u32 drc_index)
+{
+ struct device_node *np = NULL;
+ u32 nr_indexes, index;
+ int i, rc;
+
+ for_each_node_with_property(np, "ibm,drc-indexes") {
+ /*
+ * First element in the array is the total number of
+ * DRC indexes returned.
+ */
+ rc = of_property_read_u32_index(np, "ibm,drc-indexes",
+ 0, &nr_indexes);
+ if (rc)
+ goto out_put_np;
+
+ /*
+ * Retrieve DRC index from the list and return the
+ * device node if matched with the specified index.
+ */
+ for (i = 0; i < nr_indexes; i++) {
+ rc = of_property_read_u32_index(np, "ibm,drc-indexes",
+ i+1, &index);
+ if (rc)
+ goto out_put_np;
+
+ if (drc_index == index)
+ return np;
+ }
+ }
+
+ return NULL;
+
+out_put_np:
+ of_node_put(np);
+ return NULL;
+}
+
static int dlpar_hp_dt_add(u32 index)
{
struct device_node *np, *nodes;
@@ -423,10 +462,19 @@ static int dlpar_hp_dt_add(u32 index)
goto out;
}
+ /*
+ * Recent FW provides ibm,drc-info property. So search
+ * for the user specified DRC index from ibm,drc-info
+ * property. If this property is not available, search
+ * in the indexes array from ibm,drc-indexes property.
+ */
np = get_device_node_with_drc_info(index);
- if (!np)
- return -EIO;
+ if (!np) {
+ np = get_device_node_with_drc_indexes(index);
+ if (!np)
+ return -EIO;
+ }
/* Next, configure the connector. */
nodes = dlpar_configure_connector(cpu_to_be32(index), np);
diff --git a/arch/powerpc/platforms/pseries/plpks-secvar.c b/arch/powerpc/platforms/pseries/plpks-secvar.c
index 257fd1f8bc19..f9e9cc40c9d0 100644
--- a/arch/powerpc/platforms/pseries/plpks-secvar.c
+++ b/arch/powerpc/platforms/pseries/plpks-secvar.c
@@ -59,7 +59,14 @@ static u32 get_policy(const char *name)
return PLPKS_SIGNEDUPDATE;
}
-static const char * const plpks_var_names[] = {
+static const char * const plpks_var_names_static[] = {
+ "PK",
+ "moduledb",
+ "trustedcadb",
+ NULL,
+};
+
+static const char * const plpks_var_names_dynamic[] = {
"PK",
"KEK",
"db",
@@ -152,39 +159,55 @@ err:
return rc;
}
-// PLPKS dynamic secure boot doesn't give us a format string in the same way OPAL does.
-// Instead, report the format using the SB_VERSION variable in the keystore.
-// The string is made up by us, and takes the form "ibm,plpks-sb-v<n>" (or "ibm,plpks-sb-unknown"
-// if the SB_VERSION variable doesn't exist). Hypervisor defines the SB_VERSION variable as a
-// "1 byte unsigned integer value".
-static ssize_t plpks_secvar_format(char *buf, size_t bufsize)
+/*
+ * Return the key management mode.
+ *
+ * SB_VERSION is defined as a "1 byte unsigned integer value", taking values
+ * starting from 1. It is owned by the Partition Firmware and its presence
+ * indicates that the key management mode is dynamic. Any failure in
+ * reading SB_VERSION defaults the key management mode to static. The error
+ * codes -ENOENT or -EPERM are expected in static key management mode. An
+ * unexpected error code will have to be investigated. Only signed variables
+ * have null bytes in their names, SB_VERSION does not.
+ *
+ * Return 0 to indicate that the key management mode is static. Otherwise
+ * return the SB_VERSION value to indicate that the key management mode is
+ * dynamic.
+ */
+static u8 plpks_get_sb_keymgmt_mode(void)
{
- struct plpks_var var = {0};
- ssize_t ret;
- u8 version;
-
- var.component = NULL;
- // Only the signed variables have null bytes in their names, this one doesn't
- var.name = "SB_VERSION";
- var.namelen = strlen(var.name);
- var.datalen = 1;
- var.data = &version;
-
- // Unlike the other vars, SB_VERSION is owned by firmware instead of the OS
- ret = plpks_read_fw_var(&var);
- if (ret) {
- if (ret == -ENOENT) {
- ret = snprintf(buf, bufsize, "ibm,plpks-sb-unknown");
- } else {
- pr_err("Error %ld reading SB_VERSION from firmware\n", ret);
- ret = -EIO;
- }
- goto err;
+ u8 mode;
+ ssize_t rc;
+ struct plpks_var var = {
+ .component = NULL,
+ .name = "SB_VERSION",
+ .namelen = 10,
+ .datalen = 1,
+ .data = &mode,
+ };
+
+ rc = plpks_read_fw_var(&var);
+ if (rc) {
+ if (rc != -ENOENT && rc != -EPERM)
+ pr_info("Error %ld reading SB_VERSION from firmware\n", rc);
+ mode = 0;
}
+ return mode;
+}
- ret = snprintf(buf, bufsize, "ibm,plpks-sb-v%hhu", version);
-err:
- return ret;
+/*
+ * PLPKS dynamic secure boot doesn't give us a format string in the same way
+ * OPAL does. Instead, report the format using the SB_VERSION variable in the
+ * keystore. The string, made up by us, takes the form of either
+ * "ibm,plpks-sb-v<n>" or "ibm,plpks-sb-v0", based on the key management mode,
+ * and return the length of the secvar format property.
+ */
+static ssize_t plpks_secvar_format(char *buf, size_t bufsize)
+{
+ u8 mode;
+
+ mode = plpks_get_sb_keymgmt_mode();
+ return snprintf(buf, bufsize, "ibm,plpks-sb-v%hhu", mode);
}
static int plpks_max_size(u64 *max_size)
@@ -197,21 +220,34 @@ static int plpks_max_size(u64 *max_size)
return 0;
}
+static const struct secvar_operations plpks_secvar_ops_static = {
+ .get = plpks_get_variable,
+ .set = plpks_set_variable,
+ .format = plpks_secvar_format,
+ .max_size = plpks_max_size,
+ .config_attrs = config_attrs,
+ .var_names = plpks_var_names_static,
+};
-static const struct secvar_operations plpks_secvar_ops = {
+static const struct secvar_operations plpks_secvar_ops_dynamic = {
.get = plpks_get_variable,
.set = plpks_set_variable,
.format = plpks_secvar_format,
.max_size = plpks_max_size,
.config_attrs = config_attrs,
- .var_names = plpks_var_names,
+ .var_names = plpks_var_names_dynamic,
};
static int plpks_secvar_init(void)
{
+ u8 mode;
+
if (!plpks_is_available())
return -ENODEV;
- return set_secvar_ops(&plpks_secvar_ops);
+ mode = plpks_get_sb_keymgmt_mode();
+ if (mode)
+ return set_secvar_ops(&plpks_secvar_ops_dynamic);
+ return set_secvar_ops(&plpks_secvar_ops_static);
}
machine_device_initcall(pseries, plpks_secvar_init);
diff --git a/arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c b/arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c
index ce6c739c51e5..06d9101a5d49 100644
--- a/arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c
+++ b/arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c
@@ -75,7 +75,7 @@ static ssize_t fsl_timer_wakeup_store(struct device *dev,
if (kstrtoll(buf, 0, &interval))
return -EINVAL;
- mutex_lock(&sysfs_lock);
+ guard(mutex)(&sysfs_lock);
if (fsl_wakeup->timer) {
disable_irq_wake(fsl_wakeup->timer->irq);
@@ -83,31 +83,23 @@ static ssize_t fsl_timer_wakeup_store(struct device *dev,
fsl_wakeup->timer = NULL;
}
- if (!interval) {
- mutex_unlock(&sysfs_lock);
+ if (!interval)
return count;
- }
fsl_wakeup->timer = mpic_request_timer(fsl_mpic_timer_irq,
fsl_wakeup, interval);
- if (!fsl_wakeup->timer) {
- mutex_unlock(&sysfs_lock);
+ if (!fsl_wakeup->timer)
return -EINVAL;
- }
ret = enable_irq_wake(fsl_wakeup->timer->irq);
if (ret) {
mpic_free_timer(fsl_wakeup->timer);
fsl_wakeup->timer = NULL;
- mutex_unlock(&sysfs_lock);
-
return ret;
}
mpic_start_timer(fsl_wakeup->timer);
- mutex_unlock(&sysfs_lock);
-
return count;
}
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 5352932badd8..cbd6b505b2ff 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -157,7 +157,6 @@ config RISCV
select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG)
select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE_GRAPH_FUNC
- select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
select HAVE_FUNCTION_GRAPH_TRACER if HAVE_DYNAMIC_FTRACE_WITH_ARGS
select HAVE_FUNCTION_GRAPH_FREGS
select HAVE_FUNCTION_TRACER if !XIP_KERNEL && HAVE_DYNAMIC_FTRACE
diff --git a/arch/riscv/include/asm/kvm_aia.h b/arch/riscv/include/asm/kvm_aia.h
index 5acce285e56e..b04ecdd1a860 100644
--- a/arch/riscv/include/asm/kvm_aia.h
+++ b/arch/riscv/include/asm/kvm_aia.h
@@ -150,7 +150,7 @@ int kvm_riscv_vcpu_aia_rmw_ireg(struct kvm_vcpu *vcpu, unsigned int csr_num,
int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu);
-int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu);
+void kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu);
int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, u32 hart_index,
diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
new file mode 100644
index 000000000000..595e2183173e
--- /dev/null
+++ b/arch/riscv/include/asm/kvm_gstage.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ * Copyright (c) 2025 Ventana Micro Systems Inc.
+ */
+
+#ifndef __RISCV_KVM_GSTAGE_H_
+#define __RISCV_KVM_GSTAGE_H_
+
+#include <linux/kvm_types.h>
+
+struct kvm_gstage {
+ struct kvm *kvm;
+ unsigned long flags;
+#define KVM_GSTAGE_FLAGS_LOCAL BIT(0)
+ unsigned long vmid;
+ pgd_t *pgd;
+};
+
+struct kvm_gstage_mapping {
+ gpa_t addr;
+ pte_t pte;
+ u32 level;
+};
+
+#ifdef CONFIG_64BIT
+#define kvm_riscv_gstage_index_bits 9
+#else
+#define kvm_riscv_gstage_index_bits 10
+#endif
+
+extern unsigned long kvm_riscv_gstage_mode;
+extern unsigned long kvm_riscv_gstage_pgd_levels;
+
+#define kvm_riscv_gstage_pgd_xbits 2
+#define kvm_riscv_gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits))
+#define kvm_riscv_gstage_gpa_bits (HGATP_PAGE_SHIFT + \
+ (kvm_riscv_gstage_pgd_levels * \
+ kvm_riscv_gstage_index_bits) + \
+ kvm_riscv_gstage_pgd_xbits)
+#define kvm_riscv_gstage_gpa_size ((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits))
+
+bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
+ pte_t **ptepp, u32 *ptep_level);
+
+int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
+ struct kvm_mmu_memory_cache *pcache,
+ const struct kvm_gstage_mapping *map);
+
+int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
+ struct kvm_mmu_memory_cache *pcache,
+ gpa_t gpa, phys_addr_t hpa, unsigned long page_size,
+ bool page_rdonly, bool page_exec,
+ struct kvm_gstage_mapping *out_map);
+
+enum kvm_riscv_gstage_op {
+ GSTAGE_OP_NOP = 0, /* Nothing */
+ GSTAGE_OP_CLEAR, /* Clear/Unmap */
+ GSTAGE_OP_WP, /* Write-protect */
+};
+
+void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
+ pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op);
+
+void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
+ gpa_t start, gpa_t size, bool may_block);
+
+void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end);
+
+void kvm_riscv_gstage_mode_detect(void);
+
+#endif
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index bcbf8b1ec115..d71d3299a335 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -16,6 +16,8 @@
#include <asm/hwcap.h>
#include <asm/kvm_aia.h>
#include <asm/ptrace.h>
+#include <asm/kvm_tlb.h>
+#include <asm/kvm_vmid.h>
#include <asm/kvm_vcpu_fp.h>
#include <asm/kvm_vcpu_insn.h>
#include <asm/kvm_vcpu_sbi.h>
@@ -36,14 +38,16 @@
#define KVM_REQ_UPDATE_HGATP KVM_ARCH_REQ(2)
#define KVM_REQ_FENCE_I \
KVM_ARCH_REQ_FLAGS(3, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_HFENCE_GVMA_VMID_ALL KVM_REQ_TLB_FLUSH
#define KVM_REQ_HFENCE_VVMA_ALL \
KVM_ARCH_REQ_FLAGS(4, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_HFENCE \
KVM_ARCH_REQ_FLAGS(5, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(6)
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
+
#define KVM_HEDELEG_DEFAULT (BIT(EXC_INST_MISALIGNED) | \
+ BIT(EXC_INST_ILLEGAL) | \
BIT(EXC_BREAKPOINT) | \
BIT(EXC_SYSCALL) | \
BIT(EXC_INST_PAGE_FAULT) | \
@@ -54,24 +58,6 @@
BIT(IRQ_VS_TIMER) | \
BIT(IRQ_VS_EXT))
-enum kvm_riscv_hfence_type {
- KVM_RISCV_HFENCE_UNKNOWN = 0,
- KVM_RISCV_HFENCE_GVMA_VMID_GPA,
- KVM_RISCV_HFENCE_VVMA_ASID_GVA,
- KVM_RISCV_HFENCE_VVMA_ASID_ALL,
- KVM_RISCV_HFENCE_VVMA_GVA,
-};
-
-struct kvm_riscv_hfence {
- enum kvm_riscv_hfence_type type;
- unsigned long asid;
- unsigned long order;
- gpa_t addr;
- gpa_t size;
-};
-
-#define KVM_RISCV_VCPU_MAX_HFENCE 64
-
struct kvm_vm_stat {
struct kvm_vm_stat_generic generic;
};
@@ -97,15 +83,6 @@ struct kvm_vcpu_stat {
struct kvm_arch_memory_slot {
};
-struct kvm_vmid {
- /*
- * Writes to vmid_version and vmid happen with vmid_lock held
- * whereas reads happen without any lock held.
- */
- unsigned long vmid_version;
- unsigned long vmid;
-};
-
struct kvm_arch {
/* G-stage vmid */
struct kvm_vmid vmid;
@@ -309,77 +286,6 @@ static inline bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu)
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
-#define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12
-
-void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid,
- gpa_t gpa, gpa_t gpsz,
- unsigned long order);
-void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid);
-void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz,
- unsigned long order);
-void kvm_riscv_local_hfence_gvma_all(void);
-void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid,
- unsigned long asid,
- unsigned long gva,
- unsigned long gvsz,
- unsigned long order);
-void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid,
- unsigned long asid);
-void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid,
- unsigned long gva, unsigned long gvsz,
- unsigned long order);
-void kvm_riscv_local_hfence_vvma_all(unsigned long vmid);
-
-void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu);
-
-void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu);
-void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu);
-void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu);
-void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu);
-
-void kvm_riscv_fence_i(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask);
-void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask,
- gpa_t gpa, gpa_t gpsz,
- unsigned long order);
-void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask);
-void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask,
- unsigned long gva, unsigned long gvsz,
- unsigned long order, unsigned long asid);
-void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask,
- unsigned long asid);
-void kvm_riscv_hfence_vvma_gva(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask,
- unsigned long gva, unsigned long gvsz,
- unsigned long order);
-void kvm_riscv_hfence_vvma_all(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask);
-
-int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa,
- phys_addr_t hpa, unsigned long size,
- bool writable, bool in_atomic);
-void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa,
- unsigned long size);
-int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
- struct kvm_memory_slot *memslot,
- gpa_t gpa, unsigned long hva, bool is_write);
-int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm);
-void kvm_riscv_gstage_free_pgd(struct kvm *kvm);
-void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu);
-void __init kvm_riscv_gstage_mode_detect(void);
-unsigned long __init kvm_riscv_gstage_mode(void);
-int kvm_riscv_gstage_gpa_bits(void);
-
-void __init kvm_riscv_gstage_vmid_detect(void);
-unsigned long kvm_riscv_gstage_vmid_bits(void);
-int kvm_riscv_gstage_vmid_init(struct kvm *kvm);
-bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid);
-void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu);
-
int kvm_riscv_setup_default_irq_routing(struct kvm *kvm, u32 lines);
void __kvm_riscv_unpriv_trap(void);
@@ -415,7 +321,6 @@ void __kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu);
bool kvm_riscv_vcpu_stopped(struct kvm_vcpu *vcpu);
-void kvm_riscv_vcpu_sbi_sta_reset(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu);
#endif /* __RISCV_KVM_HOST_H__ */
diff --git a/arch/riscv/include/asm/kvm_mmu.h b/arch/riscv/include/asm/kvm_mmu.h
new file mode 100644
index 000000000000..5439e76f0a96
--- /dev/null
+++ b/arch/riscv/include/asm/kvm_mmu.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025 Ventana Micro Systems Inc.
+ */
+
+#ifndef __RISCV_KVM_MMU_H_
+#define __RISCV_KVM_MMU_H_
+
+#include <asm/kvm_gstage.h>
+
+int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
+ unsigned long size, bool writable, bool in_atomic);
+void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size);
+int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
+ gpa_t gpa, unsigned long hva, bool is_write,
+ struct kvm_gstage_mapping *out_map);
+int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm);
+void kvm_riscv_mmu_free_pgd(struct kvm *kvm);
+void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/arch/riscv/include/asm/kvm_tlb.h b/arch/riscv/include/asm/kvm_tlb.h
new file mode 100644
index 000000000000..38a2f933ad3a
--- /dev/null
+++ b/arch/riscv/include/asm/kvm_tlb.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025 Ventana Micro Systems Inc.
+ */
+
+#ifndef __RISCV_KVM_TLB_H_
+#define __RISCV_KVM_TLB_H_
+
+#include <linux/kvm_types.h>
+
+enum kvm_riscv_hfence_type {
+ KVM_RISCV_HFENCE_UNKNOWN = 0,
+ KVM_RISCV_HFENCE_GVMA_VMID_GPA,
+ KVM_RISCV_HFENCE_GVMA_VMID_ALL,
+ KVM_RISCV_HFENCE_VVMA_ASID_GVA,
+ KVM_RISCV_HFENCE_VVMA_ASID_ALL,
+ KVM_RISCV_HFENCE_VVMA_GVA,
+ KVM_RISCV_HFENCE_VVMA_ALL
+};
+
+struct kvm_riscv_hfence {
+ enum kvm_riscv_hfence_type type;
+ unsigned long asid;
+ unsigned long vmid;
+ unsigned long order;
+ gpa_t addr;
+ gpa_t size;
+};
+
+#define KVM_RISCV_VCPU_MAX_HFENCE 64
+
+#define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12
+
+void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid,
+ gpa_t gpa, gpa_t gpsz,
+ unsigned long order);
+void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid);
+void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz,
+ unsigned long order);
+void kvm_riscv_local_hfence_gvma_all(void);
+void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid,
+ unsigned long asid,
+ unsigned long gva,
+ unsigned long gvsz,
+ unsigned long order);
+void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid,
+ unsigned long asid);
+void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid,
+ unsigned long gva, unsigned long gvsz,
+ unsigned long order);
+void kvm_riscv_local_hfence_vvma_all(unsigned long vmid);
+
+void kvm_riscv_tlb_flush_process(struct kvm_vcpu *vcpu);
+
+void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu);
+void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu);
+void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu);
+
+void kvm_riscv_fence_i(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask);
+void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ gpa_t gpa, gpa_t gpsz,
+ unsigned long order, unsigned long vmid);
+void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned long vmid);
+void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned long gva, unsigned long gvsz,
+ unsigned long order, unsigned long asid,
+ unsigned long vmid);
+void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned long asid, unsigned long vmid);
+void kvm_riscv_hfence_vvma_gva(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned long gva, unsigned long gvsz,
+ unsigned long order, unsigned long vmid);
+void kvm_riscv_hfence_vvma_all(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned long vmid);
+
+#endif
diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h
index 439ab2b3534f..d678fd7e5973 100644
--- a/arch/riscv/include/asm/kvm_vcpu_sbi.h
+++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h
@@ -49,6 +49,16 @@ struct kvm_vcpu_sbi_extension {
/* Extension specific probe function */
unsigned long (*probe)(struct kvm_vcpu *vcpu);
+
+ /*
+ * Init/deinit function called once during VCPU init/destroy. These
+ * might be use if the SBI extensions need to allocate or do specific
+ * init time only configuration.
+ */
+ int (*init)(struct kvm_vcpu *vcpu);
+ void (*deinit)(struct kvm_vcpu *vcpu);
+
+ void (*reset)(struct kvm_vcpu *vcpu);
};
void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run);
@@ -72,6 +82,8 @@ const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext(
bool riscv_vcpu_supports_sbi_ext(struct kvm_vcpu *vcpu, int idx);
int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run);
void kvm_riscv_vcpu_sbi_init(struct kvm_vcpu *vcpu);
+void kvm_riscv_vcpu_sbi_deinit(struct kvm_vcpu *vcpu);
+void kvm_riscv_vcpu_sbi_reset(struct kvm_vcpu *vcpu);
int kvm_riscv_vcpu_get_reg_sbi_sta(struct kvm_vcpu *vcpu, unsigned long reg_num,
unsigned long *reg_val);
diff --git a/arch/riscv/include/asm/kvm_vmid.h b/arch/riscv/include/asm/kvm_vmid.h
new file mode 100644
index 000000000000..ab98e1434fb7
--- /dev/null
+++ b/arch/riscv/include/asm/kvm_vmid.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025 Ventana Micro Systems Inc.
+ */
+
+#ifndef __RISCV_KVM_VMID_H_
+#define __RISCV_KVM_VMID_H_
+
+#include <linux/kvm_types.h>
+
+struct kvm_vmid {
+ /*
+ * Writes to vmid_version and vmid happen with vmid_lock held
+ * whereas reads happen without any lock held.
+ */
+ unsigned long vmid_version;
+ unsigned long vmid;
+};
+
+void __init kvm_riscv_gstage_vmid_detect(void);
+unsigned long kvm_riscv_gstage_vmid_bits(void);
+int kvm_riscv_gstage_vmid_init(struct kvm *kvm);
+bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid);
+void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu);
+void kvm_riscv_gstage_vmid_sanitize(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h
index 5f59fd226cc5..ef27d4289da1 100644
--- a/arch/riscv/include/uapi/asm/kvm.h
+++ b/arch/riscv/include/uapi/asm/kvm.h
@@ -18,6 +18,7 @@
#define __KVM_HAVE_IRQ_LINE
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
#define KVM_INTERRUPT_SET -1U
#define KVM_INTERRUPT_UNSET -2U
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
index 704c2899197e..5a62091b0809 100644
--- a/arch/riscv/kvm/Kconfig
+++ b/arch/riscv/kvm/Kconfig
@@ -25,6 +25,7 @@ config KVM
select HAVE_KVM_MSI
select HAVE_KVM_VCPU_ASYNC_IOCTL
select HAVE_KVM_READONLY_MEM
+ select HAVE_KVM_DIRTY_RING_ACQ_REL
select KVM_COMMON
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_GENERIC_HARDWARE_ENABLING
diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile
index 4e0bba91d284..4b199dc3e58b 100644
--- a/arch/riscv/kvm/Makefile
+++ b/arch/riscv/kvm/Makefile
@@ -14,6 +14,7 @@ kvm-y += aia.o
kvm-y += aia_aplic.o
kvm-y += aia_device.o
kvm-y += aia_imsic.o
+kvm-y += gstage.o
kvm-y += main.o
kvm-y += mmu.o
kvm-y += nacl.o
diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c
index 806c41931cde..b195a93add1c 100644
--- a/arch/riscv/kvm/aia_device.c
+++ b/arch/riscv/kvm/aia_device.c
@@ -509,12 +509,12 @@ void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu)
kvm_riscv_vcpu_aia_imsic_reset(vcpu);
}
-int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu)
+void kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu_aia *vaia = &vcpu->arch.aia_context;
if (!kvm_riscv_aia_available())
- return 0;
+ return;
/*
* We don't do any memory allocations over here because these
@@ -526,8 +526,6 @@ int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu)
/* Initialize default values in AIA vcpu context */
vaia->imsic_addr = KVM_RISCV_AIA_UNDEF_ADDR;
vaia->hart_index = vcpu->vcpu_idx;
-
- return 0;
}
void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu)
diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c
index 2ff865943ebb..fda0346f0ea1 100644
--- a/arch/riscv/kvm/aia_imsic.c
+++ b/arch/riscv/kvm/aia_imsic.c
@@ -16,6 +16,7 @@
#include <linux/swab.h>
#include <kvm/iodev.h>
#include <asm/csr.h>
+#include <asm/kvm_mmu.h>
#define IMSIC_MAX_EIX (IMSIC_MAX_ID / BITS_PER_TYPE(u64))
@@ -745,9 +746,8 @@ void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu)
*/
/* Purge the G-stage mapping */
- kvm_riscv_gstage_iounmap(vcpu->kvm,
- vcpu->arch.aia_context.imsic_addr,
- IMSIC_MMIO_PAGE_SZ);
+ kvm_riscv_mmu_iounmap(vcpu->kvm, vcpu->arch.aia_context.imsic_addr,
+ IMSIC_MMIO_PAGE_SZ);
/* TODO: Purge the IOMMU mapping ??? */
@@ -830,9 +830,9 @@ int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu)
imsic_vsfile_local_clear(new_vsfile_hgei, imsic->nr_hw_eix);
/* Update G-stage mapping for the new IMSIC VS-file */
- ret = kvm_riscv_gstage_ioremap(kvm, vcpu->arch.aia_context.imsic_addr,
- new_vsfile_pa, IMSIC_MMIO_PAGE_SZ,
- true, true);
+ ret = kvm_riscv_mmu_ioremap(kvm, vcpu->arch.aia_context.imsic_addr,
+ new_vsfile_pa, IMSIC_MMIO_PAGE_SZ,
+ true, true);
if (ret)
goto fail_free_vsfile_hgei;
diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
new file mode 100644
index 000000000000..24c270d6d0e2
--- /dev/null
+++ b/arch/riscv/kvm/gstage.c
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ * Copyright (c) 2025 Ventana Micro Systems Inc.
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/pgtable.h>
+#include <asm/kvm_gstage.h>
+
+#ifdef CONFIG_64BIT
+unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4;
+unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3;
+#else
+unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4;
+unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2;
+#endif
+
+#define gstage_pte_leaf(__ptep) \
+ (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
+
+static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
+{
+ unsigned long mask;
+ unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level);
+
+ if (level == (kvm_riscv_gstage_pgd_levels - 1))
+ mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1;
+ else
+ mask = PTRS_PER_PTE - 1;
+
+ return (addr >> shift) & mask;
+}
+
+static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
+{
+ return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
+}
+
+static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
+{
+ u32 i;
+ unsigned long psz = 1UL << 12;
+
+ for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) {
+ if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) {
+ *out_level = i;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
+{
+ if (kvm_riscv_gstage_pgd_levels < level)
+ return -EINVAL;
+
+ *out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits);
+ return 0;
+}
+
+static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
+{
+ int rc;
+ unsigned long page_order = PAGE_SHIFT;
+
+ rc = gstage_level_to_page_order(level, &page_order);
+ if (rc)
+ return rc;
+
+ *out_pgsize = BIT(page_order);
+ return 0;
+}
+
+bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
+ pte_t **ptepp, u32 *ptep_level)
+{
+ pte_t *ptep;
+ u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+
+ *ptep_level = current_level;
+ ptep = (pte_t *)gstage->pgd;
+ ptep = &ptep[gstage_pte_index(addr, current_level)];
+ while (ptep && pte_val(ptep_get(ptep))) {
+ if (gstage_pte_leaf(ptep)) {
+ *ptep_level = current_level;
+ *ptepp = ptep;
+ return true;
+ }
+
+ if (current_level) {
+ current_level--;
+ *ptep_level = current_level;
+ ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
+ ptep = &ptep[gstage_pte_index(addr, current_level)];
+ } else {
+ ptep = NULL;
+ }
+ }
+
+ return false;
+}
+
+static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr)
+{
+ unsigned long order = PAGE_SHIFT;
+
+ if (gstage_level_to_page_order(level, &order))
+ return;
+ addr &= ~(BIT(order) - 1);
+
+ if (gstage->flags & KVM_GSTAGE_FLAGS_LOCAL)
+ kvm_riscv_local_hfence_gvma_vmid_gpa(gstage->vmid, addr, BIT(order), order);
+ else
+ kvm_riscv_hfence_gvma_vmid_gpa(gstage->kvm, -1UL, 0, addr, BIT(order), order,
+ gstage->vmid);
+}
+
+int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
+ struct kvm_mmu_memory_cache *pcache,
+ const struct kvm_gstage_mapping *map)
+{
+ u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+ pte_t *next_ptep = (pte_t *)gstage->pgd;
+ pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
+
+ if (current_level < map->level)
+ return -EINVAL;
+
+ while (current_level != map->level) {
+ if (gstage_pte_leaf(ptep))
+ return -EEXIST;
+
+ if (!pte_val(ptep_get(ptep))) {
+ if (!pcache)
+ return -ENOMEM;
+ next_ptep = kvm_mmu_memory_cache_alloc(pcache);
+ if (!next_ptep)
+ return -ENOMEM;
+ set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)),
+ __pgprot(_PAGE_TABLE)));
+ } else {
+ if (gstage_pte_leaf(ptep))
+ return -EEXIST;
+ next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
+ }
+
+ current_level--;
+ ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
+ }
+
+ if (pte_val(*ptep) != pte_val(map->pte)) {
+ set_pte(ptep, map->pte);
+ if (gstage_pte_leaf(ptep))
+ gstage_tlb_flush(gstage, current_level, map->addr);
+ }
+
+ return 0;
+}
+
+int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
+ struct kvm_mmu_memory_cache *pcache,
+ gpa_t gpa, phys_addr_t hpa, unsigned long page_size,
+ bool page_rdonly, bool page_exec,
+ struct kvm_gstage_mapping *out_map)
+{
+ pgprot_t prot;
+ int ret;
+
+ out_map->addr = gpa;
+ out_map->level = 0;
+
+ ret = gstage_page_size_to_level(page_size, &out_map->level);
+ if (ret)
+ return ret;
+
+ /*
+ * A RISC-V implementation can choose to either:
+ * 1) Update 'A' and 'D' PTE bits in hardware
+ * 2) Generate page fault when 'A' and/or 'D' bits are not set
+ * PTE so that software can update these bits.
+ *
+ * We support both options mentioned above. To achieve this, we
+ * always set 'A' and 'D' PTE bits at time of creating G-stage
+ * mapping. To support KVM dirty page logging with both options
+ * mentioned above, we will write-protect G-stage PTEs to track
+ * dirty pages.
+ */
+
+ if (page_exec) {
+ if (page_rdonly)
+ prot = PAGE_READ_EXEC;
+ else
+ prot = PAGE_WRITE_EXEC;
+ } else {
+ if (page_rdonly)
+ prot = PAGE_READ;
+ else
+ prot = PAGE_WRITE;
+ }
+ out_map->pte = pfn_pte(PFN_DOWN(hpa), prot);
+ out_map->pte = pte_mkdirty(out_map->pte);
+
+ return kvm_riscv_gstage_set_pte(gstage, pcache, out_map);
+}
+
+void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
+ pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op)
+{
+ int i, ret;
+ pte_t old_pte, *next_ptep;
+ u32 next_ptep_level;
+ unsigned long next_page_size, page_size;
+
+ ret = gstage_level_to_page_size(ptep_level, &page_size);
+ if (ret)
+ return;
+
+ WARN_ON(addr & (page_size - 1));
+
+ if (!pte_val(ptep_get(ptep)))
+ return;
+
+ if (ptep_level && !gstage_pte_leaf(ptep)) {
+ next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
+ next_ptep_level = ptep_level - 1;
+ ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
+ if (ret)
+ return;
+
+ if (op == GSTAGE_OP_CLEAR)
+ set_pte(ptep, __pte(0));
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ kvm_riscv_gstage_op_pte(gstage, addr + i * next_page_size,
+ &next_ptep[i], next_ptep_level, op);
+ if (op == GSTAGE_OP_CLEAR)
+ put_page(virt_to_page(next_ptep));
+ } else {
+ old_pte = *ptep;
+ if (op == GSTAGE_OP_CLEAR)
+ set_pte(ptep, __pte(0));
+ else if (op == GSTAGE_OP_WP)
+ set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE));
+ if (pte_val(*ptep) != pte_val(old_pte))
+ gstage_tlb_flush(gstage, ptep_level, addr);
+ }
+}
+
+void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
+ gpa_t start, gpa_t size, bool may_block)
+{
+ int ret;
+ pte_t *ptep;
+ u32 ptep_level;
+ bool found_leaf;
+ unsigned long page_size;
+ gpa_t addr = start, end = start + size;
+
+ while (addr < end) {
+ found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
+ ret = gstage_level_to_page_size(ptep_level, &page_size);
+ if (ret)
+ break;
+
+ if (!found_leaf)
+ goto next;
+
+ if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
+ kvm_riscv_gstage_op_pte(gstage, addr, ptep,
+ ptep_level, GSTAGE_OP_CLEAR);
+
+next:
+ addr += page_size;
+
+ /*
+ * If the range is too large, release the kvm->mmu_lock
+ * to prevent starvation and lockup detector warnings.
+ */
+ if (!(gstage->flags & KVM_GSTAGE_FLAGS_LOCAL) && may_block && addr < end)
+ cond_resched_lock(&gstage->kvm->mmu_lock);
+ }
+}
+
+void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end)
+{
+ int ret;
+ pte_t *ptep;
+ u32 ptep_level;
+ bool found_leaf;
+ gpa_t addr = start;
+ unsigned long page_size;
+
+ while (addr < end) {
+ found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
+ ret = gstage_level_to_page_size(ptep_level, &page_size);
+ if (ret)
+ break;
+
+ if (!found_leaf)
+ goto next;
+
+ if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
+ kvm_riscv_gstage_op_pte(gstage, addr, ptep,
+ ptep_level, GSTAGE_OP_WP);
+
+next:
+ addr += page_size;
+ }
+}
+
+void __init kvm_riscv_gstage_mode_detect(void)
+{
+#ifdef CONFIG_64BIT
+ /* Try Sv57x4 G-stage mode */
+ csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
+ if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
+ kvm_riscv_gstage_mode = HGATP_MODE_SV57X4;
+ kvm_riscv_gstage_pgd_levels = 5;
+ goto skip_sv48x4_test;
+ }
+
+ /* Try Sv48x4 G-stage mode */
+ csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
+ if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
+ kvm_riscv_gstage_mode = HGATP_MODE_SV48X4;
+ kvm_riscv_gstage_pgd_levels = 4;
+ }
+skip_sv48x4_test:
+
+ csr_write(CSR_HGATP, 0);
+ kvm_riscv_local_hfence_gvma_all();
+#endif
+}
diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
index 4b24705dc63a..67c876de74ef 100644
--- a/arch/riscv/kvm/main.c
+++ b/arch/riscv/kvm/main.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/kvm_host.h>
#include <asm/cpufeature.h>
+#include <asm/kvm_mmu.h>
#include <asm/kvm_nacl.h>
#include <asm/sbi.h>
@@ -134,7 +135,7 @@ static int __init riscv_kvm_init(void)
(rc) ? slist : "no features");
}
- switch (kvm_riscv_gstage_mode()) {
+ switch (kvm_riscv_gstage_mode) {
case HGATP_MODE_SV32X4:
str = "Sv32x4";
break;
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 1087ea74567b..a1c3b2ec1dde 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -6,348 +6,38 @@
* Anup Patel <anup.patel@wdc.com>
*/
-#include <linux/bitops.h>
#include <linux/errno.h>
-#include <linux/err.h>
#include <linux/hugetlb.h>
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/kvm_host.h>
#include <linux/sched/signal.h>
+#include <asm/kvm_mmu.h>
#include <asm/kvm_nacl.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-
-#ifdef CONFIG_64BIT
-static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
-static unsigned long gstage_pgd_levels __ro_after_init = 3;
-#define gstage_index_bits 9
-#else
-static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
-static unsigned long gstage_pgd_levels __ro_after_init = 2;
-#define gstage_index_bits 10
-#endif
-
-#define gstage_pgd_xbits 2
-#define gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + gstage_pgd_xbits))
-#define gstage_gpa_bits (HGATP_PAGE_SHIFT + \
- (gstage_pgd_levels * gstage_index_bits) + \
- gstage_pgd_xbits)
-#define gstage_gpa_size ((gpa_t)(1ULL << gstage_gpa_bits))
-
-#define gstage_pte_leaf(__ptep) \
- (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
-
-static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
-{
- unsigned long mask;
- unsigned long shift = HGATP_PAGE_SHIFT + (gstage_index_bits * level);
-
- if (level == (gstage_pgd_levels - 1))
- mask = (PTRS_PER_PTE * (1UL << gstage_pgd_xbits)) - 1;
- else
- mask = PTRS_PER_PTE - 1;
-
- return (addr >> shift) & mask;
-}
-
-static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
-{
- return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
-}
-
-static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
-{
- u32 i;
- unsigned long psz = 1UL << 12;
-
- for (i = 0; i < gstage_pgd_levels; i++) {
- if (page_size == (psz << (i * gstage_index_bits))) {
- *out_level = i;
- return 0;
- }
- }
-
- return -EINVAL;
-}
-
-static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
-{
- if (gstage_pgd_levels < level)
- return -EINVAL;
-
- *out_pgorder = 12 + (level * gstage_index_bits);
- return 0;
-}
-
-static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
-{
- int rc;
- unsigned long page_order = PAGE_SHIFT;
-
- rc = gstage_level_to_page_order(level, &page_order);
- if (rc)
- return rc;
-
- *out_pgsize = BIT(page_order);
- return 0;
-}
-
-static bool gstage_get_leaf_entry(struct kvm *kvm, gpa_t addr,
- pte_t **ptepp, u32 *ptep_level)
-{
- pte_t *ptep;
- u32 current_level = gstage_pgd_levels - 1;
-
- *ptep_level = current_level;
- ptep = (pte_t *)kvm->arch.pgd;
- ptep = &ptep[gstage_pte_index(addr, current_level)];
- while (ptep && pte_val(ptep_get(ptep))) {
- if (gstage_pte_leaf(ptep)) {
- *ptep_level = current_level;
- *ptepp = ptep;
- return true;
- }
-
- if (current_level) {
- current_level--;
- *ptep_level = current_level;
- ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
- ptep = &ptep[gstage_pte_index(addr, current_level)];
- } else {
- ptep = NULL;
- }
- }
-
- return false;
-}
-
-static void gstage_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr)
-{
- unsigned long order = PAGE_SHIFT;
-
- if (gstage_level_to_page_order(level, &order))
- return;
- addr &= ~(BIT(order) - 1);
-
- kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, addr, BIT(order), order);
-}
-
-static int gstage_set_pte(struct kvm *kvm, u32 level,
- struct kvm_mmu_memory_cache *pcache,
- gpa_t addr, const pte_t *new_pte)
-{
- u32 current_level = gstage_pgd_levels - 1;
- pte_t *next_ptep = (pte_t *)kvm->arch.pgd;
- pte_t *ptep = &next_ptep[gstage_pte_index(addr, current_level)];
-
- if (current_level < level)
- return -EINVAL;
-
- while (current_level != level) {
- if (gstage_pte_leaf(ptep))
- return -EEXIST;
-
- if (!pte_val(ptep_get(ptep))) {
- if (!pcache)
- return -ENOMEM;
- next_ptep = kvm_mmu_memory_cache_alloc(pcache);
- if (!next_ptep)
- return -ENOMEM;
- set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)),
- __pgprot(_PAGE_TABLE)));
- } else {
- if (gstage_pte_leaf(ptep))
- return -EEXIST;
- next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
- }
-
- current_level--;
- ptep = &next_ptep[gstage_pte_index(addr, current_level)];
- }
-
- set_pte(ptep, *new_pte);
- if (gstage_pte_leaf(ptep))
- gstage_remote_tlb_flush(kvm, current_level, addr);
-
- return 0;
-}
-
-static int gstage_map_page(struct kvm *kvm,
- struct kvm_mmu_memory_cache *pcache,
- gpa_t gpa, phys_addr_t hpa,
- unsigned long page_size,
- bool page_rdonly, bool page_exec)
-{
- int ret;
- u32 level = 0;
- pte_t new_pte;
- pgprot_t prot;
-
- ret = gstage_page_size_to_level(page_size, &level);
- if (ret)
- return ret;
-
- /*
- * A RISC-V implementation can choose to either:
- * 1) Update 'A' and 'D' PTE bits in hardware
- * 2) Generate page fault when 'A' and/or 'D' bits are not set
- * PTE so that software can update these bits.
- *
- * We support both options mentioned above. To achieve this, we
- * always set 'A' and 'D' PTE bits at time of creating G-stage
- * mapping. To support KVM dirty page logging with both options
- * mentioned above, we will write-protect G-stage PTEs to track
- * dirty pages.
- */
- if (page_exec) {
- if (page_rdonly)
- prot = PAGE_READ_EXEC;
- else
- prot = PAGE_WRITE_EXEC;
- } else {
- if (page_rdonly)
- prot = PAGE_READ;
- else
- prot = PAGE_WRITE;
- }
- new_pte = pfn_pte(PFN_DOWN(hpa), prot);
- new_pte = pte_mkdirty(new_pte);
-
- return gstage_set_pte(kvm, level, pcache, gpa, &new_pte);
-}
-
-enum gstage_op {
- GSTAGE_OP_NOP = 0, /* Nothing */
- GSTAGE_OP_CLEAR, /* Clear/Unmap */
- GSTAGE_OP_WP, /* Write-protect */
-};
-
-static void gstage_op_pte(struct kvm *kvm, gpa_t addr,
- pte_t *ptep, u32 ptep_level, enum gstage_op op)
-{
- int i, ret;
- pte_t *next_ptep;
- u32 next_ptep_level;
- unsigned long next_page_size, page_size;
-
- ret = gstage_level_to_page_size(ptep_level, &page_size);
- if (ret)
- return;
-
- BUG_ON(addr & (page_size - 1));
-
- if (!pte_val(ptep_get(ptep)))
- return;
-
- if (ptep_level && !gstage_pte_leaf(ptep)) {
- next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
- next_ptep_level = ptep_level - 1;
- ret = gstage_level_to_page_size(next_ptep_level,
- &next_page_size);
- if (ret)
- return;
-
- if (op == GSTAGE_OP_CLEAR)
- set_pte(ptep, __pte(0));
- for (i = 0; i < PTRS_PER_PTE; i++)
- gstage_op_pte(kvm, addr + i * next_page_size,
- &next_ptep[i], next_ptep_level, op);
- if (op == GSTAGE_OP_CLEAR)
- put_page(virt_to_page(next_ptep));
- } else {
- if (op == GSTAGE_OP_CLEAR)
- set_pte(ptep, __pte(0));
- else if (op == GSTAGE_OP_WP)
- set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE));
- gstage_remote_tlb_flush(kvm, ptep_level, addr);
- }
-}
-
-static void gstage_unmap_range(struct kvm *kvm, gpa_t start,
- gpa_t size, bool may_block)
-{
- int ret;
- pte_t *ptep;
- u32 ptep_level;
- bool found_leaf;
- unsigned long page_size;
- gpa_t addr = start, end = start + size;
-
- while (addr < end) {
- found_leaf = gstage_get_leaf_entry(kvm, addr,
- &ptep, &ptep_level);
- ret = gstage_level_to_page_size(ptep_level, &page_size);
- if (ret)
- break;
-
- if (!found_leaf)
- goto next;
-
- if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
- gstage_op_pte(kvm, addr, ptep,
- ptep_level, GSTAGE_OP_CLEAR);
-
-next:
- addr += page_size;
-
- /*
- * If the range is too large, release the kvm->mmu_lock
- * to prevent starvation and lockup detector warnings.
- */
- if (may_block && addr < end)
- cond_resched_lock(&kvm->mmu_lock);
- }
-}
-
-static void gstage_wp_range(struct kvm *kvm, gpa_t start, gpa_t end)
-{
- int ret;
- pte_t *ptep;
- u32 ptep_level;
- bool found_leaf;
- gpa_t addr = start;
- unsigned long page_size;
-
- while (addr < end) {
- found_leaf = gstage_get_leaf_entry(kvm, addr,
- &ptep, &ptep_level);
- ret = gstage_level_to_page_size(ptep_level, &page_size);
- if (ret)
- break;
-
- if (!found_leaf)
- goto next;
-
- if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
- gstage_op_pte(kvm, addr, ptep,
- ptep_level, GSTAGE_OP_WP);
-
-next:
- addr += page_size;
- }
-}
-
-static void gstage_wp_memory_region(struct kvm *kvm, int slot)
+static void mmu_wp_memory_region(struct kvm *kvm, int slot)
{
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+ struct kvm_gstage gstage;
+
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
spin_lock(&kvm->mmu_lock);
- gstage_wp_range(kvm, start, end);
+ kvm_riscv_gstage_wp_range(&gstage, start, end);
spin_unlock(&kvm->mmu_lock);
- kvm_flush_remote_tlbs(kvm);
+ kvm_flush_remote_tlbs_memslot(kvm, memslot);
}
-int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa,
- phys_addr_t hpa, unsigned long size,
- bool writable, bool in_atomic)
+int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
+ unsigned long size, bool writable, bool in_atomic)
{
- pte_t pte;
int ret = 0;
unsigned long pfn;
phys_addr_t addr, end;
@@ -355,22 +45,31 @@ int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa,
.gfp_custom = (in_atomic) ? GFP_ATOMIC | __GFP_ACCOUNT : 0,
.gfp_zero = __GFP_ZERO,
};
+ struct kvm_gstage_mapping map;
+ struct kvm_gstage gstage;
+
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK;
pfn = __phys_to_pfn(hpa);
for (addr = gpa; addr < end; addr += PAGE_SIZE) {
- pte = pfn_pte(pfn, PAGE_KERNEL_IO);
+ map.addr = addr;
+ map.pte = pfn_pte(pfn, PAGE_KERNEL_IO);
+ map.level = 0;
if (!writable)
- pte = pte_wrprotect(pte);
+ map.pte = pte_wrprotect(map.pte);
- ret = kvm_mmu_topup_memory_cache(&pcache, gstage_pgd_levels);
+ ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels);
if (ret)
goto out;
spin_lock(&kvm->mmu_lock);
- ret = gstage_set_pte(kvm, 0, &pcache, addr, &pte);
+ ret = kvm_riscv_gstage_set_pte(&gstage, &pcache, &map);
spin_unlock(&kvm->mmu_lock);
if (ret)
goto out;
@@ -383,10 +82,17 @@ out:
return ret;
}
-void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size)
+void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size)
{
+ struct kvm_gstage gstage;
+
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+
spin_lock(&kvm->mmu_lock);
- gstage_unmap_range(kvm, gpa, size, false);
+ kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
spin_unlock(&kvm->mmu_lock);
}
@@ -398,8 +104,14 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+ struct kvm_gstage gstage;
- gstage_wp_range(kvm, start, end);
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+
+ kvm_riscv_gstage_wp_range(&gstage, start, end);
}
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
@@ -416,7 +128,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
- kvm_riscv_gstage_free_pgd(kvm);
+ kvm_riscv_mmu_free_pgd(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -424,9 +136,15 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
{
gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
phys_addr_t size = slot->npages << PAGE_SHIFT;
+ struct kvm_gstage gstage;
+
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
spin_lock(&kvm->mmu_lock);
- gstage_unmap_range(kvm, gpa, size, false);
+ kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
spin_unlock(&kvm->mmu_lock);
}
@@ -441,7 +159,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
* the memory slot is write protected.
*/
if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES)
- gstage_wp_memory_region(kvm, new->id);
+ mmu_wp_memory_region(kvm, new->id);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -463,7 +181,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
* space addressable by the KVM guest GPA space.
*/
if ((new->base_gfn + new->npages) >=
- (gstage_gpa_size >> PAGE_SHIFT))
+ (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT))
return -EFAULT;
hva = new->userspace_addr;
@@ -487,10 +205,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
* +--------------------------------------------+
*/
do {
- struct vm_area_struct *vma = find_vma(current->mm, hva);
+ struct vm_area_struct *vma;
hva_t vm_start, vm_end;
- if (!vma || vma->vm_start >= reg_end)
+ vma = find_vma_intersection(current->mm, hva, reg_end);
+ if (!vma)
break;
/*
@@ -519,9 +238,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
goto out;
}
- ret = kvm_riscv_gstage_ioremap(kvm, gpa, pa,
- vm_end - vm_start,
- writable, false);
+ ret = kvm_riscv_mmu_ioremap(kvm, gpa, pa, vm_end - vm_start,
+ writable, false);
if (ret)
break;
}
@@ -532,7 +250,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
goto out;
if (ret)
- kvm_riscv_gstage_iounmap(kvm, base_gpa, size);
+ kvm_riscv_mmu_iounmap(kvm, base_gpa, size);
out:
mmap_read_unlock(current->mm);
@@ -541,12 +259,18 @@ out:
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
+ struct kvm_gstage gstage;
+
if (!kvm->arch.pgd)
return false;
- gstage_unmap_range(kvm, range->start << PAGE_SHIFT,
- (range->end - range->start) << PAGE_SHIFT,
- range->may_block);
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+ kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT,
+ (range->end - range->start) << PAGE_SHIFT,
+ range->may_block);
return false;
}
@@ -555,14 +279,19 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
pte_t *ptep;
u32 ptep_level = 0;
u64 size = (range->end - range->start) << PAGE_SHIFT;
+ struct kvm_gstage gstage;
if (!kvm->arch.pgd)
return false;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
- &ptep, &ptep_level))
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+ if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT,
+ &ptep, &ptep_level))
return false;
return ptep_test_and_clear_young(NULL, 0, ptep);
@@ -573,22 +302,27 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
pte_t *ptep;
u32 ptep_level = 0;
u64 size = (range->end - range->start) << PAGE_SHIFT;
+ struct kvm_gstage gstage;
if (!kvm->arch.pgd)
return false;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
- &ptep, &ptep_level))
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+ if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT,
+ &ptep, &ptep_level))
return false;
return pte_young(ptep_get(ptep));
}
-int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
- struct kvm_memory_slot *memslot,
- gpa_t gpa, unsigned long hva, bool is_write)
+int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
+ gpa_t gpa, unsigned long hva, bool is_write,
+ struct kvm_gstage_mapping *out_map)
{
int ret;
kvm_pfn_t hfn;
@@ -601,10 +335,19 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
bool logging = (memslot->dirty_bitmap &&
!(memslot->flags & KVM_MEM_READONLY)) ? true : false;
unsigned long vma_pagesize, mmu_seq;
+ struct kvm_gstage gstage;
struct page *page;
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+
+ /* Setup initial state of output mapping */
+ memset(out_map, 0, sizeof(*out_map));
+
/* We need minimum second+third level pages */
- ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels);
+ ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels);
if (ret) {
kvm_err("Failed to topup G-stage cache\n");
return ret;
@@ -648,7 +391,8 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
return -EFAULT;
}
- hfn = kvm_faultin_pfn(vcpu, gfn, is_write, &writable, &page);
+ hfn = __kvm_faultin_pfn(memslot, gfn, is_write ? FOLL_WRITE : 0,
+ &writable, &page);
if (hfn == KVM_PFN_ERR_HWPOISON) {
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva,
vma_pageshift, current);
@@ -670,12 +414,12 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
goto out_unlock;
if (writable) {
- mark_page_dirty(kvm, gfn);
- ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
- vma_pagesize, false, true);
+ mark_page_dirty_in_slot(kvm, memslot, gfn);
+ ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT,
+ vma_pagesize, false, true, out_map);
} else {
- ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
- vma_pagesize, true, true);
+ ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT,
+ vma_pagesize, true, true, out_map);
}
if (ret)
@@ -687,7 +431,7 @@ out_unlock:
return ret;
}
-int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm)
+int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm)
{
struct page *pgd_page;
@@ -697,7 +441,7 @@ int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm)
}
pgd_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(gstage_pgd_size));
+ get_order(kvm_riscv_gstage_pgd_size));
if (!pgd_page)
return -ENOMEM;
kvm->arch.pgd = page_to_virt(pgd_page);
@@ -706,13 +450,18 @@ int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm)
return 0;
}
-void kvm_riscv_gstage_free_pgd(struct kvm *kvm)
+void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
{
+ struct kvm_gstage gstage;
void *pgd = NULL;
spin_lock(&kvm->mmu_lock);
if (kvm->arch.pgd) {
- gstage_unmap_range(kvm, 0UL, gstage_gpa_size, false);
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+ kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false);
pgd = READ_ONCE(kvm->arch.pgd);
kvm->arch.pgd = NULL;
kvm->arch.pgd_phys = 0;
@@ -720,12 +469,12 @@ void kvm_riscv_gstage_free_pgd(struct kvm *kvm)
spin_unlock(&kvm->mmu_lock);
if (pgd)
- free_pages((unsigned long)pgd, get_order(gstage_pgd_size));
+ free_pages((unsigned long)pgd, get_order(kvm_riscv_gstage_pgd_size));
}
-void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu)
+void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu)
{
- unsigned long hgatp = gstage_mode;
+ unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
struct kvm_arch *k = &vcpu->kvm->arch;
hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID;
@@ -736,37 +485,3 @@ void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu)
if (!kvm_riscv_gstage_vmid_bits())
kvm_riscv_local_hfence_gvma_all();
}
-
-void __init kvm_riscv_gstage_mode_detect(void)
-{
-#ifdef CONFIG_64BIT
- /* Try Sv57x4 G-stage mode */
- csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
- if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
- gstage_mode = (HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
- gstage_pgd_levels = 5;
- goto skip_sv48x4_test;
- }
-
- /* Try Sv48x4 G-stage mode */
- csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
- if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
- gstage_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
- gstage_pgd_levels = 4;
- }
-skip_sv48x4_test:
-
- csr_write(CSR_HGATP, 0);
- kvm_riscv_local_hfence_gvma_all();
-#endif
-}
-
-unsigned long __init kvm_riscv_gstage_mode(void)
-{
- return gstage_mode >> HGATP_MODE_SHIFT;
-}
-
-int kvm_riscv_gstage_gpa_bits(void)
-{
- return gstage_gpa_bits;
-}
diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c
index 2f91ea5f8493..3c5a70a2b927 100644
--- a/arch/riscv/kvm/tlb.c
+++ b/arch/riscv/kvm/tlb.c
@@ -15,6 +15,8 @@
#include <asm/cpufeature.h>
#include <asm/insn-def.h>
#include <asm/kvm_nacl.h>
+#include <asm/kvm_tlb.h>
+#include <asm/kvm_vmid.h>
#define has_svinval() riscv_has_extension_unlikely(RISCV_ISA_EXT_SVINVAL)
@@ -156,36 +158,13 @@ void kvm_riscv_local_hfence_vvma_all(unsigned long vmid)
csr_write(CSR_HGATP, hgatp);
}
-void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu)
-{
- unsigned long vmid;
-
- if (!kvm_riscv_gstage_vmid_bits() ||
- vcpu->arch.last_exit_cpu == vcpu->cpu)
- return;
-
- /*
- * On RISC-V platforms with hardware VMID support, we share same
- * VMID for all VCPUs of a particular Guest/VM. This means we might
- * have stale G-stage TLB entries on the current Host CPU due to
- * some other VCPU of the same Guest which ran previously on the
- * current Host CPU.
- *
- * To cleanup stale TLB entries, we simply flush all G-stage TLB
- * entries by VMID whenever underlying Host CPU changes for a VCPU.
- */
-
- vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
- kvm_riscv_local_hfence_gvma_vmid_all(vmid);
-}
-
void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu)
{
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_RCVD);
local_flush_icache_all();
}
-void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu)
+void kvm_riscv_tlb_flush_process(struct kvm_vcpu *vcpu)
{
struct kvm_vmid *v = &vcpu->kvm->arch.vmid;
unsigned long vmid = READ_ONCE(v->vmid);
@@ -258,51 +237,58 @@ static bool vcpu_hfence_enqueue(struct kvm_vcpu *vcpu,
void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu)
{
- unsigned long vmid;
struct kvm_riscv_hfence d = { 0 };
- struct kvm_vmid *v = &vcpu->kvm->arch.vmid;
while (vcpu_hfence_dequeue(vcpu, &d)) {
switch (d.type) {
case KVM_RISCV_HFENCE_UNKNOWN:
break;
case KVM_RISCV_HFENCE_GVMA_VMID_GPA:
- vmid = READ_ONCE(v->vmid);
if (kvm_riscv_nacl_available())
- nacl_hfence_gvma_vmid(nacl_shmem(), vmid,
+ nacl_hfence_gvma_vmid(nacl_shmem(), d.vmid,
d.addr, d.size, d.order);
else
- kvm_riscv_local_hfence_gvma_vmid_gpa(vmid, d.addr,
+ kvm_riscv_local_hfence_gvma_vmid_gpa(d.vmid, d.addr,
d.size, d.order);
break;
+ case KVM_RISCV_HFENCE_GVMA_VMID_ALL:
+ if (kvm_riscv_nacl_available())
+ nacl_hfence_gvma_vmid_all(nacl_shmem(), d.vmid);
+ else
+ kvm_riscv_local_hfence_gvma_vmid_all(d.vmid);
+ break;
case KVM_RISCV_HFENCE_VVMA_ASID_GVA:
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD);
- vmid = READ_ONCE(v->vmid);
if (kvm_riscv_nacl_available())
- nacl_hfence_vvma_asid(nacl_shmem(), vmid, d.asid,
+ nacl_hfence_vvma_asid(nacl_shmem(), d.vmid, d.asid,
d.addr, d.size, d.order);
else
- kvm_riscv_local_hfence_vvma_asid_gva(vmid, d.asid, d.addr,
+ kvm_riscv_local_hfence_vvma_asid_gva(d.vmid, d.asid, d.addr,
d.size, d.order);
break;
case KVM_RISCV_HFENCE_VVMA_ASID_ALL:
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD);
- vmid = READ_ONCE(v->vmid);
if (kvm_riscv_nacl_available())
- nacl_hfence_vvma_asid_all(nacl_shmem(), vmid, d.asid);
+ nacl_hfence_vvma_asid_all(nacl_shmem(), d.vmid, d.asid);
else
- kvm_riscv_local_hfence_vvma_asid_all(vmid, d.asid);
+ kvm_riscv_local_hfence_vvma_asid_all(d.vmid, d.asid);
break;
case KVM_RISCV_HFENCE_VVMA_GVA:
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_RCVD);
- vmid = READ_ONCE(v->vmid);
if (kvm_riscv_nacl_available())
- nacl_hfence_vvma(nacl_shmem(), vmid,
+ nacl_hfence_vvma(nacl_shmem(), d.vmid,
d.addr, d.size, d.order);
else
- kvm_riscv_local_hfence_vvma_gva(vmid, d.addr,
+ kvm_riscv_local_hfence_vvma_gva(d.vmid, d.addr,
d.size, d.order);
break;
+ case KVM_RISCV_HFENCE_VVMA_ALL:
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_RCVD);
+ if (kvm_riscv_nacl_available())
+ nacl_hfence_vvma_all(nacl_shmem(), d.vmid);
+ else
+ kvm_riscv_local_hfence_vvma_all(d.vmid);
+ break;
default:
break;
}
@@ -355,35 +341,43 @@ void kvm_riscv_fence_i(struct kvm *kvm,
void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm,
unsigned long hbase, unsigned long hmask,
gpa_t gpa, gpa_t gpsz,
- unsigned long order)
+ unsigned long order, unsigned long vmid)
{
struct kvm_riscv_hfence data;
data.type = KVM_RISCV_HFENCE_GVMA_VMID_GPA;
data.asid = 0;
+ data.vmid = vmid;
data.addr = gpa;
data.size = gpsz;
data.order = order;
make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
- KVM_REQ_HFENCE_GVMA_VMID_ALL, &data);
+ KVM_REQ_TLB_FLUSH, &data);
}
void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask)
+ unsigned long hbase, unsigned long hmask,
+ unsigned long vmid)
{
- make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_GVMA_VMID_ALL,
- KVM_REQ_HFENCE_GVMA_VMID_ALL, NULL);
+ struct kvm_riscv_hfence data = {0};
+
+ data.type = KVM_RISCV_HFENCE_GVMA_VMID_ALL;
+ data.vmid = vmid;
+ make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
+ KVM_REQ_TLB_FLUSH, &data);
}
void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm,
unsigned long hbase, unsigned long hmask,
unsigned long gva, unsigned long gvsz,
- unsigned long order, unsigned long asid)
+ unsigned long order, unsigned long asid,
+ unsigned long vmid)
{
struct kvm_riscv_hfence data;
data.type = KVM_RISCV_HFENCE_VVMA_ASID_GVA;
data.asid = asid;
+ data.vmid = vmid;
data.addr = gva;
data.size = gvsz;
data.order = order;
@@ -393,13 +387,13 @@ void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm,
void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm,
unsigned long hbase, unsigned long hmask,
- unsigned long asid)
+ unsigned long asid, unsigned long vmid)
{
- struct kvm_riscv_hfence data;
+ struct kvm_riscv_hfence data = {0};
data.type = KVM_RISCV_HFENCE_VVMA_ASID_ALL;
data.asid = asid;
- data.addr = data.size = data.order = 0;
+ data.vmid = vmid;
make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
KVM_REQ_HFENCE_VVMA_ALL, &data);
}
@@ -407,12 +401,13 @@ void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm,
void kvm_riscv_hfence_vvma_gva(struct kvm *kvm,
unsigned long hbase, unsigned long hmask,
unsigned long gva, unsigned long gvsz,
- unsigned long order)
+ unsigned long order, unsigned long vmid)
{
struct kvm_riscv_hfence data;
data.type = KVM_RISCV_HFENCE_VVMA_GVA;
data.asid = 0;
+ data.vmid = vmid;
data.addr = gva;
data.size = gvsz;
data.order = order;
@@ -421,8 +416,21 @@ void kvm_riscv_hfence_vvma_gva(struct kvm *kvm,
}
void kvm_riscv_hfence_vvma_all(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask)
+ unsigned long hbase, unsigned long hmask,
+ unsigned long vmid)
+{
+ struct kvm_riscv_hfence data = {0};
+
+ data.type = KVM_RISCV_HFENCE_VVMA_ALL;
+ data.vmid = vmid;
+ make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
+ KVM_REQ_HFENCE_VVMA_ALL, &data);
+}
+
+int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
{
- make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_VVMA_ALL,
- KVM_REQ_HFENCE_VVMA_ALL, NULL);
+ kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0,
+ gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT,
+ PAGE_SHIFT, READ_ONCE(kvm->arch.vmid.vmid));
+ return 0;
}
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index 0462863206ca..f001e56403f9 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -18,6 +18,7 @@
#include <linux/fs.h>
#include <linux/kvm_host.h>
#include <asm/cacheflush.h>
+#include <asm/kvm_mmu.h>
#include <asm/kvm_nacl.h>
#include <asm/kvm_vcpu_vector.h>
@@ -111,7 +112,7 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu, bool kvm_sbi_reset)
vcpu->arch.hfence_tail = 0;
memset(vcpu->arch.hfence_queue, 0, sizeof(vcpu->arch.hfence_queue));
- kvm_riscv_vcpu_sbi_sta_reset(vcpu);
+ kvm_riscv_vcpu_sbi_reset(vcpu);
/* Reset the guest CSRs for hotplug usecase */
if (loaded)
@@ -148,8 +149,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
spin_lock_init(&vcpu->arch.reset_state.lock);
- if (kvm_riscv_vcpu_alloc_vector_context(vcpu))
- return -ENOMEM;
+ rc = kvm_riscv_vcpu_alloc_vector_context(vcpu);
+ if (rc)
+ return rc;
/* Setup VCPU timer */
kvm_riscv_vcpu_timer_init(vcpu);
@@ -158,9 +160,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
kvm_riscv_vcpu_pmu_init(vcpu);
/* Setup VCPU AIA */
- rc = kvm_riscv_vcpu_aia_init(vcpu);
- if (rc)
- return rc;
+ kvm_riscv_vcpu_aia_init(vcpu);
/*
* Setup SBI extensions
@@ -187,6 +187,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
+ kvm_riscv_vcpu_sbi_deinit(vcpu);
+
/* Cleanup VCPU AIA context */
kvm_riscv_vcpu_aia_deinit(vcpu);
@@ -620,7 +622,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
}
- kvm_riscv_gstage_update_hgatp(vcpu);
+ kvm_riscv_mmu_update_hgatp(vcpu);
kvm_riscv_vcpu_timer_restore(vcpu);
@@ -680,7 +682,14 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
}
}
-static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu)
+/**
+ * check_vcpu_requests - check and handle pending vCPU requests
+ * @vcpu: the VCPU pointer
+ *
+ * Return: 1 if we should enter the guest
+ * 0 if we should exit to userspace
+ */
+static int kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu)
{
struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
@@ -705,17 +714,13 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu)
kvm_riscv_reset_vcpu(vcpu, true);
if (kvm_check_request(KVM_REQ_UPDATE_HGATP, vcpu))
- kvm_riscv_gstage_update_hgatp(vcpu);
+ kvm_riscv_mmu_update_hgatp(vcpu);
if (kvm_check_request(KVM_REQ_FENCE_I, vcpu))
kvm_riscv_fence_i_process(vcpu);
- /*
- * The generic KVM_REQ_TLB_FLUSH is same as
- * KVM_REQ_HFENCE_GVMA_VMID_ALL
- */
- if (kvm_check_request(KVM_REQ_HFENCE_GVMA_VMID_ALL, vcpu))
- kvm_riscv_hfence_gvma_vmid_all_process(vcpu);
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+ kvm_riscv_tlb_flush_process(vcpu);
if (kvm_check_request(KVM_REQ_HFENCE_VVMA_ALL, vcpu))
kvm_riscv_hfence_vvma_all_process(vcpu);
@@ -725,7 +730,12 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
kvm_riscv_vcpu_record_steal_time(vcpu);
+
+ if (kvm_dirty_ring_check_request(vcpu))
+ return 0;
}
+
+ return 1;
}
static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu)
@@ -907,7 +917,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_riscv_gstage_vmid_update(vcpu);
- kvm_riscv_check_vcpu_requests(vcpu);
+ ret = kvm_riscv_check_vcpu_requests(vcpu);
+ if (ret <= 0)
+ continue;
preempt_disable();
@@ -951,12 +963,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
}
/*
- * Cleanup stale TLB enteries
+ * Sanitize VMID mappings cached (TLB) on current CPU
*
* Note: This should be done after G-stage VMID has been
* updated using kvm_riscv_gstage_vmid_ver_changed()
*/
- kvm_riscv_local_tlb_sanitize(vcpu);
+ kvm_riscv_gstage_vmid_sanitize(vcpu);
trace_kvm_entry(vcpu);
diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
index 6e0c18412795..0bb0c51e3c89 100644
--- a/arch/riscv/kvm/vcpu_exit.c
+++ b/arch/riscv/kvm/vcpu_exit.c
@@ -9,10 +9,13 @@
#include <linux/kvm_host.h>
#include <asm/csr.h>
#include <asm/insn-def.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_nacl.h>
static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run,
struct kvm_cpu_trap *trap)
{
+ struct kvm_gstage_mapping host_map;
struct kvm_memory_slot *memslot;
unsigned long hva, fault_addr;
bool writable;
@@ -40,8 +43,9 @@ static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run,
};
}
- ret = kvm_riscv_gstage_map(vcpu, memslot, fault_addr, hva,
- (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false);
+ ret = kvm_riscv_mmu_map(vcpu, memslot, fault_addr, hva,
+ (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false,
+ &host_map);
if (ret < 0)
return ret;
@@ -135,7 +139,7 @@ unsigned long kvm_riscv_vcpu_unpriv_read(struct kvm_vcpu *vcpu,
void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu,
struct kvm_cpu_trap *trap)
{
- unsigned long vsstatus = csr_read(CSR_VSSTATUS);
+ unsigned long vsstatus = ncsr_read(CSR_VSSTATUS);
/* Change Guest SSTATUS.SPP bit */
vsstatus &= ~SR_SPP;
@@ -151,15 +155,15 @@ void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu,
vsstatus &= ~SR_SIE;
/* Update Guest SSTATUS */
- csr_write(CSR_VSSTATUS, vsstatus);
+ ncsr_write(CSR_VSSTATUS, vsstatus);
/* Update Guest SCAUSE, STVAL, and SEPC */
- csr_write(CSR_VSCAUSE, trap->scause);
- csr_write(CSR_VSTVAL, trap->stval);
- csr_write(CSR_VSEPC, trap->sepc);
+ ncsr_write(CSR_VSCAUSE, trap->scause);
+ ncsr_write(CSR_VSTVAL, trap->stval);
+ ncsr_write(CSR_VSEPC, trap->sepc);
/* Set Guest PC to Guest exception vector */
- vcpu->arch.guest_context.sepc = csr_read(CSR_VSTVEC);
+ vcpu->arch.guest_context.sepc = ncsr_read(CSR_VSTVEC);
/* Set Guest privilege mode to supervisor */
vcpu->arch.guest_context.sstatus |= SR_SPP;
diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c
index 2e1b646f0d61..cce6a38ea54f 100644
--- a/arch/riscv/kvm/vcpu_onereg.c
+++ b/arch/riscv/kvm/vcpu_onereg.c
@@ -23,7 +23,7 @@
#define KVM_ISA_EXT_ARR(ext) \
[KVM_RISCV_ISA_EXT_##ext] = RISCV_ISA_EXT_##ext
-/* Mapping between KVM ISA Extension ID & Host ISA extension ID */
+/* Mapping between KVM ISA Extension ID & guest ISA extension ID */
static const unsigned long kvm_isa_ext_arr[] = {
/* Single letter extensions (alphabetically sorted) */
[KVM_RISCV_ISA_EXT_A] = RISCV_ISA_EXT_a,
@@ -35,7 +35,7 @@ static const unsigned long kvm_isa_ext_arr[] = {
[KVM_RISCV_ISA_EXT_M] = RISCV_ISA_EXT_m,
[KVM_RISCV_ISA_EXT_V] = RISCV_ISA_EXT_v,
/* Multi letter extensions (alphabetically sorted) */
- [KVM_RISCV_ISA_EXT_SMNPM] = RISCV_ISA_EXT_SSNPM,
+ KVM_ISA_EXT_ARR(SMNPM),
KVM_ISA_EXT_ARR(SMSTATEEN),
KVM_ISA_EXT_ARR(SSAIA),
KVM_ISA_EXT_ARR(SSCOFPMF),
@@ -112,6 +112,36 @@ static unsigned long kvm_riscv_vcpu_base2isa_ext(unsigned long base_ext)
return KVM_RISCV_ISA_EXT_MAX;
}
+static int kvm_riscv_vcpu_isa_check_host(unsigned long kvm_ext, unsigned long *guest_ext)
+{
+ unsigned long host_ext;
+
+ if (kvm_ext >= KVM_RISCV_ISA_EXT_MAX ||
+ kvm_ext >= ARRAY_SIZE(kvm_isa_ext_arr))
+ return -ENOENT;
+
+ *guest_ext = kvm_isa_ext_arr[kvm_ext];
+ switch (*guest_ext) {
+ case RISCV_ISA_EXT_SMNPM:
+ /*
+ * Pointer masking effective in (H)S-mode is provided by the
+ * Smnpm extension, so that extension is reported to the guest,
+ * even though the CSR bits for configuring VS-mode pointer
+ * masking on the host side are part of the Ssnpm extension.
+ */
+ host_ext = RISCV_ISA_EXT_SSNPM;
+ break;
+ default:
+ host_ext = *guest_ext;
+ break;
+ }
+
+ if (!__riscv_isa_extension_available(NULL, host_ext))
+ return -ENOENT;
+
+ return 0;
+}
+
static bool kvm_riscv_vcpu_isa_enable_allowed(unsigned long ext)
{
switch (ext) {
@@ -219,13 +249,13 @@ static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext)
void kvm_riscv_vcpu_setup_isa(struct kvm_vcpu *vcpu)
{
- unsigned long host_isa, i;
+ unsigned long guest_ext, i;
for (i = 0; i < ARRAY_SIZE(kvm_isa_ext_arr); i++) {
- host_isa = kvm_isa_ext_arr[i];
- if (__riscv_isa_extension_available(NULL, host_isa) &&
- kvm_riscv_vcpu_isa_enable_allowed(i))
- set_bit(host_isa, vcpu->arch.isa);
+ if (kvm_riscv_vcpu_isa_check_host(i, &guest_ext))
+ continue;
+ if (kvm_riscv_vcpu_isa_enable_allowed(i))
+ set_bit(guest_ext, vcpu->arch.isa);
}
}
@@ -607,18 +637,15 @@ static int riscv_vcpu_get_isa_ext_single(struct kvm_vcpu *vcpu,
unsigned long reg_num,
unsigned long *reg_val)
{
- unsigned long host_isa_ext;
-
- if (reg_num >= KVM_RISCV_ISA_EXT_MAX ||
- reg_num >= ARRAY_SIZE(kvm_isa_ext_arr))
- return -ENOENT;
+ unsigned long guest_ext;
+ int ret;
- host_isa_ext = kvm_isa_ext_arr[reg_num];
- if (!__riscv_isa_extension_available(NULL, host_isa_ext))
- return -ENOENT;
+ ret = kvm_riscv_vcpu_isa_check_host(reg_num, &guest_ext);
+ if (ret)
+ return ret;
*reg_val = 0;
- if (__riscv_isa_extension_available(vcpu->arch.isa, host_isa_ext))
+ if (__riscv_isa_extension_available(vcpu->arch.isa, guest_ext))
*reg_val = 1; /* Mark the given extension as available */
return 0;
@@ -628,17 +655,14 @@ static int riscv_vcpu_set_isa_ext_single(struct kvm_vcpu *vcpu,
unsigned long reg_num,
unsigned long reg_val)
{
- unsigned long host_isa_ext;
-
- if (reg_num >= KVM_RISCV_ISA_EXT_MAX ||
- reg_num >= ARRAY_SIZE(kvm_isa_ext_arr))
- return -ENOENT;
+ unsigned long guest_ext;
+ int ret;
- host_isa_ext = kvm_isa_ext_arr[reg_num];
- if (!__riscv_isa_extension_available(NULL, host_isa_ext))
- return -ENOENT;
+ ret = kvm_riscv_vcpu_isa_check_host(reg_num, &guest_ext);
+ if (ret)
+ return ret;
- if (reg_val == test_bit(host_isa_ext, vcpu->arch.isa))
+ if (reg_val == test_bit(guest_ext, vcpu->arch.isa))
return 0;
if (!vcpu->arch.ran_atleast_once) {
@@ -648,10 +672,10 @@ static int riscv_vcpu_set_isa_ext_single(struct kvm_vcpu *vcpu,
*/
if (reg_val == 1 &&
kvm_riscv_vcpu_isa_enable_allowed(reg_num))
- set_bit(host_isa_ext, vcpu->arch.isa);
+ set_bit(guest_ext, vcpu->arch.isa);
else if (!reg_val &&
kvm_riscv_vcpu_isa_disable_allowed(reg_num))
- clear_bit(host_isa_ext, vcpu->arch.isa);
+ clear_bit(guest_ext, vcpu->arch.isa);
else
return -EINVAL;
kvm_riscv_vcpu_fp_reset(vcpu);
@@ -1009,16 +1033,15 @@ static int copy_fp_d_reg_indices(const struct kvm_vcpu *vcpu,
static int copy_isa_ext_reg_indices(const struct kvm_vcpu *vcpu,
u64 __user *uindices)
{
+ unsigned long guest_ext;
unsigned int n = 0;
- unsigned long isa_ext;
for (int i = 0; i < KVM_RISCV_ISA_EXT_MAX; i++) {
u64 size = IS_ENABLED(CONFIG_32BIT) ?
KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64;
u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_ISA_EXT | i;
- isa_ext = kvm_isa_ext_arr[i];
- if (!__riscv_isa_extension_available(NULL, isa_ext))
+ if (kvm_riscv_vcpu_isa_check_host(i, &guest_ext))
continue;
if (uindices) {
diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c
index 6e09b518a5d1..a56c4959f9ad 100644
--- a/arch/riscv/kvm/vcpu_sbi.c
+++ b/arch/riscv/kvm/vcpu_sbi.c
@@ -536,5 +536,54 @@ void kvm_riscv_vcpu_sbi_init(struct kvm_vcpu *vcpu)
scontext->ext_status[idx] = ext->default_disabled ?
KVM_RISCV_SBI_EXT_STATUS_DISABLED :
KVM_RISCV_SBI_EXT_STATUS_ENABLED;
+
+ if (ext->init && ext->init(vcpu) != 0)
+ scontext->ext_status[idx] = KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE;
+ }
+}
+
+void kvm_riscv_vcpu_sbi_deinit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context;
+ const struct kvm_riscv_sbi_extension_entry *entry;
+ const struct kvm_vcpu_sbi_extension *ext;
+ int idx, i;
+
+ for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) {
+ entry = &sbi_ext[i];
+ ext = entry->ext_ptr;
+ idx = entry->ext_idx;
+
+ if (idx < 0 || idx >= ARRAY_SIZE(scontext->ext_status))
+ continue;
+
+ if (scontext->ext_status[idx] == KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE ||
+ !ext->deinit)
+ continue;
+
+ ext->deinit(vcpu);
+ }
+}
+
+void kvm_riscv_vcpu_sbi_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context;
+ const struct kvm_riscv_sbi_extension_entry *entry;
+ const struct kvm_vcpu_sbi_extension *ext;
+ int idx, i;
+
+ for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) {
+ entry = &sbi_ext[i];
+ ext = entry->ext_ptr;
+ idx = entry->ext_idx;
+
+ if (idx < 0 || idx >= ARRAY_SIZE(scontext->ext_status))
+ continue;
+
+ if (scontext->ext_status[idx] != KVM_RISCV_SBI_EXT_STATUS_ENABLED ||
+ !ext->reset)
+ continue;
+
+ ext->reset(vcpu);
}
}
diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c
index b17fad091bab..b490ed1428a6 100644
--- a/arch/riscv/kvm/vcpu_sbi_replace.c
+++ b/arch/riscv/kvm/vcpu_sbi_replace.c
@@ -96,6 +96,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
unsigned long hmask = cp->a0;
unsigned long hbase = cp->a1;
unsigned long funcid = cp->a6;
+ unsigned long vmid;
switch (funcid) {
case SBI_EXT_RFENCE_REMOTE_FENCE_I:
@@ -103,22 +104,22 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_SENT);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
+ vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL)
- kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask);
+ kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask, vmid);
else
kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask,
- cp->a2, cp->a3, PAGE_SHIFT);
+ cp->a2, cp->a3, PAGE_SHIFT, vmid);
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_SENT);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
+ vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL)
- kvm_riscv_hfence_vvma_asid_all(vcpu->kvm,
- hbase, hmask, cp->a4);
+ kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, hbase, hmask,
+ cp->a4, vmid);
else
- kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm,
- hbase, hmask,
- cp->a2, cp->a3,
- PAGE_SHIFT, cp->a4);
+ kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, hbase, hmask, cp->a2,
+ cp->a3, PAGE_SHIFT, cp->a4, vmid);
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_SENT);
break;
case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA:
diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c
index 5f35427114c1..cc6cb7c8f0e4 100644
--- a/arch/riscv/kvm/vcpu_sbi_sta.c
+++ b/arch/riscv/kvm/vcpu_sbi_sta.c
@@ -16,7 +16,7 @@
#include <asm/sbi.h>
#include <asm/uaccess.h>
-void kvm_riscv_vcpu_sbi_sta_reset(struct kvm_vcpu *vcpu)
+static void kvm_riscv_vcpu_sbi_sta_reset(struct kvm_vcpu *vcpu)
{
vcpu->arch.sta.shmem = INVALID_GPA;
vcpu->arch.sta.last_steal = 0;
@@ -156,6 +156,7 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_sta = {
.extid_end = SBI_EXT_STA,
.handler = kvm_sbi_ext_sta_handler,
.probe = kvm_sbi_ext_sta_probe,
+ .reset = kvm_riscv_vcpu_sbi_sta_reset,
};
int kvm_riscv_vcpu_get_reg_sbi_sta(struct kvm_vcpu *vcpu,
diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c
index 8f4c4fa16227..368dfddd23d9 100644
--- a/arch/riscv/kvm/vcpu_sbi_v01.c
+++ b/arch/riscv/kvm/vcpu_sbi_v01.c
@@ -23,6 +23,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
struct kvm *kvm = vcpu->kvm;
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
struct kvm_cpu_trap *utrap = retdata->utrap;
+ unsigned long vmid;
switch (cp->a7) {
case SBI_EXT_0_1_CONSOLE_GETCHAR:
@@ -78,25 +79,21 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
if (cp->a7 == SBI_EXT_0_1_REMOTE_FENCE_I)
kvm_riscv_fence_i(vcpu->kvm, 0, hmask);
else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) {
+ vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
if (cp->a1 == 0 && cp->a2 == 0)
- kvm_riscv_hfence_vvma_all(vcpu->kvm,
- 0, hmask);
+ kvm_riscv_hfence_vvma_all(vcpu->kvm, 0, hmask, vmid);
else
- kvm_riscv_hfence_vvma_gva(vcpu->kvm,
- 0, hmask,
- cp->a1, cp->a2,
- PAGE_SHIFT);
+ kvm_riscv_hfence_vvma_gva(vcpu->kvm, 0, hmask, cp->a1,
+ cp->a2, PAGE_SHIFT, vmid);
} else {
+ vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
if (cp->a1 == 0 && cp->a2 == 0)
- kvm_riscv_hfence_vvma_asid_all(vcpu->kvm,
- 0, hmask,
- cp->a3);
+ kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, 0, hmask,
+ cp->a3, vmid);
else
- kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm,
- 0, hmask,
- cp->a1, cp->a2,
- PAGE_SHIFT,
- cp->a3);
+ kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, 0, hmask,
+ cp->a1, cp->a2, PAGE_SHIFT,
+ cp->a3, vmid);
}
break;
default:
diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
index b27ec8f96697..66d91ae6e9b2 100644
--- a/arch/riscv/kvm/vm.c
+++ b/arch/riscv/kvm/vm.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/kvm_host.h>
+#include <asm/kvm_mmu.h>
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS()
@@ -31,13 +32,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
int r;
- r = kvm_riscv_gstage_alloc_pgd(kvm);
+ r = kvm_riscv_mmu_alloc_pgd(kvm);
if (r)
return r;
r = kvm_riscv_gstage_vmid_init(kvm);
if (r) {
- kvm_riscv_gstage_free_pgd(kvm);
+ kvm_riscv_mmu_free_pgd(kvm);
return r;
}
@@ -199,7 +200,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_USER_MEM_SLOTS;
break;
case KVM_CAP_VM_GPA_BITS:
- r = kvm_riscv_gstage_gpa_bits();
+ r = kvm_riscv_gstage_gpa_bits;
break;
default:
r = 0;
diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
index ddc98714ce8e..3b426c800480 100644
--- a/arch/riscv/kvm/vmid.c
+++ b/arch/riscv/kvm/vmid.c
@@ -14,6 +14,8 @@
#include <linux/smp.h>
#include <linux/kvm_host.h>
#include <asm/csr.h>
+#include <asm/kvm_tlb.h>
+#include <asm/kvm_vmid.h>
static unsigned long vmid_version = 1;
static unsigned long vmid_next;
@@ -122,3 +124,26 @@ void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu)
kvm_for_each_vcpu(i, v, vcpu->kvm)
kvm_make_request(KVM_REQ_UPDATE_HGATP, v);
}
+
+void kvm_riscv_gstage_vmid_sanitize(struct kvm_vcpu *vcpu)
+{
+ unsigned long vmid;
+
+ if (!kvm_riscv_gstage_vmid_bits() ||
+ vcpu->arch.last_exit_cpu == vcpu->cpu)
+ return;
+
+ /*
+ * On RISC-V platforms with hardware VMID support, we share same
+ * VMID for all VCPUs of a particular Guest/VM. This means we might
+ * have stale G-stage TLB entries on the current Host CPU due to
+ * some other VCPU of the same Guest which ran previously on the
+ * current Host CPU.
+ *
+ * To cleanup stale TLB entries, we simply flush all G-stage TLB
+ * entries by VMID whenever underlying Host CPU changes for a VCPU.
+ */
+
+ vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
+ kvm_riscv_local_hfence_gvma_vmid_all(vmid);
+}
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 0194324a0c50..04ed6f8acae4 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -20,6 +20,9 @@
#include <asm/ptrace.h>
#include <asm/tlbflush.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/exceptions.h>
+
#include "../kernel/head.h"
static void show_pte(unsigned long addr)
@@ -291,6 +294,11 @@ void handle_page_fault(struct pt_regs *regs)
if (kprobe_page_fault(regs, cause))
return;
+ if (user_mode(regs))
+ trace_page_fault_user(addr, regs, cause);
+ else
+ trace_page_fault_kernel(addr, regs, cause);
+
/*
* Fault-in kernel-space virtual memory on-demand.
* The 'reference' page table is init_mm.pgd.
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 25a773e6596e..afc08b538f99 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -198,7 +198,6 @@ config S390
select HAVE_GUP_FAST
select HAVE_FENTRY
select HAVE_FTRACE_GRAPH_FUNC
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_FUNCTION_GRAPH_FREGS
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index dd7ba7587dd5..ad2b0baa527c 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -12,6 +12,7 @@
#define KMSG_COMPONENT "appldata"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#include <linux/export.h>
#include <linux/module.h>
#include <linux/sched/stat.h>
#include <linux/init.h>
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index bee49626be4b..02f2cf082748 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -19,15 +19,15 @@ CC_FLAGS_MARCH_MINIMUM := -march=z10
KBUILD_AFLAGS := $(filter-out $(CC_FLAGS_MARCH),$(KBUILD_AFLAGS_DECOMPRESSOR))
KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_MARCH),$(KBUILD_CFLAGS_DECOMPRESSOR))
-KBUILD_AFLAGS += $(CC_FLAGS_MARCH_MINIMUM)
-KBUILD_CFLAGS += $(CC_FLAGS_MARCH_MINIMUM)
+KBUILD_AFLAGS += $(CC_FLAGS_MARCH_MINIMUM) -D__DISABLE_EXPORTS
+KBUILD_CFLAGS += $(CC_FLAGS_MARCH_MINIMUM) -D__DISABLE_EXPORTS
CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char
obj-y := head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o
obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
obj-y += version.o pgm_check.o ctype.o ipl_data.o relocs.o alternative.o
-obj-y += uv.o printk.o
+obj-y += uv.o printk.o trampoline.o
obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o
obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o
diff --git a/arch/s390/boot/als.c b/arch/s390/boot/als.c
index 79afb5fa7f1f..25a20986b96e 100644
--- a/arch/s390/boot/als.c
+++ b/arch/s390/boot/als.c
@@ -65,7 +65,7 @@ static void facility_mismatch(void)
boot_emerg("The Linux kernel requires more recent processor hardware\n");
boot_emerg("Detected machine-type number: %4x\n", id.machine);
print_missing_facilities();
- boot_emerg("See Principles of Operations for facility bits\n");
+ boot_emerg("See z/Architecture Principles of Operation - Facility Indications\n");
disabled_wait();
}
diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h
index e045cae6e80a..c0152db285f0 100644
--- a/arch/s390/boot/boot.h
+++ b/arch/s390/boot/boot.h
@@ -6,7 +6,7 @@
#define IPL_START 0x200
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/printk.h>
#include <asm/physmem_info.h>
@@ -74,6 +74,7 @@ void print_stacktrace(unsigned long sp);
void error(char *m);
int get_random(unsigned long limit, unsigned long *value);
void boot_rb_dump(void);
+void __noreturn jump_to_kernel(psw_t *psw);
#ifndef boot_fmt
#define boot_fmt(fmt) fmt
@@ -121,5 +122,5 @@ static inline bool intersects(unsigned long addr0, unsigned long size0,
{
return addr0 + size0 > addr1 && addr1 + size1 > addr0;
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* BOOT_BOOT_H */
diff --git a/arch/s390/boot/ipl_data.c b/arch/s390/boot/ipl_data.c
index 0846e2b249c6..c4130a80b058 100644
--- a/arch/s390/boot/ipl_data.c
+++ b/arch/s390/boot/ipl_data.c
@@ -16,7 +16,9 @@ struct ipl_lowcore {
struct ccw0 ccwpgm[2]; /* 0x0008 */
u8 fill[56]; /* 0x0018 */
struct ccw0 ccwpgmcc[20]; /* 0x0050 */
- u8 pad_0xf0[0x01a0-0x00f0]; /* 0x00f0 */
+ u8 pad_0xf0[0x0140-0x00f0]; /* 0x00f0 */
+ psw_t svc_old_psw; /* 0x0140 */
+ u8 pad_0x150[0x01a0-0x0150]; /* 0x0150 */
psw_t restart_psw; /* 0x01a0 */
psw_t external_new_psw; /* 0x01b0 */
psw_t svc_new_psw; /* 0x01c0 */
@@ -75,6 +77,11 @@ static struct ipl_lowcore ipl_lowcore __used __section(".ipldata") = {
[18] = CCW0(CCW_CMD_READ_IPL, 0x690, 0x50, CCW_FLAG_SLI | CCW_FLAG_CC),
[19] = CCW0(CCW_CMD_READ_IPL, 0x6e0, 0x50, CCW_FLAG_SLI),
},
+ /*
+ * Let the GDB's lx-symbols command find the jump_to_kernel symbol
+ * without having to load decompressor symbols.
+ */
+ .svc_old_psw = { .mask = 0, .addr = (unsigned long)jump_to_kernel },
.restart_psw = { .mask = 0, .addr = IPL_START, },
.external_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_EXT_NEW_PSW, },
.svc_new_psw = { .mask = PSW_MASK_DISABLED, .addr = __LC_SVC_NEW_PSW, },
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index da8337e63a3e..305e6c791071 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -642,5 +642,5 @@ void startup_kernel(void)
psw.addr = __kaslr_offset + vmlinux.entry;
psw.mask = PSW_KERNEL_BITS;
boot_debug("Starting kernel at: 0x%016lx\n", psw.addr);
- __load_psw(psw);
+ jump_to_kernel(&psw);
}
diff --git a/arch/s390/boot/trampoline.S b/arch/s390/boot/trampoline.S
new file mode 100644
index 000000000000..1cb5adf005ea
--- /dev/null
+++ b/arch/s390/boot/trampoline.S
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+
+# This function is identical to __load_psw(), but the lx-symbols GDB command
+# puts a breakpoint on it, so it needs to be kept separate.
+SYM_CODE_START(jump_to_kernel)
+ lpswe 0(%r2)
+SYM_CODE_END(jump_to_kernel)
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index a7db7ed28720..a8e84e80552d 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -248,7 +248,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
CONFIG_NETFILTER_XT_MATCH_CPU=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 0217ed5616bf..a209b6ae4791 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -239,7 +239,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
CONFIG_NETFILTER_XT_MATCH_CPU=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c
index a8a2407381af..083e8d5eada2 100644
--- a/arch/s390/crypto/arch_random.c
+++ b/arch/s390/crypto/arch_random.c
@@ -6,6 +6,7 @@
* Author(s): Harald Freudenberger
*/
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/random.h>
diff --git a/arch/s390/crypto/sha_common.c b/arch/s390/crypto/sha_common.c
index b5e2c365ea05..d6f839618794 100644
--- a/arch/s390/crypto/sha_common.c
+++ b/arch/s390/crypto/sha_common.c
@@ -9,6 +9,7 @@
*/
#include <crypto/internal/hash.h>
+#include <linux/export.h>
#include <linux/module.h>
#include <asm/cpacf.h>
#include "sha.h"
diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
index c7bf60a541e9..1c56480def9e 100644
--- a/arch/s390/include/asm/alternative.h
+++ b/arch/s390/include/asm/alternative.h
@@ -51,7 +51,7 @@
ALT_TYPE_SPEC << ALT_TYPE_SHIFT | \
(facility) << ALT_DATA_SHIFT)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <linux/stddef.h>
@@ -183,7 +183,7 @@ static inline void apply_alternatives(struct alt_instr *start, struct alt_instr
/* Use this macro if clobbers are needed without inputs. */
#define ASM_NO_INPUT_CLOBBER(clobber...) : clobber
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
/*
* Issue one struct alt_instr descriptor entry (need to put it into
@@ -233,6 +233,6 @@ static inline void apply_alternatives(struct alt_instr *start, struct alt_instr
.popsection
.endm
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_S390_ALTERNATIVE_H */
diff --git a/arch/s390/include/asm/asm-const.h b/arch/s390/include/asm/asm-const.h
index 11f615eb0066..1cfffad9eea0 100644
--- a/arch/s390/include/asm/asm-const.h
+++ b/arch/s390/include/asm/asm-const.h
@@ -2,7 +2,7 @@
#ifndef _ASM_S390_ASM_CONST_H
#define _ASM_S390_ASM_CONST_H
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
# define stringify_in_c(...) __VA_ARGS__
#else
/* This version of stringify will deal with commas... */
diff --git a/arch/s390/include/asm/cpu.h b/arch/s390/include/asm/cpu.h
index 26c710cd3485..5672e3fab52b 100644
--- a/arch/s390/include/asm/cpu.h
+++ b/arch/s390/include/asm/cpu.h
@@ -9,7 +9,7 @@
#ifndef _ASM_S390_CPU_H
#define _ASM_S390_CPU_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <linux/jump_label.h>
@@ -24,5 +24,5 @@ struct cpuid
DECLARE_STATIC_KEY_FALSE(cpu_has_bear);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_S390_CPU_H */
diff --git a/arch/s390/include/asm/cpu_mf-insn.h b/arch/s390/include/asm/cpu_mf-insn.h
index a68b362e0964..941663939cc7 100644
--- a/arch/s390/include/asm/cpu_mf-insn.h
+++ b/arch/s390/include/asm/cpu_mf-insn.h
@@ -8,7 +8,7 @@
#ifndef _ASM_S390_CPU_MF_INSN_H
#define _ASM_S390_CPU_MF_INSN_H
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
/* Macro to generate the STCCTM instruction with a customized
* M3 field designating the counter set.
@@ -17,6 +17,6 @@
.insn rsy,0xeb0000000017,\r1,\m3 & 0xf,\db2
.endm
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif
diff --git a/arch/s390/include/asm/ctlreg.h b/arch/s390/include/asm/ctlreg.h
index e6527f51ad0b..e93cc240a1ed 100644
--- a/arch/s390/include/asm/ctlreg.h
+++ b/arch/s390/include/asm/ctlreg.h
@@ -80,7 +80,7 @@
#define CR14_EXTERNAL_DAMAGE_SUBMASK BIT(CR14_EXTERNAL_DAMAGE_SUBMASK_BIT)
#define CR14_WARNING_SUBMASK BIT(CR14_WARNING_SUBMASK_BIT)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/bug.h>
@@ -252,5 +252,5 @@ union ctlreg15 {
};
};
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __ASM_S390_CTLREG_H */
diff --git a/arch/s390/include/asm/dwarf.h b/arch/s390/include/asm/dwarf.h
index 390906b8e386..e3ad6798d0cd 100644
--- a/arch/s390/include/asm/dwarf.h
+++ b/arch/s390/include/asm/dwarf.h
@@ -2,7 +2,7 @@
#ifndef _ASM_S390_DWARF_H
#define _ASM_S390_DWARF_H
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#define CFI_STARTPROC .cfi_startproc
#define CFI_ENDPROC .cfi_endproc
@@ -33,6 +33,6 @@
.cfi_sections .eh_frame, .debug_frame
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_S390_DWARF_H */
diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h
index 35555c944630..979af986a8fe 100644
--- a/arch/s390/include/asm/entry-common.h
+++ b/arch/s390/include/asm/entry-common.h
@@ -59,4 +59,14 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
+static __always_inline bool arch_in_rcu_eqs(void)
+{
+ if (IS_ENABLED(CONFIG_KVM))
+ return current->flags & PF_VCPU;
+
+ return false;
+}
+
+#define arch_in_rcu_eqs arch_in_rcu_eqs
+
#endif
diff --git a/arch/s390/include/asm/extmem.h b/arch/s390/include/asm/extmem.h
index e0a06060afdd..225ee89c3f5e 100644
--- a/arch/s390/include/asm/extmem.h
+++ b/arch/s390/include/asm/extmem.h
@@ -6,7 +6,7 @@
#ifndef _ASM_S390X_DCSS_H
#define _ASM_S390X_DCSS_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* DCSS segment is defined as a contiguous range of pages using DEFSEG command.
diff --git a/arch/s390/include/asm/fpu-insn-asm.h b/arch/s390/include/asm/fpu-insn-asm.h
index d296322be4bc..cc0468fdf2d0 100644
--- a/arch/s390/include/asm/fpu-insn-asm.h
+++ b/arch/s390/include/asm/fpu-insn-asm.h
@@ -16,7 +16,7 @@
#error only <asm/fpu-insn.h> can be included directly
#endif
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
/* Macros to generate vector instruction byte code */
@@ -750,5 +750,5 @@
MRXBOPC 0, 0x77, v1, v2, v3
.endm
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __ASM_S390_FPU_INSN_ASM_H */
diff --git a/arch/s390/include/asm/fpu-insn.h b/arch/s390/include/asm/fpu-insn.h
index f668bffd6dd3..135bb89c0a89 100644
--- a/arch/s390/include/asm/fpu-insn.h
+++ b/arch/s390/include/asm/fpu-insn.h
@@ -9,7 +9,7 @@
#include <asm/fpu-insn-asm.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/instrumented.h>
#include <asm/asm-extable.h>
@@ -475,5 +475,5 @@ static __always_inline void fpu_vzero(u8 v)
: "memory");
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __ASM_S390_FPU_INSN_H */
diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
index 185331e91f83..bee2d16c2951 100644
--- a/arch/s390/include/asm/ftrace.h
+++ b/arch/s390/include/asm/ftrace.h
@@ -5,7 +5,7 @@
#define ARCH_SUPPORTS_FTRACE_OPS 1
#define MCOUNT_INSN_SIZE 6
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/stacktrace.h>
static __always_inline unsigned long return_address(unsigned int n)
@@ -134,7 +134,7 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs);
#define ftrace_graph_func ftrace_graph_func
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h
index bde6a496df5f..697497e7d13e 100644
--- a/arch/s390/include/asm/irq.h
+++ b/arch/s390/include/asm/irq.h
@@ -25,7 +25,7 @@
#define EXT_IRQ_CP_SERVICE 0x2603
#define EXT_IRQ_IUCV 0x4000
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/hardirq.h>
#include <linux/percpu.h>
@@ -120,6 +120,6 @@ void irq_subclass_unregister(enum irq_subclass subclass);
#define irq_canonicalize(irq) (irq)
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_IRQ_H */
diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h
index bf78cf381dfc..d9cbc18f6b2e 100644
--- a/arch/s390/include/asm/jump_label.h
+++ b/arch/s390/include/asm/jump_label.h
@@ -4,7 +4,7 @@
#define HAVE_JUMP_LABEL_BATCH
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <linux/stringify.h>
@@ -51,5 +51,5 @@ label:
return true;
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index cb89e54ada25..f870d09515cc 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -716,6 +716,9 @@ extern char sie_exit;
bool kvm_s390_pv_is_protected(struct kvm *kvm);
bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu);
+extern int kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb,
+ u64 *gprs, unsigned long gasce);
+
extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc);
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index e99e9c87b1ce..d9c853db9a40 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -22,7 +22,7 @@
#define LOWCORE_ALT_ADDRESS _AC(0x70000, UL)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct pgm_tdb {
u64 data[32];
@@ -237,7 +237,7 @@ static inline void set_prefix(__u32 address)
asm volatile("spx %0" : : "Q" (address) : "memory");
}
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
.macro GET_LC reg
ALTERNATIVE "lghi \reg,0", \
@@ -251,5 +251,5 @@ static inline void set_prefix(__u32 address)
ALT_FEATURE(MFEATURE_LOWCORE)
.endm
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_S390_LOWCORE_H */
diff --git a/arch/s390/include/asm/machine.h b/arch/s390/include/asm/machine.h
index 8abe5afdbfc4..9bd4a9dc7778 100644
--- a/arch/s390/include/asm/machine.h
+++ b/arch/s390/include/asm/machine.h
@@ -20,7 +20,7 @@
#define MFEATURE_LPAR 9
#define MFEATURE_DIAG288 10
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/bitops.h>
#include <asm/alternative.h>
@@ -100,5 +100,5 @@ DEFINE_MACHINE_HAS_FEATURE(lpar, MFEATURE_LPAR)
#define machine_is_kvm machine_has_kvm
#define machine_is_lpar machine_has_lpar
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __ASM_S390_MACHINE_H */
diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h
index b85e13505a0f..28c83ec1f243 100644
--- a/arch/s390/include/asm/mem_encrypt.h
+++ b/arch/s390/include/asm/mem_encrypt.h
@@ -2,11 +2,11 @@
#ifndef S390_MEM_ENCRYPT_H__
#define S390_MEM_ENCRYPT_H__
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
int set_memory_encrypted(unsigned long vaddr, int numpages);
int set_memory_decrypted(unsigned long vaddr, int numpages);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* S390_MEM_ENCRYPT_H__ */
diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h
index 227466ce9e41..6454c1531854 100644
--- a/arch/s390/include/asm/nmi.h
+++ b/arch/s390/include/asm/nmi.h
@@ -33,7 +33,7 @@
#define MCCK_CODE_FC_VALID BIT(63 - 43)
#define MCCK_CODE_CPU_TIMER_VALID BIT(63 - 46)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
union mci {
unsigned long val;
@@ -104,5 +104,5 @@ void nmi_free_mcesa(u64 *mcesad);
void s390_handle_mcck(void);
void s390_do_machine_check(struct pt_regs *regs);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_S390_NMI_H */
diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h
index c7c96282f011..81c4813cff18 100644
--- a/arch/s390/include/asm/nospec-branch.h
+++ b/arch/s390/include/asm/nospec-branch.h
@@ -2,7 +2,7 @@
#ifndef _ASM_S390_EXPOLINE_H
#define _ASM_S390_EXPOLINE_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <asm/facility.h>
@@ -42,6 +42,6 @@ void __s390_indirect_jump_r13(void);
void __s390_indirect_jump_r14(void);
void __s390_indirect_jump_r15(void);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_S390_EXPOLINE_H */
diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h
index cb15dd25bf21..6ce6b56e282b 100644
--- a/arch/s390/include/asm/nospec-insn.h
+++ b/arch/s390/include/asm/nospec-insn.h
@@ -3,9 +3,10 @@
#define _ASM_S390_NOSPEC_ASM_H
#include <linux/linkage.h>
+#include <linux/export.h>
#include <asm/dwarf.h>
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#ifdef CC_USING_EXPOLINE
@@ -128,6 +129,6 @@
.endm
#endif /* CC_USING_EXPOLINE */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_S390_NOSPEC_ASM_H */
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 4e5dbabdf202..9240a363c893 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -33,7 +33,7 @@
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#include <asm/setup.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
void __storage_key_init_range(unsigned long start, unsigned long end);
@@ -130,11 +130,19 @@ typedef pte_t *pgtable_t;
static inline void page_set_storage_key(unsigned long addr,
unsigned char skey, int mapped)
{
- if (!mapped)
- asm volatile(".insn rrf,0xb22b0000,%0,%1,8,0"
- : : "d" (skey), "a" (addr));
- else
- asm volatile("sske %0,%1" : : "d" (skey), "a" (addr));
+ if (!mapped) {
+ asm volatile(
+ " .insn rrf,0xb22b0000,%[skey],%[addr],8,0"
+ :
+ : [skey] "d" (skey), [addr] "a" (addr)
+ : "memory");
+ } else {
+ asm volatile(
+ " sske %[skey],%[addr]"
+ :
+ : [skey] "d" (skey), [addr] "a" (addr)
+ : "memory");
+ }
}
static inline unsigned char page_get_storage_key(unsigned long addr)
@@ -274,7 +282,7 @@ static inline unsigned long virt_to_pfn(const void *kaddr)
#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 6c8063cb8fe7..6a9c08b80eda 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -26,7 +26,7 @@
#define RESTART_FLAG_CTLREGS _AC(1 << 0, U)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/cpumask.h>
#include <linux/linkage.h>
@@ -418,6 +418,6 @@ static __always_inline void bpon(void)
);
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __ASM_S390_PROCESSOR_H */
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index 0905fa99a31e..dfa770b15fad 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -54,7 +54,7 @@
PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_MASK_MCHECK | \
PSW_MASK_PSTATE | PSW_ASC_PRIMARY)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct psw_bits {
unsigned long : 1;
@@ -292,5 +292,5 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
regs->gprs[2] = rc;
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _S390_PTRACE_H */
diff --git a/arch/s390/include/asm/purgatory.h b/arch/s390/include/asm/purgatory.h
index e297bcfc476f..4c7a43bc43a1 100644
--- a/arch/s390/include/asm/purgatory.h
+++ b/arch/s390/include/asm/purgatory.h
@@ -7,11 +7,11 @@
#ifndef _S390_PURGATORY_H_
#define _S390_PURGATORY_H_
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/purgatory.h>
int verify_sha256_digest(void);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _S390_PURGATORY_H_ */
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index 1e62919bacf4..0f184dbdbe5e 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -21,7 +21,7 @@
#define SCLP_ERRNOTIFY_AQ_INFO_LOG 2
#define SCLP_ERRNOTIFY_AQ_OPTICS_DATA 3
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/uio.h>
#include <asm/chpid.h>
#include <asm/cpu.h>
@@ -199,5 +199,5 @@ static inline int sclp_get_core_info(struct sclp_core_info *info, int early)
return _sclp_get_core_info(info);
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_S390_SCLP_H */
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 031e881b4d88..7c57ac968bf6 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -24,7 +24,7 @@
#define LEGACY_COMMAND_LINE_SIZE 896
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/lowcore.h>
#include <asm/types.h>
@@ -41,6 +41,8 @@ struct parmarea {
char command_line[COMMAND_LINE_SIZE]; /* 0x10480 */
};
+extern char arch_hw_string[128];
+
extern struct parmarea parmarea;
extern unsigned int zlib_dfltcc_support;
@@ -100,5 +102,5 @@ static __always_inline u32 gen_lpswe(unsigned long addr)
BUILD_BUG_ON(addr > 0xfff);
return 0xb2b20000 | addr;
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_S390_SETUP_H */
diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h
index 472943b77066..97d77868f83c 100644
--- a/arch/s390/include/asm/sigp.h
+++ b/arch/s390/include/asm/sigp.h
@@ -36,7 +36,7 @@
#define SIGP_STATUS_INCORRECT_STATE 0x00000200UL
#define SIGP_STATUS_NOT_RUNNING 0x00000400UL
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/asm.h>
@@ -68,6 +68,6 @@ static inline int __pcpu_sigp(u16 addr, u8 order, unsigned long parm,
return cc;
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __S390_ASM_SIGP_H */
diff --git a/arch/s390/include/asm/skey.h b/arch/s390/include/asm/skey.h
new file mode 100644
index 000000000000..84e7cf28b712
--- /dev/null
+++ b/arch/s390/include/asm/skey.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_SKEY_H
+#define __ASM_SKEY_H
+
+#include <asm/rwonce.h>
+
+struct skey_region {
+ unsigned long start;
+ unsigned long end;
+};
+
+#define SKEY_REGION(_start, _end) \
+ stringify_in_c(.section .skey_region,"a";) \
+ stringify_in_c(.balign 8;) \
+ stringify_in_c(.quad (_start);) \
+ stringify_in_c(.quad (_end);) \
+ stringify_in_c(.previous)
+
+extern int skey_regions_initialized;
+extern struct skey_region __skey_region_start[];
+extern struct skey_region __skey_region_end[];
+
+void __skey_regions_initialize(void);
+
+static inline void skey_regions_initialize(void)
+{
+ if (READ_ONCE(skey_regions_initialized))
+ return;
+ __skey_regions_initialize();
+}
+
+#endif /* __ASM_SKEY_H */
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index 391eb04d26d8..f6ed2c8192c8 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -24,7 +24,7 @@
#define STACK_INIT_OFFSET (THREAD_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* low level task data that entry.S needs immediate access to
diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h
index bed8d0b5a282..59dfb8780f62 100644
--- a/arch/s390/include/asm/timex.h
+++ b/arch/s390/include/asm/timex.h
@@ -196,13 +196,6 @@ static inline unsigned long get_tod_clock_fast(void)
asm volatile("stckf %0" : "=Q" (clk) : : "cc");
return clk;
}
-
-static inline cycles_t get_cycles(void)
-{
- return (cycles_t) get_tod_clock() >> 2;
-}
-#define get_cycles get_cycles
-
int get_phys_clock(unsigned long *clock);
void init_cpu_timer(void);
@@ -230,6 +223,12 @@ static inline unsigned long get_tod_clock_monotonic(void)
return tod;
}
+static inline cycles_t get_cycles(void)
+{
+ return (cycles_t)get_tod_clock_monotonic() >> 2;
+}
+#define get_cycles get_cycles
+
/**
* tod_to_ns - convert a TOD format value to nanoseconds
* @todval: to be converted TOD format value
diff --git a/arch/s390/include/asm/tpi.h b/arch/s390/include/asm/tpi.h
index f76e5fdff23a..71c8b6f76cdd 100644
--- a/arch/s390/include/asm/tpi.h
+++ b/arch/s390/include/asm/tpi.h
@@ -5,7 +5,7 @@
#include <linux/types.h>
#include <uapi/asm/schid.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/* I/O-Interruption Code as stored by TEST PENDING INTERRUPTION (TPI). */
struct tpi_info {
@@ -32,6 +32,6 @@ struct tpi_adapter_info {
u32 :27;
} __packed __aligned(4);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_S390_TPI_H */
diff --git a/arch/s390/include/asm/types.h b/arch/s390/include/asm/types.h
index 0b5d550a0478..53695b2196f7 100644
--- a/arch/s390/include/asm/types.h
+++ b/arch/s390/include/asm/types.h
@@ -5,7 +5,7 @@
#include <uapi/asm/types.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
union register_pair {
unsigned __int128 pair;
@@ -15,5 +15,5 @@ union register_pair {
};
};
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_S390_TYPES_H */
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index a43fc88c0050..3e5b8b677057 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -473,188 +473,30 @@ do { \
void __cmpxchg_user_key_called_with_bad_pointer(void);
-#define CMPXCHG_USER_KEY_MAX_LOOPS 128
-
-static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval,
- __uint128_t old, __uint128_t new,
- unsigned long key, int size)
+int __cmpxchg_user_key1(unsigned long address, unsigned char *uval,
+ unsigned char old, unsigned char new, unsigned long key);
+int __cmpxchg_user_key2(unsigned long address, unsigned short *uval,
+ unsigned short old, unsigned short new, unsigned long key);
+int __cmpxchg_user_key4(unsigned long address, unsigned int *uval,
+ unsigned int old, unsigned int new, unsigned long key);
+int __cmpxchg_user_key8(unsigned long address, unsigned long *uval,
+ unsigned long old, unsigned long new, unsigned long key);
+int __cmpxchg_user_key16(unsigned long address, __uint128_t *uval,
+ __uint128_t old, __uint128_t new, unsigned long key);
+
+static __always_inline int _cmpxchg_user_key(unsigned long address, void *uval,
+ __uint128_t old, __uint128_t new,
+ unsigned long key, int size)
{
- bool sacf_flag;
- int rc = 0;
-
switch (size) {
- case 1: {
- unsigned int prev, shift, mask, _old, _new;
- unsigned long count;
-
- shift = (3 ^ (address & 3)) << 3;
- address ^= address & 3;
- _old = ((unsigned int)old & 0xff) << shift;
- _new = ((unsigned int)new & 0xff) << shift;
- mask = ~(0xff << shift);
- sacf_flag = enable_sacf_uaccess();
- asm_inline volatile(
- " spka 0(%[key])\n"
- " sacf 256\n"
- " llill %[count],%[max_loops]\n"
- "0: l %[prev],%[address]\n"
- "1: nr %[prev],%[mask]\n"
- " xilf %[mask],0xffffffff\n"
- " or %[new],%[prev]\n"
- " or %[prev],%[tmp]\n"
- "2: lr %[tmp],%[prev]\n"
- "3: cs %[prev],%[new],%[address]\n"
- "4: jnl 5f\n"
- " xr %[tmp],%[prev]\n"
- " xr %[new],%[tmp]\n"
- " nr %[tmp],%[mask]\n"
- " jnz 5f\n"
- " brct %[count],2b\n"
- "5: sacf 768\n"
- " spka %[default_key]\n"
- EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev])
- EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev])
- EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev])
- EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev])
- : [rc] "+&d" (rc),
- [prev] "=&d" (prev),
- [address] "+Q" (*(int *)address),
- [tmp] "+&d" (_old),
- [new] "+&d" (_new),
- [mask] "+&d" (mask),
- [count] "=a" (count)
- : [key] "%[count]" (key << 4),
- [default_key] "J" (PAGE_DEFAULT_KEY),
- [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS)
- : "memory", "cc");
- disable_sacf_uaccess(sacf_flag);
- *(unsigned char *)uval = prev >> shift;
- if (!count)
- rc = -EAGAIN;
- return rc;
- }
- case 2: {
- unsigned int prev, shift, mask, _old, _new;
- unsigned long count;
-
- shift = (2 ^ (address & 2)) << 3;
- address ^= address & 2;
- _old = ((unsigned int)old & 0xffff) << shift;
- _new = ((unsigned int)new & 0xffff) << shift;
- mask = ~(0xffff << shift);
- sacf_flag = enable_sacf_uaccess();
- asm_inline volatile(
- " spka 0(%[key])\n"
- " sacf 256\n"
- " llill %[count],%[max_loops]\n"
- "0: l %[prev],%[address]\n"
- "1: nr %[prev],%[mask]\n"
- " xilf %[mask],0xffffffff\n"
- " or %[new],%[prev]\n"
- " or %[prev],%[tmp]\n"
- "2: lr %[tmp],%[prev]\n"
- "3: cs %[prev],%[new],%[address]\n"
- "4: jnl 5f\n"
- " xr %[tmp],%[prev]\n"
- " xr %[new],%[tmp]\n"
- " nr %[tmp],%[mask]\n"
- " jnz 5f\n"
- " brct %[count],2b\n"
- "5: sacf 768\n"
- " spka %[default_key]\n"
- EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev])
- EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev])
- EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev])
- EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev])
- : [rc] "+&d" (rc),
- [prev] "=&d" (prev),
- [address] "+Q" (*(int *)address),
- [tmp] "+&d" (_old),
- [new] "+&d" (_new),
- [mask] "+&d" (mask),
- [count] "=a" (count)
- : [key] "%[count]" (key << 4),
- [default_key] "J" (PAGE_DEFAULT_KEY),
- [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS)
- : "memory", "cc");
- disable_sacf_uaccess(sacf_flag);
- *(unsigned short *)uval = prev >> shift;
- if (!count)
- rc = -EAGAIN;
- return rc;
- }
- case 4: {
- unsigned int prev = old;
-
- sacf_flag = enable_sacf_uaccess();
- asm_inline volatile(
- " spka 0(%[key])\n"
- " sacf 256\n"
- "0: cs %[prev],%[new],%[address]\n"
- "1: sacf 768\n"
- " spka %[default_key]\n"
- EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
- EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
- : [rc] "+&d" (rc),
- [prev] "+&d" (prev),
- [address] "+Q" (*(int *)address)
- : [new] "d" ((unsigned int)new),
- [key] "a" (key << 4),
- [default_key] "J" (PAGE_DEFAULT_KEY)
- : "memory", "cc");
- disable_sacf_uaccess(sacf_flag);
- *(unsigned int *)uval = prev;
- return rc;
- }
- case 8: {
- unsigned long prev = old;
-
- sacf_flag = enable_sacf_uaccess();
- asm_inline volatile(
- " spka 0(%[key])\n"
- " sacf 256\n"
- "0: csg %[prev],%[new],%[address]\n"
- "1: sacf 768\n"
- " spka %[default_key]\n"
- EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
- EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
- : [rc] "+&d" (rc),
- [prev] "+&d" (prev),
- [address] "+QS" (*(long *)address)
- : [new] "d" ((unsigned long)new),
- [key] "a" (key << 4),
- [default_key] "J" (PAGE_DEFAULT_KEY)
- : "memory", "cc");
- disable_sacf_uaccess(sacf_flag);
- *(unsigned long *)uval = prev;
- return rc;
- }
- case 16: {
- __uint128_t prev = old;
-
- sacf_flag = enable_sacf_uaccess();
- asm_inline volatile(
- " spka 0(%[key])\n"
- " sacf 256\n"
- "0: cdsg %[prev],%[new],%[address]\n"
- "1: sacf 768\n"
- " spka %[default_key]\n"
- EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev])
- EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev])
- : [rc] "+&d" (rc),
- [prev] "+&d" (prev),
- [address] "+QS" (*(__int128_t *)address)
- : [new] "d" (new),
- [key] "a" (key << 4),
- [default_key] "J" (PAGE_DEFAULT_KEY)
- : "memory", "cc");
- disable_sacf_uaccess(sacf_flag);
- *(__uint128_t *)uval = prev;
- return rc;
- }
+ case 1: return __cmpxchg_user_key1(address, uval, old, new, key);
+ case 2: return __cmpxchg_user_key2(address, uval, old, new, key);
+ case 4: return __cmpxchg_user_key4(address, uval, old, new, key);
+ case 8: return __cmpxchg_user_key8(address, uval, old, new, key);
+ case 16: return __cmpxchg_user_key16(address, uval, old, new, key);
+ default: __cmpxchg_user_key_called_with_bad_pointer();
}
- __cmpxchg_user_key_called_with_bad_pointer();
- return rc;
+ return 0;
}
/**
@@ -686,8 +528,8 @@ static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval,
BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval))); \
might_fault(); \
__chk_user_ptr(__ptr); \
- __cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval), \
- (old), (new), (key), sizeof(*(__ptr))); \
+ _cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval), \
+ (old), (new), (key), sizeof(*(__ptr))); \
})
#endif /* __S390_UACCESS_H */
diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h
index 420a073fdde5..8e2fffa0ca68 100644
--- a/arch/s390/include/asm/vdso.h
+++ b/arch/s390/include/asm/vdso.h
@@ -4,11 +4,11 @@
#include <vdso/datapage.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
int vdso_getcpu_init(void);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#define __VDSO_PAGES 4
diff --git a/arch/s390/include/asm/vdso/getrandom.h b/arch/s390/include/asm/vdso/getrandom.h
index f8713ce39bb2..6741a27199f8 100644
--- a/arch/s390/include/asm/vdso/getrandom.h
+++ b/arch/s390/include/asm/vdso/getrandom.h
@@ -3,7 +3,7 @@
#ifndef __ASM_VDSO_GETRANDOM_H
#define __ASM_VDSO_GETRANDOM_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <vdso/datapage.h>
#include <asm/vdso/vsyscall.h>
@@ -23,6 +23,6 @@ static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsig
return syscall3(__NR_getrandom, (long)buffer, (long)len, (long)flags);
}
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h
index fb4564308e9d..c31ac5f61c83 100644
--- a/arch/s390/include/asm/vdso/gettimeofday.h
+++ b/arch/s390/include/asm/vdso/gettimeofday.h
@@ -16,13 +16,7 @@
static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_time_data *vd)
{
- u64 adj, now;
-
- now = get_tod_clock();
- adj = vd->arch_data.tod_steering_end - now;
- if (unlikely((s64) adj > 0))
- now += (vd->arch_data.tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15);
- return now;
+ return get_tod_clock() - vd->arch_data.tod_delta;
}
static __always_inline
diff --git a/arch/s390/include/asm/vdso/time_data.h b/arch/s390/include/asm/vdso/time_data.h
index 8a08752422e6..25c4e0d9f596 100644
--- a/arch/s390/include/asm/vdso/time_data.h
+++ b/arch/s390/include/asm/vdso/time_data.h
@@ -5,8 +5,7 @@
#include <linux/types.h>
struct arch_vdso_time_data {
- __s64 tod_steering_delta;
- __u64 tod_steering_end;
+ __s64 tod_delta;
};
#endif /* __S390_ASM_VDSO_TIME_DATA_H */
diff --git a/arch/s390/include/asm/vdso/vsyscall.h b/arch/s390/include/asm/vdso/vsyscall.h
index d346ebe51301..b00acec8ddbc 100644
--- a/arch/s390/include/asm/vdso/vsyscall.h
+++ b/arch/s390/include/asm/vdso/vsyscall.h
@@ -2,7 +2,7 @@
#ifndef __ASM_VDSO_VSYSCALL_H
#define __ASM_VDSO_VSYSCALL_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/hrtimer.h>
#include <vdso/datapage.h>
@@ -11,6 +11,6 @@
/* The asm-generic header needs to be included after the definitions above */
#include <asm-generic/vdso/vsyscall.h>
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* __ASM_VDSO_VSYSCALL_H */
diff --git a/arch/s390/include/uapi/asm/ptrace.h b/arch/s390/include/uapi/asm/ptrace.h
index bb0826024bb9..ea202072f1ad 100644
--- a/arch/s390/include/uapi/asm/ptrace.h
+++ b/arch/s390/include/uapi/asm/ptrace.h
@@ -242,7 +242,8 @@
#define PTRACE_OLDSETOPTIONS 21
#define PTRACE_SYSEMU 31
#define PTRACE_SYSEMU_SINGLESTEP 32
-#ifndef __ASSEMBLY__
+
+#ifndef __ASSEMBLER__
#include <linux/stddef.h>
#include <linux/types.h>
@@ -450,6 +451,6 @@ struct user_regs_struct {
unsigned long ieee_instruction_pointer; /* obsolete, always 0 */
};
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _UAPI_S390_PTRACE_H */
diff --git a/arch/s390/include/uapi/asm/schid.h b/arch/s390/include/uapi/asm/schid.h
index a3e1cf168553..d804d1a5b1b3 100644
--- a/arch/s390/include/uapi/asm/schid.h
+++ b/arch/s390/include/uapi/asm/schid.h
@@ -4,7 +4,7 @@
#include <linux/types.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct subchannel_id {
__u32 cssid : 8;
@@ -15,6 +15,6 @@ struct subchannel_id {
__u32 sch_no : 16;
} __attribute__ ((packed, aligned(4)));
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _UAPIASM_SCHID_H */
diff --git a/arch/s390/include/uapi/asm/types.h b/arch/s390/include/uapi/asm/types.h
index 84457dbb26b4..4ab468c5032e 100644
--- a/arch/s390/include/uapi/asm/types.h
+++ b/arch/s390/include/uapi/asm/types.h
@@ -10,7 +10,7 @@
#include <asm-generic/int-ll64.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
typedef unsigned long addr_t;
typedef __signed__ long saddr_t;
@@ -25,6 +25,6 @@ typedef struct {
};
} __attribute__((packed, aligned(4))) __vector128;
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _UAPI_S390_TYPES_H */
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index ea5ed6654050..eb06ff888314 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -41,7 +41,7 @@ obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
obj-y += debug.o irq.o ipl.o dis.o vdso.o cpufeature.o
obj-y += sysinfo.o lgr.o os_info.o ctlreg.o
obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
-obj-y += entry.o reipl.o kdebugfs.o alternative.o
+obj-y += entry.o reipl.o kdebugfs.o alternative.o skey.o
obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o uv.o wti.o
obj-y += diag/
diff --git a/arch/s390/kernel/cpufeature.c b/arch/s390/kernel/cpufeature.c
index 76210f001028..c9eef9ed876b 100644
--- a/arch/s390/kernel/cpufeature.c
+++ b/arch/s390/kernel/cpufeature.c
@@ -4,6 +4,7 @@
*/
#include <linux/cpufeature.h>
+#include <linux/export.h>
#include <linux/bug.h>
#include <asm/machine.h>
#include <asm/elf.h>
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index adb164223f8c..d4839de8ce9d 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -7,6 +7,7 @@
*/
#include <linux/crash_dump.h>
+#include <linux/export.h>
#include <asm/lowcore.h>
#include <linux/kernel.h>
#include <linux/init.h>
diff --git a/arch/s390/kernel/ctlreg.c b/arch/s390/kernel/ctlreg.c
index 8cc26cf2c64a..a0501f4c7e7a 100644
--- a/arch/s390/kernel/ctlreg.c
+++ b/arch/s390/kernel/ctlreg.c
@@ -5,6 +5,7 @@
#include <linux/irqflags.h>
#include <linux/spinlock.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/smp.h>
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index 94eb8168ea44..63a1d4226ff8 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -17,7 +17,6 @@
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
-#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/reboot.h>
#include <linux/kprobes.h>
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 54cf0923050f..9adfbdd377dc 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -105,6 +105,8 @@ static inline void strim_all(char *str)
}
}
+char arch_hw_string[128];
+
static noinline __init void setup_arch_string(void)
{
struct sysinfo_1_1_1 *mach = (struct sysinfo_1_1_1 *)&sysinfo_page;
@@ -131,6 +133,7 @@ static noinline __init void setup_arch_string(void)
machine_is_vm() ? "z/VM" :
machine_is_kvm() ? "KVM" : "unknown");
}
+ sprintf(arch_hw_string, "HW: %s (%s)", mstr, hvstr);
dump_stack_set_arch_desc("%s (%s)", mstr, hvstr);
}
@@ -154,6 +157,7 @@ void __init __do_early_pgm_check(struct pt_regs *regs)
regs->int_code = lc->pgm_int_code;
regs->int_parm_long = lc->trans_exc_code;
+ regs->last_break = lc->pgm_last_break;
ip = __rewind_psw(regs->psw, regs->int_code >> 16);
/* Monitor Event? Might be a warning */
diff --git a/arch/s390/kernel/facility.c b/arch/s390/kernel/facility.c
index f02127219a27..d028b0be5c1d 100644
--- a/arch/s390/kernel/facility.c
+++ b/arch/s390/kernel/facility.c
@@ -3,6 +3,7 @@
* Copyright IBM Corp. 2023
*/
+#include <linux/export.h>
#include <asm/facility.h>
unsigned int stfle_size(void)
diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c
index 6f2e87920288..03a8973aec3c 100644
--- a/arch/s390/kernel/fpu.c
+++ b/arch/s390/kernel/fpu.c
@@ -5,6 +5,8 @@
* Copyright IBM Corp. 2015
* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
*/
+
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/cpu.h>
#include <linux/sched.h>
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 3da371c144eb..11f33243a23f 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -9,6 +9,7 @@
*/
#include <linux/kernel_stat.h>
+#include <linux/utsname.h>
#include <linux/cpufeature.h>
#include <linux/init.h>
#include <linux/errno.h>
@@ -21,7 +22,6 @@
#include <linux/module.h>
#include <linux/sched/signal.h>
#include <linux/kvm_host.h>
-#include <linux/export.h>
#include <asm/lowcore.h>
#include <asm/ctlreg.h>
#include <asm/fpu.h>
@@ -116,18 +116,82 @@ static __always_inline char *u64_to_hex(char *dest, u64 val)
return dest;
}
+static notrace void nmi_print_info(void)
+{
+ struct lowcore *lc = get_lowcore();
+ char message[100];
+ char *ptr;
+ int i;
+
+ ptr = nmi_puts(message, "Unrecoverable machine check, code: ");
+ ptr = u64_to_hex(ptr, lc->mcck_interruption_code);
+ ptr = nmi_puts(ptr, "\n");
+ sclp_emergency_printk(message);
+
+ ptr = nmi_puts(message, init_utsname()->release);
+ ptr = nmi_puts(ptr, "\n");
+ sclp_emergency_printk(message);
+
+ ptr = nmi_puts(message, arch_hw_string);
+ ptr = nmi_puts(ptr, "\n");
+ sclp_emergency_printk(message);
+
+ ptr = nmi_puts(message, "PSW: ");
+ ptr = u64_to_hex(ptr, lc->mcck_old_psw.mask);
+ ptr = nmi_puts(ptr, " ");
+ ptr = u64_to_hex(ptr, lc->mcck_old_psw.addr);
+ ptr = nmi_puts(ptr, " PFX: ");
+ ptr = u64_to_hex(ptr, (u64)get_lowcore());
+ ptr = nmi_puts(ptr, "\n");
+ sclp_emergency_printk(message);
+
+ ptr = nmi_puts(message, "LBA: ");
+ ptr = u64_to_hex(ptr, lc->last_break_save_area);
+ ptr = nmi_puts(ptr, " EDC: ");
+ ptr = u64_to_hex(ptr, lc->external_damage_code);
+ ptr = nmi_puts(ptr, " FSA: ");
+ ptr = u64_to_hex(ptr, lc->failing_storage_address);
+ ptr = nmi_puts(ptr, "\n");
+ sclp_emergency_printk(message);
+
+ ptr = nmi_puts(message, "CRS:\n");
+ sclp_emergency_printk(message);
+ ptr = message;
+ for (i = 0; i < 16; i++) {
+ ptr = u64_to_hex(ptr, lc->cregs_save_area[i].val);
+ ptr = nmi_puts(ptr, " ");
+ if ((i + 1) % 4 == 0) {
+ ptr = nmi_puts(ptr, "\n");
+ sclp_emergency_printk(message);
+ ptr = message;
+ }
+ }
+
+ ptr = nmi_puts(message, "GPRS:\n");
+ sclp_emergency_printk(message);
+ ptr = message;
+ for (i = 0; i < 16; i++) {
+ ptr = u64_to_hex(ptr, lc->gpregs_save_area[i]);
+ ptr = nmi_puts(ptr, " ");
+ if ((i + 1) % 4 == 0) {
+ ptr = nmi_puts(ptr, "\n");
+ sclp_emergency_printk(message);
+ ptr = message;
+ }
+ }
+
+ ptr = nmi_puts(message, "System stopped\n");
+ sclp_emergency_printk(message);
+}
+
static notrace void s390_handle_damage(void)
{
struct lowcore *lc = get_lowcore();
union ctlreg0 cr0, cr0_new;
- char message[100];
psw_t psw_save;
- char *ptr;
smp_emergency_stop();
diag_amode31_ops.diag308_reset();
- ptr = nmi_puts(message, "System stopped due to unrecoverable machine check, code: 0x");
- u64_to_hex(ptr, lc->mcck_interruption_code);
/*
* Disable low address protection and make machine check new PSW a
@@ -141,7 +205,7 @@ static notrace void s390_handle_damage(void)
psw_bits(lc->mcck_new_psw).io = 0;
psw_bits(lc->mcck_new_psw).ext = 0;
psw_bits(lc->mcck_new_psw).wait = 1;
- sclp_emergency_printk(message);
+ nmi_print_info();
/*
* Restore machine check new PSW and control register 0 to original
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 6a262e198e35..4d09954ebf49 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -14,7 +14,6 @@
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/init.h>
-#include <linux/export.h>
#include <linux/miscdevice.h>
#include <linux/perf_event.h>
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 91469401f2c9..f432869f8921 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -14,7 +14,6 @@
#include <linux/percpu.h>
#include <linux/pid.h>
#include <linux/notifier.h>
-#include <linux/export.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/moduleparam.h>
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index 2b9611c4718e..91b8716c883a 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -12,7 +12,6 @@
#include <linux/perf_event.h>
#include <linux/kvm_host.h>
#include <linux/percpu.h>
-#include <linux/export.h>
#include <linux/seq_file.h>
#include <linux/spinlock.h>
#include <linux/uaccess.h>
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
index 63875270941b..f373a1009c45 100644
--- a/arch/s390/kernel/perf_pai_crypto.c
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@ -13,7 +13,6 @@
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/init.h>
-#include <linux/export.h>
#include <linux/io.h>
#include <linux/perf_event.h>
#include <asm/ctlreg.h>
@@ -696,7 +695,7 @@ static const char * const paicrypt_ctrnames[] = {
[111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256",
[112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128",
[113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192",
- [114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256A",
+ [114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256",
[115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128",
[116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256",
[117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128",
diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
index fd14d5ebccbc..d827473e7f87 100644
--- a/arch/s390/kernel/perf_pai_ext.c
+++ b/arch/s390/kernel/perf_pai_ext.c
@@ -14,7 +14,6 @@
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/init.h>
-#include <linux/export.h>
#include <linux/io.h>
#include <linux/perf_event.h>
#include <asm/ctlreg.h>
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 9637aee43c40..f55f09cda6f8 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -27,7 +27,6 @@
#include <linux/compat.h>
#include <linux/kprobes.h>
#include <linux/random.h>
-#include <linux/export.h>
#include <linux/init_task.h>
#include <linux/entry-common.h>
#include <linux/io.h>
diff --git a/arch/s390/kernel/skey.c b/arch/s390/kernel/skey.c
new file mode 100644
index 000000000000..ba049fd103c2
--- /dev/null
+++ b/arch/s390/kernel/skey.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/rwonce.h>
+#include <asm/page.h>
+#include <asm/skey.h>
+
+int skey_regions_initialized;
+
+static inline unsigned long load_real_address(unsigned long address)
+{
+ unsigned long real;
+
+ asm volatile(
+ " lra %[real],0(%[address])\n"
+ : [real] "=d" (real)
+ : [address] "a" (address)
+ : "cc");
+ return real;
+}
+
+/*
+ * Initialize storage keys of registered memory regions with the
+ * default key. This is useful for code which is executed with a
+ * non-default access key.
+ */
+void __skey_regions_initialize(void)
+{
+ unsigned long address, real;
+ struct skey_region *r, *end;
+
+ r = __skey_region_start;
+ end = __skey_region_end;
+ while (r < end) {
+ address = r->start & PAGE_MASK;
+ do {
+ real = load_real_address(address);
+ page_set_storage_key(real, PAGE_DEFAULT_KEY, 1);
+ address += PAGE_SIZE;
+ } while (address < r->end);
+ r++;
+ }
+ /*
+ * Make sure storage keys are initialized before
+ * skey_regions_initialized is changed.
+ */
+ barrier();
+ WRITE_ONCE(skey_regions_initialized, 1);
+}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 81f12bb77f62..e88ebe5339fc 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -175,13 +175,10 @@ static struct pcpu *pcpu_find_address(const struct cpumask *mask, u16 address)
static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit)
{
- int order;
-
if (test_and_set_bit(ec_bit, &pcpu->ec_mask))
return;
- order = pcpu_running(pcpu) ? SIGP_EXTERNAL_CALL : SIGP_EMERGENCY_SIGNAL;
pcpu->ec_clk = get_tod_clock_fast();
- pcpu_sigp_retry(pcpu, order, 0);
+ pcpu_sigp_retry(pcpu, SIGP_EXTERNAL_CALL, 0);
}
static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
@@ -433,16 +430,16 @@ void notrace smp_emergency_stop(void)
cpumask_copy(&cpumask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &cpumask);
- end = get_tod_clock() + (1000000UL << 12);
+ end = get_tod_clock_monotonic() + (1000000UL << 12);
for_each_cpu(cpu, &cpumask) {
struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu);
set_bit(ec_stop_cpu, &pcpu->ec_mask);
while (__pcpu_sigp(pcpu->address, SIGP_EMERGENCY_SIGNAL,
0, NULL) == SIGP_CC_BUSY &&
- get_tod_clock() < end)
+ get_tod_clock_monotonic() < end)
cpu_relax();
}
- while (get_tod_clock() < end) {
+ while (get_tod_clock_monotonic() < end) {
for_each_cpu(cpu, &cpumask)
if (pcpu_stopped(per_cpu_ptr(&pcpu_devices, cpu)))
cpumask_clear_cpu(cpu, &cpumask);
diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c
index d40f0b983e74..f4ccdbed4b89 100644
--- a/arch/s390/kernel/sthyi.c
+++ b/arch/s390/kernel/sthyi.c
@@ -5,6 +5,8 @@
* Copyright IBM Corp. 2016
* Author(s): Janosch Frank <frankja@linux.vnet.ibm.com>
*/
+
+#include <linux/export.h>
#include <linux/errno.h>
#include <linux/pagemap.h>
#include <linux/vmalloc.h>
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index fed17d407a44..63517b85f4c9 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -69,8 +69,6 @@ unsigned char ptff_function_mask[16];
static unsigned long lpar_offset;
static unsigned long initial_leap_seconds;
-static unsigned long tod_steering_end;
-static long tod_steering_delta;
/*
* Get time offsets with PTFF
@@ -80,9 +78,7 @@ void __init time_early_init(void)
struct ptff_qto qto;
struct ptff_qui qui;
- /* Initialize TOD steering parameters */
- tod_steering_end = tod_clock_base.tod;
- vdso_k_time_data->arch_data.tod_steering_end = tod_steering_end;
+ vdso_k_time_data->arch_data.tod_delta = tod_clock_base.tod;
if (!test_facility(28))
return;
@@ -226,21 +222,7 @@ void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
static u64 read_tod_clock(struct clocksource *cs)
{
- unsigned long now, adj;
-
- preempt_disable(); /* protect from changes to steering parameters */
- now = get_tod_clock();
- adj = tod_steering_end - now;
- if (unlikely((s64) adj > 0))
- /*
- * manually steer by 1 cycle every 2^16 cycles. This
- * corresponds to shifting the tod delta by 15. 1s is
- * therefore steered in ~9h. The adjust will decrease
- * over time, until it finally reaches 0.
- */
- now += (tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15);
- preempt_enable();
- return now;
+ return get_tod_clock_monotonic();
}
static struct clocksource clocksource_tod = {
@@ -369,26 +351,11 @@ static inline int check_sync_clock(void)
*/
static void clock_sync_global(long delta)
{
- unsigned long now, adj;
struct ptff_qto qto;
/* Fixup the monotonic sched clock. */
tod_clock_base.eitod += delta;
- /* Adjust TOD steering parameters. */
- now = get_tod_clock();
- adj = tod_steering_end - now;
- if (unlikely((s64) adj >= 0))
- /* Calculate how much of the old adjustment is left. */
- tod_steering_delta = (tod_steering_delta < 0) ?
- -(adj >> 15) : (adj >> 15);
- tod_steering_delta += delta;
- if ((abs(tod_steering_delta) >> 48) != 0)
- panic("TOD clock sync offset %li is too large to drift\n",
- tod_steering_delta);
- tod_steering_end = now + (abs(tod_steering_delta) << 15);
- vdso_k_time_data->arch_data.tod_steering_end = tod_steering_end;
- vdso_k_time_data->arch_data.tod_steering_delta = tod_steering_delta;
-
+ vdso_k_time_data->arch_data.tod_delta = tod_clock_base.tod;
/* Update LPAR offset. */
if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0)
lpar_offset = qto.tod_epoch_difference;
@@ -430,7 +397,7 @@ struct clock_sync_data {
/*
* Server Time Protocol (STP) code.
*/
-static bool stp_online;
+static bool stp_online = true;
static struct stp_sstpi stp_info;
static void *stp_page;
@@ -456,7 +423,6 @@ static void __init stp_reset(void)
if (rc == 0)
set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags);
else if (stp_online) {
- pr_warn("The real or virtual hardware system does not provide an STP interface\n");
free_page((unsigned long) stp_page);
stp_page = NULL;
stp_online = false;
@@ -580,7 +546,7 @@ static int stp_sync_clock(void *data)
atomic_dec(&sync->cpus);
/* Wait for in_sync to be set. */
while (READ_ONCE(sync->in_sync) == 0)
- __udelay(1);
+ ;
}
if (sync->in_sync != 1)
/* Didn't work. Clear per-cpu in sync bit again. */
@@ -591,81 +557,6 @@ static int stp_sync_clock(void *data)
return 0;
}
-static int stp_clear_leap(void)
-{
- struct __kernel_timex txc;
- int ret;
-
- memset(&txc, 0, sizeof(txc));
-
- ret = do_adjtimex(&txc);
- if (ret < 0)
- return ret;
-
- txc.modes = ADJ_STATUS;
- txc.status &= ~(STA_INS|STA_DEL);
- return do_adjtimex(&txc);
-}
-
-static void stp_check_leap(void)
-{
- struct stp_stzi stzi;
- struct stp_lsoib *lsoib = &stzi.lsoib;
- struct __kernel_timex txc;
- int64_t timediff;
- int leapdiff, ret;
-
- if (!stp_info.lu || !check_sync_clock()) {
- /*
- * Either a scheduled leap second was removed by the operator,
- * or STP is out of sync. In both cases, clear the leap second
- * kernel flags.
- */
- if (stp_clear_leap() < 0)
- pr_err("failed to clear leap second flags\n");
- return;
- }
-
- if (chsc_stzi(stp_page, &stzi, sizeof(stzi))) {
- pr_err("stzi failed\n");
- return;
- }
-
- timediff = tod_to_ns(lsoib->nlsout - get_tod_clock()) / NSEC_PER_SEC;
- leapdiff = lsoib->nlso - lsoib->also;
-
- if (leapdiff != 1 && leapdiff != -1) {
- pr_err("Cannot schedule %d leap seconds\n", leapdiff);
- return;
- }
-
- if (timediff < 0) {
- if (stp_clear_leap() < 0)
- pr_err("failed to clear leap second flags\n");
- } else if (timediff < 7200) {
- memset(&txc, 0, sizeof(txc));
- ret = do_adjtimex(&txc);
- if (ret < 0)
- return;
-
- txc.modes = ADJ_STATUS;
- if (leapdiff > 0)
- txc.status |= STA_INS;
- else
- txc.status |= STA_DEL;
- ret = do_adjtimex(&txc);
- if (ret < 0)
- pr_err("failed to set leap second flags\n");
- /* arm Timer to clear leap second flags */
- mod_timer(&stp_timer, jiffies + secs_to_jiffies(14400));
- } else {
- /* The day the leap second is scheduled for hasn't been reached. Retry
- * in one hour.
- */
- mod_timer(&stp_timer, jiffies + secs_to_jiffies(3600));
- }
-}
-
/*
* STP work. Check for the STP state and take over the clock
* synchronization if the STP clock source is usable.
@@ -707,8 +598,6 @@ static void stp_work_fn(struct work_struct *work)
* Retry after a second.
*/
mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC));
- else if (stp_info.lu)
- stp_check_leap();
out_unlock:
mutex_unlock(&stp_mutex);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 3df048e190b1..46569b8e47dd 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -531,11 +531,11 @@ static const struct cpumask *cpu_drawer_mask(int cpu)
}
static struct sched_domain_topology_level s390_topology[] = {
- { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
- { cpu_book_mask, SD_INIT_NAME(BOOK) },
- { cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
- { cpu_cpu_mask, SD_INIT_NAME(PKG) },
+ SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT),
+ SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
+ SDTL_INIT(cpu_book_mask, NULL, BOOK),
+ SDTL_INIT(cpu_drawer_mask, NULL, DRAWER),
+ SDTL_INIT(cpu_cpu_mask, NULL, PKG),
{ NULL, },
};
diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c
index cd44be2b6ce8..0f88caca4eaf 100644
--- a/arch/s390/kernel/unwind_bc.c
+++ b/arch/s390/kernel/unwind_bc.c
@@ -1,4 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index b99478e84da4..47f574cd1728 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -7,6 +7,7 @@
#define KMSG_COMPONENT "prot_virt"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/sizes.h>
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index ff1ddba96352..1c606dfa595d 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -71,6 +71,13 @@ SECTIONS
. = ALIGN(PAGE_SIZE);
__end_ro_after_init = .;
+ . = ALIGN(8);
+ .skey_region_table : {
+ __skey_region_start = .;
+ KEEP(*(.skey_region))
+ __skey_region_end = .;
+ }
+
.data.rel.ro : {
*(.data.rel.ro .data.rel.ro.*)
}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 60c360c18690..2a92a8b9e4c2 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -14,6 +14,7 @@
#include <linux/interrupt.h>
#include <linux/kvm_host.h>
#include <linux/hrtimer.h>
+#include <linux/export.h>
#include <linux/mmu_context.h>
#include <linux/nospec.h>
#include <linux/signal.h>
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d5ad10791c25..bf6fa8b9ca73 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -14,6 +14,7 @@
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/compiler.h>
+#include <linux/export.h>
#include <linux/err.h>
#include <linux/fs.h>
#include <linux/hrtimer.h>
@@ -5062,6 +5063,30 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
return vcpu_post_run_handle_fault(vcpu);
}
+int noinstr kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb,
+ u64 *gprs, unsigned long gasce)
+{
+ int ret;
+
+ guest_state_enter_irqoff();
+
+ /*
+ * The guest_state_{enter,exit}_irqoff() functions inform lockdep and
+ * tracing that entry to the guest will enable host IRQs, and exit from
+ * the guest will disable host IRQs.
+ *
+ * We must not use lockdep/tracing/RCU in this critical section, so we
+ * use the low-level arch_local_irq_*() helpers to enable/disable IRQs.
+ */
+ arch_local_irq_enable();
+ ret = sie64a(scb, gprs, gasce);
+ arch_local_irq_disable();
+
+ guest_state_exit_irqoff();
+
+ return ret;
+}
+
#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK)
static int __vcpu_run(struct kvm_vcpu *vcpu)
{
@@ -5082,20 +5107,27 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
kvm_vcpu_srcu_read_unlock(vcpu);
/*
* As PF_VCPU will be used in fault handler, between
- * guest_enter and guest_exit should be no uaccess.
+ * guest_timing_enter_irqoff and guest_timing_exit_irqoff
+ * should be no uaccess.
*/
- local_irq_disable();
- guest_enter_irqoff();
- __disable_cpu_timer_accounting(vcpu);
- local_irq_enable();
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
memcpy(sie_page->pv_grregs,
vcpu->run->s.regs.gprs,
sizeof(sie_page->pv_grregs));
}
- exit_reason = sie64a(vcpu->arch.sie_block,
- vcpu->run->s.regs.gprs,
- vcpu->arch.gmap->asce);
+
+ local_irq_disable();
+ guest_timing_enter_irqoff();
+ __disable_cpu_timer_accounting(vcpu);
+
+ exit_reason = kvm_s390_enter_exit_sie(vcpu->arch.sie_block,
+ vcpu->run->s.regs.gprs,
+ vcpu->arch.gmap->asce);
+
+ __enable_cpu_timer_accounting(vcpu);
+ guest_timing_exit_irqoff();
+ local_irq_enable();
+
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
memcpy(vcpu->run->s.regs.gprs,
sie_page->pv_grregs,
@@ -5111,10 +5143,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK;
}
}
- local_irq_disable();
- __enable_cpu_timer_accounting(vcpu);
- guest_exit_irqoff();
- local_irq_enable();
kvm_vcpu_srcu_read_lock(vcpu);
rc = vcpu_post_run(vcpu, exit_reason);
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index 14c330ec8ceb..25ede8354514 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -5,6 +5,8 @@
* Copyright IBM Corp. 2019, 2020
* Author(s): Janosch Frank <frankja@linux.ibm.com>
*/
+
+#include <linux/export.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/minmax.h>
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 13a9661d2b28..347268f89f2f 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1170,10 +1170,6 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
vcpu->arch.sie_block->fpf & FPF_BPBC)
set_thread_flag(TIF_ISOLATE_BP_GUEST);
- local_irq_disable();
- guest_enter_irqoff();
- local_irq_enable();
-
/*
* Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking
* and VCPU requests also hinder the vSIE from running and lead
@@ -1183,15 +1179,16 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
current->thread.gmap_int_code = 0;
barrier();
- if (!kvm_s390_vcpu_sie_inhibited(vcpu))
- rc = sie64a(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce);
+ if (!kvm_s390_vcpu_sie_inhibited(vcpu)) {
+ local_irq_disable();
+ guest_timing_enter_irqoff();
+ rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce);
+ guest_timing_exit_irqoff();
+ local_irq_enable();
+ }
barrier();
vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE;
- local_irq_disable();
- guest_exit_irqoff();
- local_irq_enable();
-
/* restore guest state for bp isolation override */
if (!guest_bp_isolation)
clear_thread_flag(TIF_ISOLATE_BP_GUEST);
diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c
index be14c58cb989..c1ea14e3c927 100644
--- a/arch/s390/lib/delay.c
+++ b/arch/s390/lib/delay.c
@@ -7,6 +7,7 @@
*/
#include <linux/processor.h>
+#include <linux/export.h>
#include <linux/delay.h>
#include <asm/div64.h>
#include <asm/timex.h>
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index fa7d98fa1320..1a6ba105e071 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -8,11 +8,13 @@
* Gerald Schaefer (gerald.schaefer@de.ibm.com)
*/
+#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <asm/asm-extable.h>
#include <asm/ctlreg.h>
+#include <asm/skey.h>
#ifdef CONFIG_DEBUG_ENTRY
void debug_user_asce(int exit)
@@ -145,3 +147,189 @@ unsigned long _copy_to_user_key(void __user *to, const void *from,
return raw_copy_to_user_key(to, from, n, key);
}
EXPORT_SYMBOL(_copy_to_user_key);
+
+#define CMPXCHG_USER_KEY_MAX_LOOPS 128
+
+static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsigned int *uval,
+ unsigned int old, unsigned int new,
+ unsigned int mask, unsigned long key)
+{
+ unsigned long count;
+ unsigned int prev;
+ bool sacf_flag;
+ int rc = 0;
+
+ skey_regions_initialize();
+ sacf_flag = enable_sacf_uaccess();
+ asm_inline volatile(
+ "20: spka 0(%[key])\n"
+ " sacf 256\n"
+ " llill %[count],%[max_loops]\n"
+ "0: l %[prev],%[address]\n"
+ "1: nr %[prev],%[mask]\n"
+ " xilf %[mask],0xffffffff\n"
+ " or %[new],%[prev]\n"
+ " or %[prev],%[tmp]\n"
+ "2: lr %[tmp],%[prev]\n"
+ "3: cs %[prev],%[new],%[address]\n"
+ "4: jnl 5f\n"
+ " xr %[tmp],%[prev]\n"
+ " xr %[new],%[tmp]\n"
+ " nr %[tmp],%[mask]\n"
+ " jnz 5f\n"
+ " brct %[count],2b\n"
+ "5: sacf 768\n"
+ " spka %[default_key]\n"
+ "21:\n"
+ EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev])
+ SKEY_REGION(20b, 21b)
+ : [rc] "+&d" (rc),
+ [prev] "=&d" (prev),
+ [address] "+Q" (*(int *)address),
+ [tmp] "+&d" (old),
+ [new] "+&d" (new),
+ [mask] "+&d" (mask),
+ [count] "=a" (count)
+ : [key] "%[count]" (key << 4),
+ [default_key] "J" (PAGE_DEFAULT_KEY),
+ [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS)
+ : "memory", "cc");
+ disable_sacf_uaccess(sacf_flag);
+ *uval = prev;
+ if (!count)
+ rc = -EAGAIN;
+ return rc;
+}
+
+int __kprobes __cmpxchg_user_key1(unsigned long address, unsigned char *uval,
+ unsigned char old, unsigned char new, unsigned long key)
+{
+ unsigned int prev, shift, mask, _old, _new;
+ int rc;
+
+ shift = (3 ^ (address & 3)) << 3;
+ address ^= address & 3;
+ _old = (unsigned int)old << shift;
+ _new = (unsigned int)new << shift;
+ mask = ~(0xff << shift);
+ rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key);
+ *uval = prev >> shift;
+ return rc;
+}
+EXPORT_SYMBOL(__cmpxchg_user_key1);
+
+int __kprobes __cmpxchg_user_key2(unsigned long address, unsigned short *uval,
+ unsigned short old, unsigned short new, unsigned long key)
+{
+ unsigned int prev, shift, mask, _old, _new;
+ int rc;
+
+ shift = (2 ^ (address & 2)) << 3;
+ address ^= address & 2;
+ _old = (unsigned int)old << shift;
+ _new = (unsigned int)new << shift;
+ mask = ~(0xffff << shift);
+ rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key);
+ *uval = prev >> shift;
+ return rc;
+}
+EXPORT_SYMBOL(__cmpxchg_user_key2);
+
+int __kprobes __cmpxchg_user_key4(unsigned long address, unsigned int *uval,
+ unsigned int old, unsigned int new, unsigned long key)
+{
+ unsigned int prev = old;
+ bool sacf_flag;
+ int rc = 0;
+
+ skey_regions_initialize();
+ sacf_flag = enable_sacf_uaccess();
+ asm_inline volatile(
+ "20: spka 0(%[key])\n"
+ " sacf 256\n"
+ "0: cs %[prev],%[new],%[address]\n"
+ "1: sacf 768\n"
+ " spka %[default_key]\n"
+ "21:\n"
+ EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
+ SKEY_REGION(20b, 21b)
+ : [rc] "+&d" (rc),
+ [prev] "+&d" (prev),
+ [address] "+Q" (*(int *)address)
+ : [new] "d" (new),
+ [key] "a" (key << 4),
+ [default_key] "J" (PAGE_DEFAULT_KEY)
+ : "memory", "cc");
+ disable_sacf_uaccess(sacf_flag);
+ *uval = prev;
+ return rc;
+}
+EXPORT_SYMBOL(__cmpxchg_user_key4);
+
+int __kprobes __cmpxchg_user_key8(unsigned long address, unsigned long *uval,
+ unsigned long old, unsigned long new, unsigned long key)
+{
+ unsigned long prev = old;
+ bool sacf_flag;
+ int rc = 0;
+
+ skey_regions_initialize();
+ sacf_flag = enable_sacf_uaccess();
+ asm_inline volatile(
+ "20: spka 0(%[key])\n"
+ " sacf 256\n"
+ "0: csg %[prev],%[new],%[address]\n"
+ "1: sacf 768\n"
+ " spka %[default_key]\n"
+ "21:\n"
+ EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
+ SKEY_REGION(20b, 21b)
+ : [rc] "+&d" (rc),
+ [prev] "+&d" (prev),
+ [address] "+QS" (*(long *)address)
+ : [new] "d" (new),
+ [key] "a" (key << 4),
+ [default_key] "J" (PAGE_DEFAULT_KEY)
+ : "memory", "cc");
+ disable_sacf_uaccess(sacf_flag);
+ *uval = prev;
+ return rc;
+}
+EXPORT_SYMBOL(__cmpxchg_user_key8);
+
+int __kprobes __cmpxchg_user_key16(unsigned long address, __uint128_t *uval,
+ __uint128_t old, __uint128_t new, unsigned long key)
+{
+ __uint128_t prev = old;
+ bool sacf_flag;
+ int rc = 0;
+
+ skey_regions_initialize();
+ sacf_flag = enable_sacf_uaccess();
+ asm_inline volatile(
+ "20: spka 0(%[key])\n"
+ " sacf 256\n"
+ "0: cdsg %[prev],%[new],%[address]\n"
+ "1: sacf 768\n"
+ " spka %[default_key]\n"
+ "21:\n"
+ EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev])
+ EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev])
+ SKEY_REGION(20b, 21b)
+ : [rc] "+&d" (rc),
+ [prev] "+&d" (prev),
+ [address] "+QS" (*(__int128_t *)address)
+ : [new] "d" (new),
+ [key] "a" (key << 4),
+ [default_key] "J" (PAGE_DEFAULT_KEY)
+ : "memory", "cc");
+ disable_sacf_uaccess(sacf_flag);
+ *uval = prev;
+ return rc;
+}
+EXPORT_SYMBOL(__cmpxchg_user_key16);
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 012a4366a2ad..c7defe4ed1f6 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -9,6 +9,7 @@
*/
#include <linux/cpufeature.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/pagewalk.h>
#include <linux/swap.h>
diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
index a45d417ad951..b63f427e7289 100644
--- a/arch/s390/mm/gmap_helpers.c
+++ b/arch/s390/mm/gmap_helpers.c
@@ -4,6 +4,8 @@
*
* Copyright IBM Corp. 2007, 2025
*/
+
+#include <linux/export.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/mm.h>
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index b449fd2605b0..d2f6f1f6d2fc 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -173,11 +173,6 @@ void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
- /*
- * THPs are not allowed for KVM guests. Warn if pgste ever reaches here.
- * Turn to the generic pte_free_defer() version once gmap is removed.
- */
- WARN_ON_ONCE(mm_has_pgste(mm));
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 7df70cd8f739..60688be4e876 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -5,6 +5,7 @@
*/
#include <linux/cpufeature.h>
+#include <linux/export.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h
deleted file mode 100644
index 7822ea92e54a..000000000000
--- a/arch/s390/net/bpf_jit.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * BPF Jit compiler defines
- *
- * Copyright IBM Corp. 2012,2015
- *
- * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- * Michael Holzheu <holzheu@linux.vnet.ibm.com>
- */
-
-#ifndef __ARCH_S390_NET_BPF_JIT_H
-#define __ARCH_S390_NET_BPF_JIT_H
-
-#ifndef __ASSEMBLY__
-
-#include <linux/filter.h>
-#include <linux/types.h>
-
-#endif /* __ASSEMBLY__ */
-
-/*
- * Stackframe layout (packed stack):
- *
- * ^ high
- * +---------------+ |
- * | old backchain | |
- * +---------------+ |
- * | r15 - r6 | |
- * +---------------+ |
- * | 4 byte align | |
- * | tail_call_cnt | |
- * BFP -> +===============+ |
- * | | |
- * | BPF stack | |
- * | | |
- * R15+160 -> +---------------+ |
- * | new backchain | |
- * R15+152 -> +---------------+ |
- * | + 152 byte SA | |
- * R15 -> +---------------+ + low
- *
- * We get 160 bytes stack space from calling function, but only use
- * 12 * 8 byte for old backchain, r15..r6, and tail_call_cnt.
- *
- * The stack size used by the BPF program ("BPF stack" above) is passed
- * via "aux->stack_depth".
- */
-#define STK_SPACE_ADD (160)
-#define STK_160_UNUSED (160 - 12 * 8)
-#define STK_OFF (STK_SPACE_ADD - STK_160_UNUSED)
-
-#define STK_OFF_R6 (160 - 11 * 8) /* Offset of r6 on stack */
-#define STK_OFF_TCCNT (160 - 12 * 8) /* Offset of tail_call_cnt on stack */
-
-#endif /* __ARCH_S390_NET_BPF_JIT_H */
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 0c9a35782c83..bb17efe29d65 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -32,7 +32,6 @@
#include <asm/set_memory.h>
#include <asm/text-patching.h>
#include <asm/unwind.h>
-#include "bpf_jit.h"
struct bpf_jit {
u32 seen; /* Flags to remember seen eBPF instructions */
@@ -54,6 +53,7 @@ struct bpf_jit {
int prologue_plt; /* Start of prologue hotpatch PLT */
int kern_arena; /* Pool offset of kernel arena address */
u64 user_arena; /* User arena address */
+ u32 frame_off; /* Offset of struct bpf_prog from %r15 */
};
#define SEEN_MEM BIT(0) /* use mem[] for temporary storage */
@@ -426,11 +426,25 @@ static void jit_fill_hole(void *area, unsigned int size)
}
/*
+ * Caller-allocated part of the frame.
+ * Thanks to packed stack, its otherwise unused initial part can be used for
+ * the BPF stack and for the next frame.
+ */
+struct prog_frame {
+ u64 unused[8];
+ /* BPF stack starts here and grows towards 0 */
+ u32 tail_call_cnt;
+ u32 pad;
+ u64 r6[10]; /* r6 - r15 */
+ u64 backchain;
+} __packed;
+
+/*
* Save registers from "rs" (register start) to "re" (register end) on stack
*/
static void save_regs(struct bpf_jit *jit, u32 rs, u32 re)
{
- u32 off = STK_OFF_R6 + (rs - 6) * 8;
+ u32 off = offsetof(struct prog_frame, r6) + (rs - 6) * 8;
if (rs == re)
/* stg %rs,off(%r15) */
@@ -443,12 +457,9 @@ static void save_regs(struct bpf_jit *jit, u32 rs, u32 re)
/*
* Restore registers from "rs" (register start) to "re" (register end) on stack
*/
-static void restore_regs(struct bpf_jit *jit, u32 rs, u32 re, u32 stack_depth)
+static void restore_regs(struct bpf_jit *jit, u32 rs, u32 re)
{
- u32 off = STK_OFF_R6 + (rs - 6) * 8;
-
- if (jit->seen & SEEN_STACK)
- off += STK_OFF + stack_depth;
+ u32 off = jit->frame_off + offsetof(struct prog_frame, r6) + (rs - 6) * 8;
if (rs == re)
/* lg %rs,off(%r15) */
@@ -492,8 +503,7 @@ static int get_end(u16 seen_regs, int start)
* Save and restore clobbered registers (6-15) on stack.
* We save/restore registers in chunks with gap >= 2 registers.
*/
-static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth,
- u16 extra_regs)
+static void save_restore_regs(struct bpf_jit *jit, int op, u16 extra_regs)
{
u16 seen_regs = jit->seen_regs | extra_regs;
const int last = 15, save_restore_size = 6;
@@ -516,7 +526,7 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth,
if (op == REGS_SAVE)
save_regs(jit, rs, re);
else
- restore_regs(jit, rs, re, stack_depth);
+ restore_regs(jit, rs, re);
re++;
} while (re <= last);
}
@@ -581,11 +591,12 @@ static void bpf_jit_plt(struct bpf_plt *plt, void *ret, void *target)
* Emit function prologue
*
* Save registers and create stack frame if necessary.
- * See stack frame layout description in "bpf_jit.h"!
+ * Stack frame layout is described by struct prog_frame.
*/
-static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp,
- u32 stack_depth)
+static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp)
{
+ BUILD_BUG_ON(sizeof(struct prog_frame) != STACK_FRAME_OVERHEAD);
+
/* No-op for hotpatching */
/* brcl 0,prologue_plt */
EMIT6_PCREL_RILC(0xc0040000, 0, jit->prologue_plt);
@@ -593,8 +604,9 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp,
if (!bpf_is_subprog(fp)) {
/* Initialize the tail call counter in the main program. */
- /* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */
- _EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT);
+ /* xc tail_call_cnt(4,%r15),tail_call_cnt(%r15) */
+ _EMIT6(0xd703f000 | offsetof(struct prog_frame, tail_call_cnt),
+ 0xf000 | offsetof(struct prog_frame, tail_call_cnt));
} else {
/*
* Skip the tail call counter initialization in subprograms.
@@ -617,7 +629,7 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp,
jit->seen_regs |= NVREGS;
} else {
/* Save registers */
- save_restore_regs(jit, REGS_SAVE, stack_depth,
+ save_restore_regs(jit, REGS_SAVE,
fp->aux->exception_boundary ? NVREGS : 0);
}
/* Setup literal pool */
@@ -637,13 +649,15 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp,
if (is_first_pass(jit) || (jit->seen & SEEN_STACK)) {
/* lgr %w1,%r15 (backchain) */
EMIT4(0xb9040000, REG_W1, REG_15);
- /* la %bfp,STK_160_UNUSED(%r15) (BPF frame pointer) */
- EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15, STK_160_UNUSED);
- /* aghi %r15,-STK_OFF */
- EMIT4_IMM(0xa70b0000, REG_15, -(STK_OFF + stack_depth));
- /* stg %w1,152(%r15) (backchain) */
+ /* la %bfp,unused_end(%r15) (BPF frame pointer) */
+ EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15,
+ offsetofend(struct prog_frame, unused));
+ /* aghi %r15,-frame_off */
+ EMIT4_IMM(0xa70b0000, REG_15, -jit->frame_off);
+ /* stg %w1,backchain(%r15) */
EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
- REG_15, 152);
+ REG_15,
+ offsetof(struct prog_frame, backchain));
}
}
@@ -677,13 +691,13 @@ static void call_r1(struct bpf_jit *jit)
/*
* Function epilogue
*/
-static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
+static void bpf_jit_epilogue(struct bpf_jit *jit)
{
jit->exit_ip = jit->prg;
/* Load exit code: lgr %r2,%b0 */
EMIT4(0xb9040000, REG_2, BPF_REG_0);
/* Restore registers */
- save_restore_regs(jit, REGS_RESTORE, stack_depth, 0);
+ save_restore_regs(jit, REGS_RESTORE, 0);
EMIT_JUMP_REG(14);
jit->prg = ALIGN(jit->prg, 8);
@@ -865,7 +879,7 @@ static int sign_extend(struct bpf_jit *jit, int r, u8 size, u8 flags)
* stack space for the large switch statement.
*/
static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
- int i, bool extra_pass, u32 stack_depth)
+ int i, bool extra_pass)
{
struct bpf_insn *insn = &fp->insnsi[i];
s32 branch_oc_off = insn->off;
@@ -1786,9 +1800,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
* Note 2: We assume that the verifier does not let us call the
* main program, which clears the tail call counter on entry.
*/
- /* mvc STK_OFF_TCCNT(4,%r15),N(%r15) */
- _EMIT6(0xd203f000 | STK_OFF_TCCNT,
- 0xf000 | (STK_OFF_TCCNT + STK_OFF + stack_depth));
+ /* mvc tail_call_cnt(4,%r15),frame_off+tail_call_cnt(%r15) */
+ _EMIT6(0xd203f000 | offsetof(struct prog_frame, tail_call_cnt),
+ 0xf000 | (jit->frame_off +
+ offsetof(struct prog_frame, tail_call_cnt)));
/* Sign-extend the kfunc arguments. */
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
@@ -1839,10 +1854,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
* goto out;
*/
- if (jit->seen & SEEN_STACK)
- off = STK_OFF_TCCNT + STK_OFF + stack_depth;
- else
- off = STK_OFF_TCCNT;
+ off = jit->frame_off +
+ offsetof(struct prog_frame, tail_call_cnt);
/* lhi %w0,1 */
EMIT4_IMM(0xa7080000, REG_W0, 1);
/* laal %w1,%w0,off(%r15) */
@@ -1872,7 +1885,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
/*
* Restore registers before calling function
*/
- save_restore_regs(jit, REGS_RESTORE, stack_depth, 0);
+ save_restore_regs(jit, REGS_RESTORE, 0);
/*
* goto *(prog->bpf_func + tail_call_start);
@@ -2165,7 +2178,7 @@ static int bpf_set_addr(struct bpf_jit *jit, int i)
* Compile eBPF program into s390x code
*/
static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
- bool extra_pass, u32 stack_depth)
+ bool extra_pass)
{
int i, insn_count, lit32_size, lit64_size;
u64 kern_arena;
@@ -2174,24 +2187,30 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
jit->lit64 = jit->lit64_start;
jit->prg = 0;
jit->excnt = 0;
+ if (is_first_pass(jit) || (jit->seen & SEEN_STACK))
+ jit->frame_off = sizeof(struct prog_frame) -
+ offsetofend(struct prog_frame, unused) +
+ round_up(fp->aux->stack_depth, 8);
+ else
+ jit->frame_off = 0;
kern_arena = bpf_arena_get_kern_vm_start(fp->aux->arena);
if (kern_arena)
jit->kern_arena = _EMIT_CONST_U64(kern_arena);
jit->user_arena = bpf_arena_get_user_vm_start(fp->aux->arena);
- bpf_jit_prologue(jit, fp, stack_depth);
+ bpf_jit_prologue(jit, fp);
if (bpf_set_addr(jit, 0) < 0)
return -1;
for (i = 0; i < fp->len; i += insn_count) {
- insn_count = bpf_jit_insn(jit, fp, i, extra_pass, stack_depth);
+ insn_count = bpf_jit_insn(jit, fp, i, extra_pass);
if (insn_count < 0)
return -1;
/* Next instruction address */
if (bpf_set_addr(jit, i + insn_count) < 0)
return -1;
}
- bpf_jit_epilogue(jit, stack_depth);
+ bpf_jit_epilogue(jit);
lit32_size = jit->lit32 - jit->lit32_start;
lit64_size = jit->lit64 - jit->lit64_start;
@@ -2267,7 +2286,6 @@ static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit,
*/
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
{
- u32 stack_depth = round_up(fp->aux->stack_depth, 8);
struct bpf_prog *tmp, *orig_fp = fp;
struct bpf_binary_header *header;
struct s390_jit_data *jit_data;
@@ -2320,7 +2338,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
* - 3: Calculate program size and addrs array
*/
for (pass = 1; pass <= 3; pass++) {
- if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) {
+ if (bpf_jit_prog(&jit, fp, extra_pass)) {
fp = orig_fp;
goto free_addrs;
}
@@ -2334,7 +2352,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
goto free_addrs;
}
skip_init_ctx:
- if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) {
+ if (bpf_jit_prog(&jit, fp, extra_pass)) {
bpf_jit_binary_free(header);
fp = orig_fp;
goto free_addrs;
@@ -2654,9 +2672,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
/* stg %r1,backchain_off(%r15) */
EMIT6_DISP_LH(0xe3000000, 0x0024, REG_1, REG_0, REG_15,
tjit->backchain_off);
- /* mvc tccnt_off(4,%r15),stack_size+STK_OFF_TCCNT(%r15) */
+ /* mvc tccnt_off(4,%r15),stack_size+tail_call_cnt(%r15) */
_EMIT6(0xd203f000 | tjit->tccnt_off,
- 0xf000 | (tjit->stack_size + STK_OFF_TCCNT));
+ 0xf000 | (tjit->stack_size +
+ offsetof(struct prog_frame, tail_call_cnt)));
/* stmg %r2,%rN,fwd_reg_args_off(%r15) */
if (nr_reg_args)
EMIT6_DISP_LH(0xeb000000, 0x0024, REG_2,
@@ -2793,8 +2812,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
(nr_stack_args * sizeof(u64) - 1) << 16 |
tjit->stack_args_off,
0xf000 | tjit->orig_stack_args_off);
- /* mvc STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */
- _EMIT6(0xd203f000 | STK_OFF_TCCNT, 0xf000 | tjit->tccnt_off);
+ /* mvc tail_call_cnt(4,%r15),tccnt_off(%r15) */
+ _EMIT6(0xd203f000 | offsetof(struct prog_frame, tail_call_cnt),
+ 0xf000 | tjit->tccnt_off);
/* lgr %r1,%r8 */
EMIT4(0xb9040000, REG_1, REG_8);
/* %r1() */
@@ -2851,8 +2871,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
if (flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET))
EMIT6_DISP_LH(0xe3000000, 0x0004, REG_2, REG_0, REG_15,
tjit->retval_off);
- /* mvc stack_size+STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */
- _EMIT6(0xd203f000 | (tjit->stack_size + STK_OFF_TCCNT),
+ /* mvc stack_size+tail_call_cnt(4,%r15),tccnt_off(%r15) */
+ _EMIT6(0xd203f000 | (tjit->stack_size +
+ offsetof(struct prog_frame, tail_call_cnt)),
0xf000 | tjit->tccnt_off);
/* aghi %r15,stack_size */
EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size);
diff --git a/arch/s390/net/pnet.c b/arch/s390/net/pnet.c
index 79211bec0fc8..03089ef479b2 100644
--- a/arch/s390/net/pnet.c
+++ b/arch/s390/net/pnet.c
@@ -6,6 +6,7 @@
*/
#include <linux/device.h>
+#include <linux/export.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/types.h>
diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
index 81bdb54ad5e3..45a1c36c5a54 100644
--- a/arch/s390/pci/pci_bus.c
+++ b/arch/s390/pci/pci_bus.c
@@ -13,7 +13,6 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/err.h>
-#include <linux/export.h>
#include <linux/delay.h>
#include <linux/seq_file.h>
#include <linux/jump_label.h>
diff --git a/arch/s390/pci/pci_kvm_hook.c b/arch/s390/pci/pci_kvm_hook.c
index ff34baf50a3e..df5b25dbe9ca 100644
--- a/arch/s390/pci/pci_kvm_hook.c
+++ b/arch/s390/pci/pci_kvm_hook.c
@@ -5,7 +5,9 @@
* Copyright (C) IBM Corp. 2022. All rights reserved.
* Author(s): Pierre Morel <pmorel@linux.ibm.com>
*/
+
#include <linux/kvm_host.h>
+#include <linux/export.h>
struct zpci_kvm_hook zpci_kvm_hook;
EXPORT_SYMBOL_GPL(zpci_kvm_hook);
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 89185af7bcc9..d5795067befa 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -40,7 +40,6 @@ config SUPERH
select HAVE_GUP_FAST if MMU
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_HW_BREAKPOINT
select HAVE_IOREMAP_PROT if MMU && !X2TLB
select HAVE_KERNEL_BZIP2
diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
index f022ada363b5..8ef72b8dbcd3 100644
--- a/arch/sh/configs/titan_defconfig
+++ b/arch/sh/configs/titan_defconfig
@@ -61,7 +61,6 @@ CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
CONFIG_NETFILTER_XT_TARGET_MARK=m
CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
CONFIG_NETFILTER_XT_MATCH_LIMIT=m
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index dcfdb7f1dae9..beda7f177c54 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -78,7 +78,6 @@ config SPARC64
select MMU_GATHER_NO_FLUSH_CACHE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_DYNAMIC_FTRACE
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_PAGE_SIZE_8KB
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_CONTEXT_TRACKING_USER
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index adcba7329386..71befa109e1c 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -145,6 +145,9 @@
#define SO_PASSRIGHTS 0x005c
+#define SO_INQ 0x005d
+#define SCM_INQ SO_INQ
+
#if !defined(__KERNEL__)
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index f08e8a7fac93..9083bfdb7735 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -6,6 +6,7 @@ config UML
bool
default y
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+ select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_CPU_FINALIZE_INIT
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
@@ -35,6 +36,7 @@ config UML
select HAVE_RUST
select ARCH_HAS_UBSAN
select HAVE_ARCH_TRACEHOOK
+ select HAVE_SYSCALL_TRACEPOINTS
select THREAD_INFO_IN_TASK
config MMU
@@ -82,9 +84,6 @@ config NR_CPUS
range 1 1
default 1
-config ARCH_HAS_CACHE_LINE_SIZE
- def_bool y
-
source "arch/$(HEADER_ARCH)/um/Kconfig"
config MAY_HAVE_RUNTIME_DEPS
diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index 34085bfc6d41..6a0354ca032f 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -160,6 +160,7 @@ config UML_RTC
config UML_PCI
bool
select FORCE_PCI
+ select IRQ_MSI_LIB
select UML_IOMEM_EMULATION
select UML_DMA_EMULATION
select PCI_MSI
diff --git a/arch/um/drivers/rtc_user.c b/arch/um/drivers/rtc_user.c
index 51e79f3148cd..67912fcf7b28 100644
--- a/arch/um/drivers/rtc_user.c
+++ b/arch/um/drivers/rtc_user.c
@@ -28,7 +28,7 @@ int uml_rtc_start(bool timetravel)
int err;
if (timetravel) {
- int err = os_pipe(uml_rtc_irq_fds, 1, 1);
+ err = os_pipe(uml_rtc_irq_fds, 1, 1);
if (err)
goto fail;
} else {
diff --git a/arch/um/drivers/vfio_kern.c b/arch/um/drivers/vfio_kern.c
index 13b971a2bd43..915812a79bfc 100644
--- a/arch/um/drivers/vfio_kern.c
+++ b/arch/um/drivers/vfio_kern.c
@@ -16,6 +16,7 @@
#include <init.h>
#include <os.h>
+#include "mconsole_kern.h"
#include "virt-pci.h"
#include "vfio_user.h"
@@ -60,6 +61,7 @@ static LIST_HEAD(uml_vfio_groups);
static DEFINE_MUTEX(uml_vfio_groups_mtx);
static LIST_HEAD(uml_vfio_devices);
+static DEFINE_MUTEX(uml_vfio_devices_mtx);
static int uml_vfio_set_container(int group_fd)
{
@@ -581,32 +583,44 @@ static struct uml_vfio_device *uml_vfio_find_device(const char *device)
return NULL;
}
-static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp)
+static struct uml_vfio_device *uml_vfio_add_device(const char *device)
{
struct uml_vfio_device *dev;
int fd;
+ guard(mutex)(&uml_vfio_devices_mtx);
+
if (uml_vfio_container.fd < 0) {
fd = uml_vfio_user_open_container();
if (fd < 0)
- return fd;
+ return ERR_PTR(fd);
uml_vfio_container.fd = fd;
}
if (uml_vfio_find_device(device))
- return -EEXIST;
+ return ERR_PTR(-EEXIST);
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
if (!dev)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
dev->name = kstrdup(device, GFP_KERNEL);
if (!dev->name) {
kfree(dev);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
list_add_tail(&dev->list, &uml_vfio_devices);
+ return dev;
+}
+
+static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp)
+{
+ struct uml_vfio_device *dev;
+
+ dev = uml_vfio_add_device(device);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
return 0;
}
@@ -629,6 +643,42 @@ __uml_help(uml_vfio_cmdline_param_ops,
" through multiple PCI devices to UML.\n\n"
);
+static int uml_vfio_mc_config(char *str, char **error_out)
+{
+ struct uml_vfio_device *dev;
+
+ if (*str != '=') {
+ *error_out = "Invalid config";
+ return -EINVAL;
+ }
+ str += 1;
+
+ dev = uml_vfio_add_device(str);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+ uml_vfio_open_device(dev);
+ return 0;
+}
+
+static int uml_vfio_mc_id(char **str, int *start_out, int *end_out)
+{
+ return -EOPNOTSUPP;
+}
+
+static int uml_vfio_mc_remove(int n, char **error_out)
+{
+ return -EOPNOTSUPP;
+}
+
+static struct mc_device uml_vfio_mc = {
+ .list = LIST_HEAD_INIT(uml_vfio_mc.list),
+ .name = "vfio_uml.device",
+ .config = uml_vfio_mc_config,
+ .get_config = NULL,
+ .id = uml_vfio_mc_id,
+ .remove = uml_vfio_mc_remove,
+};
+
static int __init uml_vfio_init(void)
{
struct uml_vfio_device *dev, *n;
@@ -639,6 +689,8 @@ static int __init uml_vfio_init(void)
list_for_each_entry_safe(dev, n, &uml_vfio_devices, list)
uml_vfio_open_device(dev);
+ mconsole_register_dev(&uml_vfio_mc);
+
return 0;
}
late_initcall(uml_vfio_init);
diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c
index 0fe207ca4b72..557d93aea00a 100644
--- a/arch/um/drivers/virt-pci.c
+++ b/arch/um/drivers/virt-pci.c
@@ -7,6 +7,7 @@
#include <linux/pci.h>
#include <linux/logic_iomem.h>
#include <linux/of_platform.h>
+#include <linux/irqchip/irq-msi-lib.h>
#include <linux/irqdomain.h>
#include <linux/msi.h>
#include <linux/unaligned.h>
@@ -29,7 +30,6 @@ static struct um_pci_device *um_pci_platform_device;
static struct um_pci_device_reg um_pci_devices[MAX_DEVICES];
static struct fwnode_handle *um_pci_fwnode;
static struct irq_domain *um_pci_inner_domain;
-static struct irq_domain *um_pci_msi_domain;
static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)];
static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset,
@@ -400,21 +400,24 @@ static void um_pci_inner_domain_free(struct irq_domain *domain,
}
static const struct irq_domain_ops um_pci_inner_domain_ops = {
+ .select = msi_lib_irq_domain_select,
.alloc = um_pci_inner_domain_alloc,
.free = um_pci_inner_domain_free,
};
-static struct irq_chip um_pci_msi_irq_chip = {
- .name = "UM virtual PCIe MSI",
- .irq_mask = pci_msi_mask_irq,
- .irq_unmask = pci_msi_unmask_irq,
-};
-
-static struct msi_domain_info um_pci_msi_domain_info = {
- .flags = MSI_FLAG_USE_DEF_DOM_OPS |
- MSI_FLAG_USE_DEF_CHIP_OPS |
- MSI_FLAG_PCI_MSIX,
- .chip = &um_pci_msi_irq_chip,
+#define UM_PCI_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \
+ MSI_FLAG_USE_DEF_CHIP_OPS | \
+ MSI_FLAG_NO_AFFINITY)
+#define UM_PCI_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \
+ MSI_FLAG_PCI_MSIX)
+
+static const struct msi_parent_ops um_pci_msi_parent_ops = {
+ .required_flags = UM_PCI_MSI_FLAGS_REQUIRED,
+ .supported_flags = UM_PCI_MSI_FLAGS_SUPPORTED,
+ .bus_select_token = DOMAIN_BUS_NEXUS,
+ .bus_select_mask = MATCH_PCI_MSI,
+ .prefix = "UM-virtual-",
+ .init_dev_msi_info = msi_lib_init_dev_msi_info,
};
static struct resource busn_resource = {
@@ -559,17 +562,14 @@ static int __init um_pci_init(void)
goto free;
}
- um_pci_inner_domain = irq_domain_create_linear(um_pci_fwnode, MAX_MSI_VECTORS,
- &um_pci_inner_domain_ops, NULL);
- if (!um_pci_inner_domain) {
- err = -ENOMEM;
- goto free;
- }
+ struct irq_domain_info info = {
+ .fwnode = um_pci_fwnode,
+ .ops = &um_pci_inner_domain_ops,
+ .size = MAX_MSI_VECTORS,
+ };
- um_pci_msi_domain = pci_msi_create_irq_domain(um_pci_fwnode,
- &um_pci_msi_domain_info,
- um_pci_inner_domain);
- if (!um_pci_msi_domain) {
+ um_pci_inner_domain = msi_create_parent_irq_domain(&info, &um_pci_msi_parent_ops);
+ if (!um_pci_inner_domain) {
err = -ENOMEM;
goto free;
}
@@ -611,7 +611,6 @@ device_initcall(um_pci_init);
static void __exit um_pci_exit(void)
{
- irq_domain_remove(um_pci_msi_domain);
irq_domain_remove(um_pci_inner_domain);
pci_free_resource_list(&bridge->windows);
pci_free_host_bridge(bridge);
diff --git a/arch/um/drivers/virtio_pcidev.c b/arch/um/drivers/virtio_pcidev.c
index 3c4c4c928fdd..e9e23cc3f357 100644
--- a/arch/um/drivers/virtio_pcidev.c
+++ b/arch/um/drivers/virtio_pcidev.c
@@ -42,7 +42,7 @@ struct virtio_pcidev_device {
void *extra_ptrs[VIRTIO_PCIDEV_WRITE_BUFS + 1];
DECLARE_BITMAP(used_bufs, VIRTIO_PCIDEV_WRITE_BUFS);
-#define UM_PCI_STAT_WAITING 0
+#define VIRTIO_PCIDEV_STAT_WAITING 0
unsigned long status;
bool platform;
@@ -172,7 +172,7 @@ static int virtio_pcidev_send_cmd(struct virtio_pcidev_device *dev,
}
/* kick and poll for getting a response on the queue */
- set_bit(UM_PCI_STAT_WAITING, &dev->status);
+ set_bit(VIRTIO_PCIDEV_STAT_WAITING, &dev->status);
virtqueue_kick(dev->cmd_vq);
ret = 0;
@@ -193,7 +193,7 @@ static int virtio_pcidev_send_cmd(struct virtio_pcidev_device *dev,
}
udelay(1);
}
- clear_bit(UM_PCI_STAT_WAITING, &dev->status);
+ clear_bit(VIRTIO_PCIDEV_STAT_WAITING, &dev->status);
if (bounce_out)
memcpy(out, buf->data, out_size);
@@ -439,7 +439,7 @@ static void virtio_pcidev_cmd_vq_cb(struct virtqueue *vq)
void *cmd;
int len;
- if (test_bit(UM_PCI_STAT_WAITING, &dev->status))
+ if (test_bit(VIRTIO_PCIDEV_STAT_WAITING, &dev->status))
return;
while ((cmd = virtqueue_get_buf(vq, &len)))
diff --git a/arch/um/include/asm/cpufeature.h b/arch/um/include/asm/cpufeature.h
index 1eb8b834fbec..4354f6984271 100644
--- a/arch/um/include/asm/cpufeature.h
+++ b/arch/um/include/asm/cpufeature.h
@@ -4,7 +4,7 @@
#include <asm/processor.h>
-#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+#if defined(__KERNEL__) && !defined(__ASSEMBLER__)
#include <asm/asm.h>
#include <linux/bitops.h>
@@ -137,5 +137,5 @@ t_no:
#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
boot_cpu_data.x86_model
-#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
+#endif /* defined(__KERNEL__) && !defined(__ASSEMBLER__) */
#endif /* _ASM_UM_CPUFEATURE_H */
diff --git a/arch/um/include/asm/current.h b/arch/um/include/asm/current.h
index de64e032d66c..8accc6d6f502 100644
--- a/arch/um/include/asm/current.h
+++ b/arch/um/include/asm/current.h
@@ -5,7 +5,7 @@
#include <linux/compiler.h>
#include <linux/threads.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct task_struct;
extern struct task_struct *cpu_tasks[NR_CPUS];
@@ -18,6 +18,6 @@ static __always_inline struct task_struct *get_current(void)
#define current get_current()
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __ASM_CURRENT_H */
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h
index 23dcc914d44e..0bbb24868557 100644
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -16,11 +16,6 @@
#define activate_mm activate_mm
static inline void activate_mm(struct mm_struct *old, struct mm_struct *new)
{
- /*
- * This is called by fs/exec.c and sys_unshare()
- * when the new ->mm is used for the first time.
- */
- __switch_mm(&new->context.id);
}
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
@@ -28,11 +23,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
{
unsigned cpu = smp_processor_id();
- if(prev != next){
+ if (prev != next) {
cpumask_clear_cpu(cpu, mm_cpumask(prev));
cpumask_set_cpu(cpu, mm_cpumask(next));
- if(next != &init_mm)
- __switch_mm(&next->context.id);
}
}
diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h
index 3d516f3ca9c7..6f54254aaf44 100644
--- a/arch/um/include/asm/page.h
+++ b/arch/um/include/asm/page.h
@@ -11,7 +11,7 @@
#include <vdso/page.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct page;
@@ -94,7 +94,7 @@ extern unsigned long uml_physmem;
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#ifdef CONFIG_X86_32
#define __HAVE_ARCH_GATE_AREA 1
diff --git a/arch/um/include/asm/ptrace-generic.h b/arch/um/include/asm/ptrace-generic.h
index 4696f24d1492..86d74f9d33cf 100644
--- a/arch/um/include/asm/ptrace-generic.h
+++ b/arch/um/include/asm/ptrace-generic.h
@@ -6,7 +6,7 @@
#ifndef __UM_PTRACE_GENERIC_H
#define __UM_PTRACE_GENERIC_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <sysdep/ptrace.h>
diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h
index f9ad06fcc991..7a6f4dc99fa1 100644
--- a/arch/um/include/asm/thread_info.h
+++ b/arch/um/include/asm/thread_info.h
@@ -9,7 +9,7 @@
#define THREAD_SIZE_ORDER CONFIG_KERNEL_STACK_ORDER
#define THREAD_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/types.h>
#include <asm/page.h>
@@ -43,6 +43,8 @@ struct thread_info {
#define TIF_NOTIFY_RESUME 8
#define TIF_SECCOMP 9 /* secure computing */
#define TIF_SINGLESTEP 10 /* single stepping userspace */
+#define TIF_SYSCALL_TRACEPOINT 11 /* syscall tracepoint instrumentation */
+
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
@@ -50,7 +52,11 @@ struct thread_info {
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
#define _TIF_MEMDIE (1 << TIF_MEMDIE)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
+#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \
+ _TIF_NOTIFY_RESUME)
+
#endif
diff --git a/arch/um/include/shared/as-layout.h b/arch/um/include/shared/as-layout.h
index 4f44dcce8a7c..2f9bfd99460a 100644
--- a/arch/um/include/shared/as-layout.h
+++ b/arch/um/include/shared/as-layout.h
@@ -26,7 +26,7 @@
#define STUB_DATA_PAGES 2 /* must be a power of two */
#define STUB_END (STUB_DATA + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <sysdep/ptrace.h>
diff --git a/arch/um/include/shared/skas/mm_id.h b/arch/um/include/shared/skas/mm_id.h
index 89df9a55fbea..4f977ef5dda5 100644
--- a/arch/um/include/shared/skas/mm_id.h
+++ b/arch/um/include/shared/skas/mm_id.h
@@ -19,8 +19,6 @@ struct mm_id {
int syscall_fd_map[STUB_MAX_FDS];
};
-void __switch_mm(struct mm_id *mm_idp);
-
void notify_mm_kill(int pid);
#endif
diff --git a/arch/um/include/shared/skas/skas.h b/arch/um/include/shared/skas/skas.h
index 7d1de4cab551..807514e10538 100644
--- a/arch/um/include/shared/skas/skas.h
+++ b/arch/um/include/shared/skas/skas.h
@@ -9,7 +9,6 @@
#include <sysdep/ptrace.h>
extern int using_seccomp;
-extern int userspace_pid[];
extern void new_thread_handler(void);
extern void handle_syscall(struct uml_pt_regs *regs);
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index cb8b5cd9285c..13812fa97eee 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -26,8 +26,6 @@ void flush_thread(void)
get_safe_registers(current_pt_regs()->regs.gp,
current_pt_regs()->regs.fp);
-
- __switch_mm(&current->mm->context.id);
}
void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp)
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 0cd6fad3d908..1be644de9e41 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -82,14 +82,18 @@ struct task_struct *__switch_to(struct task_struct *from, struct task_struct *to
void interrupt_end(void)
{
struct pt_regs *regs = &current->thread.regs;
-
- if (need_resched())
- schedule();
- if (test_thread_flag(TIF_SIGPENDING) ||
- test_thread_flag(TIF_NOTIFY_SIGNAL))
- do_signal(regs);
- if (test_thread_flag(TIF_NOTIFY_RESUME))
- resume_user_mode_work(regs);
+ unsigned long thread_flags;
+
+ thread_flags = read_thread_flags();
+ while (thread_flags & _TIF_WORK_MASK) {
+ if (thread_flags & _TIF_NEED_RESCHED)
+ schedule();
+ if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+ do_signal(regs);
+ if (thread_flags & _TIF_NOTIFY_RESUME)
+ resume_user_mode_work(regs);
+ thread_flags = read_thread_flags();
+ }
}
int get_current_pid(void)
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 2124624b7817..fdbb37b5c399 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -9,6 +9,9 @@
#include <linux/uaccess.h>
#include <asm/ptrace-abi.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
void user_enable_single_step(struct task_struct *child)
{
set_tsk_thread_flag(child, TIF_SINGLESTEP);
@@ -126,6 +129,9 @@ int syscall_trace_enter(struct pt_regs *regs)
UPT_SYSCALL_ARG3(&regs->regs),
UPT_SYSCALL_ARG4(&regs->regs));
+ if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+ trace_sys_enter(regs, UPT_SYSCALL_NR(&regs->regs));
+
if (!test_thread_flag(TIF_SYSCALL_TRACE))
return 0;
@@ -142,6 +148,9 @@ void syscall_trace_leave(struct pt_regs *regs)
if (test_thread_flag(TIF_SINGLESTEP))
send_sigtrap(&regs->regs, 0);
+ if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+ trace_sys_exit(regs, PT_REGS_SYSCALL_RET(regs));
+
if (!test_thread_flag(TIF_SYSCALL_TRACE))
return;
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 849fafa4b54f..afe9a2f251ef 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -20,8 +20,8 @@
/* Ensure the stub_data struct covers the allocated area */
static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE);
-spinlock_t mm_list_lock;
-struct list_head mm_list;
+static spinlock_t mm_list_lock;
+static struct list_head mm_list;
int init_new_context(struct task_struct *task, struct mm_struct *mm)
{
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index 05dcdc057af9..5881b17eb987 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -26,8 +26,6 @@ static int __init start_kernel_proc(void *unused)
return 0;
}
-extern int userspace_pid[];
-
static char cpu0_irqstack[THREAD_SIZE] __aligned(THREAD_SIZE);
int __init start_uml(void)
diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c
index a5beaea2967e..ba7494f9bfe4 100644
--- a/arch/um/kernel/skas/syscall.c
+++ b/arch/um/kernel/skas/syscall.c
@@ -9,8 +9,8 @@
#include <kern_util.h>
#include <sysdep/ptrace.h>
#include <sysdep/ptrace_user.h>
-#include <sysdep/syscalls.h>
#include <linux/time-internal.h>
+#include <asm/syscall.h>
#include <asm/unistd.h>
#include <asm/delay.h>
@@ -43,7 +43,14 @@ void handle_syscall(struct uml_pt_regs *r)
tt_extra_sched_jiffies += 1;
if (syscall >= 0 && syscall < __NR_syscalls) {
- unsigned long ret = EXECUTE_SYSCALL(syscall, regs);
+ unsigned long ret;
+
+ ret = (*sys_call_table[syscall])(UPT_SYSCALL_ARG1(&regs->regs),
+ UPT_SYSCALL_ARG2(&regs->regs),
+ UPT_SYSCALL_ARG3(&regs->regs),
+ UPT_SYSCALL_ARG4(&regs->regs),
+ UPT_SYSCALL_ARG5(&regs->regs),
+ UPT_SYSCALL_ARG6(&regs->regs));
PT_REGS_SET_SYSCALL_RETURN(regs, ret);
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index e42ffac23e3c..78f48fa9db8b 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -267,7 +267,7 @@ static void get_skas_faultinfo(int pid, struct faultinfo *fi)
memcpy(fi, (void *)current_stub_stack(), sizeof(*fi));
}
-static void handle_trap(int pid, struct uml_pt_regs *regs)
+static void handle_trap(struct uml_pt_regs *regs)
{
if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END))
fatal_sigsegv();
@@ -434,7 +434,6 @@ static int __init init_stub_exe_fd(void)
__initcall(init_stub_exe_fd);
int using_seccomp;
-int userspace_pid[NR_CPUS];
/**
* start_userspace() - prepare a new userspace process
@@ -548,12 +547,12 @@ out_close:
return err;
}
-int unscheduled_userspace_iterations;
+static int unscheduled_userspace_iterations;
extern unsigned long tt_extra_sched_jiffies;
void userspace(struct uml_pt_regs *regs)
{
- int err, status, op, pid = userspace_pid[0];
+ int err, status, op;
siginfo_t si_ptrace;
siginfo_t *si;
int sig;
@@ -562,6 +561,8 @@ void userspace(struct uml_pt_regs *regs)
interrupt_end();
while (1) {
+ struct mm_id *mm_id = current_mm_id();
+
/*
* When we are in time-travel mode, userspace can theoretically
* do a *lot* of work without being scheduled. The problem with
@@ -590,14 +591,12 @@ void userspace(struct uml_pt_regs *regs)
current_mm_sync();
if (using_seccomp) {
- struct mm_id *mm_id = current_mm_id();
struct stub_data *proc_data = (void *) mm_id->stack;
- int ret;
- ret = set_stub_state(regs, proc_data, singlestepping());
- if (ret) {
+ err = set_stub_state(regs, proc_data, singlestepping());
+ if (err) {
printk(UM_KERN_ERR "%s - failed to set regs: %d",
- __func__, ret);
+ __func__, err);
fatal_sigsegv();
}
@@ -623,10 +622,10 @@ void userspace(struct uml_pt_regs *regs)
mm_id->syscall_data_len = 0;
mm_id->syscall_fd_num = 0;
- ret = get_stub_state(regs, proc_data, NULL);
- if (ret) {
+ err = get_stub_state(regs, proc_data, NULL);
+ if (err) {
printk(UM_KERN_ERR "%s - failed to get regs: %d",
- __func__, ret);
+ __func__, err);
fatal_sigsegv();
}
@@ -645,8 +644,10 @@ void userspace(struct uml_pt_regs *regs)
GET_FAULTINFO_FROM_MC(regs->faultinfo, mcontext);
}
} else {
+ int pid = mm_id->pid;
+
/* Flush out any pending syscalls */
- err = syscall_stub_flush(current_mm_id());
+ err = syscall_stub_flush(mm_id);
if (err) {
if (err == -ENOMEM)
report_enomem();
@@ -756,7 +757,7 @@ void userspace(struct uml_pt_regs *regs)
handle_syscall(regs);
break;
case SIGTRAP + 0x80:
- handle_trap(pid, regs);
+ handle_trap(regs);
break;
case SIGTRAP:
relay_signal(SIGTRAP, (struct siginfo *)si, regs, NULL);
@@ -777,7 +778,6 @@ void userspace(struct uml_pt_regs *regs)
__func__, sig);
fatal_sigsegv();
}
- pid = userspace_pid[0];
interrupt_end();
/* Avoid -ERESTARTSYS handling in host */
@@ -902,8 +902,3 @@ void reboot_skas(void)
block_signals_trace();
UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT);
}
-
-void __switch_mm(struct mm_id *mm_idp)
-{
- userspace_pid[0] = mm_idp->pid;
-}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index edaab220d9c1..08e511657f05 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -75,6 +75,7 @@ config X86
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE)
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
+ select ARCH_HAS_CPU_ATTACK_VECTORS if CPU_MITIGATIONS
select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
select ARCH_HAS_CPU_FINALIZE_INIT
@@ -241,7 +242,6 @@ config X86
select HAVE_GUP_FAST
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
select HAVE_FTRACE_GRAPH_FUNC if HAVE_FUNCTION_GRAPH_TRACER
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_FREGS if HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_GRAPH_TRACER if X86_32 || (X86_64 && DYNAMIC_FTRACE)
select HAVE_FUNCTION_TRACER
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 640fcac3af74..3f9fb3698d66 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -71,7 +71,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
-sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|_e\?sbat\|z_.*\)$$/\#define ZO_\2 0x\1/p'
quiet_cmd_zoffset = ZOFFSET $@
cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f4f7b22d8113..3a38fdcdb9bd 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -106,6 +106,11 @@ vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o
vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o
vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
vmlinux-libs-$(CONFIG_X86_64) += $(objtree)/arch/x86/boot/startup/lib.a
+vmlinux-objs-$(CONFIG_EFI_SBAT) += $(obj)/sbat.o
+
+ifdef CONFIG_EFI_SBAT
+$(obj)/sbat.o: $(CONFIG_EFI_SBAT_FILE)
+endif
$(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE
$(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/sbat.S b/arch/x86/boot/compressed/sbat.S
new file mode 100644
index 000000000000..838f70a997dd
--- /dev/null
+++ b/arch/x86/boot/compressed/sbat.S
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Embed SBAT data in the kernel.
+ */
+ .pushsection ".sbat", "a", @progbits
+ .incbin CONFIG_EFI_SBAT_FILE
+ .popsection
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 3b2bc61c9408..587ce3e7c504 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -43,6 +43,14 @@ SECTIONS
*(.rodata.*)
_erodata = . ;
}
+#ifdef CONFIG_EFI_SBAT
+ .sbat : ALIGN(0x1000) {
+ _sbat = . ;
+ *(.sbat)
+ _esbat = ALIGN(0x1000);
+ . = _esbat;
+ }
+#endif
.data : ALIGN(0x1000) {
_data = . ;
*(.data)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index e1f4fd5bc8ee..9bea5a1e2c52 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -179,15 +179,11 @@ pecompat_fstart:
#else
.set pecompat_fstart, setup_size
#endif
- .ascii ".text"
- .byte 0
- .byte 0
- .byte 0
- .long ZO__data
- .long setup_size
- .long ZO__data # Size of initialized data
- # on disk
- .long setup_size
+ .ascii ".text\0\0\0"
+ .long textsize # VirtualSize
+ .long setup_size # VirtualAddress
+ .long textsize # SizeOfRawData
+ .long setup_size # PointerToRawData
.long 0 # PointerToRelocations
.long 0 # PointerToLineNumbers
.word 0 # NumberOfRelocations
@@ -196,6 +192,23 @@ pecompat_fstart:
IMAGE_SCN_MEM_READ | \
IMAGE_SCN_MEM_EXECUTE # Characteristics
+#ifdef CONFIG_EFI_SBAT
+ .ascii ".sbat\0\0\0"
+ .long ZO__esbat - ZO__sbat # VirtualSize
+ .long setup_size + ZO__sbat # VirtualAddress
+ .long ZO__esbat - ZO__sbat # SizeOfRawData
+ .long setup_size + ZO__sbat # PointerToRawData
+
+ .long 0, 0, 0
+ .long IMAGE_SCN_CNT_INITIALIZED_DATA | \
+ IMAGE_SCN_MEM_READ | \
+ IMAGE_SCN_MEM_DISCARDABLE # Characteristics
+
+ .set textsize, ZO__sbat
+#else
+ .set textsize, ZO__data
+#endif
+
.ascii ".data\0\0\0"
.long ZO__end - ZO__data # VirtualSize
.long setup_size + ZO__data # VirtualAddress
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 7543a8b52c67..fc59ce78c477 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -1045,11 +1045,13 @@ int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
* This is needed by the OVMF UEFI firmware which will use whatever it finds in
* the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
* runtime GHCBs used by the kernel are also mapped in the EFI page-table.
+ *
+ * When running under SVSM the CA page is needed too, so map it as well.
*/
-int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
+int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd)
{
+ unsigned long address, pflags, pflags_enc;
struct sev_es_runtime_data *data;
- unsigned long address, pflags;
int cpu;
u64 pfn;
@@ -1057,6 +1059,7 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
return 0;
pflags = _PAGE_NX | _PAGE_RW;
+ pflags_enc = cc_mkenc(pflags);
for_each_possible_cpu(cpu) {
data = per_cpu(runtime_data, cpu);
@@ -1066,6 +1069,16 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags))
return 1;
+
+ if (snp_vmpl) {
+ address = per_cpu(svsm_caa_pa, cpu);
+ if (!address)
+ return 1;
+
+ pfn = address >> PAGE_SHIFT;
+ if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags_enc))
+ return 1;
+ }
}
return 0;
@@ -1389,16 +1402,16 @@ int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call,
}
EXPORT_SYMBOL_GPL(snp_issue_svsm_attest_req);
-static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input,
- struct snp_guest_request_ioctl *rio)
+static int snp_issue_guest_request(struct snp_guest_req *req)
{
+ struct snp_req_data *input = &req->input;
struct ghcb_state state;
struct es_em_ctxt ctxt;
unsigned long flags;
struct ghcb *ghcb;
int ret;
- rio->exitinfo2 = SEV_RET_NO_FW_CALL;
+ req->exitinfo2 = SEV_RET_NO_FW_CALL;
/*
* __sev_get_ghcb() needs to run with IRQs disabled because it is using
@@ -1423,8 +1436,8 @@ static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_dat
if (ret)
goto e_put;
- rio->exitinfo2 = ghcb->save.sw_exit_info_2;
- switch (rio->exitinfo2) {
+ req->exitinfo2 = ghcb->save.sw_exit_info_2;
+ switch (req->exitinfo2) {
case 0:
break;
@@ -1919,8 +1932,7 @@ static int enc_payload(struct snp_msg_desc *mdesc, u64 seqno, struct snp_guest_r
return 0;
}
-static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
- struct snp_guest_request_ioctl *rio)
+static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req)
{
unsigned long req_start = jiffies;
unsigned int override_npages = 0;
@@ -1934,7 +1946,7 @@ retry_request:
* sequence number must be incremented or the VMPCK must be deleted to
* prevent reuse of the IV.
*/
- rc = snp_issue_guest_request(req, &req->input, rio);
+ rc = snp_issue_guest_request(req);
switch (rc) {
case -ENOSPC:
/*
@@ -1987,7 +1999,7 @@ retry_request:
snp_inc_msg_seqno(mdesc);
if (override_err) {
- rio->exitinfo2 = override_err;
+ req->exitinfo2 = override_err;
/*
* If an extended guest request was issued and the supplied certificate
@@ -2005,12 +2017,20 @@ retry_request:
return rc;
}
-int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
- struct snp_guest_request_ioctl *rio)
+int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req)
{
u64 seqno;
int rc;
+ /*
+ * enc_payload() calls aesgcm_encrypt(), which can potentially offload to HW.
+ * The offload's DMA SG list of data to encrypt has to be in linear mapping.
+ */
+ if (!virt_addr_valid(req->req_buf) || !virt_addr_valid(req->resp_buf)) {
+ pr_warn("AES-GSM buffers must be in linear mapping");
+ return -EINVAL;
+ }
+
guard(mutex)(&snp_cmd_mutex);
/* Check if the VMPCK is not empty */
@@ -2043,14 +2063,14 @@ int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req
req->input.resp_gpa = __pa(mdesc->response);
req->input.data_gpa = req->certs_data ? __pa(req->certs_data) : 0;
- rc = __handle_guest_request(mdesc, req, rio);
+ rc = __handle_guest_request(mdesc, req);
if (rc) {
if (rc == -EIO &&
- rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
+ req->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
return rc;
pr_alert("Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n",
- rc, rio->exitinfo2);
+ rc, req->exitinfo2);
snp_disable_vmpck(mdesc);
return rc;
@@ -2069,11 +2089,10 @@ EXPORT_SYMBOL_GPL(snp_send_guest_request);
static int __init snp_get_tsc_info(void)
{
- struct snp_guest_request_ioctl *rio;
struct snp_tsc_info_resp *tsc_resp;
struct snp_tsc_info_req *tsc_req;
struct snp_msg_desc *mdesc;
- struct snp_guest_req *req;
+ struct snp_guest_req req = {};
int rc = -ENOMEM;
tsc_req = kzalloc(sizeof(*tsc_req), GFP_KERNEL);
@@ -2089,32 +2108,24 @@ static int __init snp_get_tsc_info(void)
if (!tsc_resp)
goto e_free_tsc_req;
- req = kzalloc(sizeof(*req), GFP_KERNEL);
- if (!req)
- goto e_free_tsc_resp;
-
- rio = kzalloc(sizeof(*rio), GFP_KERNEL);
- if (!rio)
- goto e_free_req;
-
mdesc = snp_msg_alloc();
if (IS_ERR_OR_NULL(mdesc))
- goto e_free_rio;
+ goto e_free_tsc_resp;
rc = snp_msg_init(mdesc, snp_vmpl);
if (rc)
goto e_free_mdesc;
- req->msg_version = MSG_HDR_VER;
- req->msg_type = SNP_MSG_TSC_INFO_REQ;
- req->vmpck_id = snp_vmpl;
- req->req_buf = tsc_req;
- req->req_sz = sizeof(*tsc_req);
- req->resp_buf = (void *)tsc_resp;
- req->resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN;
- req->exit_code = SVM_VMGEXIT_GUEST_REQUEST;
+ req.msg_version = MSG_HDR_VER;
+ req.msg_type = SNP_MSG_TSC_INFO_REQ;
+ req.vmpck_id = snp_vmpl;
+ req.req_buf = tsc_req;
+ req.req_sz = sizeof(*tsc_req);
+ req.resp_buf = (void *)tsc_resp;
+ req.resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN;
+ req.exit_code = SVM_VMGEXIT_GUEST_REQUEST;
- rc = snp_send_guest_request(mdesc, req, rio);
+ rc = snp_send_guest_request(mdesc, &req);
if (rc)
goto e_request;
@@ -2135,11 +2146,7 @@ e_request:
memzero_explicit(tsc_resp, sizeof(*tsc_resp) + AUTHTAG_LEN);
e_free_mdesc:
snp_msg_free(mdesc);
-e_free_rio:
- kfree(rio);
-e_free_req:
- kfree(req);
- e_free_tsc_resp:
+e_free_tsc_resp:
kfree(tsc_resp);
e_free_tsc_req:
kfree(tsc_req);
diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c
index 0989d98da130..faf1fce89ed4 100644
--- a/arch/x86/coco/sev/vc-handle.c
+++ b/arch/x86/coco/sev/vc-handle.c
@@ -17,6 +17,7 @@
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/psp-sev.h>
+#include <linux/efi.h>
#include <uapi/linux/sev-guest.h>
#include <asm/init.h>
@@ -178,9 +179,15 @@ static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
return ES_OK;
}
+/*
+ * User instruction decoding is also required for the EFI runtime. Even though
+ * the EFI runtime is running in kernel mode, it uses special EFI virtual
+ * address mappings that require the use of efi_mm to properly address and
+ * decode.
+ */
static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
{
- if (user_mode(ctxt->regs))
+ if (user_mode(ctxt->regs) || mm_is_efi(current->active_mm))
return __vc_decode_user_insn(ctxt);
else
return __vc_decode_kern_insn(ctxt);
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 7cd2f395f301..79fa38ca954d 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -27,10 +27,12 @@ CONFIG_CGROUP_DEBUG=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_KALLSYMS_ALL=y
CONFIG_PROFILING=y
+CONFIG_KEXEC=y
+# Do not remove this as it results in non-bootable kernels
+# CONFIG_64BIT is not set
CONFIG_SMP=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
-CONFIG_NR_CPUS=8
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
@@ -39,9 +41,6 @@ CONFIG_X86_CHECK_BIOS_CORRUPTION=y
CONFIG_EFI=y
CONFIG_EFI_STUB=y
CONFIG_HZ_1000=y
-CONFIG_KEXEC=y
-CONFIG_CRASH_DUMP=y
-# CONFIG_MITIGATION_RETHUNK is not set
CONFIG_HIBERNATION=y
CONFIG_PM_DEBUG=y
CONFIG_PM_TRACE_RTC=y
@@ -52,7 +51,6 @@ CONFIG_CPU_FREQ_GOV_ONDEMAND=y
CONFIG_X86_ACPI_CPUFREQ=y
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
-CONFIG_COMPAT_32BIT_TIME=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
@@ -63,9 +61,7 @@ CONFIG_BINFMT_MISC=y
# CONFIG_COMPAT_BRK is not set
CONFIG_NET=y
CONFIG_PACKET=y
-CONFIG_UNIX=y
CONFIG_XFRM_USER=y
-CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_MULTIPLE_TABLES=y
@@ -134,7 +130,6 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_DEBUG_DEVRES=y
CONFIG_CONNECTOR=y
-CONFIG_EFI_CAPSULE_LOADER=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_VIRTIO_BLK=y
CONFIG_BLK_DEV_SD=y
@@ -210,7 +205,6 @@ CONFIG_SND_HDA_INTEL=y
CONFIG_SND_HDA_HWDEP=y
CONFIG_HIDRAW=y
CONFIG_HID_GYRATION=y
-CONFIG_LOGITECH_FF=y
CONFIG_HID_NTRIG=y
CONFIG_HID_PANTHERLORD=y
CONFIG_PANTHERLORD_FF=y
@@ -241,7 +235,6 @@ CONFIG_EXT4_FS_POSIX_ACL=y
CONFIG_EXT4_FS_SECURITY=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
-# CONFIG_PRINT_QUOTA_WARNING is not set
CONFIG_QFMT_V2=y
CONFIG_AUTOFS_FS=y
CONFIG_ISO9660_FS=y
@@ -266,19 +259,13 @@ CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_KERNEL=y
-CONFIG_FRAME_WARN=1024
CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_WX=y
CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_BOOT_PARAMS=y
-CONFIG_UNWINDER_FRAME_POINTER=y
CONFIG_DEBUG_ENTRY=y
-# CONFIG_64BIT is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 61e25f6209ed..7d7310cdf8b0 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -27,6 +27,7 @@ CONFIG_CGROUP_DEBUG=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_KALLSYMS_ALL=y
CONFIG_PROFILING=y
+CONFIG_KEXEC=y
CONFIG_SMP=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
@@ -40,8 +41,6 @@ CONFIG_EFI=y
CONFIG_EFI_STUB=y
CONFIG_EFI_MIXED=y
CONFIG_HZ_1000=y
-CONFIG_KEXEC=y
-CONFIG_CRASH_DUMP=y
CONFIG_HIBERNATION=y
CONFIG_PM_DEBUG=y
CONFIG_PM_TRACE_RTC=y
@@ -63,9 +62,7 @@ CONFIG_BINFMT_MISC=y
# CONFIG_COMPAT_BRK is not set
CONFIG_NET=y
CONFIG_PACKET=y
-CONFIG_UNIX=y
CONFIG_XFRM_USER=y
-CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_MULTIPLE_TABLES=y
@@ -205,7 +202,6 @@ CONFIG_SND_HDA_INTEL=y
CONFIG_SND_HDA_HWDEP=y
CONFIG_HIDRAW=y
CONFIG_HID_GYRATION=y
-CONFIG_LOGITECH_FF=y
CONFIG_HID_NTRIG=y
CONFIG_HID_PANTHERLORD=y
CONFIG_PANTHERLORD_FF=y
@@ -239,7 +235,6 @@ CONFIG_EXT4_FS_POSIX_ACL=y
CONFIG_EXT4_FS_SECURITY=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
-# CONFIG_PRINT_QUOTA_WARNING is not set
CONFIG_QFMT_V2=y
CONFIG_AUTOFS_FS=y
CONFIG_ISO9660_FS=y
@@ -264,13 +259,11 @@ CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_KERNEL=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_WX=y
CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index e0815a12db90..a762f7f5b161 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1807,6 +1807,12 @@ static const struct intel_uncore_init_fun lnl_uncore_init __initconst = {
.mmio_init = lnl_uncore_mmio_init,
};
+static const struct intel_uncore_init_fun ptl_uncore_init __initconst = {
+ .cpu_init = ptl_uncore_cpu_init,
+ .mmio_init = ptl_uncore_mmio_init,
+ .use_discovery = true,
+};
+
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
@@ -1888,6 +1894,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_VFM(INTEL_ARROWLAKE_U, &mtl_uncore_init),
X86_MATCH_VFM(INTEL_ARROWLAKE_H, &mtl_uncore_init),
X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_uncore_init),
+ X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &ptl_uncore_init),
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init),
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init),
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 3dcb88c0ecfa..d8815fff7588 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -612,10 +612,12 @@ void tgl_uncore_cpu_init(void);
void adl_uncore_cpu_init(void);
void lnl_uncore_cpu_init(void);
void mtl_uncore_cpu_init(void);
+void ptl_uncore_cpu_init(void);
void tgl_uncore_mmio_init(void);
void tgl_l_uncore_mmio_init(void);
void adl_uncore_mmio_init(void);
void lnl_uncore_mmio_init(void);
+void ptl_uncore_mmio_init(void);
int snb_pci2phy_map_init(int devid);
/* uncore_snbep.c */
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 18a3022f26a0..7d57ce706feb 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -274,32 +274,15 @@ uncore_ignore_unit(struct uncore_unit_discovery *unit, int *ignore)
return false;
}
-static int parse_discovery_table(struct pci_dev *dev, int die,
- u32 bar_offset, bool *parsed,
- int *ignore)
+static int __parse_discovery_table(resource_size_t addr, int die,
+ bool *parsed, int *ignore)
{
struct uncore_global_discovery global;
struct uncore_unit_discovery unit;
void __iomem *io_addr;
- resource_size_t addr;
unsigned long size;
- u32 val;
int i;
- pci_read_config_dword(dev, bar_offset, &val);
-
- if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64)
- return -EINVAL;
-
- addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK);
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
- if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) {
- u32 val2;
-
- pci_read_config_dword(dev, bar_offset + 4, &val2);
- addr |= ((resource_size_t)val2) << 32;
- }
-#endif
size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE;
io_addr = ioremap(addr, size);
if (!io_addr)
@@ -342,7 +325,32 @@ static int parse_discovery_table(struct pci_dev *dev, int die,
return 0;
}
-bool intel_uncore_has_discovery_tables(int *ignore)
+static int parse_discovery_table(struct pci_dev *dev, int die,
+ u32 bar_offset, bool *parsed,
+ int *ignore)
+{
+ resource_size_t addr;
+ u32 val;
+
+ pci_read_config_dword(dev, bar_offset, &val);
+
+ if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64)
+ return -EINVAL;
+
+ addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK);
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+ u32 val2;
+
+ pci_read_config_dword(dev, bar_offset + 4, &val2);
+ addr |= ((resource_size_t)val2) << 32;
+ }
+#endif
+
+ return __parse_discovery_table(addr, die, parsed, ignore);
+}
+
+static bool intel_uncore_has_discovery_tables_pci(int *ignore)
{
u32 device, val, entry_id, bar_offset;
int die, dvsec = 0, ret = true;
@@ -391,6 +399,45 @@ err:
return ret;
}
+static bool intel_uncore_has_discovery_tables_msr(int *ignore)
+{
+ unsigned long *die_mask;
+ bool parsed = false;
+ int cpu, die;
+ u64 base;
+
+ die_mask = kcalloc(BITS_TO_LONGS(uncore_max_dies()),
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!die_mask)
+ return false;
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ die = topology_logical_die_id(cpu);
+ if (__test_and_set_bit(die, die_mask))
+ continue;
+
+ if (rdmsrq_safe_on_cpu(cpu, UNCORE_DISCOVERY_MSR, &base))
+ continue;
+
+ if (!base)
+ continue;
+
+ __parse_discovery_table(base, die, &parsed, ignore);
+ }
+
+ cpus_read_unlock();
+
+ kfree(die_mask);
+ return parsed;
+}
+
+bool intel_uncore_has_discovery_tables(int *ignore)
+{
+ return intel_uncore_has_discovery_tables_msr(ignore) ||
+ intel_uncore_has_discovery_tables_pci(ignore);
+}
+
void intel_uncore_clear_discovery_tables(void)
{
struct intel_uncore_discovery_type *type, *next;
@@ -604,7 +651,7 @@ void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box)
}
addr = unit->addr;
- box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE);
+ box->io_addr = ioremap(addr, type->mmio_map_size);
if (!box->io_addr) {
pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n",
type->type_id, unit->id, (unsigned long long)addr);
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 0e94aa7db8e7..dff75c98e22f 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -1,5 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0-only */
+/* Store the full address of the global discovery table */
+#define UNCORE_DISCOVERY_MSR 0x201e
+
/* Generic device ID of a discovery table device */
#define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7
/* Capability ID for a discovery table device */
@@ -168,3 +171,7 @@ bool intel_generic_uncore_assign_hw_event(struct perf_event *event,
struct intel_uncore_box *box);
void uncore_find_add_unit(struct intel_uncore_discovery_unit *node,
struct rb_root *root, u16 *num_units);
+struct intel_uncore_type **
+uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
+ struct intel_uncore_type **extra, int max_num_types,
+ struct intel_uncore_type **uncores);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index a1a96833e30e..807e582b8f17 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -1855,3 +1855,82 @@ void lnl_uncore_mmio_init(void)
}
/* end of Lunar Lake MMIO uncore support */
+
+/* Panther Lake uncore support */
+
+#define UNCORE_PTL_MAX_NUM_UNCORE_TYPES 42
+#define UNCORE_PTL_TYPE_IMC 6
+#define UNCORE_PTL_TYPE_SNCU 34
+#define UNCORE_PTL_TYPE_HBO 41
+
+#define PTL_UNCORE_GLOBAL_CTL_OFFSET 0x380
+
+static struct intel_uncore_type ptl_uncore_imc = {
+ .name = "imc",
+ .mmio_map_size = 0xf00,
+};
+
+static void ptl_uncore_sncu_init_box(struct intel_uncore_box *box)
+{
+ intel_generic_uncore_mmio_init_box(box);
+
+ /* Clear the global freeze bit */
+ if (box->io_addr)
+ writel(0, box->io_addr + PTL_UNCORE_GLOBAL_CTL_OFFSET);
+}
+
+static struct intel_uncore_ops ptl_uncore_sncu_ops = {
+ .init_box = ptl_uncore_sncu_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = intel_generic_uncore_mmio_disable_box,
+ .enable_box = intel_generic_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = intel_generic_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+static struct intel_uncore_type ptl_uncore_sncu = {
+ .name = "sncu",
+ .ops = &ptl_uncore_sncu_ops,
+ .mmio_map_size = 0xf00,
+};
+
+static struct intel_uncore_type ptl_uncore_hbo = {
+ .name = "hbo",
+ .mmio_map_size = 0xf00,
+};
+
+static struct intel_uncore_type *ptl_uncores[UNCORE_PTL_MAX_NUM_UNCORE_TYPES] = {
+ [UNCORE_PTL_TYPE_IMC] = &ptl_uncore_imc,
+ [UNCORE_PTL_TYPE_SNCU] = &ptl_uncore_sncu,
+ [UNCORE_PTL_TYPE_HBO] = &ptl_uncore_hbo,
+};
+
+#define UNCORE_PTL_MMIO_EXTRA_UNCORES 1
+
+static struct intel_uncore_type *ptl_mmio_extra_uncores[UNCORE_PTL_MMIO_EXTRA_UNCORES] = {
+ &adl_uncore_imc_free_running,
+};
+
+void ptl_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO,
+ UNCORE_PTL_MMIO_EXTRA_UNCORES,
+ ptl_mmio_extra_uncores,
+ UNCORE_PTL_MAX_NUM_UNCORE_TYPES,
+ ptl_uncores);
+}
+
+static struct intel_uncore_type *ptl_msr_uncores[] = {
+ &mtl_uncore_cbox,
+ NULL
+};
+
+void ptl_uncore_cpu_init(void)
+{
+ mtl_uncore_cbox.num_boxes = 6;
+ mtl_uncore_cbox.ops = &lnl_uncore_msr_ops;
+ uncore_msr_uncores = ptl_msr_uncores;
+}
+
+/* end of Panther Lake uncore support */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 2824dc9950be..e1f370b8d065 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -6409,9 +6409,11 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
to_type->get_topology = from_type->get_topology;
if (from_type->cleanup_mapping)
to_type->cleanup_mapping = from_type->cleanup_mapping;
+ if (from_type->mmio_map_size)
+ to_type->mmio_map_size = from_type->mmio_map_size;
}
-static struct intel_uncore_type **
+struct intel_uncore_type **
uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
struct intel_uncore_type **extra, int max_num_types,
struct intel_uncore_type **uncores)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 23d86c9750b9..07ba4935e873 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -488,11 +488,14 @@ static inline void apic_setup_apic_calls(void) { }
extern void apic_ack_irq(struct irq_data *data);
+#define APIC_VECTOR_TO_BIT_NUMBER(v) ((unsigned int)(v) % 32)
+#define APIC_VECTOR_TO_REG_OFFSET(v) ((unsigned int)(v) / 32 * 0x10)
+
static inline bool lapic_vector_set_in_irr(unsigned int vector)
{
- u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ u32 irr = apic_read(APIC_IRR + APIC_VECTOR_TO_REG_OFFSET(vector));
- return !!(irr & (1U << (vector % 32)));
+ return !!(irr & (1U << APIC_VECTOR_TO_BIT_NUMBER(vector)));
}
static inline bool is_vector_pending(unsigned int vector)
@@ -500,6 +503,65 @@ static inline bool is_vector_pending(unsigned int vector)
return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector);
}
+#define MAX_APIC_VECTOR 256
+#define APIC_VECTORS_PER_REG 32
+
+/*
+ * Vector states are maintained by APIC in 32-bit registers that are
+ * 16 bytes aligned. The status of each vector is kept in a single
+ * bit.
+ */
+static inline int apic_find_highest_vector(void *bitmap)
+{
+ int vec;
+ u32 *reg;
+
+ for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; vec >= 0; vec -= APIC_VECTORS_PER_REG) {
+ reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec);
+ if (*reg)
+ return __fls(*reg) + vec;
+ }
+
+ return -1;
+}
+
+static inline u32 apic_get_reg(void *regs, int reg)
+{
+ return *((u32 *) (regs + reg));
+}
+
+static inline void apic_set_reg(void *regs, int reg, u32 val)
+{
+ *((u32 *) (regs + reg)) = val;
+}
+
+static __always_inline u64 apic_get_reg64(void *regs, int reg)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ return *((u64 *) (regs + reg));
+}
+
+static __always_inline void apic_set_reg64(void *regs, int reg, u64 val)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ *((u64 *) (regs + reg)) = val;
+}
+
+static inline void apic_clear_vector(int vec, void *bitmap)
+{
+ clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec));
+}
+
+static inline void apic_set_vector(int vec, void *bitmap)
+{
+ set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec));
+}
+
+static inline int apic_test_vector(int vec, void *bitmap)
+{
+ return test_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec));
+}
+
/*
* Warm reset vector position:
*/
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 286d509f9363..602957dd2609 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -458,9 +458,12 @@
#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */
#define X86_FEATURE_VERW_CLEAR (20*32+ 5) /* The memory form of VERW mitigates TSA */
#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */
+
#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */
#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */
+#define X86_FEATURE_GP_ON_USER_CPUID (20*32+17) /* User CPUID faulting */
+
#define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */
#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */
#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 1c94121acd3d..93e99d2583d6 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -118,7 +118,7 @@ enum xfeature {
XFEATURE_PKRU,
XFEATURE_PASID,
XFEATURE_CET_USER,
- XFEATURE_CET_KERNEL_UNUSED,
+ XFEATURE_CET_KERNEL,
XFEATURE_RSRVD_COMP_13,
XFEATURE_RSRVD_COMP_14,
XFEATURE_LBR,
@@ -142,7 +142,7 @@ enum xfeature {
#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
#define XFEATURE_MASK_PASID (1 << XFEATURE_PASID)
#define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER)
-#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL_UNUSED)
+#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL)
#define XFEATURE_MASK_LBR (1 << XFEATURE_LBR)
#define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG)
#define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA)
@@ -269,6 +269,16 @@ struct cet_user_state {
};
/*
+ * State component 12 is Control-flow Enforcement supervisor states.
+ * This state includes SSP pointers for privilege levels 0 through 2.
+ */
+struct cet_supervisor_state {
+ u64 pl0_ssp;
+ u64 pl1_ssp;
+ u64 pl2_ssp;
+} __packed;
+
+/*
* State component 15: Architectural LBR configuration state.
* The size of Arch LBR state depends on the number of LBRs (lbr_depth).
*/
@@ -552,6 +562,31 @@ struct fpu_guest {
};
/*
+ * FPU state configuration data for fpu_guest.
+ * Initialized at boot time. Read only after init.
+ */
+struct vcpu_fpu_config {
+ /*
+ * @size:
+ *
+ * The default size of the register state buffer in guest FPUs.
+ * Includes all supported features except independent managed
+ * features and features which have to be requested by user space
+ * before usage.
+ */
+ unsigned int size;
+
+ /*
+ * @features:
+ *
+ * The default supported features bitmap in guest FPUs. Does not
+ * include independent managed features and features which have to
+ * be requested by user space before usage.
+ */
+ u64 features;
+};
+
+/*
* FPU state configuration data. Initialized at boot time. Read only after init.
*/
struct fpu_state_config {
@@ -567,8 +602,9 @@ struct fpu_state_config {
* @default_size:
*
* The default size of the register state buffer. Includes all
- * supported features except independent managed features and
- * features which have to be requested by user space before usage.
+ * supported features except independent managed features,
+ * guest-only features and features which have to be requested by
+ * user space before usage.
*/
unsigned int default_size;
@@ -584,8 +620,8 @@ struct fpu_state_config {
* @default_features:
*
* The default supported features bitmap. Does not include
- * independent managed features and features which have to
- * be requested by user space before usage.
+ * independent managed features, guest-only features and features
+ * which have to be requested by user space before usage.
*/
u64 default_features;
/*
@@ -606,5 +642,6 @@ struct fpu_state_config {
/* FPU state configuration information */
extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg;
+extern struct vcpu_fpu_config guest_default_cfg;
#endif /* _ASM_X86_FPU_TYPES_H */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index b308a76afbb7..7a7dc9d56027 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -46,9 +46,13 @@
/* Features which are dynamically enabled for a process on request */
#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
+/* Supervisor features which are enabled only in guest FPUs */
+#define XFEATURE_MASK_GUEST_SUPERVISOR XFEATURE_MASK_CET_KERNEL
+
/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
- XFEATURE_MASK_CET_USER)
+ XFEATURE_MASK_CET_USER | \
+ XFEATURE_MASK_GUEST_SUPERVISOR)
/*
* A supervisor state component may not always contain valuable information,
@@ -75,8 +79,7 @@
* Unsupported supervisor features. When a supervisor feature in this mask is
* supported in the future, move it to the supported supervisor feature mask.
*/
-#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \
- XFEATURE_MASK_CET_KERNEL)
+#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)
/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 6bfdaeddbae8..5a68e9db6518 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,7 +5,7 @@
#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000
#define __head __section(".head.text") __no_sanitize_undefined __no_stack_protector
#else
-#define __head __section(".head.text") __no_sanitize_undefined __no_sanitize_coverage
+#define __head __section(".head.text") __no_sanitize_undefined __no_kstack_erase
#endif
struct x86_mapping_info {
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 5036f13ab69f..5a0d42464d44 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -26,7 +26,22 @@ enum {
IRQ_REMAP_X2APIC_MODE,
};
-struct vcpu_data {
+/*
+ * This is mainly used to communicate information back-and-forth
+ * between SVM and IOMMU for setting up and tearing down posted
+ * interrupt
+ */
+struct amd_iommu_pi_data {
+ u64 vapic_addr; /* Physical address of the vCPU's vAPIC. */
+ u32 ga_tag;
+ u32 vector; /* Guest vector of the interrupt */
+ int cpu;
+ bool ga_log_intr;
+ bool is_guest_mode;
+ void *ir_data;
+};
+
+struct intel_iommu_pi_data {
u64 pi_desc_addr; /* Physical address of PI Descriptor */
u32 vector; /* Guest vector of the interrupt */
};
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 8d50e3e0a19b..18a5c3119e1a 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -49,7 +49,6 @@ KVM_X86_OP(set_idt)
KVM_X86_OP(get_gdt)
KVM_X86_OP(set_gdt)
KVM_X86_OP(sync_dirty_debug_regs)
-KVM_X86_OP(set_dr6)
KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
@@ -112,7 +111,7 @@ KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
KVM_X86_OP_OPTIONAL(vcpu_blocking)
KVM_X86_OP_OPTIONAL(vcpu_unblocking)
KVM_X86_OP_OPTIONAL(pi_update_irte)
-KVM_X86_OP_OPTIONAL(pi_start_assignment)
+KVM_X86_OP_OPTIONAL(pi_start_bypass)
KVM_X86_OP_OPTIONAL(apicv_pre_state_restore)
KVM_X86_OP_OPTIONAL(apicv_post_state_restore)
KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
@@ -139,7 +138,7 @@ KVM_X86_OP(check_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
KVM_X86_OP_OPTIONAL(migrate_timers)
-KVM_X86_OP(msr_filter_changed)
+KVM_X86_OP(recalc_msr_intercepts)
KVM_X86_OP(complete_emulated_msr)
KVM_X86_OP(vcpu_deliver_sipi_vector)
KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f7af967aa16f..f19a76d3ca0e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -297,6 +297,7 @@ enum x86_intercept_stage;
*/
#define KVM_APIC_PV_EOI_PENDING 1
+struct kvm_kernel_irqfd;
struct kvm_kernel_irq_routing_entry;
/*
@@ -1320,6 +1321,12 @@ enum kvm_apicv_inhibit {
*/
APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
+ /*
+ * AVIC is disabled because the vCPU's APIC ID is beyond the max
+ * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable.
+ */
+ APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG,
+
NR_APICV_INHIBIT_REASONS,
};
@@ -1338,7 +1345,8 @@ enum kvm_apicv_inhibit {
__APICV_INHIBIT_REASON(IRQWIN), \
__APICV_INHIBIT_REASON(PIT_REINJ), \
__APICV_INHIBIT_REASON(SEV), \
- __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED)
+ __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \
+ __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG)
struct kvm_arch {
unsigned long n_used_mmu_pages;
@@ -1350,7 +1358,7 @@ struct kvm_arch {
bool has_private_mem;
bool has_protected_state;
bool pre_fault_allowed;
- struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+ struct hlist_head *mmu_page_hash;
struct list_head active_mmu_pages;
/*
* A list of kvm_mmu_page structs that, if zapped, could possibly be
@@ -1379,11 +1387,13 @@ struct kvm_arch {
#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
atomic_t noncoherent_dma_count;
-#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
- atomic_t assigned_device_count;
+ unsigned long nr_possible_bypass_irqs;
+
+#ifdef CONFIG_KVM_IOAPIC
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
+#endif
atomic_t vapics_in_nmi_mode;
struct mutex apic_map_lock;
struct kvm_apic_map __rcu *apic_map;
@@ -1398,12 +1408,8 @@ struct kvm_arch {
gpa_t wall_clock;
- bool mwait_in_guest;
- bool hlt_in_guest;
- bool pause_in_guest;
- bool cstate_in_guest;
+ u64 disabled_exits;
- unsigned long irq_sources_bitmap;
s64 kvmclock_offset;
/*
@@ -1432,9 +1438,6 @@ struct kvm_arch {
struct delayed_work kvmclock_update_work;
struct delayed_work kvmclock_sync_work;
- /* reads protected by irq_srcu, writes by irq_lock */
- struct hlist_head mask_notifier_list;
-
#ifdef CONFIG_KVM_HYPERV
struct kvm_hv hyperv;
#endif
@@ -1457,6 +1460,7 @@ struct kvm_arch {
bool x2apic_format;
bool x2apic_broadcast_quirk_disabled;
+ bool has_mapped_host_mmio;
bool guest_can_read_msr_platform_info;
bool exception_payload_enabled;
@@ -1680,6 +1684,12 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
}
+enum kvm_x86_run_flags {
+ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
+ KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
+ KVM_RUN_LOAD_DEBUGCTL = BIT(2),
+};
+
struct kvm_x86_ops {
const char *name;
@@ -1708,6 +1718,12 @@ struct kvm_x86_ops {
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
+ /*
+ * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
+ * match the host's value even while the guest is active.
+ */
+ const u64 HOST_OWNED_DEBUGCTL;
+
void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -1730,7 +1746,6 @@ struct kvm_x86_ops {
void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
- void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
@@ -1761,7 +1776,7 @@ struct kvm_x86_ops {
int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
- bool force_immediate_exit);
+ u64 run_flags);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
@@ -1853,9 +1868,10 @@ struct kvm_x86_ops {
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
- int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
- void (*pi_start_assignment)(struct kvm *kvm);
+ int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
+ void (*pi_start_bypass)(struct kvm *kvm);
void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
@@ -1892,7 +1908,7 @@ struct kvm_x86_ops {
int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu);
void (*migrate_timers)(struct kvm_vcpu *vcpu);
- void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
+ void (*recalc_msr_intercepts)(struct kvm_vcpu *vcpu);
int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
@@ -1950,6 +1966,7 @@ struct kvm_arch_async_pf {
extern u32 __read_mostly kvm_nr_uret_msrs;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
+extern bool __read_mostly enable_ipiv;
extern bool __read_mostly enable_device_posted_irqs;
extern struct kvm_x86_ops kvm_x86_ops;
@@ -1968,7 +1985,7 @@ void kvm_x86_vendor_exit(void);
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
- return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ return kvzalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT);
}
#define __KVM_HAVE_ARCH_VM_FREE
@@ -2013,7 +2030,7 @@ void kvm_mmu_vendor_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
-void kvm_mmu_init_vm(struct kvm *kvm);
+int kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
@@ -2044,19 +2061,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
-struct kvm_irq_mask_notifier {
- void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
- int irq;
- struct hlist_node link;
-};
-
-void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn);
-void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn);
-void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
- bool mask);
-
extern bool tdp_enabled;
u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
@@ -2215,9 +2219,6 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
return !!(*irq_state);
}
-int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
-void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
-
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
@@ -2394,9 +2395,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
-void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct kvm_lapic_irq *irq);
-
static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
{
/* We can only post Fixed and LowPrio IRQs */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5cfb5d74dd5f..b65c3ba5fa14 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -419,6 +419,7 @@
#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12)
#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
+#define DEBUGCTLMSR_RTM_DEBUG BIT(15)
#define MSR_PEBS_FRONTEND 0x000003f7
@@ -733,6 +734,11 @@
#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301
#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302
+/* AMD Hardware Feedback Support MSRs */
+#define MSR_AMD_WORKLOAD_CLASS_CONFIG 0xc0000500
+#define MSR_AMD_WORKLOAD_CLASS_ID 0xc0000501
+#define MSR_AMD_WORKLOAD_HRST 0xc0000502
+
/* AMD Last Branch Record MSRs */
#define MSR_AMD64_LBR_SELECT 0xc000010e
@@ -831,6 +837,7 @@
#define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
#define MSR_K7_HWCR_IRPERF_EN_BIT 30
#define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT)
+#define MSR_K7_HWCR_CPUID_USER_DIS_BIT 35
#define MSR_K7_FID_VID_CTL 0xc0010041
#define MSR_K7_FID_VID_STATUS 0xc0010042
#define MSR_K7_HWCR_CPB_DIS_BIT 25
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b74ec5c3643b..a5731fb1e9dd 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -214,9 +214,6 @@ enum page_cache_mode {
#define PAGE_READONLY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0)
#define PAGE_READONLY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0)
-#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G)
-#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G)
-
/*
* Page tables needs to have Write=1 in order for any lower PTEs to be
* writable. This includes shadow stack memory (Write=0, Dirty=1)
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index a631f7d7c0c0..89075ff19afa 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -243,6 +243,7 @@ struct snp_guest_req {
size_t resp_sz;
u64 exit_code;
+ u64 exitinfo2;
unsigned int vmpck_id;
u8 msg_version;
u8 msg_type;
@@ -460,7 +461,7 @@ static __always_inline void sev_es_nmi_complete(void)
cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
__sev_es_nmi_complete();
}
-extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
+extern int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd);
extern void sev_enable(struct boot_params *bp);
/*
@@ -501,8 +502,6 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
return rc;
}
-struct snp_guest_request_ioctl;
-
void setup_ghcb(void);
void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
unsigned long npages);
@@ -528,8 +527,7 @@ void snp_kexec_begin(void);
int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id);
struct snp_msg_desc *snp_msg_alloc(void);
void snp_msg_free(struct snp_msg_desc *mdesc);
-int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
- struct snp_guest_request_ioctl *rio);
+int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req);
int snp_svsm_vtpm_send_command(u8 *buffer);
@@ -571,7 +569,7 @@ static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
static inline void sev_es_nmi_complete(void) { }
-static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; }
+static inline int sev_es_efi_map_ghcbs_cas(pgd_t *pgd) { return 0; }
static inline void sev_enable(struct boot_params *bp) { }
static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; }
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; }
@@ -602,8 +600,8 @@ static inline void snp_kexec_begin(void) { }
static inline int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id) { return -1; }
static inline struct snp_msg_desc *snp_msg_alloc(void) { return NULL; }
static inline void snp_msg_free(struct snp_msg_desc *mdesc) { }
-static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
- struct snp_guest_request_ioctl *rio) { return -ENODEV; }
+static inline int snp_send_guest_request(struct snp_msg_desc *mdesc,
+ struct snp_guest_req *req) { return -ENODEV; }
static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; }
static inline void __init snp_secure_tsc_prepare(void) { }
static inline void __init snp_secure_tsc_init(void) { }
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 0c1c68039d6f..22bfebe6776d 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -112,7 +112,10 @@ void __noreturn hlt_play_dead(void);
void native_play_dead(void);
void play_dead_common(void);
void wbinvd_on_cpu(int cpu);
-int wbinvd_on_all_cpus(void);
+void wbinvd_on_all_cpus(void);
+void wbinvd_on_cpus_mask(struct cpumask *cpus);
+void wbnoinvd_on_all_cpus(void);
+void wbnoinvd_on_cpus_mask(struct cpumask *cpus);
void smp_kick_mwait_play_dead(void);
void __noreturn mwait_play_dead(unsigned int eax_hint);
@@ -148,10 +151,24 @@ static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
#else /* !CONFIG_SMP */
#define wbinvd_on_cpu(cpu) wbinvd()
-static inline int wbinvd_on_all_cpus(void)
+static inline void wbinvd_on_all_cpus(void)
{
wbinvd();
- return 0;
+}
+
+static inline void wbinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ wbinvd();
+}
+
+static inline void wbnoinvd_on_all_cpus(void)
+{
+ wbnoinvd();
+}
+
+static inline void wbnoinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ wbnoinvd();
}
static inline struct cpumask *cpu_llc_shared_mask(int cpu)
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index ecda17efa042..fde2bd7af19e 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -104,9 +104,36 @@ static inline void wrpkru(u32 pkru)
}
#endif
+/*
+ * Write back all modified lines in all levels of cache associated with this
+ * logical processor to main memory, and then invalidate all caches. Depending
+ * on the micro-architecture, WBINVD (and WBNOINVD below) may or may not affect
+ * lower level caches associated with another logical processor that shares any
+ * level of this processor's cache hierarchy.
+ */
static __always_inline void wbinvd(void)
{
- asm volatile("wbinvd": : :"memory");
+ asm volatile("wbinvd" : : : "memory");
+}
+
+/* Instruction encoding provided for binutils backwards compatibility. */
+#define ASM_WBNOINVD _ASM_BYTES(0xf3,0x0f,0x09)
+
+/*
+ * Write back all modified lines in all levels of cache associated with this
+ * logical processor to main memory, but do NOT explicitly invalidate caches,
+ * i.e. leave all/most cache lines in the hierarchy in non-modified state.
+ */
+static __always_inline void wbnoinvd(void)
+{
+ /*
+ * Explicitly encode WBINVD if X86_FEATURE_WBNOINVD is unavailable even
+ * though WBNOINVD is backwards compatible (it's simply WBINVD with an
+ * ignored REP prefix), to guarantee that WBNOINVD isn't used if it
+ * needs to be avoided for any reason. For all supported usage in the
+ * kernel, WBINVD is functionally a superset of WBNOINVD.
+ */
+ alternative("wbinvd", ASM_WBNOINVD, X86_FEATURE_WBNOINVD);
}
static inline unsigned long __read_cr4(void)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index ad954a1a6656..ffc27f676243 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -252,16 +252,21 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+/*
+ * GA_LOG_INTR is a synthetic flag that's never propagated to hardware-visible
+ * tables. GA_LOG_INTR is set if the vCPU needs device posted IRQs to generate
+ * GA log interrupts to wake the vCPU (because it's blocking or about to block).
+ */
+#define AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR BIT_ULL(61)
+
#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
-#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK GENMASK_ULL(51, 12)
#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFFULL)
#define AVIC_DOORBELL_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
-#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-
#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
@@ -290,8 +295,6 @@ enum avic_ipi_failure_cause {
static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID);
static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID);
-#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-
#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0)
#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3)
#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 93069b13d3af..a947b46a8b64 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -183,6 +183,7 @@ setnew:
apicd->cpu = newcpu;
BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));
per_cpu(vector_irq, newcpu)[newvec] = desc;
+ apic_update_irq_cfg(irqd, newvec, newcpu);
}
static void vector_assign_managed_shutdown(struct irq_data *irqd)
@@ -261,7 +262,6 @@ assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest)
if (vector < 0)
return vector;
apic_update_vector(irqd, vector, cpu);
- apic_update_irq_cfg(irqd, vector, cpu);
return 0;
}
@@ -338,7 +338,7 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest)
if (vector < 0)
return vector;
apic_update_vector(irqd, vector, cpu);
- apic_update_irq_cfg(irqd, vector, cpu);
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 329ee185d8cc..a5ece6ebe8a7 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -530,9 +530,11 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
}
bsp_determine_snp(c);
-
tsa_init(c);
+ if (cpu_has(c, X86_FEATURE_GP_ON_USER_CPUID))
+ setup_force_cpu_cap(X86_FEATURE_CPUID_FAULT);
+
return;
warn:
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index f4d3abb12317..b74bf937cd9f 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -115,10 +115,9 @@ void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
static void __init set_return_thunk(void *thunk)
{
- if (x86_return_thunk != __x86_return_thunk)
- pr_warn("x86/bugs: return thunk changed\n");
-
x86_return_thunk = thunk;
+
+ pr_info("active return thunk: %ps\n", thunk);
}
/* Update SPEC_CTRL MSR and its cached copy unconditionally */
@@ -190,6 +189,39 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
EXPORT_SYMBOL_GPL(cpu_buf_vm_clear);
+#undef pr_fmt
+#define pr_fmt(fmt) "mitigations: " fmt
+
+static void __init cpu_print_attack_vectors(void)
+{
+ pr_info("Enabled attack vectors: ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
+ pr_cont("user_kernel, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER))
+ pr_cont("user_user, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST))
+ pr_cont("guest_host, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST))
+ pr_cont("guest_guest, ");
+
+ pr_cont("SMT mitigations: ");
+
+ switch (smt_mitigations) {
+ case SMT_MITIGATIONS_OFF:
+ pr_cont("off\n");
+ break;
+ case SMT_MITIGATIONS_AUTO:
+ pr_cont("auto\n");
+ break;
+ case SMT_MITIGATIONS_ON:
+ pr_cont("on\n");
+ }
+}
+
void __init cpu_select_mitigations(void)
{
/*
@@ -210,6 +242,8 @@ void __init cpu_select_mitigations(void)
x86_arch_cap_msr = x86_read_arch_cap_msr();
+ cpu_print_attack_vectors();
+
/* Select the proper CPU mitigations before patching alternatives: */
spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
@@ -333,6 +367,62 @@ static void x86_amd_ssb_disable(void)
#undef pr_fmt
#define pr_fmt(fmt) "MDS: " fmt
+/*
+ * Returns true if vulnerability should be mitigated based on the
+ * selected attack vector controls.
+ *
+ * See Documentation/admin-guide/hw-vuln/attack_vector_controls.rst
+ */
+static bool __init should_mitigate_vuln(unsigned int bug)
+{
+ switch (bug) {
+ /*
+ * The only runtime-selected spectre_v1 mitigations in the kernel are
+ * related to SWAPGS protection on kernel entry. Therefore, protection
+ * is only required for the user->kernel attack vector.
+ */
+ case X86_BUG_SPECTRE_V1:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL);
+
+ case X86_BUG_SPECTRE_V2:
+ case X86_BUG_RETBLEED:
+ case X86_BUG_SRSO:
+ case X86_BUG_L1TF:
+ case X86_BUG_ITS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST);
+
+ case X86_BUG_SPECTRE_V2_USER:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST);
+
+ /*
+ * All the vulnerabilities below allow potentially leaking data
+ * across address spaces. Therefore, mitigation is required for
+ * any of these 4 attack vectors.
+ */
+ case X86_BUG_MDS:
+ case X86_BUG_TAA:
+ case X86_BUG_MMIO_STALE_DATA:
+ case X86_BUG_RFDS:
+ case X86_BUG_SRBDS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST);
+
+ case X86_BUG_GDS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST) ||
+ (smt_mitigations != SMT_MITIGATIONS_OFF);
+ default:
+ WARN(1, "Unknown bug %x\n", bug);
+ return false;
+ }
+}
+
/* Default mitigation for MDS-affected CPUs */
static enum mds_mitigations mds_mitigation __ro_after_init =
IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF;
@@ -386,13 +476,17 @@ static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init;
static void __init mds_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_MDS)) {
mds_mitigation = MDS_MITIGATION_OFF;
return;
}
- if (mds_mitigation == MDS_MITIGATION_AUTO)
- mds_mitigation = MDS_MITIGATION_FULL;
+ if (mds_mitigation == MDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_MDS))
+ mds_mitigation = MDS_MITIGATION_FULL;
+ else
+ mds_mitigation = MDS_MITIGATION_OFF;
+ }
if (mds_mitigation == MDS_MITIGATION_OFF)
return;
@@ -402,7 +496,7 @@ static void __init mds_select_mitigation(void)
static void __init mds_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
return;
/* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */
@@ -423,7 +517,7 @@ static void __init mds_apply_mitigation(void)
mds_mitigation == MDS_MITIGATION_VMWERV) {
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
- (mds_nosmt || cpu_mitigations_auto_nosmt()))
+ (mds_nosmt || smt_mitigations == SMT_MITIGATIONS_ON))
cpu_smt_disable(false);
}
}
@@ -479,12 +573,13 @@ static void __init taa_select_mitigation(void)
return;
}
- if (cpu_mitigations_off())
- taa_mitigation = TAA_MITIGATION_OFF;
-
/* Microcode will be checked in taa_update_mitigation(). */
- if (taa_mitigation == TAA_MITIGATION_AUTO)
- taa_mitigation = TAA_MITIGATION_VERW;
+ if (taa_mitigation == TAA_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_TAA))
+ taa_mitigation = TAA_MITIGATION_VERW;
+ else
+ taa_mitigation = TAA_MITIGATION_OFF;
+ }
if (taa_mitigation != TAA_MITIGATION_OFF)
verw_clear_cpu_buf_mitigation_selected = true;
@@ -492,7 +587,7 @@ static void __init taa_select_mitigation(void)
static void __init taa_update_mitigation(void)
{
- if (!taa_vulnerable() || cpu_mitigations_off())
+ if (!taa_vulnerable())
return;
if (verw_clear_cpu_buf_mitigation_selected)
@@ -533,7 +628,7 @@ static void __init taa_apply_mitigation(void)
*/
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
- if (taa_nosmt || cpu_mitigations_auto_nosmt())
+ if (taa_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)
cpu_smt_disable(false);
}
}
@@ -579,8 +674,12 @@ static void __init mmio_select_mitigation(void)
}
/* Microcode will be checked in mmio_update_mitigation(). */
- if (mmio_mitigation == MMIO_MITIGATION_AUTO)
- mmio_mitigation = MMIO_MITIGATION_VERW;
+ if (mmio_mitigation == MMIO_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_MMIO_STALE_DATA))
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ else
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ }
if (mmio_mitigation == MMIO_MITIGATION_OFF)
return;
@@ -595,7 +694,7 @@ static void __init mmio_select_mitigation(void)
static void __init mmio_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
return;
if (verw_clear_cpu_buf_mitigation_selected)
@@ -643,7 +742,7 @@ static void __init mmio_apply_mitigation(void)
if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
static_branch_enable(&cpu_buf_idle_clear);
- if (mmio_nosmt || cpu_mitigations_auto_nosmt())
+ if (mmio_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)
cpu_smt_disable(false);
}
@@ -684,13 +783,17 @@ static inline bool __init verw_clears_cpu_reg_file(void)
static void __init rfds_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_RFDS)) {
rfds_mitigation = RFDS_MITIGATION_OFF;
return;
}
- if (rfds_mitigation == RFDS_MITIGATION_AUTO)
- rfds_mitigation = RFDS_MITIGATION_VERW;
+ if (rfds_mitigation == RFDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_RFDS))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+ else
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ }
if (rfds_mitigation == RFDS_MITIGATION_OFF)
return;
@@ -701,7 +804,7 @@ static void __init rfds_select_mitigation(void)
static void __init rfds_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
return;
if (verw_clear_cpu_buf_mitigation_selected)
@@ -802,13 +905,19 @@ void update_srbds_msr(void)
static void __init srbds_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SRBDS) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS)) {
srbds_mitigation = SRBDS_MITIGATION_OFF;
return;
}
- if (srbds_mitigation == SRBDS_MITIGATION_AUTO)
- srbds_mitigation = SRBDS_MITIGATION_FULL;
+ if (srbds_mitigation == SRBDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_SRBDS))
+ srbds_mitigation = SRBDS_MITIGATION_FULL;
+ else {
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+ return;
+ }
+ }
/*
* Check to see if this is one of the MDS_NO systems supporting TSX that
@@ -956,12 +1065,15 @@ static void __init gds_select_mitigation(void)
return;
}
- if (cpu_mitigations_off())
- gds_mitigation = GDS_MITIGATION_OFF;
/* Will verify below that mitigation _can_ be disabled */
-
- if (gds_mitigation == GDS_MITIGATION_AUTO)
- gds_mitigation = GDS_MITIGATION_FULL;
+ if (gds_mitigation == GDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_GDS))
+ gds_mitigation = GDS_MITIGATION_FULL;
+ else {
+ gds_mitigation = GDS_MITIGATION_OFF;
+ return;
+ }
+ }
/* No microcode */
if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) {
@@ -1067,13 +1179,16 @@ static bool smap_works_speculatively(void)
static void __init spectre_v1_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V1))
spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
}
static void __init spectre_v1_apply_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
return;
if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) {
@@ -1124,6 +1239,20 @@ early_param("nospectre_v1", nospectre_v1_cmdline);
enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE;
+/* Depends on spectre_v2 mitigation selected already */
+static inline bool cdt_possible(enum spectre_v2_mitigation mode)
+{
+ if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) ||
+ !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE))
+ return false;
+
+ if (mode == SPECTRE_V2_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE)
+ return true;
+
+ return false;
+}
+
#undef pr_fmt
#define pr_fmt(fmt) "RETBleed: " fmt
@@ -1162,6 +1291,21 @@ static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
static int __ro_after_init retbleed_nosmt = false;
+enum srso_mitigation {
+ SRSO_MITIGATION_NONE,
+ SRSO_MITIGATION_AUTO,
+ SRSO_MITIGATION_UCODE_NEEDED,
+ SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
+ SRSO_MITIGATION_MICROCODE,
+ SRSO_MITIGATION_NOSMT,
+ SRSO_MITIGATION_SAFE_RET,
+ SRSO_MITIGATION_IBPB,
+ SRSO_MITIGATION_IBPB_ON_VMEXIT,
+ SRSO_MITIGATION_BP_SPEC_REDUCE,
+};
+
+static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO;
+
static int __init retbleed_parse_cmdline(char *str)
{
if (!str)
@@ -1204,7 +1348,7 @@ early_param("retbleed", retbleed_parse_cmdline);
static void __init retbleed_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED)) {
retbleed_mitigation = RETBLEED_MITIGATION_NONE;
return;
}
@@ -1241,6 +1385,11 @@ static void __init retbleed_select_mitigation(void)
if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO)
return;
+ if (!should_mitigate_vuln(X86_BUG_RETBLEED)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ return;
+ }
+
/* Intel mitigation selected in retbleed_update_mitigation() */
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
@@ -1251,35 +1400,36 @@ static void __init retbleed_select_mitigation(void)
retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
else
retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ /* Final mitigation depends on spectre-v2 selection */
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED))
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ else if (boot_cpu_has(X86_FEATURE_IBRS))
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ else
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
}
}
static void __init retbleed_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED))
return;
- if (retbleed_mitigation == RETBLEED_MITIGATION_NONE)
- goto out;
+ /* ITS can also enable stuffing */
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF)
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
- /*
- * retbleed=stuff is only allowed on Intel. If stuffing can't be used
- * then a different mitigation will be selected below.
- *
- * its=stuff will also attempt to enable stuffing.
- */
- if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF ||
- its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) {
- if (spectre_v2_enabled != SPECTRE_V2_RETPOLINE) {
- pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n");
- retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
- } else {
- if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
- pr_info("Retbleed mitigation updated to stuffing\n");
+ /* If SRSO is using IBPB, that works for retbleed too */
+ if (srso_mitigation == SRSO_MITIGATION_IBPB)
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
- retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
- }
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF &&
+ !cdt_possible(spectre_v2_enabled)) {
+ pr_err("WARNING: retbleed=stuff depends on retpoline\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
}
+
/*
* Let IBRS trump all on Intel without affecting the effects of the
* retbleed= cmdline option except for call depth based stuffing
@@ -1298,15 +1448,11 @@ static void __init retbleed_update_mitigation(void)
if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
pr_err(RETBLEED_INTEL_MSG);
}
- /* If nothing has set the mitigation yet, default to NONE. */
- if (retbleed_mitigation == RETBLEED_MITIGATION_AUTO)
- retbleed_mitigation = RETBLEED_MITIGATION_NONE;
}
-out:
+
pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
}
-
static void __init retbleed_apply_mitigation(void)
{
bool mitigate_smt = false;
@@ -1362,7 +1508,7 @@ static void __init retbleed_apply_mitigation(void)
}
if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) &&
- (retbleed_nosmt || cpu_mitigations_auto_nosmt()))
+ (retbleed_nosmt || smt_mitigations == SMT_MITIGATIONS_ON))
cpu_smt_disable(false);
}
@@ -1407,13 +1553,17 @@ early_param("indirect_target_selection", its_parse_cmdline);
static void __init its_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_ITS)) {
its_mitigation = ITS_MITIGATION_OFF;
return;
}
- if (its_mitigation == ITS_MITIGATION_AUTO)
- its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ if (its_mitigation == ITS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_ITS))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ else
+ its_mitigation = ITS_MITIGATION_OFF;
+ }
if (its_mitigation == ITS_MITIGATION_OFF)
return;
@@ -1444,15 +1594,17 @@ static void __init its_select_mitigation(void)
static void __init its_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_ITS))
return;
switch (spectre_v2_enabled) {
case SPECTRE_V2_NONE:
- pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
+ if (its_mitigation != ITS_MITIGATION_OFF)
+ pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
its_mitigation = ITS_MITIGATION_OFF;
break;
case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
/* Retpoline+CDT mitigates ITS */
if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF)
its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
@@ -1466,13 +1618,8 @@ static void __init its_update_mitigation(void)
break;
}
- /*
- * retbleed_update_mitigation() will try to do stuffing if its=stuff.
- * If it can't, such as if spectre_v2!=retpoline, then fall back to
- * aligned thunks.
- */
if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF &&
- retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
+ !cdt_possible(spectre_v2_enabled))
its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
pr_info("%s\n", its_strings[its_mitigation]);
@@ -1480,15 +1627,24 @@ static void __init its_update_mitigation(void)
static void __init its_apply_mitigation(void)
{
- /* its=stuff forces retbleed stuffing and is enabled there. */
- if (its_mitigation != ITS_MITIGATION_ALIGNED_THUNKS)
- return;
-
- if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
- setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
+ switch (its_mitigation) {
+ case ITS_MITIGATION_OFF:
+ case ITS_MITIGATION_AUTO:
+ case ITS_MITIGATION_VMEXIT_ONLY:
+ break;
+ case ITS_MITIGATION_ALIGNED_THUNKS:
+ if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
+ setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
- setup_force_cpu_cap(X86_FEATURE_RETHUNK);
- set_return_thunk(its_return_thunk);
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ set_return_thunk(its_return_thunk);
+ break;
+ case ITS_MITIGATION_RETPOLINE_STUFF:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+ set_return_thunk(call_depth_return_thunk);
+ break;
+ }
}
#undef pr_fmt
@@ -1536,28 +1692,43 @@ early_param("tsa", tsa_parse_cmdline);
static void __init tsa_select_mitigation(void)
{
- if (cpu_mitigations_off() || !boot_cpu_has_bug(X86_BUG_TSA)) {
+ if (!boot_cpu_has_bug(X86_BUG_TSA)) {
tsa_mitigation = TSA_MITIGATION_NONE;
return;
}
+ if (tsa_mitigation == TSA_MITIGATION_AUTO) {
+ bool vm = false, uk = false;
+
+ tsa_mitigation = TSA_MITIGATION_NONE;
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER)) {
+ tsa_mitigation = TSA_MITIGATION_USER_KERNEL;
+ uk = true;
+ }
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) {
+ tsa_mitigation = TSA_MITIGATION_VM;
+ vm = true;
+ }
+
+ if (uk && vm)
+ tsa_mitigation = TSA_MITIGATION_FULL;
+ }
+
if (tsa_mitigation == TSA_MITIGATION_NONE)
return;
- if (!boot_cpu_has(X86_FEATURE_VERW_CLEAR)) {
+ if (!boot_cpu_has(X86_FEATURE_VERW_CLEAR))
tsa_mitigation = TSA_MITIGATION_UCODE_NEEDED;
- goto out;
- }
-
- if (tsa_mitigation == TSA_MITIGATION_AUTO)
- tsa_mitigation = TSA_MITIGATION_FULL;
/*
* No need to set verw_clear_cpu_buf_mitigation_selected - it
* doesn't fit all cases here and it is not needed because this
* is the only VERW-based mitigation on AMD.
*/
-out:
pr_info("%s\n", tsa_strings[tsa_mitigation]);
}
@@ -1701,7 +1872,7 @@ static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void)
char arg[20];
int ret, i;
- if (cpu_mitigations_off() || !IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2))
+ if (!IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2))
return SPECTRE_V2_USER_CMD_NONE;
ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
@@ -1739,6 +1910,13 @@ static void __init spectre_v2_user_select_mitigation(void)
spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
break;
case SPECTRE_V2_USER_CMD_AUTO:
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2_USER))
+ break;
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ if (smt_mitigations == SMT_MITIGATIONS_OFF)
+ break;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
case SPECTRE_V2_USER_CMD_PRCTL:
spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
@@ -1890,8 +2068,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
int ret, i;
cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE;
- if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
- cpu_mitigations_off())
+ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
return SPECTRE_V2_CMD_NONE;
ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
@@ -2094,11 +2271,20 @@ early_param("spectre_bhi", spectre_bhi_parse_cmdline);
static void __init bhi_select_mitigation(void)
{
- if (!boot_cpu_has(X86_BUG_BHI) || cpu_mitigations_off())
+ if (!boot_cpu_has(X86_BUG_BHI))
bhi_mitigation = BHI_MITIGATION_OFF;
- if (bhi_mitigation == BHI_MITIGATION_AUTO)
- bhi_mitigation = BHI_MITIGATION_ON;
+ if (bhi_mitigation != BHI_MITIGATION_AUTO)
+ return;
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST)) {
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
+ bhi_mitigation = BHI_MITIGATION_ON;
+ else
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
+ } else {
+ bhi_mitigation = BHI_MITIGATION_OFF;
+ }
}
static void __init bhi_update_mitigation(void)
@@ -2154,8 +2340,11 @@ static void __init spectre_v2_select_mitigation(void)
case SPECTRE_V2_CMD_NONE:
return;
- case SPECTRE_V2_CMD_FORCE:
case SPECTRE_V2_CMD_AUTO:
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2))
+ break;
+ fallthrough;
+ case SPECTRE_V2_CMD_FORCE:
if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
spectre_v2_enabled = SPECTRE_V2_EIBRS;
break;
@@ -2209,7 +2398,7 @@ static void __init spectre_v2_update_mitigation(void)
}
}
- if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && !cpu_mitigations_off())
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]);
}
@@ -2861,17 +3050,23 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
static void __init l1tf_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_L1TF) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
l1tf_mitigation = L1TF_MITIGATION_OFF;
return;
}
- if (l1tf_mitigation == L1TF_MITIGATION_AUTO) {
- if (cpu_mitigations_auto_nosmt())
- l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
- else
- l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+ if (l1tf_mitigation != L1TF_MITIGATION_AUTO)
+ return;
+
+ if (!should_mitigate_vuln(X86_BUG_L1TF)) {
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ return;
}
+
+ if (smt_mitigations == SMT_MITIGATIONS_ON)
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
}
static void __init l1tf_apply_mitigation(void)
@@ -2945,31 +3140,18 @@ early_param("l1tf", l1tf_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt
-enum srso_mitigation {
- SRSO_MITIGATION_NONE,
- SRSO_MITIGATION_AUTO,
- SRSO_MITIGATION_UCODE_NEEDED,
- SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
- SRSO_MITIGATION_MICROCODE,
- SRSO_MITIGATION_SAFE_RET,
- SRSO_MITIGATION_IBPB,
- SRSO_MITIGATION_IBPB_ON_VMEXIT,
- SRSO_MITIGATION_BP_SPEC_REDUCE,
-};
-
static const char * const srso_strings[] = {
[SRSO_MITIGATION_NONE] = "Vulnerable",
[SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
[SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode",
[SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET",
+ [SRSO_MITIGATION_NOSMT] = "Mitigation: SMT disabled",
[SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET",
[SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
[SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only",
[SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation"
};
-static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO;
-
static int __init srso_parse_cmdline(char *str)
{
if (!str)
@@ -2996,35 +3178,44 @@ early_param("spec_rstack_overflow", srso_parse_cmdline);
static void __init srso_select_mitigation(void)
{
- bool has_microcode;
-
- if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_SRSO)) {
srso_mitigation = SRSO_MITIGATION_NONE;
-
- if (srso_mitigation == SRSO_MITIGATION_NONE)
return;
+ }
- if (srso_mitigation == SRSO_MITIGATION_AUTO)
- srso_mitigation = SRSO_MITIGATION_SAFE_RET;
-
- has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
- if (has_microcode) {
- /*
- * Zen1/2 with SMT off aren't vulnerable after the right
- * IBPB microcode has been applied.
- */
- if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
- setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
+ if (srso_mitigation == SRSO_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_SRSO)) {
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ } else {
srso_mitigation = SRSO_MITIGATION_NONE;
return;
}
- } else {
+ }
+
+ /* Zen1/2 with SMT off aren't vulnerable to SRSO. */
+ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
+ srso_mitigation = SRSO_MITIGATION_NOSMT;
+ return;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) {
pr_warn("IBPB-extending microcode not applied!\n");
pr_warn(SRSO_NOTICE);
+
+ /*
+ * Safe-RET provides partial mitigation without microcode, but
+ * other mitigations require microcode to provide any
+ * mitigations.
+ */
+ if (srso_mitigation == SRSO_MITIGATION_SAFE_RET)
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
+ else
+ srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
}
switch (srso_mitigation) {
case SRSO_MITIGATION_SAFE_RET:
+ case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED:
if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) {
srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
goto ibpb_on_vmexit;
@@ -3034,9 +3225,6 @@ static void __init srso_select_mitigation(void)
pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
srso_mitigation = SRSO_MITIGATION_NONE;
}
-
- if (!has_microcode)
- srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
break;
ibpb_on_vmexit:
case SRSO_MITIGATION_IBPB_ON_VMEXIT:
@@ -3051,9 +3239,6 @@ ibpb_on_vmexit:
pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
srso_mitigation = SRSO_MITIGATION_NONE;
}
-
- if (!has_microcode)
- srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
break;
default:
break;
@@ -3068,8 +3253,7 @@ static void __init srso_update_mitigation(void)
srso_mitigation = SRSO_MITIGATION_IBPB;
if (boot_cpu_has_bug(X86_BUG_SRSO) &&
- !cpu_mitigations_off() &&
- !boot_cpu_has(X86_FEATURE_SRSO_NO))
+ !cpu_mitigations_off())
pr_info("%s\n", srso_strings[srso_mitigation]);
}
@@ -3365,9 +3549,6 @@ static ssize_t retbleed_show_state(char *buf)
static ssize_t srso_show_state(char *buf)
{
- if (boot_cpu_has(X86_FEATURE_SRSO_NO))
- return sysfs_emit(buf, "Mitigation: SMT disabled\n");
-
return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]);
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index fb50c1dd53ef..34a054181c4d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -26,6 +26,7 @@
#include <linux/pgtable.h>
#include <linux/stackprotector.h>
#include <linux/utsname.h>
+#include <linux/efi.h>
#include <asm/alternative.h>
#include <asm/cmdline.h>
@@ -2538,6 +2539,12 @@ void __init arch_cpu_finalize_init(void)
fpu__init_cpu();
/*
+ * This needs to follow the FPU initializtion, since EFI depends on it.
+ */
+ if (efi_enabled(EFI_RUNTIME_SERVICES))
+ efi_enter_virtual_mode();
+
+ /*
* Ensure that access to the per CPU representation has the initial
* boot CPU configuration.
*/
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index fe50eb5b7c4a..b92e09a87c69 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -17,8 +17,8 @@
#define pr_fmt(fmt) "microcode: " fmt
-#include <linux/platform_device.h>
#include <linux/stop_machine.h>
+#include <linux/device/faux.h>
#include <linux/syscore_ops.h>
#include <linux/miscdevice.h>
#include <linux/capability.h>
@@ -249,7 +249,7 @@ static void reload_early_microcode(unsigned int cpu)
}
/* fake device for request_firmware */
-static struct platform_device *microcode_pdev;
+static struct faux_device *microcode_fdev;
#ifdef CONFIG_MICROCODE_LATE_LOADING
/*
@@ -690,7 +690,7 @@ static int load_late_locked(void)
if (!setup_cpus())
return -EBUSY;
- switch (microcode_ops->request_microcode_fw(0, &microcode_pdev->dev)) {
+ switch (microcode_ops->request_microcode_fw(0, &microcode_fdev->dev)) {
case UCODE_NEW:
return load_late_stop_cpus(false);
case UCODE_NEW_SAFE:
@@ -841,9 +841,9 @@ static int __init microcode_init(void)
if (early_data.new_rev)
pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev);
- microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0);
- if (IS_ERR(microcode_pdev))
- return PTR_ERR(microcode_pdev);
+ microcode_fdev = faux_device_create("microcode", NULL, NULL);
+ if (!microcode_fdev)
+ return -ENODEV;
dev_root = bus_get_dev_root(&cpu_subsys);
if (dev_root) {
@@ -862,7 +862,7 @@ static int __init microcode_init(void)
return 0;
out_pdev:
- platform_device_unregister(microcode_pdev);
+ faux_device_destroy(microcode_fdev);
return error;
}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index ea138583dd92..aefd412a23dc 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -37,6 +37,7 @@ DEFINE_PER_CPU(u64, xfd_state);
/* The FPU state configuration data for kernel and user space */
struct fpu_state_config fpu_kernel_cfg __ro_after_init;
struct fpu_state_config fpu_user_cfg __ro_after_init;
+struct vcpu_fpu_config guest_default_cfg __ro_after_init;
/*
* Represents the initial FPU state. It's mostly (but not completely) zeroes,
@@ -217,7 +218,7 @@ void fpu_reset_from_exception_fixup(void)
}
#if IS_ENABLED(CONFIG_KVM)
-static void __fpstate_reset(struct fpstate *fpstate, u64 xfd);
+static void __fpstate_reset(struct fpstate *fpstate);
static void fpu_lock_guest_permissions(void)
{
@@ -242,19 +243,21 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
struct fpstate *fpstate;
unsigned int size;
- size = fpu_kernel_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
+ size = guest_default_cfg.size + ALIGN(offsetof(struct fpstate, regs), 64);
+
fpstate = vzalloc(size);
if (!fpstate)
return false;
- /* Leave xfd to 0 (the reset value defined by spec) */
- __fpstate_reset(fpstate, 0);
- fpstate_init_user(fpstate);
+ /* Initialize indicators to reflect properties of the fpstate */
fpstate->is_valloc = true;
fpstate->is_guest = true;
+ __fpstate_reset(fpstate);
+ fpstate_init_user(fpstate);
+
gfpu->fpstate = fpstate;
- gfpu->xfeatures = fpu_kernel_cfg.default_features;
+ gfpu->xfeatures = guest_default_cfg.features;
/*
* KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
@@ -541,28 +544,50 @@ void fpstate_init_user(struct fpstate *fpstate)
fpstate_init_fstate(fpstate);
}
-static void __fpstate_reset(struct fpstate *fpstate, u64 xfd)
+static void __fpstate_reset(struct fpstate *fpstate)
{
- /* Initialize sizes and feature masks */
- fpstate->size = fpu_kernel_cfg.default_size;
+ /*
+ * Supervisor features (and thus sizes) may diverge between guest
+ * FPUs and host FPUs, as some supervisor features are supported
+ * for guests despite not being utilized by the host. User
+ * features and sizes are always identical, which allows for
+ * common guest and userspace ABI.
+ *
+ * For the host, set XFD to the kernel's desired initialization
+ * value. For guests, set XFD to its architectural RESET value.
+ */
+ if (fpstate->is_guest) {
+ fpstate->size = guest_default_cfg.size;
+ fpstate->xfeatures = guest_default_cfg.features;
+ fpstate->xfd = 0;
+ } else {
+ fpstate->size = fpu_kernel_cfg.default_size;
+ fpstate->xfeatures = fpu_kernel_cfg.default_features;
+ fpstate->xfd = init_fpstate.xfd;
+ }
+
fpstate->user_size = fpu_user_cfg.default_size;
- fpstate->xfeatures = fpu_kernel_cfg.default_features;
fpstate->user_xfeatures = fpu_user_cfg.default_features;
- fpstate->xfd = xfd;
}
void fpstate_reset(struct fpu *fpu)
{
/* Set the fpstate pointer to the default fpstate */
fpu->fpstate = &fpu->__fpstate;
- __fpstate_reset(fpu->fpstate, init_fpstate.xfd);
+ __fpstate_reset(fpu->fpstate);
/* Initialize the permission related info in fpu */
fpu->perm.__state_perm = fpu_kernel_cfg.default_features;
fpu->perm.__state_size = fpu_kernel_cfg.default_size;
fpu->perm.__user_state_size = fpu_user_cfg.default_size;
- /* Same defaults for guests */
- fpu->guest_perm = fpu->perm;
+
+ fpu->guest_perm.__state_perm = guest_default_cfg.features;
+ fpu->guest_perm.__state_size = guest_default_cfg.size;
+ /*
+ * User features and sizes are always identical between host and
+ * guest FPUs, which allows for common guest and userspace ABI.
+ */
+ fpu->guest_perm.__user_state_size = fpu_user_cfg.default_size;
}
static inline void fpu_inherit_perms(struct fpu *dst_fpu)
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 99db41bf9fa6..ff988b9ea39f 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -205,6 +205,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
fpu_kernel_cfg.default_size = size;
fpu_user_cfg.max_size = size;
fpu_user_cfg.default_size = size;
+ guest_default_cfg.size = size;
}
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 9aa9ac8399ae..12ed75c1b567 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -57,7 +57,7 @@ static const char *xfeature_names[] =
"Protection Keys User registers",
"PASID state",
"Control-flow User registers",
- "Control-flow Kernel registers (unused)",
+ "Control-flow Kernel registers (KVM only)",
"unknown xstate feature",
"unknown xstate feature",
"unknown xstate feature",
@@ -81,6 +81,7 @@ static unsigned short xsave_cpuid_features[] __initdata = {
[XFEATURE_PKRU] = X86_FEATURE_OSPKE,
[XFEATURE_PASID] = X86_FEATURE_ENQCMD,
[XFEATURE_CET_USER] = X86_FEATURE_SHSTK,
+ [XFEATURE_CET_KERNEL] = X86_FEATURE_SHSTK,
[XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
[XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
[XFEATURE_APX] = X86_FEATURE_APX,
@@ -372,6 +373,7 @@ static __init void os_xrstor_booting(struct xregs_state *xstate)
XFEATURE_MASK_BNDCSR | \
XFEATURE_MASK_PASID | \
XFEATURE_MASK_CET_USER | \
+ XFEATURE_MASK_CET_KERNEL | \
XFEATURE_MASK_XTILE | \
XFEATURE_MASK_APX)
@@ -573,6 +575,7 @@ static bool __init check_xstate_against_struct(int nr)
case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg);
case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state);
+ case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state);
case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state);
case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
default:
@@ -743,6 +746,9 @@ static int __init init_xstate_size(void)
fpu_user_cfg.default_size =
xstate_calculate_size(fpu_user_cfg.default_features, false);
+ guest_default_cfg.size =
+ xstate_calculate_size(guest_default_cfg.features, compacted);
+
return 0;
}
@@ -763,6 +769,7 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
fpu_kernel_cfg.default_size = legacy_size;
fpu_user_cfg.max_size = legacy_size;
fpu_user_cfg.default_size = legacy_size;
+ guest_default_cfg.size = legacy_size;
/*
* Prevent enabling the static branch which enables writes to the
@@ -773,6 +780,24 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
fpstate_reset(x86_task_fpu(current));
}
+static u64 __init host_default_mask(void)
+{
+ /*
+ * Exclude dynamic features (require userspace opt-in) and features
+ * that are supported only for KVM guests.
+ */
+ return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR);
+}
+
+static u64 __init guest_default_mask(void)
+{
+ /*
+ * Exclude dynamic features, which require userspace opt-in even
+ * for KVM guests.
+ */
+ return ~(u64)XFEATURE_MASK_USER_DYNAMIC;
+}
+
/*
* Enable and initialize the xsave feature.
* Called once per system bootup.
@@ -855,12 +880,13 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
- /* Clean out dynamic features from default */
- fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
- fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
-
- fpu_user_cfg.default_features = fpu_user_cfg.max_features;
- fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+ /*
+ * Now, given maximum feature set, determine default values by
+ * applying default masks.
+ */
+ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask();
+ fpu_user_cfg.default_features = fpu_user_cfg.max_features & host_default_mask();
+ guest_default_cfg.features = fpu_kernel_cfg.max_features & guest_default_mask();
/* Store it for paranoia check at the end */
xfeatures = fpu_kernel_cfg.max_features;
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 9cea1fc36c18..243a769fdd97 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -59,6 +59,18 @@ static ssize_t sched_itmt_enabled_write(struct file *filp,
return result;
}
+static int sched_core_priority_show(struct seq_file *s, void *unused)
+{
+ int cpu;
+
+ seq_puts(s, "CPU #\tPriority\n");
+ for_each_possible_cpu(cpu)
+ seq_printf(s, "%d\t%d\n", cpu, arch_asym_cpu_priority(cpu));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(sched_core_priority);
+
static const struct file_operations dfs_sched_itmt_fops = {
.read = debugfs_read_file_bool,
.write = sched_itmt_enabled_write,
@@ -67,6 +79,7 @@ static const struct file_operations dfs_sched_itmt_fops = {
};
static struct dentry *dfs_sched_itmt;
+static struct dentry *dfs_sched_core_prio;
/**
* sched_set_itmt_support() - Indicate platform supports ITMT
@@ -102,6 +115,14 @@ int sched_set_itmt_support(void)
return -ENOMEM;
}
+ dfs_sched_core_prio = debugfs_create_file("sched_core_priority", 0644,
+ arch_debugfs_dir, NULL,
+ &sched_core_priority_fops);
+ if (IS_ERR_OR_NULL(dfs_sched_core_prio)) {
+ dfs_sched_core_prio = NULL;
+ return -ENOMEM;
+ }
+
sched_itmt_capable = true;
sysctl_sched_itmt_enabled = 1;
@@ -133,6 +154,8 @@ void sched_clear_itmt_support(void)
debugfs_remove(dfs_sched_itmt);
dfs_sched_itmt = NULL;
+ debugfs_remove(dfs_sched_core_prio);
+ dfs_sched_core_prio = NULL;
if (sysctl_sched_itmt_enabled) {
/* disable sched_itmt if we are no longer ITMT capable */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index a838be04f874..1b7960cf6eb0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -334,13 +334,21 @@ DEFINE_PER_CPU(u64, msr_misc_features_shadow);
static void set_cpuid_faulting(bool on)
{
- u64 msrval;
- msrval = this_cpu_read(msr_misc_features_shadow);
- msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
- msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
- this_cpu_write(msr_misc_features_shadow, msrval);
- wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval);
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ u64 msrval;
+
+ msrval = this_cpu_read(msr_misc_features_shadow);
+ msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+ msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
+ this_cpu_write(msr_misc_features_shadow, msrval);
+ wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval);
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (on)
+ msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT);
+ else
+ msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT);
+ }
}
static void disable_cpuid(void)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b972bf72fb8b..52a5c03c353c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -707,6 +707,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Load the Intel cache allocation PQR MSR. */
resctrl_arch_sched_in(next_p);
+ /* Reset hw history on AMD CPUs */
+ if (cpu_feature_enabled(X86_FEATURE_AMD_WORKLOAD_CLASS))
+ wrmsrl(MSR_AMD_WORKLOAD_HRST, 0x1);
+
return prev_p;
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fb27be697128..0792f31961ac 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -213,8 +213,10 @@ arch_initcall(init_x86_sysctl);
*/
struct screen_info screen_info;
EXPORT_SYMBOL(screen_info);
+#if defined(CONFIG_FIRMWARE_EDID)
struct edid_info edid_info;
EXPORT_SYMBOL_GPL(edid_info);
+#endif
extern int root_mountflags;
@@ -525,7 +527,9 @@ static void __init parse_boot_params(void)
{
ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
screen_info = boot_params.screen_info;
+#if defined(CONFIG_FIRMWARE_EDID)
edid_info = boot_params.edid_info;
+#endif
#ifdef CONFIG_X86_32
apm_info.bios = boot_params.apm_bios_info;
ist_info = boot_params.ist_info;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 58ede3fa6a75..33e166f6ab12 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -478,44 +478,41 @@ static int x86_cluster_flags(void)
*/
static bool x86_has_numa_in_package;
-static struct sched_domain_topology_level x86_topology[6];
-
-static void __init build_sched_topology(void)
-{
- int i = 0;
-
-#ifdef CONFIG_SCHED_SMT
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)
- };
-#endif
+static struct sched_domain_topology_level x86_topology[] = {
+ SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
#ifdef CONFIG_SCHED_CLUSTER
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
- };
+ SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
- };
+ SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC),
#endif
+ SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG),
+ { NULL },
+};
+
+static void __init build_sched_topology(void)
+{
+ struct sched_domain_topology_level *topology = x86_topology;
+
/*
- * When there is NUMA topology inside the package skip the PKG domain
- * since the NUMA domains will auto-magically create the right spanning
- * domains based on the SLIT.
+ * When there is NUMA topology inside the package invalidate the
+ * PKG domain since the NUMA domains will auto-magically create the
+ * right spanning domains based on the SLIT.
*/
- if (!x86_has_numa_in_package) {
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG)
- };
+ if (x86_has_numa_in_package) {
+ unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
+
+ memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
}
/*
- * There must be one trailing NULL entry left.
+ * Drop the SMT domains if there is only one thread per-core
+ * since it'll get degenerated by the scheduler anyways.
*/
- BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
+ if (cpu_smt_num_threads <= 1)
+ ++topology;
- set_sched_topology(x86_topology);
+ set_sched_topology(topology);
}
void set_cpu_sibling_map(int cpu)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2eeffcec5382..2c86673155c9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -166,6 +166,16 @@ config KVM_AMD_SEV
Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
Secure Nested Paging (SEV-SNP) technologies on AMD processors.
+config KVM_IOAPIC
+ bool "I/O APIC, PIC, and PIT emulation"
+ default y
+ depends on KVM
+ help
+ Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
+ for full in-kernel APIC emulation.
+
+ If unsure, say Y.
+
config KVM_SMM
bool "System Management Mode emulation"
default y
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a5d362c7b504..c4b8950c7abe 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,12 +5,11 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror
include $(srctree)/virt/kvm/Makefile.kvm
-kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
- i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- debugfs.o mmu/mmu.o mmu/page_track.o \
- mmu/spte.o
+kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \
+ debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
+kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o
kvm-$(CONFIG_KVM_HYPERV) += hyperv.o
kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-$(CONFIG_KVM_SMM) += smm.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index f84bc0569c9c..e2836a255b16 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -979,6 +979,7 @@ void kvm_set_cpu_caps(void)
F(FSRS),
F(FSRC),
F(WRMSRNS),
+ X86_64_F(LKGS),
F(AMX_FP16),
F(AVX_IFMA),
F(LAM),
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index ee27064dd72f..72b19a88a776 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -497,15 +497,19 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
return ret;
}
-int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
struct kvm_vcpu_hv_synic *synic;
- synic = synic_get(kvm, vpidx);
+ if (!level)
+ return -1;
+
+ synic = synic_get(kvm, e->hv_sint.vcpu);
if (!synic)
return -EINVAL;
- return synic_set_irq(synic, sint);
+ return synic_set_irq(synic, e->hv_sint.sint);
}
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 913bfc96959c..6ce160ffa678 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -103,7 +103,8 @@ static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
-int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 739aa6c0d0c3..d1b79b418c05 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -248,8 +248,8 @@ static void pit_do_work(struct kthread_work *work)
if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
return;
- kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
- kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false);
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false);
/*
* Provides NMI watchdog support via Virtual Wire mode.
@@ -288,7 +288,7 @@ static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
atomic_set(&pit->pit_state.irq_ack, 1);
}
-void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+static void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
{
struct kvm_kpit_state *ps = &pit->pit_state;
struct kvm *kvm = pit->kvm;
@@ -400,8 +400,8 @@ static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
}
}
-void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
- int hpet_legacy_start)
+static void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start)
{
u8 saved_mode;
@@ -649,6 +649,79 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
kvm_pit_reset_reinject(pit);
}
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+ BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+ mutex_lock(&kps->lock);
+ memcpy(ps, &kps->channels, sizeof(*ps));
+ mutex_unlock(&kps->lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int i;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+ sizeof(ps->channels));
+ ps->flags = kvm->arch.vpit->pit_state.flags;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ memset(&ps->reserved, 0, sizeof(ps->reserved));
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int start = 0;
+ int i;
+ u32 prev_legacy, cur_legacy;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ if (!prev_legacy && cur_legacy)
+ start = 1;
+ memcpy(&pit->pit_state.channels, &ps->channels,
+ sizeof(pit->pit_state.channels));
+ pit->pit_state.flags = ps->flags;
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
+ start && i == 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control)
+{
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ /* pit->pit_state.lock was overloaded to prevent userspace from getting
+ * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+ * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
+ */
+ mutex_lock(&pit->pit_state.lock);
+ kvm_pit_set_reinject(pit, control->pit_reinject);
+ mutex_unlock(&pit->pit_state.lock);
+
+ return 0;
+}
+
static const struct kvm_io_device_ops pit_dev_ops = {
.read = pit_ioport_read,
.write = pit_ioport_write,
@@ -671,10 +744,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
if (!pit)
return NULL;
- pit->irq_source_id = kvm_request_irq_source_id(kvm);
- if (pit->irq_source_id < 0)
- goto fail_request;
-
mutex_init(&pit->pit_state.lock);
pid = get_pid(task_tgid(current));
@@ -726,8 +795,6 @@ fail_register_pit:
kvm_pit_set_reinject(pit, false);
kthread_destroy_worker(pit->worker);
fail_kthread:
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
-fail_request:
kfree(pit);
return NULL;
}
@@ -744,7 +811,6 @@ void kvm_free_pit(struct kvm *kvm)
kvm_pit_set_reinject(pit, false);
hrtimer_cancel(&pit->pit_state.timer);
kthread_destroy_worker(pit->worker);
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
kfree(pit);
}
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index a768212ba821..60fa499d2f8a 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -6,6 +6,11 @@
#include <kvm/iodev.h>
+#include <uapi/asm/kvm.h>
+
+#include "ioapic.h"
+
+#ifdef CONFIG_KVM_IOAPIC
struct kvm_kpit_channel_state {
u32 count; /* can be 65536 */
u16 latched_count;
@@ -42,7 +47,6 @@ struct kvm_pit {
struct kvm_io_device speaker_dev;
struct kvm *kvm;
struct kvm_kpit_state pit_state;
- int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
struct kthread_worker *worker;
struct kthread_work expired;
@@ -55,11 +59,14 @@ struct kvm_pit {
#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
#define KVM_PIT_CHANNEL_MASK 0x3
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control);
+
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
-
-void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
- int hpet_legacy_start);
-void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
+#endif /* CONFIG_KVM_IOAPIC */
#endif
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a8fb19940975..2ac7f1678c46 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -31,6 +31,8 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/bitops.h>
+
+#include "ioapic.h"
#include "irq.h"
#include <linux/kvm_host.h>
@@ -185,8 +187,11 @@ void kvm_pic_update_irq(struct kvm_pic *s)
pic_unlock(s);
}
-int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
+ struct kvm_pic *s = kvm->arch.vpic;
+ int irq = e->irqchip.pin;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
@@ -203,16 +208,6 @@ int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
return ret;
}
-void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
-{
- int i;
-
- pic_lock(s);
- for (i = 0; i < PIC_NUM_PINS; i++)
- __clear_bit(irq_source_id, &s->irq_states[i]);
- pic_unlock(s);
-}
-
/*
* acknowledge interrupt 'irq'
*/
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 45dae2d5d2f1..2b5d389bca5f 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -41,11 +41,11 @@
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/current.h>
-#include <trace/events/kvm.h>
#include "ioapic.h"
#include "lapic.h"
#include "irq.h"
+#include "trace.h"
static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
bool line_status);
@@ -310,6 +310,42 @@ void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
kvm_make_scan_ioapic_request(kvm);
}
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &ioapic->mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_srcu(&kvm->irq_srcu);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ struct kvm_irq_mask_notifier *kimn;
+ int idx, gsi;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kimn, &ioapic->mask_notifier_list, link)
+ if (kimn->irq == gsi)
+ kimn->func(kimn, mask);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
{
unsigned index;
@@ -479,9 +515,11 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
return ret;
}
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
- int level, bool line_status)
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ int irq = e->irqchip.pin;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
@@ -496,16 +534,6 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
return ret;
}
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
-{
- int i;
-
- spin_lock(&ioapic->lock);
- for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
- __clear_bit(irq_source_id, &ioapic->irq_states[i]);
- spin_unlock(&ioapic->lock);
-}
-
static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
{
int i;
@@ -718,6 +746,7 @@ int kvm_ioapic_init(struct kvm *kvm)
return -ENOMEM;
spin_lock_init(&ioapic->lock);
INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
+ INIT_HLIST_HEAD(&ioapic->mask_notifier_list);
kvm->arch.vioapic = ioapic;
kvm_ioapic_reset(ioapic);
kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index aa8cb4ac0479..bf28dbc11ff6 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -86,8 +86,24 @@ struct kvm_ioapic {
struct delayed_work eoi_inject;
u32 irq_eoi[IOAPIC_NUM_PINS];
u32 irr_delivered;
+
+ /* reads protected by irq_srcu, writes by irq_lock */
+ struct hlist_head mask_notifier_list;
+};
+
+struct kvm_irq_mask_notifier {
+ void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+ int irq;
+ struct hlist_node link;
};
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask);
+
#ifdef DEBUG
#define ASSERT(x) \
do { \
@@ -103,7 +119,7 @@ do { \
static inline int ioapic_in_kernel(struct kvm *kvm)
{
- return irqchip_kernel(kvm);
+ return irqchip_full(kvm);
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -111,9 +127,9 @@ void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
int trigger_mode);
int kvm_ioapic_init(struct kvm *kvm);
void kvm_ioapic_destroy(struct kvm *kvm);
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
- int level, bool line_status);
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
+
void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 97d68d837929..16da89259011 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -11,9 +11,12 @@
#include <linux/export.h>
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
+#include "hyperv.h"
+#include "ioapic.h"
#include "irq.h"
-#include "i8254.h"
+#include "trace.h"
#include "x86.h"
#include "xen.h"
@@ -41,6 +44,14 @@ static int pending_userspace_extint(struct kvm_vcpu *v)
return v->arch.pending_external_vector != -1;
}
+static int get_userspace_extint(struct kvm_vcpu *vcpu)
+{
+ int vector = vcpu->arch.pending_external_vector;
+
+ vcpu->arch.pending_external_vector = -1;
+ return vector;
+}
+
/*
* check if there is pending interrupt from
* non-APIC source without intack.
@@ -67,10 +78,13 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v)
if (!kvm_apic_accept_pic_intr(v))
return 0;
- if (irqchip_split(v->kvm))
- return pending_userspace_extint(v);
- else
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
return v->kvm->arch.vpic->output;
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return pending_userspace_extint(v);
}
/*
@@ -126,13 +140,13 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v)
return v->kvm->arch.xen.upcall_vector;
#endif
- if (irqchip_split(v->kvm)) {
- int vector = v->arch.pending_external_vector;
-
- v->arch.pending_external_vector = -1;
- return vector;
- } else
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
return kvm_pic_read_irq(v->kvm); /* PIC */
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return get_userspace_extint(v);
}
EXPORT_SYMBOL_GPL(kvm_cpu_get_extint);
@@ -163,7 +177,9 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
+#ifdef CONFIG_KVM_IOAPIC
__kvm_migrate_pit_timer(vcpu);
+#endif
kvm_x86_call(migrate_timers)(vcpu);
}
@@ -171,10 +187,532 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
{
bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE;
- return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
+ return resample ? irqchip_full(kvm) : irqchip_in_kernel(kvm);
}
bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
{
return irqchip_in_kernel(kvm);
}
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, struct dest_map *dest_map)
+{
+ int r = -1;
+ struct kvm_vcpu *vcpu, *lowest = NULL;
+ unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned int dest_vcpus = 0;
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL &&
+ irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
+ pr_info("apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_lowest_prio_delivery(irq)) {
+ if (r < 0)
+ r = 0;
+ r += kvm_apic_set_irq(vcpu, irq, dest_map);
+ } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
+ if (!kvm_vector_hashing_enabled()) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ } else {
+ __set_bit(i, dest_vcpu_bitmap);
+ dest_vcpus++;
+ }
+ }
+ }
+
+ if (dest_vcpus != 0) {
+ int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+ lowest = kvm_get_vcpu(kvm, idx);
+ }
+
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq, dest_map);
+
+ return r;
+}
+
+static void kvm_msi_to_lapic_irq(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
+{
+ struct msi_msg msg = { .address_lo = e->msi.address_lo,
+ .address_hi = e->msi.address_hi,
+ .data = e->msi.data };
+
+ trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
+ (u64)msg.address_hi << 32 : 0), msg.data);
+
+ irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
+ irq->vector = msg.arch_data.vector;
+ irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
+ irq->trig_mode = msg.arch_data.is_level;
+ irq->delivery_mode = msg.arch_data.delivery_mode << 8;
+ irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
+ irq->level = 1;
+ irq->shorthand = APIC_DEST_NOSHORT;
+}
+
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e)
+{
+ return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+ struct kvm_lapic_irq irq;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ if (!level)
+ return -1;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_lapic_irq irq;
+ int r;
+
+ switch (e->type) {
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level,
+ line_status);
+#endif
+
+ case KVM_IRQ_ROUTING_MSI:
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
+ return r;
+ break;
+
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ if (!level)
+ return -1;
+
+ return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
+#endif
+ default:
+ break;
+ }
+
+ return -EWOULDBLOCK;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+ bool line_status)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level,
+ line_status);
+ return 0;
+}
+
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ /* We can't check irqchip_in_kernel() here as some callers are
+ * currently initializing the irqchip. Other callers should therefore
+ * check kvm_arch_can_set_irq_routing() before calling this function.
+ */
+ switch (ue->type) {
+#ifdef CONFIG_KVM_IOAPIC
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ if (irqchip_split(kvm))
+ return -EINVAL;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ switch (ue->u.irqchip.irqchip) {
+ case KVM_IRQCHIP_PIC_SLAVE:
+ e->irqchip.pin += PIC_NUM_PINS / 2;
+ fallthrough;
+ case KVM_IRQCHIP_PIC_MASTER:
+ if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
+ return -EINVAL;
+ e->set = kvm_pic_set_irq;
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+ return -EINVAL;
+ e->set = kvm_ioapic_set_irq;
+ break;
+ default:
+ return -EINVAL;
+ }
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ break;
+#endif
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+ break;
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ e->set = kvm_hv_synic_set_irq;
+ e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+ e->hv_sint.sint = ue->u.hv_sint.sint;
+ break;
+#endif
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int r = 0;
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+ return true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (++r == 2)
+ return false;
+
+ *dest_vcpu = vcpu;
+ }
+
+ return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
+void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
+ u8 vector, unsigned long *ioapic_handled_vectors)
+{
+ /*
+ * Intercept EOI if the vCPU is the target of the new IRQ routing, or
+ * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
+ * may receive a level-triggered IRQ in the future, or already received
+ * level-triggered IRQ. The EOI needs to be intercepted and forwarded
+ * to I/O APIC emulation so that the IRQ can be de-asserted.
+ */
+ if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
+ __set_bit(vector, ioapic_handled_vectors);
+ } else if (kvm_apic_pending_eoi(vcpu, vector)) {
+ __set_bit(vector, ioapic_handled_vectors);
+
+ /*
+ * Track the highest pending EOI for which the vCPU is NOT the
+ * target in the new routing. Only the EOI for the IRQ that is
+ * in-flight (for the old routing) needs to be intercepted, any
+ * future IRQs that arrive on this vCPU will be coincidental to
+ * the level-triggered routing and don't need to be intercepted.
+ */
+ if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
+ vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
+ }
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kernel_irq_routing_entry *entry;
+ struct kvm_irq_routing_table *table;
+ u32 i, nr_ioapic_pins;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+ kvm->arch.nr_reserved_ioapic_pins);
+ for (i = 0; i < nr_ioapic_pins; ++i) {
+ hlist_for_each_entry(entry, &table->map[i], link) {
+ struct kvm_lapic_irq irq;
+
+ if (entry->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ kvm_msi_to_lapic_irq(vcpu->kvm, entry, &irq);
+
+ if (!irq.trig_mode)
+ continue;
+
+ kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
+ irq.vector, ioapic_handled_vectors);
+ }
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_HYPERV
+ kvm_hv_irq_routing_update(kvm);
+#endif
+
+ if (irqchip_split(kvm))
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *entry)
+{
+ unsigned int host_irq = irqfd->producer->irq;
+ struct kvm *kvm = irqfd->kvm;
+ struct kvm_vcpu *vcpu = NULL;
+ struct kvm_lapic_irq irq;
+ int r;
+
+ if (WARN_ON_ONCE(!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass()))
+ return -EINVAL;
+
+ if (entry && entry->type == KVM_IRQ_ROUTING_MSI) {
+ kvm_msi_to_lapic_irq(kvm, entry, &irq);
+
+ /*
+ * Force remapped mode if hardware doesn't support posting the
+ * virtual interrupt to a vCPU. Only IRQs are postable (NMIs,
+ * SMIs, etc. are not), and neither AMD nor Intel IOMMUs support
+ * posting multicast/broadcast IRQs. If the interrupt can't be
+ * posted, the device MSI needs to be routed to the host so that
+ * the guest's desired interrupt can be synthesized by KVM.
+ *
+ * This means that KVM can only post lowest-priority interrupts
+ * if they have a single CPU as the destination, e.g. only if
+ * the guest has affined the interrupt to a single vCPU.
+ */
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq))
+ vcpu = NULL;
+ }
+
+ if (!irqfd->irq_bypass_vcpu && !vcpu)
+ return 0;
+
+ r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi,
+ vcpu, irq.vector);
+ if (r) {
+ WARN_ON_ONCE(irqfd->irq_bypass_vcpu && !vcpu);
+ irqfd->irq_bypass_vcpu = NULL;
+ return r;
+ }
+
+ irqfd->irq_bypass_vcpu = vcpu;
+
+ trace_kvm_pi_irte_update(host_irq, vcpu, irqfd->gsi, irq.vector, !!vcpu);
+ return 0;
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret = 0;
+
+ spin_lock_irq(&kvm->irqfds.lock);
+ irqfd->producer = prod;
+
+ if (!kvm->arch.nr_possible_bypass_irqs++)
+ kvm_x86_call(pi_start_bypass)(kvm);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, &irqfd->irq_entry);
+ if (ret)
+ kvm->arch.nr_possible_bypass_irqs--;
+ }
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+ return ret;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret;
+
+ WARN_ON(irqfd->producer != prod);
+
+ /*
+ * If the producer of an IRQ that is currently being posted to a vCPU
+ * is unregistered, change the associated IRTE back to remapped mode as
+ * the IRQ has been released (or repurposed) by the device driver, i.e.
+ * KVM must relinquish control of the IRTE.
+ */
+ spin_lock_irq(&kvm->irqfds.lock);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, NULL);
+ if (ret)
+ pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n",
+ irqfd->consumer.eventfd, ret);
+ }
+ irqfd->producer = NULL;
+
+ kvm->arch.nr_possible_bypass_irqs--;
+
+ spin_unlock_irq(&kvm->irqfds.lock);
+}
+
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (new->type != KVM_IRQ_ROUTING_MSI &&
+ old->type != KVM_IRQ_ROUTING_MSI)
+ return;
+
+ if (old->type == KVM_IRQ_ROUTING_MSI &&
+ new->type == KVM_IRQ_ROUTING_MSI &&
+ !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
+ return;
+
+ kvm_pi_update_irte(irqfd, new);
+}
+
+#ifdef CONFIG_KVM_IOAPIC
+#define IOAPIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#define PIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
+#define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+ ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+ ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+ ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+ ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+ ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+ ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+ ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+ ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+ ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+ ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+ ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+ ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+};
+
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, default_routing,
+ ARRAY_SIZE(default_routing), 0);
+}
+
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ memcpy(&chip->chip.pic, &pic->pics[0],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ memcpy(&chip->chip.pic, &pic->pics[1],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_get_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[0], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[1], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_set_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ kvm_pic_update_irq(pic);
+ return r;
+}
+#endif
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 76d46b2f41dd..5e62c1f79ce6 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -18,6 +18,8 @@
#include <kvm/iodev.h>
#include "lapic.h"
+#ifdef CONFIG_KVM_IOAPIC
+
#define PIC_NUM_PINS 16
#define SELECT_PIC(irq) \
((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
@@ -63,17 +65,15 @@ int kvm_pic_init(struct kvm *kvm);
void kvm_pic_destroy(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
-static inline int irqchip_split(struct kvm *kvm)
-{
- int mode = kvm->arch.irqchip_mode;
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm);
- /* Matches smp_wmb() when setting irqchip_mode */
- smp_rmb();
- return mode == KVM_IRQCHIP_SPLIT;
-}
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
-static inline int irqchip_kernel(struct kvm *kvm)
+static inline int irqchip_full(struct kvm *kvm)
{
int mode = kvm->arch.irqchip_mode;
@@ -81,10 +81,26 @@ static inline int irqchip_kernel(struct kvm *kvm)
smp_rmb();
return mode == KVM_IRQCHIP_KERNEL;
}
+#else /* CONFIG_KVM_IOAPIC */
+static __always_inline int irqchip_full(struct kvm *kvm)
+{
+ return false;
+}
+#endif
static inline int pic_in_kernel(struct kvm *kvm)
{
- return irqchip_kernel(kvm);
+ return irqchip_full(kvm);
+}
+
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_SPLIT;
}
static inline int irqchip_in_kernel(struct kvm *kvm)
@@ -105,7 +121,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
-int kvm_setup_default_irq_routing(struct kvm *kvm);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
deleted file mode 100644
index d6d792b5d1bd..000000000000
--- a/arch/x86/kvm/irq_comm.c
+++ /dev/null
@@ -1,469 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * irq_comm.c: Common API for in kernel interrupt controller
- * Copyright (c) 2007, Intel Corporation.
- *
- * Authors:
- * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kvm_host.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/rculist.h>
-
-#include <trace/events/kvm.h>
-
-#include "irq.h"
-
-#include "ioapic.h"
-
-#include "lapic.h"
-
-#include "hyperv.h"
-#include "x86.h"
-#include "xen.h"
-
-static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
-}
-
-static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
- line_status);
-}
-
-int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, struct dest_map *dest_map)
-{
- int r = -1;
- struct kvm_vcpu *vcpu, *lowest = NULL;
- unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
- unsigned int dest_vcpus = 0;
-
- if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
- return r;
-
- if (irq->dest_mode == APIC_DEST_PHYSICAL &&
- irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
- pr_info("apic: phys broadcast and lowest prio\n");
- irq->delivery_mode = APIC_DM_FIXED;
- }
-
- memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!kvm_apic_present(vcpu))
- continue;
-
- if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
- irq->dest_id, irq->dest_mode))
- continue;
-
- if (!kvm_lowest_prio_delivery(irq)) {
- if (r < 0)
- r = 0;
- r += kvm_apic_set_irq(vcpu, irq, dest_map);
- } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
- if (!kvm_vector_hashing_enabled()) {
- if (!lowest)
- lowest = vcpu;
- else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
- lowest = vcpu;
- } else {
- __set_bit(i, dest_vcpu_bitmap);
- dest_vcpus++;
- }
- }
- }
-
- if (dest_vcpus != 0) {
- int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
- dest_vcpu_bitmap, KVM_MAX_VCPUS);
-
- lowest = kvm_get_vcpu(kvm, idx);
- }
-
- if (lowest)
- r = kvm_apic_set_irq(lowest, irq, dest_map);
-
- return r;
-}
-
-void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct kvm_lapic_irq *irq)
-{
- struct msi_msg msg = { .address_lo = e->msi.address_lo,
- .address_hi = e->msi.address_hi,
- .data = e->msi.data };
-
- trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
- (u64)msg.address_hi << 32 : 0), msg.data);
-
- irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
- irq->vector = msg.arch_data.vector;
- irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
- irq->trig_mode = msg.arch_data.is_level;
- irq->delivery_mode = msg.arch_data.delivery_mode << 8;
- irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
- irq->level = 1;
- irq->shorthand = APIC_DEST_NOSHORT;
-}
-EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
-
-static inline bool kvm_msi_route_invalid(struct kvm *kvm,
- struct kvm_kernel_irq_routing_entry *e)
-{
- return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
-}
-
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level, bool line_status)
-{
- struct kvm_lapic_irq irq;
-
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
-
- if (!level)
- return -1;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
-}
-
-#ifdef CONFIG_KVM_HYPERV
-static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- if (!level)
- return -1;
-
- return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
-}
-#endif
-
-int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_lapic_irq irq;
- int r;
-
- switch (e->type) {
-#ifdef CONFIG_KVM_HYPERV
- case KVM_IRQ_ROUTING_HV_SINT:
- return kvm_hv_set_sint(e, kvm, irq_source_id, level,
- line_status);
-#endif
-
- case KVM_IRQ_ROUTING_MSI:
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
- return r;
- break;
-
-#ifdef CONFIG_KVM_XEN
- case KVM_IRQ_ROUTING_XEN_EVTCHN:
- if (!level)
- return -1;
-
- return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
-#endif
- default:
- break;
- }
-
- return -EWOULDBLOCK;
-}
-
-int kvm_request_irq_source_id(struct kvm *kvm)
-{
- unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
- int irq_source_id;
-
- mutex_lock(&kvm->irq_lock);
- irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
-
- if (irq_source_id >= BITS_PER_LONG) {
- pr_warn("exhausted allocatable IRQ sources!\n");
- irq_source_id = -EFAULT;
- goto unlock;
- }
-
- ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
- ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
- set_bit(irq_source_id, bitmap);
-unlock:
- mutex_unlock(&kvm->irq_lock);
-
- return irq_source_id;
-}
-
-void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
-{
- ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
- ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
-
- mutex_lock(&kvm->irq_lock);
- if (irq_source_id < 0 ||
- irq_source_id >= BITS_PER_LONG) {
- pr_err("IRQ source ID out of range!\n");
- goto unlock;
- }
- clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
- if (!irqchip_kernel(kvm))
- goto unlock;
-
- kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
- kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
-unlock:
- mutex_unlock(&kvm->irq_lock);
-}
-
-void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn)
-{
- mutex_lock(&kvm->irq_lock);
- kimn->irq = irq;
- hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
- mutex_unlock(&kvm->irq_lock);
-}
-
-void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn)
-{
- mutex_lock(&kvm->irq_lock);
- hlist_del_rcu(&kimn->link);
- mutex_unlock(&kvm->irq_lock);
- synchronize_srcu(&kvm->irq_srcu);
-}
-
-void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
- bool mask)
-{
- struct kvm_irq_mask_notifier *kimn;
- int idx, gsi;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
- if (gsi != -1)
- hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link)
- if (kimn->irq == gsi)
- kimn->func(kimn, mask);
- srcu_read_unlock(&kvm->irq_srcu, idx);
-}
-
-bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
-{
- return irqchip_in_kernel(kvm);
-}
-
-int kvm_set_routing_entry(struct kvm *kvm,
- struct kvm_kernel_irq_routing_entry *e,
- const struct kvm_irq_routing_entry *ue)
-{
- /* We can't check irqchip_in_kernel() here as some callers are
- * currently initializing the irqchip. Other callers should therefore
- * check kvm_arch_can_set_irq_routing() before calling this function.
- */
- switch (ue->type) {
- case KVM_IRQ_ROUTING_IRQCHIP:
- if (irqchip_split(kvm))
- return -EINVAL;
- e->irqchip.pin = ue->u.irqchip.pin;
- switch (ue->u.irqchip.irqchip) {
- case KVM_IRQCHIP_PIC_SLAVE:
- e->irqchip.pin += PIC_NUM_PINS / 2;
- fallthrough;
- case KVM_IRQCHIP_PIC_MASTER:
- if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
- return -EINVAL;
- e->set = kvm_set_pic_irq;
- break;
- case KVM_IRQCHIP_IOAPIC:
- if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
- return -EINVAL;
- e->set = kvm_set_ioapic_irq;
- break;
- default:
- return -EINVAL;
- }
- e->irqchip.irqchip = ue->u.irqchip.irqchip;
- break;
- case KVM_IRQ_ROUTING_MSI:
- e->set = kvm_set_msi;
- e->msi.address_lo = ue->u.msi.address_lo;
- e->msi.address_hi = ue->u.msi.address_hi;
- e->msi.data = ue->u.msi.data;
-
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
- break;
-#ifdef CONFIG_KVM_HYPERV
- case KVM_IRQ_ROUTING_HV_SINT:
- e->set = kvm_hv_set_sint;
- e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
- e->hv_sint.sint = ue->u.hv_sint.sint;
- break;
-#endif
-#ifdef CONFIG_KVM_XEN
- case KVM_IRQ_ROUTING_XEN_EVTCHN:
- return kvm_xen_setup_evtchn(kvm, e, ue);
-#endif
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
- struct kvm_vcpu **dest_vcpu)
-{
- int r = 0;
- unsigned long i;
- struct kvm_vcpu *vcpu;
-
- if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
- return true;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!kvm_apic_present(vcpu))
- continue;
-
- if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
- irq->dest_id, irq->dest_mode))
- continue;
-
- if (++r == 2)
- return false;
-
- *dest_vcpu = vcpu;
- }
-
- return r == 1;
-}
-EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
-
-#define IOAPIC_ROUTING_ENTRY(irq) \
- { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
- .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
-#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
-
-#define PIC_ROUTING_ENTRY(irq) \
- { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
- .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
-#define ROUTING_ENTRY2(irq) \
- IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
-
-static const struct kvm_irq_routing_entry default_routing[] = {
- ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
- ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
- ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
- ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
- ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
- ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
- ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
- ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
- ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
- ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
- ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
- ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
-};
-
-int kvm_setup_default_irq_routing(struct kvm *kvm)
-{
- return kvm_set_irq_routing(kvm, default_routing,
- ARRAY_SIZE(default_routing), 0);
-}
-
-void kvm_arch_post_irq_routing_update(struct kvm *kvm)
-{
- if (!irqchip_split(kvm))
- return;
- kvm_make_scan_ioapic_request(kvm);
-}
-
-void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
- u8 vector, unsigned long *ioapic_handled_vectors)
-{
- /*
- * Intercept EOI if the vCPU is the target of the new IRQ routing, or
- * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
- * may receive a level-triggered IRQ in the future, or already received
- * level-triggered IRQ. The EOI needs to be intercepted and forwarded
- * to I/O APIC emulation so that the IRQ can be de-asserted.
- */
- if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
- __set_bit(vector, ioapic_handled_vectors);
- } else if (kvm_apic_pending_eoi(vcpu, vector)) {
- __set_bit(vector, ioapic_handled_vectors);
-
- /*
- * Track the highest pending EOI for which the vCPU is NOT the
- * target in the new routing. Only the EOI for the IRQ that is
- * in-flight (for the old routing) needs to be intercepted, any
- * future IRQs that arrive on this vCPU will be coincidental to
- * the level-triggered routing and don't need to be intercepted.
- */
- if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
- vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
- }
-}
-
-void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
- ulong *ioapic_handled_vectors)
-{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_kernel_irq_routing_entry *entry;
- struct kvm_irq_routing_table *table;
- u32 i, nr_ioapic_pins;
- int idx;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
- kvm->arch.nr_reserved_ioapic_pins);
- for (i = 0; i < nr_ioapic_pins; ++i) {
- hlist_for_each_entry(entry, &table->map[i], link) {
- struct kvm_lapic_irq irq;
-
- if (entry->type != KVM_IRQ_ROUTING_MSI)
- continue;
-
- kvm_set_msi_irq(vcpu->kvm, entry, &irq);
-
- if (!irq.trig_mode)
- continue;
-
- kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
- irq.vector, ioapic_handled_vectors);
- }
- }
- srcu_read_unlock(&kvm->irq_srcu, idx);
-}
-
-void kvm_arch_irq_routing_update(struct kvm *kvm)
-{
-#ifdef CONFIG_KVM_HYPERV
- kvm_hv_irq_routing_update(kvm);
-#endif
-}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73418dc0ebb2..8172c2042dd6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -27,6 +27,7 @@
#include <linux/export.h>
#include <linux/math64.h>
#include <linux/slab.h>
+#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/mce.h>
#include <asm/msr.h>
@@ -55,9 +56,6 @@
/* 14 is the version for Xeon and Pentium 8.4.8*/
#define APIC_VERSION 0x14UL
#define LAPIC_MMIO_LENGTH (1 << 12)
-/* followed define is not in apicdef.h */
-#define MAX_APIC_VECTOR 256
-#define APIC_VECTORS_PER_REG 32
/*
* Enable local APIC timer advancement (tscdeadline mode only) with adaptive
@@ -79,42 +77,20 @@ module_param(lapic_timer_advance, bool, 0444);
static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data);
-static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
-{
- *((u32 *) (regs + reg_off)) = val;
-}
-
static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
{
- __kvm_lapic_set_reg(apic->regs, reg_off, val);
-}
-
-static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg)
-{
- BUILD_BUG_ON(reg != APIC_ICR);
- return *((u64 *) (regs + reg));
+ apic_set_reg(apic->regs, reg_off, val);
}
static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
{
- return __kvm_lapic_get_reg64(apic->regs, reg);
-}
-
-static __always_inline void __kvm_lapic_set_reg64(char *regs, int reg, u64 val)
-{
- BUILD_BUG_ON(reg != APIC_ICR);
- *((u64 *) (regs + reg)) = val;
+ return apic_get_reg64(apic->regs, reg);
}
static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
int reg, u64 val)
{
- __kvm_lapic_set_reg64(apic->regs, reg, val);
-}
-
-static inline int apic_test_vector(int vec, void *bitmap)
-{
- return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+ apic_set_reg64(apic->regs, reg, val);
}
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
@@ -125,16 +101,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
apic_test_vector(vector, apic->regs + APIC_IRR);
}
-static inline int __apic_test_and_set_vector(int vec, void *bitmap)
-{
- return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
-{
- return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
@@ -626,21 +592,6 @@ static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
[LVT_CMCI] = LVT_MASK | APIC_MODE_MASK
};
-static int find_highest_vector(void *bitmap)
-{
- int vec;
- u32 *reg;
-
- for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG;
- vec >= 0; vec -= APIC_VECTORS_PER_REG) {
- reg = bitmap + REG_POS(vec);
- if (*reg)
- return __fls(*reg) + vec;
- }
-
- return -1;
-}
-
static u8 count_vectors(void *bitmap)
{
int vec;
@@ -648,7 +599,7 @@ static u8 count_vectors(void *bitmap)
u8 count = 0;
for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
- reg = bitmap + REG_POS(vec);
+ reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec);
count += hweight32(*reg);
}
@@ -706,7 +657,7 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
static inline int apic_search_irr(struct kvm_lapic *apic)
{
- return find_highest_vector(apic->regs + APIC_IRR);
+ return apic_find_highest_vector(apic->regs + APIC_IRR);
}
static inline int apic_find_highest_irr(struct kvm_lapic *apic)
@@ -729,10 +680,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
{
if (unlikely(apic->apicv_active)) {
- kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
} else {
apic->irr_pending = false;
- kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
if (apic_search_irr(apic) != -1)
apic->irr_pending = true;
}
@@ -744,9 +695,15 @@ void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec)
}
EXPORT_SYMBOL_GPL(kvm_apic_clear_irr);
+static void *apic_vector_to_isr(int vec, struct kvm_lapic *apic)
+{
+ return apic->regs + APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(vec);
+}
+
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
{
- if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ if (__test_and_set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
+ apic_vector_to_isr(vec, apic)))
return;
/*
@@ -781,7 +738,7 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
if (likely(apic->highest_isr_cache != -1))
return apic->highest_isr_cache;
- result = find_highest_vector(apic->regs + APIC_ISR);
+ result = apic_find_highest_vector(apic->regs + APIC_ISR);
ASSERT(result == -1 || result >= 16);
return result;
@@ -789,7 +746,8 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
{
- if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ if (!__test_and_clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
+ apic_vector_to_isr(vec, apic)))
return;
/*
@@ -1332,11 +1290,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
if (trig_mode)
- kvm_lapic_set_vector(vector,
- apic->regs + APIC_TMR);
+ apic_set_vector(vector, apic->regs + APIC_TMR);
else
- kvm_lapic_clear_vector(vector,
- apic->regs + APIC_TMR);
+ apic_clear_vector(vector, apic->regs + APIC_TMR);
}
kvm_x86_call(deliver_interrupt)(apic, delivery_mode,
@@ -1455,7 +1411,7 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
{
- int trigger_mode;
+ int __maybe_unused trigger_mode;
/* Eoi the ioapic only if the ioapic doesn't own the vector. */
if (!kvm_ioapic_handles_vector(apic, vector))
@@ -1476,12 +1432,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
return;
}
+#ifdef CONFIG_KVM_IOAPIC
if (apic_test_vector(vector, apic->regs + APIC_TMR))
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+#endif
}
static int apic_set_eoi(struct kvm_lapic *apic)
@@ -3084,12 +3042,12 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
if (!kvm_x86_ops.x2apic_icr_is_split) {
if (set) {
- icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
- (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
- __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+ icr = apic_get_reg(s->regs, APIC_ICR) |
+ (u64)apic_get_reg(s->regs, APIC_ICR2) << 32;
+ apic_set_reg64(s->regs, APIC_ICR, icr);
} else {
- icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
- __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ icr = apic_get_reg64(s->regs, APIC_ICR);
+ apic_set_reg(s->regs, APIC_ICR2, icr >> 32);
}
}
}
@@ -3105,8 +3063,7 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
* Get calculated timer current count for remaining timer period (if
* any) and store it in the returned register set.
*/
- __kvm_lapic_set_reg(s->regs, APIC_TMCCT,
- __apic_read(vcpu->arch.apic, APIC_TMCCT));
+ apic_set_reg(s->regs, APIC_TMCCT, __apic_read(vcpu->arch.apic, APIC_TMCCT));
return kvm_apic_state_fixup(vcpu, s, false);
}
@@ -3146,8 +3103,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+#ifdef CONFIG_KVM_IOAPIC
if (ioapic_in_kernel(vcpu->kvm))
kvm_rtc_eoi_tracking_restore_one(vcpu);
+#endif
vcpu->arch.apic_arb_prio = 0;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4ce30db65828..72de14527698 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -4,6 +4,8 @@
#include <kvm/iodev.h>
+#include <asm/apic.h>
+
#include <linux/kvm_host.h>
#include "hyperv.h"
@@ -21,6 +23,8 @@
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
enum lapic_mode {
LAPIC_MODE_DISABLED = 0,
LAPIC_MODE_INVALID = X2APIC_ENABLE,
@@ -145,22 +149,9 @@ void kvm_lapic_exit(void);
u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic);
-#define VEC_POS(v) ((v) & (32 - 1))
-#define REG_POS(v) (((v) >> 5) << 4)
-
-static inline void kvm_lapic_clear_vector(int vec, void *bitmap)
-{
- clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline void kvm_lapic_set_vector(int vec, void *bitmap)
-{
- set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
{
- kvm_lapic_set_vector(vec, apic->regs + APIC_IRR);
+ apic_set_vector(vec, apic->regs + APIC_IRR);
/*
* irr_pending must be true if any interrupt is pending; set it after
* APIC_IRR to avoid race with apic_clear_irr
@@ -168,14 +159,9 @@ static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
apic->irr_pending = true;
}
-static inline u32 __kvm_lapic_get_reg(char *regs, int reg_off)
-{
- return *((u32 *) (regs + reg_off));
-}
-
static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
{
- return __kvm_lapic_get_reg(apic->regs, reg_off);
+ return apic_get_reg(apic->regs, reg_off);
}
DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4e06e2e89a8f..6e838cb6c9e1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1983,14 +1983,35 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
return true;
}
+static __ro_after_init HLIST_HEAD(empty_page_hash);
+
+static struct hlist_head *kvm_get_mmu_page_hash(struct kvm *kvm, gfn_t gfn)
+{
+ /*
+ * Ensure the load of the hash table pointer itself is ordered before
+ * loads to walk the table. The pointer is set at runtime outside of
+ * mmu_lock when the TDP MMU is enabled, i.e. when the hash table of
+ * shadow pages becomes necessary only when KVM needs to shadow L1's
+ * TDP for an L2 guest. Pairs with the smp_store_release() in
+ * kvm_mmu_alloc_page_hash().
+ */
+ struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash);
+
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ if (!page_hash)
+ return &empty_page_hash;
+
+ return &page_hash[kvm_page_table_hashfn(gfn)];
+}
+
#define for_each_valid_sp(_kvm, _sp, _list) \
hlist_for_each_entry(_sp, _list, hash_link) \
if (is_obsolete_sp((_kvm), (_sp))) { \
} else
#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
- for_each_valid_sp(_kvm, _sp, \
- &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
+ for_each_valid_sp(_kvm, _sp, kvm_get_mmu_page_hash(_kvm, _gfn)) \
if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
@@ -2358,6 +2379,12 @@ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
struct kvm_mmu_page *sp;
bool created = false;
+ /*
+ * No need for memory barriers, unlike in kvm_get_mmu_page_hash(), as
+ * mmu_page_hash must be set prior to creating the first shadow root,
+ * i.e. reaching this point is fully serialized by slots_arch_lock.
+ */
+ BUG_ON(!kvm->arch.mmu_page_hash);
sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
@@ -3882,6 +3909,28 @@ out_unlock:
return r;
}
+static int kvm_mmu_alloc_page_hash(struct kvm *kvm)
+{
+ struct hlist_head *h;
+
+ if (kvm->arch.mmu_page_hash)
+ return 0;
+
+ h = kvcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT);
+ if (!h)
+ return -ENOMEM;
+
+ /*
+ * Ensure the hash table pointer is set only after all stores to zero
+ * the memory are retired. Pairs with the smp_load_acquire() in
+ * kvm_get_mmu_page_hash(). Note, mmu_lock must be held for write to
+ * add (or remove) shadow pages, and so readers are guaranteed to see
+ * an empty list for their current mmu_lock critical section.
+ */
+ smp_store_release(&kvm->arch.mmu_page_hash, h);
+ return 0;
+}
+
static int mmu_first_shadow_root_alloc(struct kvm *kvm)
{
struct kvm_memslots *slots;
@@ -3901,9 +3950,13 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
if (kvm_shadow_root_allocated(kvm))
goto out_unlock;
+ r = kvm_mmu_alloc_page_hash(kvm);
+ if (r)
+ goto out_unlock;
+
/*
- * Check if anything actually needs to be allocated, e.g. all metadata
- * will be allocated upfront if TDP is disabled.
+ * Check if memslot metadata actually needs to be allocated, e.g. all
+ * metadata will be allocated upfront if TDP is disabled.
*/
if (kvm_memslots_have_rmaps(kvm) &&
kvm_page_track_write_tracking_enabled(kvm))
@@ -6682,15 +6735,22 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
}
-void kvm_mmu_init_vm(struct kvm *kvm)
+int kvm_mmu_init_vm(struct kvm *kvm)
{
+ int r;
+
kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
- if (tdp_mmu_enabled)
+ if (tdp_mmu_enabled) {
kvm_mmu_init_tdp_mmu(kvm);
+ } else {
+ r = kvm_mmu_alloc_page_hash(kvm);
+ if (r)
+ return r;
+ }
kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
@@ -6699,6 +6759,7 @@ void kvm_mmu_init_vm(struct kvm *kvm)
kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
+ return 0;
}
static void mmu_free_vm_memory_caches(struct kvm *kvm)
@@ -6710,6 +6771,8 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm)
void kvm_mmu_uninit_vm(struct kvm *kvm)
{
+ kvfree(kvm->arch.mmu_page_hash);
+
if (tdp_mmu_enabled)
kvm_mmu_uninit_tdp_mmu(kvm);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index db8f33e4de62..65f3c89d7c5d 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -103,6 +103,9 @@ struct kvm_mmu_page {
int root_count;
refcount_t tdp_mmu_root_count;
};
+
+ bool has_mapped_host_mmio;
+
union {
/* These two members aren't used for TDP MMU */
struct {
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 68e323568e95..ed762bb4b007 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -804,9 +804,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r != RET_PF_CONTINUE)
return r;
+#if PTTYPE != PTTYPE_EPT
/*
- * Do not change pte_access if the pfn is a mmio page, otherwise
- * we will cache the incorrect access into mmio spte.
+ * Treat the guest PTE protections as writable, supervisor-only if this
+ * is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore
+ * PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO,
+ * otherwise KVM will cache incorrect access information in the SPTE.
*/
if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
!is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
@@ -822,6 +825,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (is_cr4_smep(vcpu->arch.mmu))
walker.pte_access &= ~ACC_EXEC_MASK;
}
+#endif
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index cfce03d8f123..df31039b5d63 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -104,7 +104,7 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
return spte;
}
-static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
+static bool __kvm_is_mmio_pfn(kvm_pfn_t pfn)
{
if (pfn_valid(pfn))
return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
@@ -125,6 +125,35 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
E820_TYPE_RAM);
}
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn, int *is_host_mmio)
+{
+ /*
+ * Determining if a PFN is host MMIO is relative expensive. Cache the
+ * result locally (in the sole caller) to avoid doing the full query
+ * multiple times when creating a single SPTE.
+ */
+ if (*is_host_mmio < 0)
+ *is_host_mmio = __kvm_is_mmio_pfn(pfn);
+
+ return *is_host_mmio;
+}
+
+static void kvm_track_host_mmio_mapping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (root)
+ WRITE_ONCE(root->has_mapped_host_mmio, true);
+ else
+ WRITE_ONCE(vcpu->kvm->arch.has_mapped_host_mmio, true);
+
+ /*
+ * Force vCPUs to exit and flush CPU buffers if the vCPU is using the
+ * affected root(s).
+ */
+ kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
/*
* Returns true if the SPTE needs to be updated atomically due to having bits
* that may be changed without holding mmu_lock, and for which KVM must not
@@ -162,6 +191,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
{
int level = sp->role.level;
u64 spte = SPTE_MMU_PRESENT_MASK;
+ int is_host_mmio = -1;
bool wrprot = false;
/*
@@ -209,13 +239,15 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
- spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn));
+ if (kvm_x86_ops.get_mt_mask)
+ spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn, &is_host_mmio));
if (host_writable)
spte |= shadow_host_writable_mask;
else
pte_access &= ~ACC_WRITE_MASK;
- if (shadow_me_value && !kvm_is_mmio_pfn(pfn))
+ if (shadow_me_value && !kvm_is_mmio_pfn(pfn, &is_host_mmio))
spte |= shadow_me_value;
spte |= (u64)pfn << PAGE_SHIFT;
@@ -260,6 +292,11 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
}
+ if (static_branch_unlikely(&cpu_buf_vm_clear) &&
+ !kvm_vcpu_can_access_host_mmio(vcpu) &&
+ kvm_is_mmio_pfn(pfn, &is_host_mmio))
+ kvm_track_host_mmio_mapping(vcpu);
+
*new_spte = spte;
return wrprot;
}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1e94f081bdaf..3133f066927e 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -280,6 +280,16 @@ static inline bool is_mirror_sptep(tdp_ptep_t sptep)
return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep)));
}
+static inline bool kvm_vcpu_can_access_host_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (root)
+ return READ_ONCE(root->has_mapped_host_mmio);
+
+ return READ_ONCE(vcpu->kvm->arch.has_mapped_host_mmio);
+}
+
static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
{
return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 067f8e3f5a0d..a34c5c3b164e 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -18,6 +18,7 @@
#include <linux/hashtable.h>
#include <linux/amd-iommu.h>
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
#include <asm/irq_remapping.h>
#include <asm/msr.h>
@@ -29,36 +30,39 @@
#include "svm.h"
/*
- * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID,
- * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry
- * if an interrupt can't be delivered, e.g. because the vCPU isn't running.
+ * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
+ * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
+ * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index
+ * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast
+ * lookup on the index, where as vCPUs whose index doesn't match their ID need
+ * to walk the entire xarray of vCPUs in the worst case scenario.
*
- * For the vCPU ID, use however many bits are currently allowed for the max
+ * For the vCPU index, use however many bits are currently allowed for the max
* guest physical APIC ID (limited by the size of the physical ID table), and
* use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
* size of the GATag is defined by hardware (32 bits), but is an opaque value
* as far as hardware is concerned.
*/
-#define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
+#define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
#define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
#define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
-#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+#define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK)
-#define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
- ((vcpu_id) & AVIC_VCPU_ID_MASK))
-#define AVIC_GATAG(vm_id, vcpu_id) \
+#define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
+ ((vcpu_idx) & AVIC_VCPU_IDX_MASK))
+#define AVIC_GATAG(vm_id, vcpu_idx) \
({ \
- u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \
+ u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \
\
- WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \
+ WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \
WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
ga_tag; \
})
-static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
+static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u);
static bool force_avic;
module_param_unsafe(force_avic, bool, 0444);
@@ -75,14 +79,6 @@ static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
bool x2avic_enabled;
-/*
- * This is a wrapper of struct amd_iommu_ir_data.
- */
-struct amd_svm_iommu_ir {
- struct list_head node; /* Used by SVM for per-vcpu ir_list */
- void *data; /* Storing pointer to struct amd_ir_data */
-};
-
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
@@ -147,16 +143,16 @@ int avic_ga_log_notifier(u32 ga_tag)
struct kvm_svm *kvm_svm;
struct kvm_vcpu *vcpu = NULL;
u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
- u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+ u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag);
- pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
- trace_kvm_avic_ga_log(vm_id, vcpu_id);
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx);
+ trace_kvm_avic_ga_log(vm_id, vcpu_idx);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
if (kvm_svm->avic_vm_id != vm_id)
continue;
- vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
+ vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx);
break;
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
@@ -180,10 +176,8 @@ void avic_vm_destroy(struct kvm *kvm)
if (!enable_apicv)
return;
- if (kvm_svm->avic_logical_id_table_page)
- __free_page(kvm_svm->avic_logical_id_table_page);
- if (kvm_svm->avic_physical_id_table_page)
- __free_page(kvm_svm->avic_physical_id_table_page);
+ free_page((unsigned long)kvm_svm->avic_logical_id_table);
+ free_page((unsigned long)kvm_svm->avic_physical_id_table);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_del(&kvm_svm->hnode);
@@ -196,27 +190,19 @@ int avic_vm_init(struct kvm *kvm)
int err = -ENOMEM;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
struct kvm_svm *k2;
- struct page *p_page;
- struct page *l_page;
u32 vm_id;
if (!enable_apicv)
return 0;
- /* Allocating physical APIC ID table (4KB) */
- p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!p_page)
+ kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!kvm_svm->avic_physical_id_table)
goto free_avic;
- kvm_svm->avic_physical_id_table_page = p_page;
-
- /* Allocating logical APIC ID table (4KB) */
- l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!l_page)
+ kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!kvm_svm->avic_logical_id_table)
goto free_avic;
- kvm_svm->avic_logical_id_table_page = l_page;
-
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
again:
vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
@@ -242,17 +228,19 @@ free_avic:
return err;
}
+static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
+{
+ return __sme_set(__pa(svm->vcpu.arch.apic->regs));
+}
+
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
{
struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
- phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
- phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
- phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
- vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
- vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
- vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
- vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
+ vmcb->control.avic_backing_page = avic_get_backing_page_address(svm);
+ vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table));
+ vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
+ vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
if (kvm_apicv_activated(svm->vcpu.kvm))
avic_activate_vmcb(svm);
@@ -260,32 +248,31 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
avic_deactivate_vmcb(svm);
}
-static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
- unsigned int index)
-{
- u64 *avic_physical_id_table;
- struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
-
- if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
- (index > X2AVIC_MAX_PHYSICAL_ID))
- return NULL;
-
- avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
-
- return &avic_physical_id_table[index];
-}
-
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
- u64 *entry, new_entry;
- int id = vcpu->vcpu_id;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
struct vcpu_svm *svm = to_svm(vcpu);
+ u32 id = vcpu->vcpu_id;
+ u64 new_entry;
+ /*
+ * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC
+ * hardware. Immediately clear apicv_active, i.e. don't wait until the
+ * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as
+ * avic_vcpu_load() expects to be called if and only if the vCPU has
+ * fully initialized AVIC.
+ */
if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
- (id > X2AVIC_MAX_PHYSICAL_ID))
- return -EINVAL;
+ (id > X2AVIC_MAX_PHYSICAL_ID)) {
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
+ vcpu->arch.apic->apicv_active = false;
+ return 0;
+ }
+
+ BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE ||
+ (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE);
- if (!vcpu->arch.apic->regs)
+ if (WARN_ON_ONCE(!vcpu->arch.apic->regs))
return -EINVAL;
if (kvm_apicv_activated(vcpu->kvm)) {
@@ -302,19 +289,21 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return ret;
}
- svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
+ /* Note, fls64() returns the bit position, +1. */
+ BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT >
+ fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK));
/* Setting AVIC backing page address in the phy APIC ID table */
- entry = avic_get_physical_id_entry(vcpu, id);
- if (!entry)
- return -EINVAL;
+ new_entry = avic_get_backing_page_address(svm) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+ svm->avic_physical_id_entry = new_entry;
- new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
- AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
- AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
- WRITE_ONCE(*entry, new_entry);
-
- svm->avic_physical_id_cache = entry;
+ /*
+ * Initialize the real table, as vCPUs must have a valid entry in order
+ * for broadcast IPIs to function correctly (broadcast IPIs ignore
+ * invalid entries, i.e. aren't guaranteed to generate a VM-Exit).
+ */
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry);
return 0;
}
@@ -448,7 +437,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
if (apic_x2apic_mode(source))
avic_logical_id_table = NULL;
else
- avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
+ avic_logical_id_table = kvm_svm->avic_logical_id_table;
/*
* AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
@@ -550,7 +539,6 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- u32 *logical_apic_id_table;
u32 cluster, index;
ldr = GET_APIC_LOGICAL_ID(ldr);
@@ -571,9 +559,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
return NULL;
index += (cluster << 2);
- logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
-
- return &logical_apic_id_table[index];
+ return &kvm_svm->avic_logical_id_table[index];
}
static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
@@ -722,6 +708,9 @@ int avic_init_vcpu(struct vcpu_svm *svm)
int ret;
struct kvm_vcpu *vcpu = &svm->vcpu;
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
+
if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
return 0;
@@ -729,8 +718,6 @@ int avic_init_vcpu(struct vcpu_svm *svm)
if (ret)
return ret;
- INIT_LIST_HEAD(&svm->ir_list);
- spin_lock_init(&svm->ir_list_lock);
svm->dfr_reg = APIC_DFR_FLAT;
return ret;
@@ -742,316 +729,161 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
{
- int ret = 0;
+ struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
unsigned long flags;
- struct amd_svm_iommu_ir *ir;
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm))
- return 0;
- /*
- * Here, we go through the per-vcpu ir_list to update all existing
- * interrupt remapping table entry targeting this vcpu.
- */
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
- if (list_empty(&svm->ir_list))
- goto out;
+ if (!vcpu)
+ return;
- list_for_each_entry(ir, &svm->ir_list, node) {
- if (activate)
- ret = amd_iommu_activate_guest_mode(ir->data);
- else
- ret = amd_iommu_deactivate_guest_mode(ir->data);
- if (ret)
- break;
- }
-out:
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
- return ret;
+ spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
+ list_del(&irqfd->vcpu_list);
+ spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
}
-static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector)
{
- unsigned long flags;
- struct amd_svm_iommu_ir *cur;
-
- spin_lock_irqsave(&svm->ir_list_lock, flags);
- list_for_each_entry(cur, &svm->ir_list, node) {
- if (cur->data != pi->ir_data)
- continue;
- list_del(&cur->node);
- kfree(cur);
- break;
- }
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-}
-
-static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
-{
- int ret = 0;
- unsigned long flags;
- struct amd_svm_iommu_ir *ir;
- u64 entry;
-
- if (WARN_ON_ONCE(!pi->ir_data))
- return -EINVAL;
-
- /**
- * In some cases, the existing irte is updated and re-set,
- * so we need to check here if it's already been * added
- * to the ir_list.
- */
- if (pi->prev_ga_tag) {
- struct kvm *kvm = svm->vcpu.kvm;
- u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
- struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
- struct vcpu_svm *prev_svm;
-
- if (!prev_vcpu) {
- ret = -EINVAL;
- goto out;
- }
-
- prev_svm = to_svm(prev_vcpu);
- svm_ir_list_del(prev_svm, pi);
- }
-
- /**
- * Allocating new amd_iommu_pi_data, which will get
- * add to the per-vcpu ir_list.
- */
- ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
- if (!ir) {
- ret = -ENOMEM;
- goto out;
- }
- ir->data = pi->ir_data;
-
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
/*
- * Update the target pCPU for IOMMU doorbells if the vCPU is running.
- * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
- * will update the pCPU info when the vCPU awkened and/or scheduled in.
- * See also avic_vcpu_load().
+ * If the IRQ was affined to a different vCPU, remove the IRTE metadata
+ * from the *previous* vCPU's list.
*/
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
- amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
- true, pi->ir_data);
-
- list_add(&ir->node, &svm->ir_list);
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-out:
- return ret;
-}
+ svm_ir_list_del(irqfd);
-/*
- * Note:
- * The HW cannot support posting multicast/broadcast
- * interrupts to a vCPU. So, we still use legacy interrupt
- * remapping for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- */
-static int
-get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
-{
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu = NULL;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq)) {
- pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
- __func__, irq.vector);
- return -1;
- }
-
- pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
- irq.vector);
- *svm = to_svm(vcpu);
- vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
- vcpu_info->vector = irq.vector;
-
- return 0;
-}
-
-/*
- * avic_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- bool enable_remapped_mode = true;
- int idx, ret = 0;
-
- if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
- return 0;
-
- pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
- __func__, host_irq, guest_irq, set);
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
- }
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- struct vcpu_data vcpu_info;
- struct vcpu_svm *svm = NULL;
+ if (vcpu) {
+ /*
+ * Try to enable guest_mode in IRTE, unless AVIC is inhibited,
+ * in which case configure the IRTE for legacy mode, but track
+ * the IRTE metadata so that it can be converted to guest mode
+ * if AVIC is enabled/uninhibited in the future.
+ */
+ struct amd_iommu_pi_data pi_data = {
+ .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+ vcpu->vcpu_idx),
+ .is_guest_mode = kvm_vcpu_apicv_active(vcpu),
+ .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
+ .vector = vector,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 entry;
+ int ret;
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
+ /*
+ * Prevent the vCPU from being scheduled out or migrated until
+ * the IRTE is updated and its metadata has been added to the
+ * list of IRQs being posted to the vCPU, to ensure the IRTE
+ * isn't programmed with stale pCPU/IsRunning information.
+ */
+ guard(spinlock_irqsave)(&svm->ir_list_lock);
- /**
- * Here, we setup with legacy mode in the following cases:
- * 1. When cannot target interrupt to a specific vcpu.
- * 2. Unsetting posted interrupt.
- * 3. APIC virtualization is disabled for the vcpu.
- * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
+ /*
+ * Update the target pCPU for IOMMU doorbells if the vCPU is
+ * running. If the vCPU is NOT running, i.e. is blocking or
+ * scheduled out, KVM will update the pCPU info when the vCPU
+ * is awakened and/or scheduled in. See also avic_vcpu_load().
*/
- if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
- kvm_vcpu_apicv_active(&svm->vcpu)) {
- struct amd_iommu_pi_data pi;
-
- enable_remapped_mode = false;
-
- /* Try to enable guest_mode in IRTE */
- pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
- AVIC_HPA_MASK);
- pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
- svm->vcpu.vcpu_id);
- pi.is_guest_mode = true;
- pi.vcpu_data = &vcpu_info;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
-
- /**
- * Here, we successfully setting up vcpu affinity in
- * IOMMU guest mode. Now, we need to store the posted
- * interrupt information in a per-vcpu ir_list so that
- * we can reference to them directly when we update vcpu
- * scheduling information in IOMMU irte.
- */
- if (!ret && pi.is_guest_mode)
- svm_ir_list_add(svm, &pi);
+ entry = svm->avic_physical_id_entry;
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
+ pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ } else {
+ pi_data.cpu = -1;
+ pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
}
- if (!ret && svm) {
- trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
- e->gsi, vcpu_info.vector,
- vcpu_info.pi_desc_addr, set);
- }
+ ret = irq_set_vcpu_affinity(host_irq, &pi_data);
+ if (ret)
+ return ret;
- if (ret < 0) {
- pr_err("%s: failed to update PI IRTE\n", __func__);
- goto out;
+ /*
+ * Revert to legacy mode if the IOMMU didn't provide metadata
+ * for the IRTE, which KVM needs to keep the IRTE up-to-date,
+ * e.g. if the vCPU is migrated or AVIC is disabled.
+ */
+ if (WARN_ON_ONCE(!pi_data.ir_data)) {
+ irq_set_vcpu_affinity(host_irq, NULL);
+ return -EIO;
}
- }
- ret = 0;
- if (enable_remapped_mode) {
- /* Use legacy mode in IRTE */
- struct amd_iommu_pi_data pi;
+ irqfd->irq_bypass_data = pi_data.ir_data;
+ list_add(&irqfd->vcpu_list, &svm->ir_list);
+ return 0;
+ }
+ return irq_set_vcpu_affinity(host_irq, NULL);
+}
- /**
- * Here, pi is used to:
- * - Tell IOMMU to use legacy mode for this interrupt.
- * - Retrieve ga_tag of prior interrupt remapping data.
- */
- pi.prev_ga_tag = 0;
- pi.is_guest_mode = false;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
+enum avic_vcpu_action {
+ /*
+ * There is no need to differentiate between activate and deactivate,
+ * as KVM only refreshes AVIC state when the vCPU is scheduled in and
+ * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is
+ * being (de)activated.
+ */
+ AVIC_TOGGLE_ON_OFF = BIT(0),
+ AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF,
+ AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF,
- /**
- * Check if the posted interrupt was previously
- * setup with the guest_mode by checking if the ga_tag
- * was cached. If so, we need to clean up the per-vcpu
- * ir_list.
- */
- if (!ret && pi.prev_ga_tag) {
- int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
- struct kvm_vcpu *vcpu;
+ /*
+ * No unique action is required to deal with a vCPU that stops/starts
+ * running. A vCPU that starts running by definition stops blocking as
+ * well, and a vCPU that stops running can't have been blocking, i.e.
+ * doesn't need to toggle GALogIntr.
+ */
+ AVIC_START_RUNNING = 0,
+ AVIC_STOP_RUNNING = 0,
- vcpu = kvm_get_vcpu_by_id(kvm, id);
- if (vcpu)
- svm_ir_list_del(to_svm(vcpu), &pi);
- }
- }
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
-}
+ /*
+ * When a vCPU starts blocking, KVM needs to set the GALogIntr flag
+ * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is
+ * sent to the vCPU.
+ */
+ AVIC_START_BLOCKING = BIT(1),
+};
-static inline int
-avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
+static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
{
- int ret = 0;
- struct amd_svm_iommu_ir *ir;
+ bool ga_log_intr = (action & AVIC_START_BLOCKING);
struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_kernel_irqfd *irqfd;
lockdep_assert_held(&svm->ir_list_lock);
- if (!kvm_arch_has_assigned_device(vcpu->kvm))
- return 0;
-
/*
* Here, we go through the per-vcpu ir_list to update all existing
* interrupt remapping table entry targeting this vcpu.
*/
if (list_empty(&svm->ir_list))
- return 0;
+ return;
- list_for_each_entry(ir, &svm->ir_list, node) {
- ret = amd_iommu_update_ga(cpu, r, ir->data);
- if (ret)
- return ret;
+ list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
+ void *data = irqfd->irq_bypass_data;
+
+ if (!(action & AVIC_TOGGLE_ON_OFF))
+ WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr));
+ else if (cpu >= 0)
+ WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr));
+ else
+ WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
}
- return 0;
}
-void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
{
- u64 entry;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
+ u64 entry;
lockdep_assert_preemption_disabled();
if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
- /*
- * No need to update anything if the vCPU is blocking, i.e. if the vCPU
- * is being scheduled in after being preempted. The CPU entries in the
- * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
- * If the vCPU was migrated, its new CPU value will be stuffed when the
- * vCPU unblocks.
- */
- if (kvm_vcpu_is_blocking(vcpu))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
return;
/*
@@ -1063,38 +895,57 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ entry = svm->avic_physical_id_entry;
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK |
+ AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
- avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
+ svm->avic_physical_id_entry = entry;
+
+ /*
+ * If IPI virtualization is disabled, clear IsRunning when updating the
+ * actual Physical ID table, so that the CPU never sees IsRunning=1.
+ * Keep the APIC ID up-to-date in the entry to minimize the chances of
+ * things going sideways if hardware peeks at the ID.
+ */
+ if (!enable_ipiv)
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
-void avic_vcpu_put(struct kvm_vcpu *vcpu)
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- u64 entry;
+ /*
+ * No need to update anything if the vCPU is blocking, i.e. if the vCPU
+ * is being scheduled in after being preempted. The CPU entries in the
+ * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
+ * If the vCPU was migrated, its new CPU value will be stuffed when the
+ * vCPU unblocks.
+ */
+ if (kvm_vcpu_is_blocking(vcpu))
+ return;
+
+ __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING);
+}
+
+static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
+ u64 entry = svm->avic_physical_id_entry;
lockdep_assert_preemption_disabled();
- /*
- * Note, reading the Physical ID entry outside of ir_list_lock is safe
- * as only the pCPU that has loaded (or is loading) the vCPU is allowed
- * to modify the entry, and preemption is disabled. I.e. the vCPU
- * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
- * recursively.
- */
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
-
- /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
- if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
return;
/*
@@ -1107,13 +958,62 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
- avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+ avic_update_iommu_vcpu_affinity(vcpu, -1, action);
+
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
+ /*
+ * Keep the previous APIC ID in the entry so that a rogue doorbell from
+ * hardware is at least restricted to a CPU associated with the vCPU.
+ */
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+
+ if (enable_ipiv)
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ /*
+ * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as
+ * it's a synthetic flag that usurps an unused should-be-zero bit.
+ */
+ if (action & AVIC_START_BLOCKING)
+ entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
+
+ svm->avic_physical_id_entry = entry;
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, reading the Physical ID entry outside of ir_list_lock is safe
+ * as only the pCPU that has loaded (or is loading) the vCPU is allowed
+ * to modify the entry, and preemption is disabled. I.e. the vCPU
+ * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
+ * recursively.
+ */
+ u64 entry = to_svm(vcpu)->avic_physical_id_entry;
+
+ /*
+ * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the
+ * vCPU is preempted while its in the process of blocking. WARN if the
+ * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put
+ * the AVIC if it wasn't previously loaded.
+ */
+ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) {
+ if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu)))
+ return;
+ /*
+ * The vCPU was preempted while blocking, ensure its IRTEs are
+ * configured to generate GA Log Interrupts.
+ */
+ if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR))))
+ return;
+ }
+
+ __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING :
+ AVIC_STOP_RUNNING);
}
void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
@@ -1142,19 +1042,18 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
- bool activated = kvm_vcpu_apicv_active(vcpu);
-
if (!enable_apicv)
return;
+ /* APICv should only be toggled on/off while the vCPU is running. */
+ WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu));
+
avic_refresh_virtual_apic_mode(vcpu);
- if (activated)
- avic_vcpu_load(vcpu, vcpu->cpu);
+ if (kvm_vcpu_apicv_active(vcpu))
+ __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE);
else
- avic_vcpu_put(vcpu);
-
- avic_set_pi_irte_mode(vcpu, activated);
+ __avic_vcpu_put(vcpu, AVIC_DEACTIVATE);
}
void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
@@ -1162,20 +1061,25 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
if (!kvm_vcpu_apicv_active(vcpu))
return;
- /*
- * Unload the AVIC when the vCPU is about to block, _before_
- * the vCPU actually blocks.
- *
- * Any IRQs that arrive before IsRunning=0 will not cause an
- * incomplete IPI vmexit on the source, therefore vIRR will also
- * be checked by kvm_vcpu_check_block() before blocking. The
- * memory barrier implicit in set_current_state orders writing
- * IsRunning=0 before reading the vIRR. The processor needs a
- * matching memory barrier on interrupt delivery between writing
- * IRR and reading IsRunning; the lack of this barrier might be
- * the cause of errata #1235).
- */
- avic_vcpu_put(vcpu);
+ /*
+ * Unload the AVIC when the vCPU is about to block, _before_ the vCPU
+ * actually blocks.
+ *
+ * Note, any IRQs that arrive before IsRunning=0 will not cause an
+ * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles
+ * this by checking vIRR one last time before blocking. The memory
+ * barrier implicit in set_current_state orders writing IsRunning=0
+ * before reading the vIRR. The processor needs a matching memory
+ * barrier on interrupt delivery between writing IRR and reading
+ * IsRunning; the lack of this barrier might be the cause of errata #1235).
+ *
+ * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM
+ * doesn't need to detect events for scheduling purposes. The doorbell
+ * used to signal running vCPUs cannot be blocked, i.e. will perturb the
+ * CPU and cause noisy neighbor problems if the VM is sending interrupts
+ * to the vCPU while it's scheduled out.
+ */
+ __avic_vcpu_put(vcpu, AVIC_START_BLOCKING);
}
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
@@ -1228,6 +1132,14 @@ bool avic_hardware_setup(void)
if (x2avic_enabled)
pr_info("x2AVIC enabled\n");
+ /*
+ * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
+ * due to erratum 1235, which results in missed VM-Exits on the sender
+ * and thus missed wake events for blocking vCPUs due to the CPU
+ * failing to see a software update to clear IsRunning.
+ */
+ enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
+
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
return true;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 8427a48b8b7a..b7fd2e869998 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -185,12 +185,87 @@ void recalc_intercepts(struct vcpu_svm *svm)
}
/*
+ * This array (and its actual size) holds the set of offsets (indexing by chunk
+ * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the
+ * set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g.
+ * based on CPUID features. This array only tracks MSRs that *might* be passed
+ * through to the guest.
+ *
+ * Hardcode the capacity of the array based on the maximum number of _offsets_.
+ * MSRs are batched together, so there are fewer offsets than MSRs.
+ */
+static int nested_svm_msrpm_merge_offsets[7] __ro_after_init;
+static int nested_svm_nr_msrpm_merge_offsets __ro_after_init;
+typedef unsigned long nsvm_msrpm_merge_t;
+
+int __init nested_svm_init_msrpm_merge_offsets(void)
+{
+ static const u32 merge_msrs[] __initconst = {
+ MSR_STAR,
+ MSR_IA32_SYSENTER_CS,
+ MSR_IA32_SYSENTER_EIP,
+ MSR_IA32_SYSENTER_ESP,
+ #ifdef CONFIG_X86_64
+ MSR_GS_BASE,
+ MSR_FS_BASE,
+ MSR_KERNEL_GS_BASE,
+ MSR_LSTAR,
+ MSR_CSTAR,
+ MSR_SYSCALL_MASK,
+ #endif
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_PRED_CMD,
+ MSR_IA32_FLUSH_CMD,
+ MSR_IA32_APERF,
+ MSR_IA32_MPERF,
+ MSR_IA32_LASTBRANCHFROMIP,
+ MSR_IA32_LASTBRANCHTOIP,
+ MSR_IA32_LASTINTFROMIP,
+ MSR_IA32_LASTINTTOIP,
+ };
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) {
+ int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]);
+ u32 offset;
+
+ if (WARN_ON(bit_nr < 0))
+ return -EIO;
+
+ /*
+ * Merging is done in chunks to reduce the number of accesses
+ * to L1's bitmap.
+ */
+ offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t);
+
+ for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) {
+ if (nested_svm_msrpm_merge_offsets[j] == offset)
+ break;
+ }
+
+ if (j < nested_svm_nr_msrpm_merge_offsets)
+ continue;
+
+ if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets)))
+ return -EIO;
+
+ nested_svm_msrpm_merge_offsets[j] = offset;
+ nested_svm_nr_msrpm_merge_offsets++;
+ }
+
+ return 0;
+}
+
+/*
* Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
* is optimized in that it only merges the parts where KVM MSR permission bitmap
* may contain zero bits.
*/
-static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
+static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm;
+ nsvm_msrpm_merge_t *msrpm01 = svm->msrpm;
int i;
/*
@@ -205,7 +280,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
if (!svm->nested.force_msr_bitmap_recalc) {
struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
- if (kvm_hv_hypercall_enabled(&svm->vcpu) &&
+ if (kvm_hv_hypercall_enabled(vcpu) &&
hve->hv_enlightenments_control.msr_bitmap &&
(svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
goto set_msrpm_base_pa;
@@ -215,25 +290,17 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return true;
- for (i = 0; i < MSRPM_OFFSETS; i++) {
- u32 value, p;
- u64 offset;
+ for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) {
+ const int p = nested_svm_msrpm_merge_offsets[i];
+ nsvm_msrpm_merge_t l1_val;
+ gpa_t gpa;
- if (msrpm_offsets[i] == 0xffffffff)
- break;
+ gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val));
- p = msrpm_offsets[i];
-
- /* x2apic msrs are intercepted always for the nested guest */
- if (is_x2apic_msrpm_offset(p))
- continue;
-
- offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
-
- if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
+ if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val)))
return false;
- svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ msrpm02[p] = msrpm01[p] | l1_val;
}
svm->nested.force_msr_bitmap_recalc = false;
@@ -937,7 +1004,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
goto out_exit_err;
- if (nested_svm_vmrun_msrpm(svm))
+ if (nested_svm_merge_msrpm(vcpu))
goto out;
out_exit_err:
@@ -1230,7 +1297,6 @@ int svm_allocate_nested(struct vcpu_svm *svm)
svm->nested.msrpm = svm_vcpu_alloc_msrpm();
if (!svm->nested.msrpm)
goto err_free_vmcb02;
- svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
svm->nested.initialized = true;
return 0;
@@ -1290,26 +1356,26 @@ void svm_leave_nested(struct kvm_vcpu *vcpu)
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{
- u32 offset, msr, value;
- int write, mask;
+ gpa_t base = svm->nested.ctl.msrpm_base_pa;
+ int write, bit_nr;
+ u8 value, mask;
+ u32 msr;
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return NESTED_EXIT_HOST;
msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- offset = svm_msrpm_offset(msr);
+ bit_nr = svm_msrpm_bit_nr(msr);
write = svm->vmcb->control.exit_info_1 & 1;
- mask = 1 << ((2 * (msr & 0xf)) + write);
- if (offset == MSR_INVALID)
+ if (bit_nr < 0)
return NESTED_EXIT_DONE;
- /* Offset is in 32 bit units but need in 8 bit units */
- offset *= 4;
-
- if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
+ if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE,
+ &value, sizeof(value)))
return NESTED_EXIT_DONE;
+ mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1));
return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
@@ -1819,13 +1885,11 @@ out_free:
static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
if (WARN_ON(!is_guest_mode(vcpu)))
return true;
if (!vcpu->arch.pdptrs_from_userspace &&
- !nested_npt_enabled(svm) && is_pae_paging(vcpu))
+ !nested_npt_enabled(to_svm(vcpu)) && is_pae_paging(vcpu))
/*
* Reload the guest's PDPTRs since after a migration
* the guest CR3 might be restored prior to setting the nested
@@ -1834,7 +1898,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
return false;
- if (!nested_svm_vmrun_msrpm(svm)) {
+ if (!nested_svm_merge_msrpm(vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_EMULATION;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index b201f77fcd49..2fbdebf79fbb 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -117,6 +117,7 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
*/
down_write(&sev_deactivate_lock);
+ /* SNP firmware requires use of WBINVD for ASID recycling. */
wbinvd_on_all_cpus();
if (sev_snp_enabled)
@@ -446,7 +447,12 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
init_args.probe = false;
ret = sev_platform_init(&init_args);
if (ret)
- goto e_free;
+ goto e_free_asid;
+
+ if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto e_free_asid;
+ }
/* This needs to happen after SEV/SNP firmware initialization. */
if (vm_type == KVM_X86_SNP_VM) {
@@ -464,6 +470,8 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
return 0;
e_free:
+ free_cpumask_var(sev->have_run_cpus);
+e_free_asid:
argp->error = init_args.error;
sev_asid_free(sev);
sev->asid = 0;
@@ -708,6 +716,33 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
}
}
+static void sev_writeback_caches(struct kvm *kvm)
+{
+ /*
+ * Note, the caller is responsible for ensuring correctness if the mask
+ * can be modified, e.g. if a CPU could be doing VMRUN.
+ */
+ if (cpumask_empty(to_kvm_sev_info(kvm)->have_run_cpus))
+ return;
+
+ /*
+ * Ensure that all dirty guest tagged cache entries are written back
+ * before releasing the pages back to the system for use. CLFLUSH will
+ * not do this without SME_COHERENT, and flushing many cache lines
+ * individually is slower than blasting WBINVD for large VMs, so issue
+ * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported)
+ * on CPUs that have done VMRUN, i.e. may have dirtied data using the
+ * VM's ASID.
+ *
+ * For simplicity, never remove CPUs from the bitmap. Ideally, KVM
+ * would clear the mask when flushing caches, but doing so requires
+ * serializing multiple calls and having responding CPUs (to the IPI)
+ * mark themselves as still running if they are running (or about to
+ * run) a vCPU for the VM.
+ */
+ wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
+}
+
static unsigned long get_num_contig_pages(unsigned long idx,
struct page **inpages, unsigned long npages)
{
@@ -2037,6 +2072,17 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
if (ret)
goto out_source_vcpu;
+ /*
+ * Allocate a new have_run_cpus for the destination, i.e. don't copy
+ * the set of CPUs from the source. If a CPU was used to run a vCPU in
+ * the source VM but is never used for the destination VM, then the CPU
+ * can only have cached memory that was accessible to the source VM.
+ */
+ if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto out_source_vcpu;
+ }
+
sev_migrate_from(kvm, source_kvm);
kvm_vm_dead(source_kvm);
cg_cleanup_sev = src_sev;
@@ -2135,11 +2181,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
return -EINVAL;
/* Check for policy bits that must be set */
- if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) ||
- !(params.policy & SNP_POLICY_MASK_SMT))
- return -EINVAL;
-
- if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET)
+ if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO))
return -EINVAL;
sev->policy = params.policy;
@@ -2698,12 +2740,7 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
goto failed;
}
- /*
- * Ensure that all guest tagged cache entries are flushed before
- * releasing the pages back to the system for use. CLFLUSH will
- * not do this, so issue a WBINVD.
- */
- wbinvd_on_all_cpus();
+ sev_writeback_caches(kvm);
__unregister_enc_region_locked(kvm, region);
@@ -2745,13 +2782,18 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
goto e_unlock;
}
+ mirror_sev = to_kvm_sev_info(kvm);
+ if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto e_unlock;
+ }
+
/*
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
source_sev = to_kvm_sev_info(source_kvm);
kvm_get_kvm(source_kvm);
- mirror_sev = to_kvm_sev_info(kvm);
list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
/* Set enc_context_owner and copy its encryption context over */
@@ -2813,7 +2855,13 @@ void sev_vm_destroy(struct kvm *kvm)
WARN_ON(!list_empty(&sev->mirror_vms));
- /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+ free_cpumask_var(sev->have_run_cpus);
+
+ /*
+ * If this is a mirror VM, remove it from the owner's list of a mirrors
+ * and skip ASID cleanup (the ASID is tied to the lifetime of the owner).
+ * Note, mirror VMs don't support registering encrypted regions.
+ */
if (is_mirroring_enc_context(kvm)) {
struct kvm *owner_kvm = sev->enc_context_owner;
@@ -2824,12 +2872,6 @@ void sev_vm_destroy(struct kvm *kvm)
return;
}
- /*
- * Ensure that all guest tagged cache entries are flushed before
- * releasing the pages back to the system for use. CLFLUSH will
- * not do this, so issue a WBINVD.
- */
- wbinvd_on_all_cpus();
/*
* if userspace was terminated before unregistering the memory regions
@@ -3099,30 +3141,29 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
/*
* VM Page Flush takes a host virtual address and a guest ASID. Fall
- * back to WBINVD if this faults so as not to make any problems worse
- * by leaving stale encrypted data in the cache.
+ * back to full writeback of caches if this faults so as not to make
+ * any problems worse by leaving stale encrypted data in the cache.
*/
if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
- goto do_wbinvd;
+ goto do_sev_writeback_caches;
return;
-do_wbinvd:
- wbinvd_on_all_cpus();
+do_sev_writeback_caches:
+ sev_writeback_caches(vcpu->kvm);
}
void sev_guest_memory_reclaimed(struct kvm *kvm)
{
/*
* With SNP+gmem, private/encrypted memory is unreachable via the
- * hva-based mmu notifiers, so these events are only actually
- * pertaining to shared pages where there is no need to perform
- * the WBINVD to flush associated caches.
+ * hva-based mmu notifiers, i.e. these events are explicitly scoped to
+ * shared pages, where there's no need to flush caches.
*/
if (!sev_guest(kvm) || sev_snp_guest(kvm))
return;
- wbinvd_on_all_cpus();
+ sev_writeback_caches(kvm);
}
void sev_free_vcpu(struct kvm_vcpu *vcpu)
@@ -3454,6 +3495,15 @@ int pre_sev_run(struct vcpu_svm *svm, int cpu)
if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa))
return -EINVAL;
+ /*
+ * To optimize cache flushes when memory is reclaimed from an SEV VM,
+ * track physical CPUs that enter the guest for SEV VMs and thus can
+ * have encrypted, dirty data in the cache, and flush caches only for
+ * CPUs that have entered the guest.
+ */
+ if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus))
+ cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus);
+
/* Assign the asid allocated with this SEV guest */
svm->asid = asid;
@@ -3886,9 +3936,9 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
* From this point forward, the VMSA will always be a guest-mapped page
* rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
* theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
- * that involves cleanups like wbinvd_on_all_cpus() which would ideally
- * be handled during teardown rather than guest boot. Deferring that
- * also allows the existing logic for SEV-ES VMSAs to be re-used with
+ * that involves cleanups like flushing caches, which would ideally be
+ * handled during teardown rather than guest boot. Deferring that also
+ * allows the existing logic for SEV-ES VMSAs to be re-used with
* minimal SNP-specific changes.
*/
svm->sev_es.snp_has_guest_vmsa = true;
@@ -4390,16 +4440,17 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
count, in);
}
-static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
+void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
-
- if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
- bool v_tsc_aux = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
+ /* Clear intercepts on MSRs that are context switched by hardware. */
+ svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW);
- set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux);
- }
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX))
+ svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID));
/*
* For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
@@ -4413,11 +4464,9 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
* XSAVES being exposed to the guest so that KVM can at least honor
* guest CPUID for RDMSR and WRMSR.
*/
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1);
- else
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) ||
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES));
}
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
@@ -4429,16 +4478,12 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
if (best)
vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
-
- if (sev_es_guest(svm->vcpu.kvm))
- sev_es_vcpu_after_set_cpuid(svm);
}
static void sev_es_init_vmcb(struct vcpu_svm *svm)
{
struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
struct vmcb *vmcb = svm->vmcb01.ptr;
- struct kvm_vcpu *vcpu = &svm->vcpu;
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
@@ -4496,10 +4541,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
/* Can't intercept XSETBV, HV can't modify XCR0 directly */
svm_clr_intercept(svm, INTERCEPT_XSETBV);
-
- /* Clear intercepts on selected MSRs */
- set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1);
}
void sev_init_vmcb(struct vcpu_svm *svm)
@@ -4888,7 +4929,7 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
/*
* SEV-ES avoids host/guest cache coherency issues through
- * WBINVD hooks issued via MMU notifiers during run-time, and
+ * WBNOINVD hooks issued via MMU notifiers during run-time, and
* KVM's VM destroy path at shutdown. Those MMU notifier events
* don't cover gmem since there is no requirement to map pages
* to a HVA in order to use them for a running guest. While the
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ab9b947dbf4f..d9931c6c4bc6 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -72,8 +72,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
static bool erratum_383_found __read_mostly;
-u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
-
/*
* Set osvw_len to higher value when updated Revision Guides
* are published and we know what the new status bits are
@@ -82,72 +80,6 @@ static uint64_t osvw_len = 4, osvw_status;
static DEFINE_PER_CPU(u64, current_tsc_ratio);
-#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4))
-
-static const struct svm_direct_access_msrs {
- u32 index; /* Index of the MSR */
- bool always; /* True if intercept is initially cleared */
-} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
- { .index = MSR_STAR, .always = true },
- { .index = MSR_IA32_SYSENTER_CS, .always = true },
- { .index = MSR_IA32_SYSENTER_EIP, .always = false },
- { .index = MSR_IA32_SYSENTER_ESP, .always = false },
-#ifdef CONFIG_X86_64
- { .index = MSR_GS_BASE, .always = true },
- { .index = MSR_FS_BASE, .always = true },
- { .index = MSR_KERNEL_GS_BASE, .always = true },
- { .index = MSR_LSTAR, .always = true },
- { .index = MSR_CSTAR, .always = true },
- { .index = MSR_SYSCALL_MASK, .always = true },
-#endif
- { .index = MSR_IA32_SPEC_CTRL, .always = false },
- { .index = MSR_IA32_PRED_CMD, .always = false },
- { .index = MSR_IA32_FLUSH_CMD, .always = false },
- { .index = MSR_IA32_DEBUGCTLMSR, .always = false },
- { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
- { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
- { .index = MSR_IA32_LASTINTFROMIP, .always = false },
- { .index = MSR_IA32_LASTINTTOIP, .always = false },
- { .index = MSR_IA32_XSS, .always = false },
- { .index = MSR_EFER, .always = false },
- { .index = MSR_IA32_CR_PAT, .always = false },
- { .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
- { .index = MSR_TSC_AUX, .always = false },
- { .index = X2APIC_MSR(APIC_ID), .always = false },
- { .index = X2APIC_MSR(APIC_LVR), .always = false },
- { .index = X2APIC_MSR(APIC_TASKPRI), .always = false },
- { .index = X2APIC_MSR(APIC_ARBPRI), .always = false },
- { .index = X2APIC_MSR(APIC_PROCPRI), .always = false },
- { .index = X2APIC_MSR(APIC_EOI), .always = false },
- { .index = X2APIC_MSR(APIC_RRR), .always = false },
- { .index = X2APIC_MSR(APIC_LDR), .always = false },
- { .index = X2APIC_MSR(APIC_DFR), .always = false },
- { .index = X2APIC_MSR(APIC_SPIV), .always = false },
- { .index = X2APIC_MSR(APIC_ISR), .always = false },
- { .index = X2APIC_MSR(APIC_TMR), .always = false },
- { .index = X2APIC_MSR(APIC_IRR), .always = false },
- { .index = X2APIC_MSR(APIC_ESR), .always = false },
- { .index = X2APIC_MSR(APIC_ICR), .always = false },
- { .index = X2APIC_MSR(APIC_ICR2), .always = false },
-
- /*
- * Note:
- * AMD does not virtualize APIC TSC-deadline timer mode, but it is
- * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
- * the AVIC hardware would generate GP fault. Therefore, always
- * intercept the MSR 0x832, and do not setup direct_access_msr.
- */
- { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false },
- { .index = X2APIC_MSR(APIC_LVTPC), .always = false },
- { .index = X2APIC_MSR(APIC_LVT0), .always = false },
- { .index = X2APIC_MSR(APIC_LVT1), .always = false },
- { .index = X2APIC_MSR(APIC_LVTERR), .always = false },
- { .index = X2APIC_MSR(APIC_TMICT), .always = false },
- { .index = X2APIC_MSR(APIC_TMCCT), .always = false },
- { .index = X2APIC_MSR(APIC_TDCR), .always = false },
- { .index = MSR_INVALID, .always = false },
-};
-
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* pause_filter_count: On processors that support Pause filtering(indicated
@@ -232,6 +164,7 @@ module_param(tsc_scaling, int, 0444);
*/
static bool avic;
module_param(avic, bool, 0444);
+module_param(enable_ipiv, bool, 0444);
module_param(enable_device_posted_irqs, bool, 0444);
@@ -264,33 +197,6 @@ static DEFINE_MUTEX(vmcb_dump_mutex);
*/
static int tsc_aux_uret_slot __read_mostly = -1;
-static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
-
-#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
-#define MSRS_RANGE_SIZE 2048
-#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
-
-u32 svm_msrpm_offset(u32 msr)
-{
- u32 offset;
- int i;
-
- for (i = 0; i < NUM_MSR_MAPS; i++) {
- if (msr < msrpm_ranges[i] ||
- msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
- continue;
-
- offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
- offset += (i * MSRS_RANGE_SIZE); /* add range offset */
-
- /* Now we have the u8 offset - but need the u32 offset */
- return offset / 4;
- }
-
- /* MSR not in any range */
- return MSR_INVALID;
-}
-
static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
@@ -757,50 +663,8 @@ static void clr_dr_intercepts(struct vcpu_svm *svm)
recalc_intercepts(svm);
}
-static int direct_access_msr_slot(u32 msr)
-{
- u32 i;
-
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
- if (direct_access_msrs[i].index == msr)
- return i;
-
- return -ENOENT;
-}
-
-static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
- int write)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int slot = direct_access_msr_slot(msr);
-
- if (slot == -ENOENT)
- return;
-
- /* Set the shadow bitmaps to the desired intercept states */
- if (read)
- set_bit(slot, svm->shadow_msr_intercept.read);
- else
- clear_bit(slot, svm->shadow_msr_intercept.read);
-
- if (write)
- set_bit(slot, svm->shadow_msr_intercept.write);
- else
- clear_bit(slot, svm->shadow_msr_intercept.write);
-}
-
-static bool valid_msr_intercept(u32 index)
-{
- return direct_access_msr_slot(index) != -ENOENT;
-}
-
static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
{
- u8 bit_write;
- unsigned long tmp;
- u32 offset;
- u32 *msrpm;
-
/*
* For non-nested case:
* If the L01 MSR bitmap does not intercept the MSR, then we need to
@@ -810,90 +674,102 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
- to_svm(vcpu)->msrpm;
+ void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm :
+ to_svm(vcpu)->msrpm;
- offset = svm_msrpm_offset(msr);
- bit_write = 2 * (msr & 0x0f) + 1;
- tmp = msrpm[offset];
-
- BUG_ON(offset == MSR_INVALID);
-
- return test_bit(bit_write, &tmp);
+ return svm_test_msr_bitmap_write(msrpm, msr);
}
-static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
- u32 msr, int read, int write)
+void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u8 bit_read, bit_write;
- unsigned long tmp;
- u32 offset;
+ void *msrpm = svm->msrpm;
- /*
- * If this warning triggers extend the direct_access_msrs list at the
- * beginning of the file
- */
- WARN_ON(!valid_msr_intercept(msr));
-
- /* Enforce non allowed MSRs to trap */
- if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
- read = 0;
-
- if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
- write = 0;
-
- offset = svm_msrpm_offset(msr);
- bit_read = 2 * (msr & 0x0f);
- bit_write = 2 * (msr & 0x0f) + 1;
- tmp = msrpm[offset];
-
- BUG_ON(offset == MSR_INVALID);
-
- read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
- write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+ /* Don't disable interception for MSRs userspace wants to handle. */
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ svm_clear_msr_bitmap_read(msrpm, msr);
+ else
+ svm_set_msr_bitmap_read(msrpm, msr);
+ }
- msrpm[offset] = tmp;
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ svm_clear_msr_bitmap_write(msrpm, msr);
+ else
+ svm_set_msr_bitmap_write(msrpm, msr);
+ }
svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
svm->nested.force_msr_bitmap_recalc = true;
}
-void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
- int read, int write)
-{
- set_shadow_msr_intercept(vcpu, msr, read, write);
- set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
-}
-
-u32 *svm_vcpu_alloc_msrpm(void)
+void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
{
- unsigned int order = get_order(MSRPM_SIZE);
- struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
- u32 *msrpm;
+ unsigned int order = get_order(size);
+ struct page *pages = alloc_pages(gfp_mask, order);
+ void *pm;
if (!pages)
return NULL;
- msrpm = page_address(pages);
- memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
+ /*
+ * Set all bits in the permissions map so that all MSR and I/O accesses
+ * are intercepted by default.
+ */
+ pm = page_address(pages);
+ memset(pm, 0xff, PAGE_SIZE * (1 << order));
- return msrpm;
+ return pm;
}
-void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
+static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
{
- int i;
+ bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- if (!direct_access_msrs[i].always)
- continue;
- set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
- }
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept);
+
+ if (sev_es_guest(vcpu->kvm))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept);
}
void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
{
+ static const u32 x2avic_passthrough_msrs[] = {
+ X2APIC_MSR(APIC_ID),
+ X2APIC_MSR(APIC_LVR),
+ X2APIC_MSR(APIC_TASKPRI),
+ X2APIC_MSR(APIC_ARBPRI),
+ X2APIC_MSR(APIC_PROCPRI),
+ X2APIC_MSR(APIC_EOI),
+ X2APIC_MSR(APIC_RRR),
+ X2APIC_MSR(APIC_LDR),
+ X2APIC_MSR(APIC_DFR),
+ X2APIC_MSR(APIC_SPIV),
+ X2APIC_MSR(APIC_ISR),
+ X2APIC_MSR(APIC_TMR),
+ X2APIC_MSR(APIC_IRR),
+ X2APIC_MSR(APIC_ESR),
+ X2APIC_MSR(APIC_ICR),
+ X2APIC_MSR(APIC_ICR2),
+
+ /*
+ * Note! Always intercept LVTT, as TSC-deadline timer mode
+ * isn't virtualized by hardware, and the CPU will generate a
+ * #GP instead of a #VMEXIT.
+ */
+ X2APIC_MSR(APIC_LVTTHMR),
+ X2APIC_MSR(APIC_LVTPC),
+ X2APIC_MSR(APIC_LVT0),
+ X2APIC_MSR(APIC_LVT1),
+ X2APIC_MSR(APIC_LVTERR),
+ X2APIC_MSR(APIC_TMICT),
+ X2APIC_MSR(APIC_TMCCT),
+ X2APIC_MSR(APIC_TDCR),
+ };
int i;
if (intercept == svm->x2avic_msrs_intercepted)
@@ -902,84 +778,79 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
if (!x2avic_enabled)
return;
- for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
- int index = direct_access_msrs[i].index;
-
- if ((index < APIC_BASE_MSR) ||
- (index > APIC_BASE_MSR + 0xff))
- continue;
- set_msr_interception(&svm->vcpu, svm->msrpm, index,
- !intercept, !intercept);
- }
+ for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++)
+ svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i],
+ MSR_TYPE_RW, intercept);
svm->x2avic_msrs_intercepted = intercept;
}
-void svm_vcpu_free_msrpm(u32 *msrpm)
+void svm_vcpu_free_msrpm(void *msrpm)
{
__free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
}
-static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
+static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u32 i;
- /*
- * Set intercept permissions for all direct access MSRs again. They
- * will automatically get filtered through the MSR filter, so we are
- * back in sync after this.
- */
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- u32 msr = direct_access_msrs[i].index;
- u32 read = test_bit(i, svm->shadow_msr_intercept.read);
- u32 write = test_bit(i, svm->shadow_msr_intercept.write);
-
- set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
- }
-}
-
-static void add_msr_offset(u32 offset)
-{
- int i;
-
- for (i = 0; i < MSRPM_OFFSETS; ++i) {
+ svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- /* Offset already in list? */
- if (msrpm_offsets[i] == offset)
- return;
+#ifdef CONFIG_X86_64
+ svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW);
+#endif
- /* Slot used by another offset? */
- if (msrpm_offsets[i] != MSR_INVALID)
- continue;
+ if (lbrv)
+ svm_recalc_lbr_msr_intercepts(vcpu);
- /* Add offset to list */
- msrpm_offsets[i] = offset;
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
- return;
- }
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
/*
- * If this BUG triggers the msrpm_offsets table has an overflow. Just
- * increase MSRPM_OFFSETS in this case.
+ * Disable interception of SPEC_CTRL if KVM doesn't need to manually
+ * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if
+ * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively
+ * using SPEC_CTRL.
*/
- BUG();
-}
-
-static void init_msrpm_offsets(void)
-{
- int i;
-
- memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+ if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !guest_has_spec_ctrl_msr(vcpu));
+ else
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !svm->spec_ctrl);
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- u32 offset;
+ /*
+ * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU,
+ * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits.
+ */
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW,
+ guest_cpuid_is_intel_compatible(vcpu));
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW,
+ guest_cpuid_is_intel_compatible(vcpu));
+
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
+ }
- offset = svm_msrpm_offset(direct_access_msrs[i].index);
- BUG_ON(offset == MSR_INVALID);
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_recalc_msr_intercepts(vcpu);
- add_msr_offset(offset);
- }
+ /*
+ * x2APIC intercepts are modified on-demand and cannot be filtered by
+ * userspace.
+ */
}
void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
@@ -998,13 +869,7 @@ void svm_enable_lbrv(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
-
- if (sev_es_guest(vcpu->kvm))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1);
+ svm_recalc_lbr_msr_intercepts(vcpu);
/* Move the LBR msrs to the vmcb02 so that the guest can see them. */
if (is_guest_mode(vcpu))
@@ -1016,12 +881,8 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
-
svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+ svm_recalc_lbr_msr_intercepts(vcpu);
/*
* Move the LBR msrs back to the vmcb01 to avoid copying them
@@ -1176,9 +1037,10 @@ void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
}
/* Evaluate instruction intercepts that depend on guest CPUID features. */
-static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
- struct vcpu_svm *svm)
+static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
/*
* Intercept INVPCID if shadow paging is enabled to sync/free shadow
* roots, or if INVPCID is disabled in the guest to inject #UD.
@@ -1197,24 +1059,11 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
else
svm_set_intercept(svm, INTERCEPT_RDTSCP);
}
-}
-
-static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
if (guest_cpuid_is_intel_compatible(vcpu)) {
- /*
- * We must intercept SYSENTER_EIP and SYSENTER_ESP
- * accesses because the processor only stores 32 bits.
- * For the same reason we cannot use virtual VMLOAD/VMSAVE.
- */
svm_set_intercept(svm, INTERCEPT_VMLOAD);
svm_set_intercept(svm, INTERCEPT_VMSAVE);
svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
-
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
} else {
/*
* If hardware supports Virtual VMLOAD VMSAVE then enable it
@@ -1225,12 +1074,15 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
svm_clr_intercept(svm, INTERCEPT_VMSAVE);
svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
}
- /* No need to intercept these MSRs */
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
}
}
+static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ svm_recalc_instruction_intercepts(vcpu);
+ svm_recalc_msr_intercepts(vcpu);
+}
+
static void init_vmcb(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1353,15 +1205,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm_clr_intercept(svm, INTERCEPT_PAUSE);
}
- svm_recalc_instruction_intercepts(vcpu, svm);
-
- /*
- * If the host supports V_SPEC_CTRL then disable the interception
- * of MSR_IA32_SPEC_CTRL.
- */
- if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
-
if (kvm_vcpu_apicv_active(vcpu))
avic_init_vmcb(svm, vmcb);
@@ -1381,7 +1224,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
sev_init_vmcb(svm);
svm_hv_init_vmcb(vmcb);
- init_vmcb_after_set_cpuid(vcpu);
+
+ svm_recalc_intercepts_after_set_cpuid(vcpu);
vmcb_mark_all_dirty(vmcb);
@@ -1392,8 +1236,6 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm_vcpu_init_msrpm(vcpu, svm->msrpm);
-
svm_init_osvw(vcpu);
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
@@ -1490,13 +1332,15 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ WARN_ON_ONCE(!list_empty(&svm->ir_list));
+
svm_leave_nested(vcpu);
svm_free_nested(svm);
sev_free_vcpu(vcpu);
__free_page(__sme_pa_to_page(svm->vmcb01.pa));
- __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
+ svm_vcpu_free_msrpm(svm->msrpm);
}
#ifdef CONFIG_CPU_MITIGATIONS
@@ -2880,12 +2724,11 @@ static int svm_get_feature_msr(u32 msr, u64 *data)
return 0;
}
-static bool
-sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info)
{
return sev_es_guest(vcpu->kvm) &&
vcpu->arch.guest_state_protected &&
- svm_msrpm_offset(msr_info->index) != MSR_INVALID &&
!msr_write_intercepted(vcpu, msr_info->index);
}
@@ -3116,11 +2959,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
- * nested_svm_vmrun_msrpm.
+ * nested_svm_merge_msrpm().
* We update the L1 MSR bit as well since it will end up
* touching the MSR anyway now.
*/
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr->host_initiated &&
@@ -3186,8 +3029,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
/*
* TSC_AUX is usually changed only during boot and never read
- * directly. Intercept TSC_AUX instead of exposing it to the
- * guest via direct_access_msrs, and switch it via user return.
+ * directly. Intercept TSC_AUX and switch it via user return.
*/
preempt_disable();
ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
@@ -4389,9 +4231,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
guest_state_exit_irqoff();
}
-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
- bool force_immediate_exit)
+static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_svm *svm = to_svm(vcpu);
bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
@@ -4438,10 +4280,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
svm_hv_update_vp_id(svm->vmcb, vcpu);
/*
- * Run with all-zero DR6 unless needed, so that we can get the exact cause
- * of a #DB.
+ * Run with all-zero DR6 unless the guest can write DR6 freely, so that
+ * KVM can get the exact cause of a #DB. Note, loading guest DR6 from
+ * KVM's snapshot is only necessary when DR accesses won't exit.
*/
- if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
+ svm_set_dr6(vcpu, vcpu->arch.dr6);
+ else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
@@ -4621,20 +4466,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (guest_cpuid_is_intel_compatible(vcpu))
guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
- svm_recalc_instruction_intercepts(vcpu, svm);
-
- if (boot_cpu_has(X86_FEATURE_IBPB))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
- !!guest_has_pred_cmd_msr(vcpu));
-
- if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
- !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
-
if (sev_guest(vcpu->kvm))
sev_vcpu_after_set_cpuid(svm);
- init_vmcb_after_set_cpuid(vcpu);
+ svm_recalc_intercepts_after_set_cpuid(vcpu);
}
static bool svm_has_wbinvd_exit(void)
@@ -5185,7 +5020,7 @@ static int svm_vm_init(struct kvm *kvm)
}
if (!pause_filter_count || !pause_filter_thresh)
- kvm->arch.pause_in_guest = true;
+ kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
if (enable_apicv) {
int ret = avic_vm_init(kvm);
@@ -5252,7 +5087,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
- .set_dr6 = svm_set_dr6,
.set_dr7 = svm_set_dr7,
.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
@@ -5337,7 +5171,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
- .msr_filter_changed = svm_msr_filter_changed,
+ .recalc_msr_intercepts = svm_recalc_msr_intercepts,
.complete_emulated_msr = svm_complete_emulated_msr,
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
@@ -5473,11 +5307,8 @@ static __init void svm_set_cpu_caps(void)
static __init int svm_hardware_setup(void)
{
- int cpu;
- struct page *iopm_pages;
void *iopm_va;
- int r;
- unsigned int order = get_order(IOPM_SIZE);
+ int cpu, r;
/*
* NX is required for shadow paging and for NPT if the NX huge pages
@@ -5489,17 +5320,6 @@ static __init int svm_hardware_setup(void)
}
kvm_enable_efer_bits(EFER_NX);
- iopm_pages = alloc_pages(GFP_KERNEL, order);
-
- if (!iopm_pages)
- return -ENOMEM;
-
- iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
- iopm_base = __sme_page_pa(iopm_pages);
-
- init_msrpm_offsets();
-
kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
XFEATURE_MASK_BNDCSR);
@@ -5533,6 +5353,10 @@ static __init int svm_hardware_setup(void)
if (nested) {
pr_info("Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+
+ r = nested_svm_init_msrpm_merge_offsets();
+ if (r)
+ return r;
}
/*
@@ -5564,6 +5388,13 @@ static __init int svm_hardware_setup(void)
else
pr_info("LBR virtualization supported\n");
}
+
+ iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL);
+ if (!iopm_va)
+ return -ENOMEM;
+
+ iopm_base = __sme_set(__pa(iopm_va));
+
/*
* Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
* may be modified by svm_adjust_mmio_mask()), as well as nrips.
@@ -5581,6 +5412,7 @@ static __init int svm_hardware_setup(void)
enable_apicv = avic = avic && avic_hardware_setup();
if (!enable_apicv) {
+ enable_ipiv = false;
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
@@ -5662,6 +5494,8 @@ static int __init svm_init(void)
{
int r;
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm);
+
__unused_size_checks();
if (!kvm_is_svm_supported())
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e6f3c6a153a0..58b9d168e0c8 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -44,9 +44,6 @@ static inline struct page *__sme_pa_to_page(unsigned long pa)
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
-#define MAX_DIRECT_ACCESS_MSRS 48
-#define MSRPM_OFFSETS 32
-extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
extern int nrips;
extern int vgif;
@@ -113,6 +110,7 @@ struct kvm_sev_info {
void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */
void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */
struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
+ cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
};
#define SEV_POLICY_NODBG BIT_ULL(0)
@@ -123,8 +121,8 @@ struct kvm_svm {
/* Struct members for AVIC */
u32 avic_vm_id;
- struct page *avic_logical_id_table_page;
- struct page *avic_physical_id_table_page;
+ u32 *avic_logical_id_table;
+ u64 *avic_physical_id_table;
struct hlist_node hnode;
struct kvm_sev_info sev_info;
@@ -189,8 +187,11 @@ struct svm_nested_state {
u64 vmcb12_gpa;
u64 last_vmcb12_gpa;
- /* These are the merged vectors */
- u32 *msrpm;
+ /*
+ * The MSR permissions map used for vmcb02, which is the merge result
+ * of vmcb01 and vmcb12
+ */
+ void *msrpm;
/* A VMRUN has started but has not yet been performed, so
* we cannot inject a nested vmexit yet. */
@@ -271,7 +272,7 @@ struct vcpu_svm {
*/
u64 virt_spec_ctrl;
- u32 *msrpm;
+ void *msrpm;
ulong nmi_iret_rip;
@@ -306,24 +307,26 @@ struct vcpu_svm {
u32 ldr_reg;
u32 dfr_reg;
- struct page *avic_backing_page;
- u64 *avic_physical_id_cache;
+
+ /* This is essentially a shadow of the vCPU's actual entry in the
+ * Physical ID table that is programmed into the VMCB, i.e. that is
+ * seen by the CPU. If IPI virtualization is disabled, IsRunning is
+ * only ever set in the shadow, i.e. is never propagated to the "real"
+ * table, so that hardware never sees IsRunning=1.
+ */
+ u64 avic_physical_id_entry;
/*
- * Per-vcpu list of struct amd_svm_iommu_ir:
- * This is used mainly to store interrupt remapping information used
- * when update the vcpu affinity. This avoids the need to scan for
- * IRTE and try to match ga_tag in the IOMMU driver.
+ * Per-vCPU list of irqfds that are eligible to post IRQs directly to
+ * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list
+ * is used to reconfigure IRTEs when the vCPU is loaded/put (to set the
+ * target pCPU), when AVIC is toggled on/off (to (de)activate bypass),
+ * and if the irqfd becomes ineligible for posting (to put the IRTE
+ * back into remapped mode).
*/
struct list_head ir_list;
spinlock_t ir_list_lock;
- /* Save desired MSR intercept (read: pass-through) state */
- struct {
- DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS);
- DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
- } shadow_msr_intercept;
-
struct vcpu_sev_es_state sev_es;
bool guest_state_loaded;
@@ -613,17 +616,74 @@ static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data)
svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data);
}
-/* svm.c */
-#define MSR_INVALID 0xffffffffU
+/*
+ * The MSRPM is 8KiB in size, divided into four 2KiB ranges (the fourth range
+ * is reserved). Each MSR within a range is covered by two bits, one each for
+ * read (bit 0) and write (bit 1), where a bit value of '1' means intercepted.
+ */
+#define SVM_MSRPM_BYTES_PER_RANGE 2048
+#define SVM_BITS_PER_MSR 2
+#define SVM_MSRS_PER_BYTE (BITS_PER_BYTE / SVM_BITS_PER_MSR)
+#define SVM_MSRS_PER_RANGE (SVM_MSRPM_BYTES_PER_RANGE * SVM_MSRS_PER_BYTE)
+static_assert(SVM_MSRS_PER_RANGE == 8192);
+#define SVM_MSRPM_OFFSET_MASK (SVM_MSRS_PER_RANGE - 1)
+
+static __always_inline int svm_msrpm_bit_nr(u32 msr)
+{
+ int range_nr;
+
+ switch (msr & ~SVM_MSRPM_OFFSET_MASK) {
+ case 0:
+ range_nr = 0;
+ break;
+ case 0xc0000000:
+ range_nr = 1;
+ break;
+ case 0xc0010000:
+ range_nr = 2;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return range_nr * SVM_MSRPM_BYTES_PER_RANGE * BITS_PER_BYTE +
+ (msr & SVM_MSRPM_OFFSET_MASK) * SVM_BITS_PER_MSR;
+}
+
+#define __BUILD_SVM_MSR_BITMAP_HELPER(rtype, action, bitop, access, bit_rw) \
+static inline rtype svm_##action##_msr_bitmap_##access(unsigned long *bitmap, \
+ u32 msr) \
+{ \
+ int bit_nr; \
+ \
+ bit_nr = svm_msrpm_bit_nr(msr); \
+ if (bit_nr < 0) \
+ return (rtype)true; \
+ \
+ return bitop##_bit(bit_nr + bit_rw, bitmap); \
+}
+
+#define BUILD_SVM_MSR_BITMAP_HELPERS(ret_type, action, bitop) \
+ __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0) \
+ __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 1)
+
+BUILD_SVM_MSR_BITMAP_HELPERS(bool, test, test)
+BUILD_SVM_MSR_BITMAP_HELPERS(void, clear, __clear)
+BUILD_SVM_MSR_BITMAP_HELPERS(void, set, __set)
#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR)
+/* svm.c */
extern bool dump_invalid_vmcb;
-u32 svm_msrpm_offset(u32 msr);
-u32 *svm_vcpu_alloc_msrpm(void);
-void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
-void svm_vcpu_free_msrpm(u32 *msrpm);
+void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask);
+
+static inline void *svm_vcpu_alloc_msrpm(void)
+{
+ return svm_alloc_permissions_map(MSRPM_SIZE, GFP_KERNEL_ACCOUNT);
+}
+
+void svm_vcpu_free_msrpm(void *msrpm);
void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
void svm_enable_lbrv(struct kvm_vcpu *vcpu);
void svm_update_lbrv(struct kvm_vcpu *vcpu);
@@ -643,6 +703,20 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable);
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
int trig_mode, int vec);
+void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set);
+
+static inline void svm_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ svm_set_intercept_for_msr(vcpu, msr, type, false);
+}
+
+static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ svm_set_intercept_for_msr(vcpu, msr, type, true);
+}
+
/* nested.c */
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -671,6 +745,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
+int __init nested_svm_init_msrpm_merge_offsets(void);
+
int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
void svm_leave_nested(struct kvm_vcpu *vcpu);
@@ -721,7 +797,8 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \
- BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \
+ BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \
)
bool avic_hardware_setup(void);
@@ -736,8 +813,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
@@ -752,6 +830,7 @@ void sev_init_vmcb(struct vcpu_svm *svm);
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_vcpu_reset(struct vcpu_svm *svm);
+void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ba736cbb0587..57d79fd31df0 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -260,6 +260,86 @@ TRACE_EVENT(kvm_cpuid,
__entry->used_max_basic ? ", used max basic" : "")
);
+#define kvm_deliver_mode \
+ {0x0, "Fixed"}, \
+ {0x1, "LowPrio"}, \
+ {0x2, "SMI"}, \
+ {0x3, "Res3"}, \
+ {0x4, "NMI"}, \
+ {0x5, "INIT"}, \
+ {0x6, "SIPI"}, \
+ {0x7, "ExtINT"}
+
+#ifdef CONFIG_KVM_IOAPIC
+TRACE_EVENT(kvm_ioapic_set_irq,
+ TP_PROTO(__u64 e, int pin, bool coalesced),
+ TP_ARGS(e, pin, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u64, e )
+ __field( int, pin )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->e = e;
+ __entry->pin = pin;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
+ __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+ TP_PROTO(__u64 e),
+ TP_ARGS(e),
+
+ TP_STRUCT__entry(
+ __field( __u64, e )
+ ),
+
+ TP_fast_assign(
+ __entry->e = e;
+ ),
+
+ TP_printk("dst %x vec %u (%s|%s|%s%s)",
+ (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "")
+);
+#endif
+
+TRACE_EVENT(kvm_msi_set_irq,
+ TP_PROTO(__u64 address, __u64 data),
+ TP_ARGS(address, data),
+
+ TP_STRUCT__entry(
+ __field( __u64, address )
+ __field( __u64, data )
+ ),
+
+ TP_fast_assign(
+ __entry->address = address;
+ __entry->data = data;
+ ),
+
+ TP_printk("dst %llx vec %u (%s|%s|%s%s)",
+ (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
+ (u8)__entry->data,
+ __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->address & (1<<2)) ? "logical" : "physical",
+ (__entry->data & (1<<15)) ? "level" : "edge",
+ (__entry->address & (1<<3)) ? "|rh" : "")
+);
+
#define AREG(x) { APIC_##x, "APIC_" #x }
#define kvm_trace_symbol_apic \
@@ -1096,37 +1176,32 @@ TRACE_EVENT(kvm_smm_transition,
* Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC.
*/
TRACE_EVENT(kvm_pi_irte_update,
- TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
- unsigned int gsi, unsigned int gvec,
- u64 pi_desc_addr, bool set),
- TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
+ TP_PROTO(unsigned int host_irq, struct kvm_vcpu *vcpu,
+ unsigned int gsi, unsigned int gvec, bool set),
+ TP_ARGS(host_irq, vcpu, gsi, gvec, set),
TP_STRUCT__entry(
__field( unsigned int, host_irq )
- __field( unsigned int, vcpu_id )
+ __field( int, vcpu_id )
__field( unsigned int, gsi )
__field( unsigned int, gvec )
- __field( u64, pi_desc_addr )
__field( bool, set )
),
TP_fast_assign(
__entry->host_irq = host_irq;
- __entry->vcpu_id = vcpu_id;
+ __entry->vcpu_id = vcpu ? vcpu->vcpu_id : -1;
__entry->gsi = gsi;
__entry->gvec = gvec;
- __entry->pi_desc_addr = pi_desc_addr;
__entry->set = set;
),
- TP_printk("PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
- "gvec: 0x%x, pi_desc_addr: 0x%llx",
+ TP_printk("PI is %s for irq %u, vcpu %d, gsi: 0x%x, gvec: 0x%x",
__entry->set ? "enabled and being updated" : "disabled",
__entry->host_irq,
__entry->vcpu_id,
__entry->gsi,
- __entry->gvec,
- __entry->pi_desc_addr)
+ __entry->gvec)
);
/*
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index cb6588238f46..5316c27f6099 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -15,7 +15,6 @@ extern bool __read_mostly enable_ept;
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
-extern bool __read_mostly enable_ipiv;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index a0c5e8781c33..bc5ece76533a 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -53,8 +53,6 @@ struct vcpu_vt {
#ifdef CONFIG_X86_64
u64 msr_host_kernel_gs_base;
#endif
-
- unsigned long host_debugctlmsr;
};
#ifdef CONFIG_KVM_INTEL_TDX
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index d1e02e567b57..dbab1c15b0cd 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -29,40 +29,8 @@ static __init int vt_hardware_setup(void)
if (ret)
return ret;
- /*
- * Update vt_x86_ops::vm_size here so it is ready before
- * kvm_ops_update() is called in kvm_x86_vendor_init().
- *
- * Note, the actual bringing up of TDX must be done after
- * kvm_ops_update() because enabling TDX requires enabling
- * hardware virtualization first, i.e., all online CPUs must
- * be in post-VMXON state. This means the @vm_size here
- * may be updated to TDX's size but TDX may fail to enable
- * at later time.
- *
- * The VMX/VT code could update kvm_x86_ops::vm_size again
- * after bringing up TDX, but this would require exporting
- * either kvm_x86_ops or kvm_ops_update() from the base KVM
- * module, which looks overkill. Anyway, the worst case here
- * is KVM may allocate couple of more bytes than needed for
- * each VM.
- */
- if (enable_tdx) {
- vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size,
- sizeof(struct kvm_tdx));
- /*
- * Note, TDX may fail to initialize in a later time in
- * vt_init(), in which case it is not necessary to setup
- * those callbacks. But making them valid here even
- * when TDX fails to init later is fine because those
- * callbacks won't be called if the VM isn't TDX guest.
- */
- vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
- vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
- vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
- vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
- vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
- }
+ if (enable_tdx)
+ tdx_hardware_setup();
return 0;
}
@@ -175,12 +143,12 @@ static int vt_vcpu_pre_run(struct kvm_vcpu *vcpu)
return vmx_vcpu_pre_run(vcpu);
}
-static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
if (is_td_vcpu(vcpu))
- return tdx_vcpu_run(vcpu, force_immediate_exit);
+ return tdx_vcpu_run(vcpu, run_flags);
- return vmx_vcpu_run(vcpu, force_immediate_exit);
+ return vmx_vcpu_run(vcpu, run_flags);
}
static int vt_handle_exit(struct kvm_vcpu *vcpu,
@@ -220,7 +188,7 @@ static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return vmx_get_msr(vcpu, msr_info);
}
-static void vt_msr_filter_changed(struct kvm_vcpu *vcpu)
+static void vt_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
/*
* TDX doesn't allow VMM to configure interception of MSR accesses.
@@ -231,7 +199,7 @@ static void vt_msr_filter_changed(struct kvm_vcpu *vcpu)
if (is_td_vcpu(vcpu))
return;
- vmx_msr_filter_changed(vcpu);
+ vmx_recalc_msr_intercepts(vcpu);
}
static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
@@ -489,14 +457,6 @@ static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
vmx_set_gdt(vcpu, dt);
}
-static void vt_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
-{
- if (is_td_vcpu(vcpu))
- return;
-
- vmx_set_dr6(vcpu, val);
-}
-
static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
if (is_td_vcpu(vcpu))
@@ -923,6 +883,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.vcpu_load = vt_op(vcpu_load),
.vcpu_put = vt_op(vcpu_put),
+ .HOST_OWNED_DEBUGCTL = VMX_HOST_OWNED_DEBUGCTL_BITS,
+
.update_exception_bitmap = vt_op(update_exception_bitmap),
.get_feature_msr = vmx_get_feature_msr,
.get_msr = vt_op(get_msr),
@@ -943,7 +905,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.set_idt = vt_op(set_idt),
.get_gdt = vt_op(get_gdt),
.set_gdt = vt_op(set_gdt),
- .set_dr6 = vt_op(set_dr6),
.set_dr7 = vt_op(set_dr7),
.sync_dirty_debug_regs = vt_op(sync_dirty_debug_regs),
.cache_reg = vt_op(cache_reg),
@@ -1014,7 +975,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.nested_ops = &vmx_nested_ops,
.pi_update_irte = vmx_pi_update_irte,
- .pi_start_assignment = vmx_pi_start_assignment,
+ .pi_start_bypass = vmx_pi_start_bypass,
#ifdef CONFIG_X86_64
.set_hv_timer = vt_op(set_hv_timer),
@@ -1034,7 +995,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.apic_init_signal_blocked = vt_op(apic_init_signal_blocked),
.migrate_timers = vmx_migrate_timers,
- .msr_filter_changed = vt_op(msr_filter_changed),
+ .recalc_msr_intercepts = vt_op(recalc_msr_intercepts),
.complete_emulated_msr = vt_op(complete_emulated_msr),
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 7211c71d4241..b8ea1969113d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -715,6 +715,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_APERF, MSR_TYPE_R);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_MPERF, MSR_TYPE_R);
+
kvm_vcpu_unmap(vcpu, &map);
vmx->nested.force_msr_bitmap_recalc = false;
@@ -2663,10 +2669,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (vmx->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
+ vmx_get_supported_debugctl(vcpu, false));
} else {
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
+ vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
}
if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
@@ -3156,7 +3163,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
- CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
+ (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
+ CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
return -EINVAL;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
@@ -3530,7 +3538,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
- vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
if (kvm_mpx_supported() &&
(!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
@@ -4608,6 +4616,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+ /*
+ * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
+ * Writes to DEBUGCTL that aren't intercepted by L1 are immediately
+ * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
+ * vmcs02 doesn't strictly track vmcs12.
+ */
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
vmcs12->guest_dr7 = vcpu->arch.dr7;
@@ -4798,7 +4812,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
__vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
kvm_set_dr(vcpu, 7, 0x400);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+ vmx_guest_debugctl_write(vcpu, 0);
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
vmcs12->vm_exit_msr_load_count))
@@ -4853,6 +4867,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
}
+ /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
+ vmx_reload_guest_debugctl(vcpu);
+
/*
* Note that calling vmx_set_{efer,cr0,cr4} is important as they
* handle a variety of side effects to KVM's software model.
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index bbf4509f32d0..0b173602821b 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -653,11 +653,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
*/
static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
{
- u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ u64 data = vmx_guest_debugctl_read();
if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
data &= ~DEBUGCTLMSR_LBR;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ vmx_guest_debugctl_write(vcpu, data);
}
}
@@ -730,7 +730,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
if (!lbr_desc->event) {
vmx_disable_lbr_msrs_passthrough(vcpu);
- if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
+ if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)
goto warn;
if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
goto warn;
@@ -752,7 +752,7 @@ warn:
static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
{
- if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
+ if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR))
intel_pmu_release_guest_lbr_event(vcpu);
}
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 5c615e5845bf..4a6d9a17da23 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -2,6 +2,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
#include <asm/irq_remapping.h>
#include <asm/cpu.h>
@@ -72,13 +73,10 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
/*
* If the vCPU wasn't on the wakeup list and wasn't migrated, then the
* full update can be skipped as neither the vector nor the destination
- * needs to be changed.
+ * needs to be changed. Clear SN even if there is no assigned device,
+ * again for simplicity.
*/
if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
- /*
- * Clear SN if it was set due to being preempted. Again, do
- * this even if there is no assigned device for simplicity.
- */
if (pi_test_and_clear_sn(pi_desc))
goto after_clear_sn;
return;
@@ -148,8 +146,13 @@ after_clear_sn:
static bool vmx_can_use_vtd_pi(struct kvm *kvm)
{
+ /*
+ * Note, reading the number of possible bypass IRQs can race with a
+ * bypass IRQ being attached to the VM. vmx_pi_start_bypass() ensures
+ * blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK.
+ */
return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() &&
- kvm_arch_has_assigned_device(kvm);
+ READ_ONCE(kvm->arch.nr_possible_bypass_irqs);
}
/*
@@ -224,17 +227,23 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
if (!vmx_needs_pi_wakeup(vcpu))
return;
- if (kvm_vcpu_is_blocking(vcpu) &&
+ /*
+ * If the vCPU is blocking with IRQs enabled and ISN'T being preempted,
+ * enable the wakeup handler so that notification IRQ wakes the vCPU as
+ * expected. There is no need to enable the wakeup handler if the vCPU
+ * is preempted between setting its wait state and manually scheduling
+ * out, as the task is still runnable, i.e. doesn't need a wake event
+ * from KVM to be scheduled in.
+ *
+ * If the wakeup handler isn't being enabled, Suppress Notifications as
+ * the cost of propagating PIR.IRR to PID.ON is negligible compared to
+ * the cost of a spurious IRQ, and vCPU put/load is a slow path.
+ */
+ if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) &&
((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) ||
(!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu))))
pi_enable_wakeup_handler(vcpu);
-
- /*
- * Set SN when the vCPU is preempted. Note, the vCPU can both be seen
- * as blocking and preempted, e.g. if it's preempted between setting
- * its wait state and manually scheduling out.
- */
- if (vcpu->preempted)
+ else
pi_set_sn(pi_desc);
}
@@ -281,99 +290,30 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
/*
- * Bail out of the block loop if the VM has an assigned
- * device, but the blocking vCPU didn't reconfigure the
- * PI.NV to the wakeup vector, i.e. the assigned device
- * came along after the initial check in vmx_vcpu_pi_put().
+ * Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as
+ * blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup
+ * vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put().
*/
-void vmx_pi_start_assignment(struct kvm *kvm)
+void vmx_pi_start_bypass(struct kvm *kvm)
{
- if (!kvm_arch_has_irq_bypass())
+ if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm)))
return;
kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
}
-/*
- * vmx_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector)
{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- bool enable_remapped_mode = true;
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu;
- struct vcpu_data vcpu_info;
- int idx, ret = 0;
-
- if (!vmx_can_use_vtd_pi(kvm))
- return 0;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
+ if (vcpu) {
+ struct intel_iommu_pi_data pi_data = {
+ .pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)),
+ .vector = vector,
+ };
+
+ return irq_set_vcpu_affinity(host_irq, &pi_data);
+ } else {
+ return irq_set_vcpu_affinity(host_irq, NULL);
}
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
- /*
- * VT-d PI cannot support posting multicast/broadcast
- * interrupts to a vCPU, we still use interrupt remapping
- * for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- *
- * We will support full lowest-priority interrupt later.
- *
- * In addition, we can only inject generic interrupts using
- * the PI mechanism, refuse to route others through it.
- */
-
- kvm_set_msi_irq(kvm, e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq))
- continue;
-
- vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
- vcpu_info.vector = irq.vector;
-
- trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
- vcpu_info.vector, vcpu_info.pi_desc_addr, set);
-
- if (!set)
- continue;
-
- enable_remapped_mode = false;
-
- ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
- if (ret < 0) {
- printk(KERN_INFO "%s: failed to update PI IRTE\n",
- __func__);
- goto out;
- }
- }
-
- if (enable_remapped_mode)
- ret = irq_set_vcpu_affinity(host_irq, NULL);
-
- ret = 0;
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
}
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 80499ea0e674..a4af39948cf0 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -3,6 +3,9 @@
#define __KVM_X86_VMX_POSTED_INTR_H
#include <linux/bitmap.h>
+#include <linux/find.h>
+#include <linux/kvm_host.h>
+
#include <asm/posted_intr.h>
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
@@ -11,9 +14,10 @@ void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
-void vmx_pi_start_assignment(struct kvm *kvm);
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
+void vmx_pi_start_bypass(struct kvm *kvm);
static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
{
diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
index 6a9bfdfbb6e5..2f20fb170def 100644
--- a/arch/x86/kvm/vmx/run_flags.h
+++ b/arch/x86/kvm/vmx/run_flags.h
@@ -2,10 +2,12 @@
#ifndef __KVM_X86_VMX_RUN_FLAGS_H
#define __KVM_X86_VMX_RUN_FLAGS_H
-#define VMX_RUN_VMRESUME_SHIFT 0
-#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
+#define VMX_RUN_VMRESUME_SHIFT 0
+#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
+#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT 2
-#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT)
-#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
+#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT)
+#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
+#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT)
#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index ec79aacc446f..66744f5768c8 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -743,7 +743,7 @@ bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
!to_tdx(vcpu)->vp_enter_args.r12;
}
-bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
+static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
u64 vcpu_state_details;
@@ -783,8 +783,6 @@ void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
else
vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
- vt->host_debugctlmsr = get_debugctlmsr();
-
vt->guest_state_loaded = true;
}
@@ -1025,20 +1023,20 @@ static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
DEBUGCTLMSR_FREEZE_IN_SMM)
-fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
struct vcpu_tdx *tdx = to_tdx(vcpu);
struct vcpu_vt *vt = to_vt(vcpu);
/*
- * force_immediate_exit requires vCPU entering for events injection with
- * an immediately exit followed. But The TDX module doesn't guarantee
- * entry, it's already possible for KVM to _think_ it completely entry
- * to the guest without actually having done so.
- * Since KVM never needs to force an immediate exit for TDX, and can't
- * do direct injection, just warn on force_immediate_exit.
+ * WARN if KVM wants to force an immediate exit, as the TDX module does
+ * not guarantee entry into the guest, i.e. it's possible for KVM to
+ * _think_ it completed entry to the guest and forced an immediate exit
+ * without actually having done so. Luckily, KVM never needs to force
+ * an immediate exit for TDX (KVM can't do direct event injection, so
+ * just WARN and continue on.
*/
- WARN_ON_ONCE(force_immediate_exit);
+ WARN_ON_ONCE(run_flags);
/*
* Wait until retry of SEPT-zap-related SEAMCALL completes before
@@ -1048,7 +1046,7 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
return EXIT_FASTPATH_EXIT_HANDLED;
- trace_kvm_entry(vcpu, force_immediate_exit);
+ trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);
if (pi_test_on(&vt->pi_desc)) {
apic->send_IPI_self(POSTED_INTR_VECTOR);
@@ -1060,8 +1058,8 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
tdx_vcpu_enter_exit(vcpu);
- if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED)
- update_debugctlmsr(vt->host_debugctlmsr);
+ if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
tdx_load_host_xsave_state(vcpu);
tdx->guest_entered = true;
@@ -1640,8 +1638,8 @@ static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
return 0;
}
-int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct page *page = pfn_to_page(pfn);
@@ -1721,8 +1719,8 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
return 0;
}
-int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt)
+static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt)
{
int tdx_level = pg_level_to_tdx_sept_level(level);
gpa_t gpa = gfn_to_gpa(gfn);
@@ -1857,8 +1855,8 @@ static void tdx_track(struct kvm *kvm)
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
}
-int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt)
+static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
@@ -1880,8 +1878,8 @@ int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
return tdx_reclaim_page(virt_to_page(private_spt));
}
-int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
{
struct page *page = pfn_to_page(pfn);
int ret;
@@ -3605,10 +3603,14 @@ int __init tdx_bringup(void)
r = __tdx_bringup();
if (r) {
/*
- * Disable TDX only but don't fail to load module if
- * the TDX module could not be loaded. No need to print
- * message saying "module is not loaded" because it was
- * printed when the first SEAMCALL failed.
+ * Disable TDX only but don't fail to load module if the TDX
+ * module could not be loaded. No need to print message saying
+ * "module is not loaded" because it was printed when the first
+ * SEAMCALL failed. Don't bother unwinding the S-EPT hooks or
+ * vm_size, as kvm_x86_ops have already been finalized (and are
+ * intentionally not exported). The S-EPT code is unreachable,
+ * and allocating a few more bytes per VM in a should-be-rare
+ * failure scenario is a non-issue.
*/
if (r == -ENODEV)
goto success_disable_tdx;
@@ -3622,3 +3624,20 @@ success_disable_tdx:
enable_tdx = 0;
return 0;
}
+
+void __init tdx_hardware_setup(void)
+{
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
+
+ /*
+ * Note, if the TDX module can't be loaded, KVM TDX support will be
+ * disabled but KVM will continue loading (see tdx_bringup()).
+ */
+ vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
+
+ vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
+ vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
+ vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
+ vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
+ vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
+}
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 51f98443e8a2..ca39a9391db1 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -8,6 +8,7 @@
#ifdef CONFIG_KVM_INTEL_TDX
#include "common.h"
+void tdx_hardware_setup(void);
int tdx_bringup(void);
void tdx_cleanup(void);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 191a9ed0da22..aa157fe5b7b3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -75,6 +75,8 @@
#include "vmx_onhyperv.h"
#include "posted_intr.h"
+#include "mmu/spte.h"
+
MODULE_AUTHOR("Qumranet");
MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");
@@ -113,8 +115,6 @@ static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, 0444);
module_param(enable_apicv, bool, 0444);
-
-bool __read_mostly enable_ipiv = true;
module_param(enable_ipiv, bool, 0444);
module_param(enable_device_posted_irqs, bool, 0444);
@@ -168,31 +168,6 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
RTIT_STATUS_BYTECNT))
/*
- * List of MSRs that can be directly passed to the guest.
- * In addition to these x2apic, PT and LBR MSRs are handled specially.
- */
-static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
- MSR_IA32_SPEC_CTRL,
- MSR_IA32_PRED_CMD,
- MSR_IA32_FLUSH_CMD,
- MSR_IA32_TSC,
-#ifdef CONFIG_X86_64
- MSR_FS_BASE,
- MSR_GS_BASE,
- MSR_KERNEL_GS_BASE,
- MSR_IA32_XFD,
- MSR_IA32_XFD_ERR,
-#endif
- MSR_IA32_SYSENTER_CS,
- MSR_IA32_SYSENTER_ESP,
- MSR_IA32_SYSENTER_EIP,
- MSR_CORE_C1_RES,
- MSR_CORE_C3_RESIDENCY,
- MSR_CORE_C6_RESIDENCY,
- MSR_CORE_C7_RESIDENCY,
-};
-
-/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
* executions of PAUSE in a loop. Also indicate if ple enabled.
@@ -674,40 +649,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static int vmx_get_passthrough_msr_slot(u32 msr)
-{
- int i;
-
- switch (msr) {
- case 0x800 ... 0x8ff:
- /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
- return -ENOENT;
- case MSR_IA32_RTIT_STATUS:
- case MSR_IA32_RTIT_OUTPUT_BASE:
- case MSR_IA32_RTIT_OUTPUT_MASK:
- case MSR_IA32_RTIT_CR3_MATCH:
- case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
- /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
- case MSR_LBR_SELECT:
- case MSR_LBR_TOS:
- case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
- case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
- case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
- case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
- case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
- /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
- return -ENOENT;
- }
-
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
- if (vmx_possible_passthrough_msrs[i] == msr)
- return i;
- }
-
- WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
- return -ENOENT;
-}
-
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -963,6 +904,10 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
flags |= VMX_RUN_SAVE_SPEC_CTRL;
+ if (static_branch_unlikely(&cpu_buf_vm_clear) &&
+ kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
+ flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
+
return flags;
}
@@ -2149,7 +2094,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ msr_info->data = vmx_guest_debugctl_read();
break;
default:
find_uret_msr:
@@ -2174,7 +2119,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
return (unsigned long)data;
}
-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
{
u64 debugctl = 0;
@@ -2186,9 +2131,25 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
(host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ if (boot_cpu_has(X86_FEATURE_RTM) &&
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
+ debugctl |= DEBUGCTLMSR_RTM_DEBUG;
+
return debugctl;
}
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+{
+ u64 invalid;
+
+ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
+ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
+ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
+ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
+ }
+ return !invalid;
+}
+
/*
* Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -2257,29 +2218,22 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_DEBUGCTLMSR: {
- u64 invalid;
-
- invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
- if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
- kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
- data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- }
-
- if (invalid)
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
return 1;
+ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ vmx_guest_debugctl_write(vcpu, data);
+
if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
(data & DEBUGCTLMSR_LBR))
intel_pmu_create_guest_lbr_event(vcpu);
return 0;
- }
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
@@ -4013,76 +3967,29 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
vmx->nested.force_msr_bitmap_recalc = true;
}
-void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
- int idx;
if (!cpu_has_vmx_msr_bitmap())
return;
vmx_msr_bitmap_l01_changed(vmx);
- /*
- * Mark the desired intercept state in shadow bitmap, this is needed
- * for resync when the MSR filters change.
- */
- idx = vmx_get_passthrough_msr_slot(msr);
- if (idx >= 0) {
- if (type & MSR_TYPE_R)
- clear_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- clear_bit(idx, vmx->shadow_msr_intercept.write);
- }
-
- if ((type & MSR_TYPE_R) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
- vmx_set_msr_bitmap_read(msr_bitmap, msr);
- type &= ~MSR_TYPE_R;
- }
-
- if ((type & MSR_TYPE_W) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
- vmx_set_msr_bitmap_write(msr_bitmap, msr);
- type &= ~MSR_TYPE_W;
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ vmx_clear_msr_bitmap_read(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
}
- if (type & MSR_TYPE_R)
- vmx_clear_msr_bitmap_read(msr_bitmap, msr);
-
- if (type & MSR_TYPE_W)
- vmx_clear_msr_bitmap_write(msr_bitmap, msr);
-}
-
-void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
- int idx;
-
- if (!cpu_has_vmx_msr_bitmap())
- return;
-
- vmx_msr_bitmap_l01_changed(vmx);
-
- /*
- * Mark the desired intercept state in shadow bitmap, this is needed
- * for resync when the MSR filter changes.
- */
- idx = vmx_get_passthrough_msr_slot(msr);
- if (idx >= 0) {
- if (type & MSR_TYPE_R)
- set_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- set_bit(idx, vmx->shadow_msr_intercept.write);
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ vmx_clear_msr_bitmap_write(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-
- if (type & MSR_TYPE_R)
- vmx_set_msr_bitmap_read(msr_bitmap, msr);
-
- if (type & MSR_TYPE_W)
- vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
@@ -4161,35 +4068,57 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
}
}
-void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 i;
-
if (!cpu_has_vmx_msr_bitmap())
return;
- /*
- * Redo intercept permissions for MSRs that KVM is passing through to
- * the guest. Disabling interception will check the new MSR filter and
- * ensure that KVM enables interception if usersepace wants to filter
- * the MSR. MSRs that KVM is already intercepting don't need to be
- * refreshed since KVM is going to intercept them regardless of what
- * userspace wants.
- */
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
- u32 msr = vmx_possible_passthrough_msrs[i];
-
- if (!test_bit(i, vmx->shadow_msr_intercept.read))
- vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
-
- if (!test_bit(i, vmx->shadow_msr_intercept.write))
- vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
+ vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ if (kvm_cstate_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ }
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
}
/* PT MSRs can be passed through iff PT is exposed to the guest. */
if (vmx_pt_mode_is_host_guest())
pt_update_intercept_for_msr(vcpu);
+
+ if (vcpu->arch.xfd_no_write_intercept)
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);
+
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !to_vmx(vcpu)->spec_ctrl);
+
+ if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
+
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
+
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
+ /*
+ * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
+ * filtered by userspace.
+ */
}
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -4790,7 +4719,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write32(GUEST_SYSENTER_CS, 0);
vmcs_writel(GUEST_SYSENTER_ESP, 0);
vmcs_writel(GUEST_SYSENTER_EIP, 0);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ vmx_guest_debugctl_write(&vmx->vcpu, 0);
if (cpu_has_vmx_tpr_shadow()) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
@@ -5606,12 +5536,6 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
-void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
-{
- lockdep_assert_irqs_disabled();
- set_debugreg(vcpu->arch.dr6, 6);
-}
-
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
@@ -7290,7 +7214,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
else if (static_branch_unlikely(&cpu_buf_vm_clear) &&
- kvm_arch_has_assigned_device(vcpu->kvm))
+ (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO))
x86_clear_cpu_buffers();
vmx_disable_fb_clear(vmx);
@@ -7323,8 +7247,9 @@ out:
guest_state_exit_irqoff();
}
-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7369,6 +7294,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
vcpu->arch.regs_dirty = 0;
+ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+ set_debugreg(vcpu->arch.dr6, 6);
+
+ if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
+ vmx_reload_guest_debugctl(vcpu);
+
/*
* Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
* prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
@@ -7543,26 +7474,6 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
evmcs->hv_enlightenments_control.msr_bitmap = 1;
}
- /* The MSR bitmap starts with all ones */
- bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
-
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
-#ifdef CONFIG_X86_64
- vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
-#endif
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
- if (kvm_cstate_in_guest(vcpu->kvm)) {
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
- }
-
vmx->loaded_vmcs = &vmx->vmcs01;
if (cpu_need_virtualize_apic_accesses(vcpu)) {
@@ -7612,7 +7523,7 @@ free_vpid:
int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
- kvm->arch.pause_in_guest = true;
+ kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
switch (l1tf_mitigation) {
@@ -7849,18 +7760,6 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
}
}
- if (kvm_cpu_cap_has(X86_FEATURE_XFD))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
- !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
-
- if (boot_cpu_has(X86_FEATURE_IBPB))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
- !guest_has_pred_cmd_msr(vcpu));
-
- if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
- !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
-
set_cr4_guest_host_mask(vmx);
vmx_write_encls_bitmap(vcpu, NULL);
@@ -7876,6 +7775,9 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx->msr_ia32_feature_control_valid_bits &=
~FEAT_CTL_SGX_LC_ENABLED;
+ /* Recalc MSR interception to account for feature changes. */
+ vmx_recalc_msr_intercepts(vcpu);
+
/* Refresh #PF interception to account for MAXPHYADDR changes. */
vmx_update_exception_bitmap(vcpu);
}
@@ -8650,6 +8552,8 @@ int __init vmx_init(void)
{
int r, cpu;
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
+
if (!kvm_is_vmx_supported())
return -EOPNOTSUPP;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index b5758c33c60f..d3389baf3ab3 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -19,8 +19,6 @@
#include "../mmu.h"
#include "common.h"
-#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
-
#ifdef CONFIG_X86_64
#define MAX_NR_USER_RETURN_MSRS 7
#else
@@ -296,13 +294,6 @@ struct vcpu_vmx {
struct pt_desc pt_desc;
struct lbr_desc lbr_desc;
- /* Save desired MSR intercept (read: pass-through) state */
-#define MAX_POSSIBLE_PASSTHROUGH_MSRS 16
- struct {
- DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- } shadow_msr_intercept;
-
/* ve_info must be page aligned. */
struct vmx_ve_information *ve_info;
};
@@ -395,24 +386,54 @@ bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs,
int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
-void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
-void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set);
+
+static inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ vmx_set_intercept_for_msr(vcpu, msr, type, false);
+}
+
+static inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ vmx_set_intercept_for_msr(vcpu, msr, type, true);
+}
u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
-static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
- int type, bool value)
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
+
+#define VMX_HOST_OWNED_DEBUGCTL_BITS (DEBUGCTLMSR_FREEZE_IN_SMM)
+
+static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
{
- if (value)
- vmx_enable_intercept_for_msr(vcpu, msr, type);
- else
- vmx_disable_intercept_for_msr(vcpu, msr, type);
+ WARN_ON_ONCE(val & VMX_HOST_OWNED_DEBUGCTL_BITS);
+
+ val |= vcpu->arch.host_debugctl & VMX_HOST_OWNED_DEBUGCTL_BITS;
+ vmcs_write64(GUEST_IA32_DEBUGCTL, val);
}
-void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+static inline u64 vmx_guest_debugctl_read(void)
+{
+ return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~VMX_HOST_OWNED_DEBUGCTL_BITS;
+}
+
+static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu)
+{
+ u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+ if (!((val ^ vcpu->arch.host_debugctl) & VMX_HOST_OWNED_DEBUGCTL_BITS))
+ return;
+
+ vmx_guest_debugctl_write(vcpu, val & ~VMX_HOST_OWNED_DEBUGCTL_BITS);
+}
/*
* Note, early Intel manuals have the write-low and read-high bitmap offsets
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index b4596f651232..2b3424f638db 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -21,7 +21,7 @@ void vmx_vm_destroy(struct kvm *kvm);
int vmx_vcpu_precreate(struct kvm *kvm);
int vmx_vcpu_create(struct kvm_vcpu *vcpu);
int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu);
-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
void vmx_vcpu_free(struct kvm_vcpu *vcpu);
void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
@@ -52,7 +52,7 @@ void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
int trig_mode, int vector);
void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
bool vmx_has_emulated_msr(struct kvm *kvm, u32 index);
-void vmx_msr_filter_changed(struct kvm_vcpu *vcpu);
+void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu);
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
int vmx_get_feature_msr(u32 msr, u64 *data);
@@ -133,10 +133,9 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void tdx_vcpu_free(struct kvm_vcpu *vcpu);
void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu);
-fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void tdx_vcpu_put(struct kvm_vcpu *vcpu);
-bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu);
int tdx_handle_exit(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion fastpath);
@@ -151,15 +150,6 @@ int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
-int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt);
-int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt);
-int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn);
-int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn);
-
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 93636f77c42d..a1c49bc681c4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -226,6 +226,9 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
bool __read_mostly enable_apicv = true;
EXPORT_SYMBOL_GPL(enable_apicv);
+bool __read_mostly enable_ipiv = true;
+EXPORT_SYMBOL_GPL(enable_ipiv);
+
bool __read_mostly enable_device_posted_irqs = true;
EXPORT_SYMBOL_GPL(enable_device_posted_irqs);
@@ -4579,6 +4582,9 @@ static u64 kvm_get_allowed_disable_exits(void)
{
u64 r = KVM_X86_DISABLE_EXITS_PAUSE;
+ if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+ r |= KVM_X86_DISABLE_EXITS_APERFMPERF;
+
if (!mitigate_smt_rsb) {
r |= KVM_X86_DISABLE_EXITS_HLT |
KVM_X86_DISABLE_EXITS_CSTATE;
@@ -4634,17 +4640,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_EXT_CPUID:
case KVM_CAP_EXT_EMUL_CPUID:
case KVM_CAP_CLOCKSOURCE:
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CAP_PIT:
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_REINJECT_CONTROL:
+#endif
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
case KVM_CAP_USER_NMI:
- case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_IOEVENTFD:
case KVM_CAP_IOEVENTFD_NO_LENGTH:
- case KVM_CAP_PIT2:
- case KVM_CAP_PIT_STATE2:
+
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
case KVM_CAP_VCPU_EVENTS:
#ifdef CONFIG_KVM_HYPERV
@@ -4985,11 +4994,6 @@ out:
return r;
}
-static void wbinvd_ipi(void *garbage)
-{
- wbinvd();
-}
-
static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
return kvm_arch_has_noncoherent_dma(vcpu->kvm);
@@ -5013,8 +5017,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_x86_call(has_wbinvd_exit)())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
- smp_call_function_single(vcpu->cpu,
- wbinvd_ipi, NULL, 1);
+ wbinvd_on_cpu(vcpu->cpu);
}
kvm_x86_call(vcpu_load)(vcpu, cpu);
@@ -5489,12 +5492,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
(events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
return -EINVAL;
- /* INITs are latched while in SMM */
- if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
- (events->smi.smm || events->smi.pending) &&
- vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
- return -EINVAL;
-
process_nmi(vcpu);
/*
@@ -6401,135 +6398,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- memcpy(&chip->chip.pic, &pic->pics[0],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- memcpy(&chip->chip.pic, &pic->pics[1],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_IOAPIC:
- kvm_get_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- return r;
-}
-
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- spin_lock(&pic->lock);
- memcpy(&pic->pics[0], &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic->lock);
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- spin_lock(&pic->lock);
- memcpy(&pic->pics[1], &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic->lock);
- break;
- case KVM_IRQCHIP_IOAPIC:
- kvm_set_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- kvm_pic_update_irq(pic);
- return r;
-}
-
-static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
-
- BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
-
- mutex_lock(&kps->lock);
- memcpy(ps, &kps->channels, sizeof(*ps));
- mutex_unlock(&kps->lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- int i;
- struct kvm_pit *pit = kvm->arch.vpit;
-
- mutex_lock(&pit->pit_state.lock);
- memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
- for (i = 0; i < 3; i++)
- kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
- mutex_unlock(&pit->pit_state.lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
- sizeof(ps->channels));
- ps->flags = kvm->arch.vpit->pit_state.flags;
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- memset(&ps->reserved, 0, sizeof(ps->reserved));
- return 0;
-}
-
-static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- int start = 0;
- int i;
- u32 prev_legacy, cur_legacy;
- struct kvm_pit *pit = kvm->arch.vpit;
-
- mutex_lock(&pit->pit_state.lock);
- prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
- cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
- if (!prev_legacy && cur_legacy)
- start = 1;
- memcpy(&pit->pit_state.channels, &ps->channels,
- sizeof(pit->pit_state.channels));
- pit->pit_state.flags = ps->flags;
- for (i = 0; i < 3; i++)
- kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
- start && i == 0);
- mutex_unlock(&pit->pit_state.lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_reinject(struct kvm *kvm,
- struct kvm_reinject_control *control)
-{
- struct kvm_pit *pit = kvm->arch.vpit;
-
- /* pit->pit_state.lock was overloaded to prevent userspace from getting
- * an inconsistent state after running multiple KVM_REINJECT_CONTROL
- * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
- */
- mutex_lock(&pit->pit_state.lock);
- kvm_pit_set_reinject(pit, control->pit_reinject);
- mutex_unlock(&pit->pit_state.lock);
-
- return 0;
-}
-
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
@@ -6549,18 +6417,6 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
kvm_vcpu_kick(vcpu);
}
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
- bool line_status)
-{
- if (!irqchip_in_kernel(kvm))
- return -ENXIO;
-
- irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event->irq, irq_event->level,
- line_status);
- return 0;
-}
-
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
@@ -6625,17 +6481,11 @@ split_irqchip_unlock:
if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) &&
cpu_smt_possible() &&
- (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
+ (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE |
+ KVM_X86_DISABLE_EXITS_APERFMPERF)))
pr_warn_once(SMT_RSB_MSG);
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
- kvm->arch.pause_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT)
- kvm->arch.mwait_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
- kvm->arch.hlt_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
- kvm->arch.cstate_in_guest = true;
+ kvm_disable_exits(kvm, cap->args[0]);
r = 0;
disable_exits_unlock:
mutex_unlock(&kvm->lock);
@@ -7072,9 +6922,11 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r = -ENOTTY;
+
+#ifdef CONFIG_KVM_IOAPIC
/*
* This union makes it completely explicit to gcc-3.x
- * that these two variables' stack usage should be
+ * that these three variables' stack usage should be
* combined, not added together.
*/
union {
@@ -7082,6 +6934,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm_pit_state2 ps2;
struct kvm_pit_config pit_config;
} u;
+#endif
switch (ioctl) {
case KVM_SET_TSS_ADDR:
@@ -7105,6 +6958,7 @@ set_identity_unlock:
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
break;
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CREATE_IRQCHIP: {
mutex_lock(&kvm->lock);
@@ -7126,7 +6980,7 @@ set_identity_unlock:
goto create_irqchip_unlock;
}
- r = kvm_setup_default_irq_routing(kvm);
+ r = kvm_setup_default_ioapic_and_pic_routing(kvm);
if (r) {
kvm_ioapic_destroy(kvm);
kvm_pic_destroy(kvm);
@@ -7174,7 +7028,7 @@ set_identity_unlock:
}
r = -ENXIO;
- if (!irqchip_kernel(kvm))
+ if (!irqchip_full(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@@ -7198,7 +7052,7 @@ set_identity_unlock:
}
r = -ENXIO;
- if (!irqchip_kernel(kvm))
+ if (!irqchip_full(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
set_irqchip_out:
@@ -7271,6 +7125,7 @@ set_pit2_out:
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
}
+#endif
case KVM_SET_BOOT_CPU_ID:
r = 0;
mutex_lock(&kvm->lock);
@@ -7341,9 +7196,12 @@ set_pit2_out:
if (user_tsc_khz == 0)
user_tsc_khz = tsc_khz;
- WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
- r = 0;
-
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
goto out;
}
case KVM_GET_TSC_KHZ: {
@@ -8295,8 +8153,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
- wbinvd_ipi, NULL, 1);
+ wbinvd_on_cpus_mask(vcpu->arch.wbinvd_dirty_mask);
put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
} else
@@ -10730,8 +10587,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+#ifdef CONFIG_KVM_IOAPIC
else if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+#endif
if (is_guest_mode(vcpu))
vcpu->arch.load_eoi_exitmap_pending = true;
@@ -10785,6 +10644,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
fastpath_t exit_fastpath;
+ u64 run_flags, debug_ctl;
bool req_immediate_exit = false;
@@ -10932,8 +10792,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_vcpu_update_apicv(vcpu);
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
kvm_check_async_pf_completion(vcpu);
+
+ /*
+ * Recalc MSR intercepts as userspace may want to intercept
+ * accesses to MSRs that KVM would otherwise pass through to
+ * the guest.
+ */
if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
- kvm_x86_call(msr_filter_changed)(vcpu);
+ kvm_x86_call(recalc_msr_intercepts)(vcpu);
if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
kvm_x86_call(update_cpu_dirty_logging)(vcpu);
@@ -11029,8 +10895,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}
- if (req_immediate_exit)
+ run_flags = 0;
+ if (req_immediate_exit) {
+ run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -11048,12 +10917,22 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[3], 3);
/* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
+ run_flags |= KVM_RUN_LOAD_GUEST_DR6;
} else if (unlikely(hw_breakpoint_active())) {
set_debugreg(DR7_FIXED_1, 7);
}
- vcpu->arch.host_debugctl = get_debugctlmsr();
+ /*
+ * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
+ * can be modified in IRQ context, e.g. via SMP function calls. Inform
+ * vendor code if any host-owned bits were changed, e.g. so that the
+ * value loaded into hardware while running the guest can be updated.
+ */
+ debug_ctl = get_debugctlmsr();
+ if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
+ !vcpu->arch.guest_state_protected)
+ run_flags |= KVM_RUN_LOAD_DEBUGCTL;
+ vcpu->arch.host_debugctl = debug_ctl;
guest_timing_enter_irqoff();
@@ -11067,8 +10946,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
- exit_fastpath = kvm_x86_call(vcpu_run)(vcpu,
- req_immediate_exit);
+ exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, run_flags);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -11080,6 +10958,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
break;
}
+ run_flags = 0;
+
/* Note, VM-Exits that go down the "slow" path are accounted below. */
++vcpu->stat.exits;
}
@@ -11553,6 +11433,28 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
trace_kvm_fpu(0);
}
+static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ /*
+ * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and
+ * tracks the pending SIPI separately. SIPI_RECEIVED is still accepted
+ * by KVM_SET_VCPU_EVENTS for backwards compatibility, but should be
+ * converted to INIT_RECEIVED.
+ */
+ if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED))
+ return -EINVAL;
+
+ /*
+ * Disallow running the vCPU if userspace forced it into an impossible
+ * MP_STATE, e.g. if the vCPU is in WFS but SIPI is blocked.
+ */
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED &&
+ !kvm_apic_init_sipi_allowed(vcpu))
+ return -EINVAL;
+
+ return kvm_x86_call(vcpu_pre_run)(vcpu);
+}
+
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
@@ -11655,7 +11557,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
- r = kvm_x86_call(vcpu_pre_run)(vcpu);
+ r = kvm_x86_vcpu_pre_run(vcpu);
if (r <= 0)
goto out;
@@ -11899,21 +11801,16 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
}
/*
- * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
- * forcing the guest into INIT/SIPI if those events are supposed to be
- * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state
- * if an SMI is pending as well.
+ * SIPI_RECEIVED is obsolete and no longer used internally; KVM instead
+ * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI.
+ * Translate SIPI_RECEIVED as appropriate for backwards compatibility.
*/
- if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
- (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
- mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
- goto out;
-
if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
- kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
+ mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED;
set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
- } else
- kvm_set_mp_state(vcpu, mp_state->mp_state);
+ }
+
+ kvm_set_mp_state(vcpu, mp_state->mp_state);
kvm_make_request(KVM_REQ_EVENT, vcpu);
ret = 0;
@@ -12795,21 +12692,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto out;
- kvm_mmu_init_vm(kvm);
+ ret = kvm_mmu_init_vm(kvm);
+ if (ret)
+ goto out_cleanup_page_track;
ret = kvm_x86_call(vm_init)(kvm);
if (ret)
goto out_uninit_mmu;
- INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
- /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
- set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
- /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
- set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
- &kvm->arch.irq_sources_bitmap);
-
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
@@ -12848,6 +12740,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
out_uninit_mmu:
kvm_mmu_uninit_vm(kvm);
+out_cleanup_page_track:
kvm_page_track_cleanup(kvm);
out:
return ret;
@@ -12940,7 +12833,9 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
+#ifdef CONFIG_KVM_IOAPIC
kvm_free_pit(kvm);
+#endif
kvm_mmu_pre_destroy_vm(kvm);
static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
@@ -12964,8 +12859,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
}
kvm_destroy_vcpus(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
+#ifdef CONFIG_KVM_IOAPIC
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
+#endif
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
@@ -13575,25 +13472,6 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
}
-void kvm_arch_start_assignment(struct kvm *kvm)
-{
- if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
- kvm_x86_call(pi_start_assignment)(kvm);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
-
-void kvm_arch_end_assignment(struct kvm *kvm)
-{
- atomic_dec(&kvm->arch.assigned_device_count);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
-
-bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
-{
- return raw_atomic_read(&kvm->arch.assigned_device_count);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
-
static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
{
/*
@@ -13629,77 +13507,6 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
-int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
- struct irq_bypass_producer *prod)
-{
- struct kvm_kernel_irqfd *irqfd =
- container_of(cons, struct kvm_kernel_irqfd, consumer);
- struct kvm *kvm = irqfd->kvm;
- int ret;
-
- kvm_arch_start_assignment(irqfd->kvm);
-
- spin_lock_irq(&kvm->irqfds.lock);
- irqfd->producer = prod;
-
- ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 1);
- if (ret)
- kvm_arch_end_assignment(irqfd->kvm);
-
- spin_unlock_irq(&kvm->irqfds.lock);
-
-
- return ret;
-}
-
-void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
- struct irq_bypass_producer *prod)
-{
- int ret;
- struct kvm_kernel_irqfd *irqfd =
- container_of(cons, struct kvm_kernel_irqfd, consumer);
- struct kvm *kvm = irqfd->kvm;
-
- WARN_ON(irqfd->producer != prod);
-
- /*
- * When producer of consumer is unregistered, we change back to
- * remapped mode, so we can re-use the current implementation
- * when the irq is masked/disabled or the consumer side (KVM
- * int this case doesn't want to receive the interrupts.
- */
- spin_lock_irq(&kvm->irqfds.lock);
- irqfd->producer = NULL;
-
- ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 0);
- if (ret)
- printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
- " fails: %d\n", irqfd->consumer.token, ret);
-
- spin_unlock_irq(&kvm->irqfds.lock);
-
-
- kvm_arch_end_assignment(irqfd->kvm);
-}
-
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
-}
-
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
- struct kvm_kernel_irq_routing_entry *new)
-{
- if (old->type != KVM_IRQ_ROUTING_MSI ||
- new->type != KVM_IRQ_ROUTING_MSI)
- return true;
-
- return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
-}
-
bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
@@ -14099,7 +13906,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 832f0faf4779..bcfd9b719ada 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -55,6 +55,28 @@ struct kvm_host_values {
void kvm_spurious_fault(void);
+#define SIZE_OF_MEMSLOTS_HASHTABLE \
+ (sizeof(((struct kvm_memslots *)0)->id_hash) * 2 * KVM_MAX_NR_ADDRESS_SPACES)
+
+/* Sanity check the size of the memslot hash tables. */
+static_assert(SIZE_OF_MEMSLOTS_HASHTABLE ==
+ (1024 * (1 + IS_ENABLED(CONFIG_X86_64)) * (1 + IS_ENABLED(CONFIG_KVM_SMM))));
+
+/*
+ * Assert that "struct kvm_{svm,vmx,tdx}" is an order-0 or order-1 allocation.
+ * Spilling over to an order-2 allocation isn't fundamentally problematic, but
+ * isn't expected to happen in the foreseeable future (O(years)). Assert that
+ * the size is an order-0 allocation when ignoring the memslot hash tables, to
+ * help detect and debug unexpected size increases.
+ */
+#define KVM_SANITY_CHECK_VM_STRUCT_SIZE(x) \
+do { \
+ BUILD_BUG_ON(get_order(sizeof(struct x) - SIZE_OF_MEMSLOTS_HASHTABLE) && \
+ !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \
+ BUILD_BUG_ON(get_order(sizeof(struct x)) > 1 && \
+ !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \
+} while (0)
+
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
({ \
bool failed = (consistency_check); \
@@ -499,24 +521,34 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
__rem; \
})
+static inline void kvm_disable_exits(struct kvm *kvm, u64 mask)
+{
+ kvm->arch.disabled_exits |= mask;
+}
+
static inline bool kvm_mwait_in_guest(struct kvm *kvm)
{
- return kvm->arch.mwait_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_MWAIT;
}
static inline bool kvm_hlt_in_guest(struct kvm *kvm)
{
- return kvm->arch.hlt_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_HLT;
}
static inline bool kvm_pause_in_guest(struct kvm *kvm)
{
- return kvm->arch.pause_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_PAUSE;
}
static inline bool kvm_cstate_in_guest(struct kvm *kvm)
{
- return kvm->arch.cstate_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_CSTATE;
+}
+
+static inline bool kvm_aperfmperf_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_APERFMPERF;
}
static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
index 7af743bd3b13..c5c60d07308c 100644
--- a/arch/x86/lib/cache-smp.c
+++ b/arch/x86/lib/cache-smp.c
@@ -14,9 +14,31 @@ void wbinvd_on_cpu(int cpu)
}
EXPORT_SYMBOL(wbinvd_on_cpu);
-int wbinvd_on_all_cpus(void)
+void wbinvd_on_all_cpus(void)
{
on_each_cpu(__wbinvd, NULL, 1);
- return 0;
}
EXPORT_SYMBOL(wbinvd_on_all_cpus);
+
+void wbinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ on_each_cpu_mask(cpus, __wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(wbinvd_on_cpus_mask);
+
+static void __wbnoinvd(void *dummy)
+{
+ wbnoinvd();
+}
+
+void wbnoinvd_on_all_cpus(void)
+{
+ on_each_cpu(__wbnoinvd, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(wbnoinvd_on_all_cpus);
+
+void wbnoinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ on_each_cpu_mask(cpus, __wbnoinvd, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(wbnoinvd_on_cpus_mask);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index bf8dab18be97..2fdc1f1f5adb 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -122,13 +122,12 @@ static bool ex_handler_sgx(const struct exception_table_entry *fixup,
static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
struct pt_regs *regs)
{
- regs->ip = ex_fixup_addr(fixup);
-
WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
(void *)instruction_pointer(regs));
fpu_reset_from_exception_fixup();
- return true;
+
+ return ex_handler_default(fixup, regs);
}
/*
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index c0c40b67524e..b10d4d131dce 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -38,6 +38,7 @@
#include <asm/desc.h>
#include <asm/sections.h>
#include <asm/set_memory.h>
+#include <asm/bugs.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
@@ -84,7 +85,8 @@ void __init pti_check_boottime_disable(void)
return;
}
- if (cpu_mitigations_off())
+ if (pti_mode == PTI_AUTO &&
+ !cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
pti_mode = PTI_FORCE_OFF;
if (pti_mode == PTI_FORCE_OFF) {
pti_print_if_insecure("disabled on command line.");
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 15672cb926fc..7e3fca164620 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3501,13 +3501,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf);
}
-static const char *bpf_get_prog_name(struct bpf_prog *prog)
-{
- if (prog->aux->ksym.prog)
- return prog->aux->ksym.name;
- return prog->aux->name;
-}
-
static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size)
{
int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
@@ -3531,7 +3524,7 @@ static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size
if (stack_ptr[0] != PRIV_STACK_GUARD_VAL ||
stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL) {
pr_err("BPF private stack overflow/underflow detected for prog %sx\n",
- bpf_get_prog_name(prog));
+ bpf_jit_get_prog_name(prog));
break;
}
}
@@ -3845,7 +3838,6 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
}
return;
#endif
- WARN(1, "verification of programs using bpf_throw should have failed\n");
}
void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index e7e8f77f77f8..b4409df2105a 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -216,8 +216,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
* When SEV-ES is active, the GHCB as set by the kernel will be used
* by firmware. Create a 1:1 unencrypted mapping for each GHCB.
*/
- if (sev_es_efi_map_ghcbs(pgd)) {
- pr_err("Failed to create 1:1 mapping for the GHCBs!\n");
+ if (sev_es_efi_map_ghcbs_cas(pgd)) {
+ pr_err("Failed to create 1:1 mapping for the GHCBs and CAs!\n");
return 1;
}
diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c
index 08cd913cbd4e..8bf15c4aefa9 100644
--- a/arch/x86/tools/insn_decoder_test.c
+++ b/arch/x86/tools/insn_decoder_test.c
@@ -167,7 +167,7 @@ int main(int argc, char **argv)
pr_warn("Decoded and checked %d instructions with %d "
"failures\n", insns, warnings);
else
- fprintf(stdout, "%s: success: Decoded and checked %d"
+ fprintf(stdout, " %s: success: Decoded and checked %d"
" instructions\n", prog, insns);
return 0;
}
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c
index 213f35f94feb..e743f0ea01ee 100644
--- a/arch/x86/tools/insn_sanity.c
+++ b/arch/x86/tools/insn_sanity.c
@@ -253,9 +253,9 @@ int main(int argc, char **argv)
}
fprintf((errors) ? stderr : stdout,
- "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n",
+ " %s: %s: Decoded and checked %d %s instructions with %d errors (seed:0x%x)\n",
prog,
- (errors) ? "Failure" : "Success",
+ (errors) ? "failure" : "success",
insns,
(input_file) ? "given" : "random",
errors,
diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h
index 56a2f0913e3c..d6208d0fad51 100644
--- a/arch/x86/um/asm/syscall.h
+++ b/arch/x86/um/asm/syscall.h
@@ -9,6 +9,8 @@ typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
+extern const sys_call_ptr_t sys_call_table[];
+
static inline int syscall_get_arch(struct task_struct *task)
{
#ifdef CONFIG_X86_32
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 8f7476ff6e95..572ea2d79131 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -44,18 +44,6 @@
#include "ptrace_64.h"
#endif
-struct syscall_args {
- unsigned long args[6];
-};
-
-#define SYSCALL_ARGS(r) ((struct syscall_args) \
- { .args = { UPT_SYSCALL_ARG1(r), \
- UPT_SYSCALL_ARG2(r), \
- UPT_SYSCALL_ARG3(r), \
- UPT_SYSCALL_ARG4(r), \
- UPT_SYSCALL_ARG5(r), \
- UPT_SYSCALL_ARG6(r) } } )
-
extern unsigned long host_fp_size;
struct uml_pt_regs {
diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h
deleted file mode 100644
index b2060ac707f0..000000000000
--- a/arch/x86/um/shared/sysdep/syscalls.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifdef __i386__
-#include "syscalls_32.h"
-#else
-#include "syscalls_64.h"
-#endif
diff --git a/arch/x86/um/shared/sysdep/syscalls_32.h b/arch/x86/um/shared/sysdep/syscalls_32.h
deleted file mode 100644
index f6e9f84397e7..000000000000
--- a/arch/x86/um/shared/sysdep/syscalls_32.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (C) 2000 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <asm/unistd.h>
-#include <sysdep/ptrace.h>
-
-typedef long syscall_handler_t(struct syscall_args);
-
-extern syscall_handler_t *sys_call_table[];
-
-#define EXECUTE_SYSCALL(syscall, regs) \
- ((*sys_call_table[syscall]))(SYSCALL_ARGS(&regs->regs))
diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h
deleted file mode 100644
index b6b997225841..000000000000
--- a/arch/x86/um/shared/sysdep/syscalls_64.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright 2003 PathScale, Inc.
- *
- * Licensed under the GPL
- */
-
-#ifndef __SYSDEP_X86_64_SYSCALLS_H__
-#define __SYSDEP_X86_64_SYSCALLS_H__
-
-#include <linux/msg.h>
-#include <linux/shm.h>
-
-typedef long syscall_handler_t(long, long, long, long, long, long);
-
-extern syscall_handler_t *sys_call_table[];
-
-#define EXECUTE_SYSCALL(syscall, regs) \
- (((*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(&regs->regs), \
- UPT_SYSCALL_ARG2(&regs->regs), \
- UPT_SYSCALL_ARG3(&regs->regs), \
- UPT_SYSCALL_ARG4(&regs->regs), \
- UPT_SYSCALL_ARG5(&regs->regs), \
- UPT_SYSCALL_ARG6(&regs->regs)))
-
-extern syscall_handler_t sys_modify_ldt;
-extern syscall_handler_t sys_arch_prctl;
-
-#endif
diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c
index cb3f17627d16..1909c2e640b2 100644
--- a/arch/x86/um/tls_32.c
+++ b/arch/x86/um/tls_32.c
@@ -186,7 +186,7 @@ int arch_switch_tls(struct task_struct *to)
/*
* We have no need whatsoever to switch TLS for kernel threads; beyond
* that, that would also result in us calling os_set_thread_area with
- * userspace_pid[cpu] == 0, which gives an error.
+ * task->mm == NULL, which would cause a crash.
*/
if (likely(to->mm))
return load_TLS(O_FORCE, to);