summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/alpha/Kconfig1
-rw-r--r--arch/alpha/include/asm/percpu.h5
-rw-r--r--arch/alpha/include/uapi/asm/socket.h3
-rw-r--r--arch/arm/Kconfig2
-rw-r--r--arch/arm/boot/dts/intel/ixp/intel-ixp42x-linksys-wrv54g.dts92
-rw-r--r--arch/arm/configs/omap2plus_defconfig1
-rw-r--r--arch/arm/crypto/aes-neonbs-glue.c2
-rw-r--r--arch/arm/mm/fault.c2
-rw-r--r--arch/arm64/Kconfig4
-rw-r--r--arch/arm64/boot/dts/mediatek/mt8370.dtsi16
-rw-r--r--arch/arm64/include/asm/barrier.h3
-rw-r--r--arch/arm64/include/asm/el2_setup.h45
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h51
-rw-r--r--arch/arm64/include/asm/kvm_host.h36
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h18
-rw-r--r--arch/arm64/include/asm/kvm_nested.h2
-rw-r--r--arch/arm64/include/asm/mman.h10
-rw-r--r--arch/arm64/include/asm/pgtable-prot.h1
-rw-r--r--arch/arm64/include/asm/pgtable.h42
-rw-r--r--arch/arm64/include/asm/smp.h24
-rw-r--r--arch/arm64/include/asm/sysreg.h71
-rw-r--r--arch/arm64/include/asm/tlbflush.h11
-rw-r--r--arch/arm64/include/asm/vncr_mapping.h2
-rw-r--r--arch/arm64/kernel/cpufeature.c26
-rw-r--r--arch/arm64/kernel/smp.c142
-rw-r--r--arch/arm64/kvm/Makefile3
-rw-r--r--arch/arm64/kvm/arch_timer.c2
-rw-r--r--arch/arm64/kvm/arm.c36
-rw-r--r--arch/arm64/kvm/at.c80
-rw-r--r--arch/arm64/kvm/config.c255
-rw-r--r--arch/arm64/kvm/emulate-nested.c49
-rw-r--r--arch/arm64/kvm/guest.c62
-rw-r--r--arch/arm64/kvm/handle_exit.c24
-rw-r--r--arch/arm64/kvm/hyp/exception.c16
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h53
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h49
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c53
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c14
-rw-r--r--arch/arm64/kvm/hyp/vhe/sysreg-sr.c6
-rw-r--r--arch/arm64/kvm/inject_fault.c235
-rw-r--r--arch/arm64/kvm/mmio.c12
-rw-r--r--arch/arm64/kvm/mmu.c105
-rw-r--r--arch/arm64/kvm/nested.c109
-rw-r--r--arch/arm64/kvm/sys_regs.c207
-rw-r--r--arch/arm64/kvm/sys_regs.h2
-rw-r--r--arch/arm64/kvm/trace_handle_exit.h2
-rw-r--r--arch/arm64/kvm/vgic-sys-reg-v3.c127
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c30
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c5
-rw-r--r--arch/arm64/kvm/vgic/vgic-kvm-device.c70
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v3.c33
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3-nested.c2
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c14
-rw-r--r--arch/arm64/kvm/vgic/vgic-v5.c52
-rw-r--r--arch/arm64/kvm/vgic/vgic.c4
-rw-r--r--arch/arm64/kvm/vgic/vgic.h48
-rw-r--r--arch/arm64/mm/fault.c2
-rw-r--r--arch/arm64/mm/mmap.c2
-rw-r--r--arch/arm64/mm/mmu.c30
-rw-r--r--arch/arm64/mm/ptdump_debugfs.c3
-rw-r--r--arch/arm64/net/bpf_jit.h5
-rw-r--r--arch/arm64/net/bpf_jit_comp.c167
-rw-r--r--arch/arm64/tools/cpucaps4
-rw-r--r--arch/arm64/tools/sysreg514
-rw-r--r--arch/csky/Kconfig1
-rw-r--r--arch/loongarch/Kconfig2
-rw-r--r--arch/loongarch/configs/loongson3_defconfig1
-rw-r--r--arch/loongarch/include/asm/hugetlb.h14
-rw-r--r--arch/loongarch/include/asm/kvm_host.h12
-rw-r--r--arch/loongarch/include/asm/pgtable-bits.h6
-rw-r--r--arch/loongarch/include/asm/pgtable.h19
-rw-r--r--arch/loongarch/kvm/exit.c33
-rw-r--r--arch/loongarch/kvm/intc/eiointc.c553
-rw-r--r--arch/loongarch/kvm/intc/ipi.c28
-rw-r--r--arch/loongarch/kvm/intc/pch_pic.c4
-rw-r--r--arch/loongarch/kvm/interrupt.c25
-rw-r--r--arch/loongarch/kvm/trace.h14
-rw-r--r--arch/loongarch/kvm/vcpu.c8
-rw-r--r--arch/loongarch/mm/pageattr.c2
-rw-r--r--arch/m68k/configs/amiga_defconfig1
-rw-r--r--arch/m68k/configs/apollo_defconfig1
-rw-r--r--arch/m68k/configs/atari_defconfig1
-rw-r--r--arch/m68k/configs/bvme6000_defconfig1
-rw-r--r--arch/m68k/configs/hp300_defconfig1
-rw-r--r--arch/m68k/configs/mac_defconfig1
-rw-r--r--arch/m68k/configs/multi_defconfig1
-rw-r--r--arch/m68k/configs/mvme147_defconfig1
-rw-r--r--arch/m68k/configs/mvme16x_defconfig1
-rw-r--r--arch/m68k/configs/q40_defconfig1
-rw-r--r--arch/m68k/configs/sun3_defconfig1
-rw-r--r--arch/m68k/configs/sun3x_defconfig1
-rw-r--r--arch/microblaze/Kconfig1
-rw-r--r--arch/mips/Kconfig3
-rw-r--r--arch/mips/boot/Makefile8
-rw-r--r--arch/mips/boot/dts/mobileye/eyeq5-epm5.dts8
-rw-r--r--arch/mips/boot/dts/mobileye/eyeq5.dtsi127
-rw-r--r--arch/mips/boot/dts/mobileye/eyeq6h.dtsi22
-rw-r--r--arch/mips/boot/dts/qca/ar9132.dtsi9
-rw-r--r--arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts4
-rw-r--r--arch/mips/boot/dts/qca/ar9331.dtsi9
-rw-r--r--arch/mips/boot/dts/qca/ar9331_dpt_module.dts4
-rw-r--r--arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts4
-rw-r--r--arch/mips/boot/dts/qca/ar9331_omega.dts4
-rw-r--r--arch/mips/boot/dts/qca/ar9331_openembed_som9331_board.dts4
-rw-r--r--arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts4
-rw-r--r--arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts2
-rw-r--r--arch/mips/boot/dts/ralink/mt7620a.dtsi10
-rw-r--r--arch/mips/boot/dts/ralink/mt7628a.dtsi11
-rw-r--r--arch/mips/boot/dts/realtek/cameo-rtl9302c-2x-rtl8224-2xge.dts96
-rw-r--r--arch/mips/boot/dts/realtek/rtl930x.dtsi31
-rw-r--r--arch/mips/configs/eyeq5_defconfig12
-rw-r--r--arch/mips/configs/eyeq6_defconfig2
-rw-r--r--arch/mips/configs/fuloong2e_defconfig1
-rw-r--r--arch/mips/configs/ip22_defconfig1
-rw-r--r--arch/mips/configs/loongson2k_defconfig1
-rw-r--r--arch/mips/configs/loongson3_defconfig1
-rw-r--r--arch/mips/configs/malta_defconfig1
-rw-r--r--arch/mips/configs/malta_kvm_defconfig1
-rw-r--r--arch/mips/configs/maltaup_xpa_defconfig1
-rw-r--r--arch/mips/configs/rb532_defconfig1
-rw-r--r--arch/mips/configs/rm200_defconfig1
-rw-r--r--arch/mips/include/asm/cpu-info.h1
-rw-r--r--arch/mips/include/asm/hugetlb.h14
-rw-r--r--arch/mips/include/asm/mach-generic/mc146818rtc.h4
-rw-r--r--arch/mips/include/asm/mach-ip30/cpu-feature-overrides.h2
-rw-r--r--arch/mips/include/asm/mach-ip30/spaces.h2
-rw-r--r--arch/mips/include/asm/mach-jazz/mc146818rtc.h2
-rw-r--r--arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h3
-rw-r--r--arch/mips/include/asm/mach-malta/mc146818rtc.h2
-rw-r--r--arch/mips/include/asm/mach-rm/mc146818rtc.h21
-rw-r--r--arch/mips/include/asm/mc146818-time.h105
-rw-r--r--arch/mips/include/asm/mips-cps.h4
-rw-r--r--arch/mips/include/asm/sgi/heart.h2
-rw-r--r--arch/mips/include/asm/smp-cps.h1
-rw-r--r--arch/mips/include/asm/vpe.h8
-rw-r--r--arch/mips/include/uapi/asm/socket.h3
-rw-r--r--arch/mips/kernel/cpu-probe.c42
-rw-r--r--arch/mips/kernel/mips-cm.c52
-rw-r--r--arch/mips/kernel/process.c16
-rw-r--r--arch/mips/kernel/relocate.c10
-rw-r--r--arch/mips/kernel/smp-cps.c16
-rw-r--r--arch/mips/kvm/mips.c2
-rw-r--r--arch/mips/lantiq/falcon/prom.c4
-rw-r--r--arch/mips/lantiq/falcon/sysctrl.c29
-rw-r--r--arch/mips/lantiq/irq.c4
-rw-r--r--arch/mips/lantiq/xway/clk.c2
-rw-r--r--arch/mips/lantiq/xway/dcdc.c2
-rw-r--r--arch/mips/lantiq/xway/dma.c2
-rw-r--r--arch/mips/lantiq/xway/gptu.c2
-rw-r--r--arch/mips/loongson64/setup.c1
-rw-r--r--arch/mips/mm/physaddr.c2
-rw-r--r--arch/mips/mm/tlb-r4k.c56
-rw-r--r--arch/mips/pci/pci-lantiq.c2
-rw-r--r--arch/mips/pci/pci-rt2880.c2
-rw-r--r--arch/mips/ralink/irq.c1
-rw-r--r--arch/mips/sgi-ip27/ip27-irq.c2
-rw-r--r--arch/mips/sgi-ip30/ip30-power.c2
-rw-r--r--arch/mips/sgi-ip30/ip30-setup.c2
-rw-r--r--arch/mips/sgi-ip30/ip30-smp.c2
-rw-r--r--arch/mips/sgi-ip30/ip30-timer.c2
-rw-r--r--arch/mips/sgi-ip30/ip30-xtalk.c2
-rw-r--r--arch/mips/txx9/generic/setup.c4
-rw-r--r--arch/openrisc/kernel/dma.c4
-rw-r--r--arch/parisc/Kconfig1
-rw-r--r--arch/parisc/include/uapi/asm/socket.h3
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/configs/cell_defconfig1
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash-4k.h6
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash-64k.h7
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h53
-rw-r--r--arch/powerpc/include/asm/book3s/64/pkeys.h2
-rw-r--r--arch/powerpc/include/asm/book3s/64/radix.h14
-rw-r--r--arch/powerpc/include/asm/hvcall.h1
-rw-r--r--arch/powerpc/include/asm/mman.h2
-rw-r--r--arch/powerpc/include/asm/pkeys.h4
-rw-r--r--arch/powerpc/kvm/book3s_hv_uvmem.c2
-rw-r--r--arch/powerpc/mm/book3s64/hash_hugepage.c2
-rw-r--r--arch/powerpc/mm/book3s64/hash_pgtable.c3
-rw-r--r--arch/powerpc/mm/book3s64/hugetlbpage.c2
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c12
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c5
-rw-r--r--arch/powerpc/mm/pgtable.c2
-rw-r--r--arch/powerpc/net/bpf_jit_comp64.c79
-rw-r--r--arch/powerpc/platforms/pseries/cmm.c2
-rw-r--r--arch/powerpc/platforms/pseries/pci_dlpar.c2
-rw-r--r--arch/riscv/Kconfig2
-rw-r--r--arch/riscv/include/asm/kvm_aia.h2
-rw-r--r--arch/riscv/include/asm/kvm_gstage.h72
-rw-r--r--arch/riscv/include/asm/kvm_host.h105
-rw-r--r--arch/riscv/include/asm/kvm_mmu.h21
-rw-r--r--arch/riscv/include/asm/kvm_tlb.h84
-rw-r--r--arch/riscv/include/asm/kvm_vcpu_sbi.h12
-rw-r--r--arch/riscv/include/asm/kvm_vmid.h27
-rw-r--r--arch/riscv/include/asm/pgtable-64.h16
-rw-r--r--arch/riscv/include/asm/pgtable-bits.h1
-rw-r--r--arch/riscv/include/asm/pgtable.h22
-rw-r--r--arch/riscv/include/asm/tlbflush.h1
-rw-r--r--arch/riscv/include/uapi/asm/kvm.h1
-rw-r--r--arch/riscv/kvm/Kconfig1
-rw-r--r--arch/riscv/kvm/Makefile1
-rw-r--r--arch/riscv/kvm/aia_device.c6
-rw-r--r--arch/riscv/kvm/aia_imsic.c12
-rw-r--r--arch/riscv/kvm/gstage.c338
-rw-r--r--arch/riscv/kvm/main.c3
-rw-r--r--arch/riscv/kvm/mmu.c509
-rw-r--r--arch/riscv/kvm/tlb.c110
-rw-r--r--arch/riscv/kvm/vcpu.c48
-rw-r--r--arch/riscv/kvm/vcpu_exit.c20
-rw-r--r--arch/riscv/kvm/vcpu_onereg.c83
-rw-r--r--arch/riscv/kvm/vcpu_sbi.c49
-rw-r--r--arch/riscv/kvm/vcpu_sbi_replace.c17
-rw-r--r--arch/riscv/kvm/vcpu_sbi_sta.c3
-rw-r--r--arch/riscv/kvm/vcpu_sbi_v01.c25
-rw-r--r--arch/riscv/kvm/vm.c7
-rw-r--r--arch/riscv/kvm/vmid.c25
-rw-r--r--arch/riscv/mm/fault.c8
-rw-r--r--arch/riscv/mm/pageattr.c8
-rw-r--r--arch/riscv/mm/ptdump.c3
-rw-r--r--arch/riscv/mm/tlbflush.c5
-rw-r--r--arch/s390/Kconfig2
-rw-r--r--arch/s390/configs/debug_defconfig2
-rw-r--r--arch/s390/configs/defconfig2
-rw-r--r--arch/s390/crypto/Makefile1
-rw-r--r--arch/s390/crypto/hmac_s390.c12
-rw-r--r--arch/s390/crypto/paes_s390.c2
-rw-r--r--arch/s390/crypto/phmac_s390.c1048
-rw-r--r--arch/s390/crypto/sha.h3
-rw-r--r--arch/s390/crypto/sha3_256_s390.c24
-rw-r--r--arch/s390/crypto/sha3_512_s390.c25
-rw-r--r--arch/s390/include/asm/cpacf.h4
-rw-r--r--arch/s390/include/asm/entry-common.h10
-rw-r--r--arch/s390/include/asm/kvm_host.h3
-rw-r--r--arch/s390/include/asm/percpu.h5
-rw-r--r--arch/s390/kvm/kvm-s390.c51
-rw-r--r--arch/s390/kvm/vsie.c17
-rw-r--r--arch/s390/mm/dump_pagetables.c2
-rw-r--r--arch/s390/net/bpf_jit.h55
-rw-r--r--arch/s390/net/bpf_jit_comp.c113
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sh/Makefile10
-rw-r--r--arch/sh/boot/compressed/Makefile4
-rw-r--r--arch/sh/boot/romimage/Makefile4
-rw-r--r--arch/sh/configs/titan_defconfig1
-rw-r--r--arch/sparc/Kconfig2
-rw-r--r--arch/sparc/include/asm/hugetlb.h5
-rw-r--r--arch/sparc/include/asm/mman.h4
-rw-r--r--arch/sparc/include/uapi/asm/socket.h3
-rw-r--r--arch/sparc/mm/hugetlbpage.c119
-rw-r--r--arch/sparc/mm/init_64.c2
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/crypto/aegis128-aesni-glue.c40
-rw-r--r--arch/x86/crypto/aria_aesni_avx2_glue.c1
-rw-r--r--arch/x86/crypto/aria_aesni_avx_glue.c1
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c1
-rw-r--r--arch/x86/crypto/camellia_glue.c1
-rw-r--r--arch/x86/crypto/curve25519-x86_64.c1
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c1
-rw-r--r--arch/x86/crypto/sm4_aesni_avx_glue.c1
-rw-r--r--arch/x86/crypto/twofish_glue.c1
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c1
-rw-r--r--arch/x86/include/asm/apic.h66
-rw-r--r--arch/x86/include/asm/irq_remapping.h17
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h76
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/pgtable.h51
-rw-r--r--arch/x86/include/asm/pgtable_types.h5
-rw-r--r--arch/x86/include/asm/svm.h13
-rw-r--r--arch/x86/include/asm/tlbflush.h5
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c8
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h2
-rw-r--r--arch/x86/kernel/setup.c4
-rw-r--r--arch/x86/kvm/Kconfig10
-rw-r--r--arch/x86/kvm/Makefile7
-rw-r--r--arch/x86/kvm/cpuid.c1
-rw-r--r--arch/x86/kvm/hyperv.c10
-rw-r--r--arch/x86/kvm/hyperv.h3
-rw-r--r--arch/x86/kvm/i8254.c90
-rw-r--r--arch/x86/kvm/i8254.h17
-rw-r--r--arch/x86/kvm/i8259.c17
-rw-r--r--arch/x86/kvm/ioapic.c55
-rw-r--r--arch/x86/kvm/ioapic.h24
-rw-r--r--arch/x86/kvm/irq.c560
-rw-r--r--arch/x86/kvm/irq.h35
-rw-r--r--arch/x86/kvm/irq_comm.c469
-rw-r--r--arch/x86/kvm/lapic.c104
-rw-r--r--arch/x86/kvm/lapic.h26
-rw-r--r--arch/x86/kvm/mmu/mmu.c75
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h3
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h8
-rw-r--r--arch/x86/kvm/mmu/spte.c43
-rw-r--r--arch/x86/kvm/mmu/spte.h10
-rw-r--r--arch/x86/kvm/svm/avic.c688
-rw-r--r--arch/x86/kvm/svm/nested.c128
-rw-r--r--arch/x86/kvm/svm/sev.c149
-rw-r--r--arch/x86/kvm/svm/svm.c506
-rw-r--r--arch/x86/kvm/svm/svm.h137
-rw-r--r--arch/x86/kvm/trace.h99
-rw-r--r--arch/x86/kvm/vmx/capabilities.h1
-rw-r--r--arch/x86/kvm/vmx/common.h2
-rw-r--r--arch/x86/kvm/vmx/main.c61
-rw-r--r--arch/x86/kvm/vmx/nested.c27
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c8
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c140
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h10
-rw-r--r--arch/x86/kvm/vmx/run_flags.h10
-rw-r--r--arch/x86/kvm/vmx/tdx.c71
-rw-r--r--arch/x86/kvm/vmx/tdx.h1
-rw-r--r--arch/x86/kvm/vmx/vmx.c296
-rw-r--r--arch/x86/kvm/vmx/vmx.h57
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h16
-rw-r--r--arch/x86/kvm/x86.c389
-rw-r--r--arch/x86/kvm/x86.h40
-rw-r--r--arch/x86/mm/pat/memtype.c1
-rw-r--r--arch/x86/mm/pgprot.c2
-rw-r--r--arch/x86/net/bpf_jit_comp.c10
316 files changed, 7862 insertions, 4906 deletions
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 109a4cddcd13..80367f2cf821 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -7,6 +7,7 @@ config ALPHA
select ARCH_HAS_DMA_OPS if PCI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
+ select ARCH_MODULE_NEEDS_WEAK_PER_CPU if SMP
select ARCH_NO_PREEMPT
select ARCH_NO_SG_CHAIN
select ARCH_USE_CMPXCHG_LOCKREF
diff --git a/arch/alpha/include/asm/percpu.h b/arch/alpha/include/asm/percpu.h
index 6923249f2d49..4383d66341dc 100644
--- a/arch/alpha/include/asm/percpu.h
+++ b/arch/alpha/include/asm/percpu.h
@@ -9,10 +9,9 @@
* way above 4G.
*
* Always use weak definitions for percpu variables in modules.
+ * Therefore, we have enabled CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU
+ * in the Kconfig.
*/
-#if defined(MODULE) && defined(CONFIG_SMP)
-#define ARCH_NEEDS_WEAK_PER_CPU
-#endif
#include <asm-generic/percpu.h>
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 8f1f18adcdb5..5ef57f88df6b 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -152,6 +152,9 @@
#define SO_PASSRIGHTS 83
+#define SO_INQ 84
+#define SCM_INQ SO_INQ
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a6e80653abd1..b1f3df39ed40 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -100,12 +100,12 @@ config ARM
select HAVE_BUILDTIME_MCOUNT_SORT
select HAVE_DEBUG_KMEMLEAK if !XIP_KERNEL
select HAVE_DMA_CONTIGUOUS if MMU
+ select HAVE_EXTRA_IPI_TRACEPOINTS
select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
select HAVE_EXIT_THREAD
select HAVE_GUP_FAST if ARM_LPAE
- select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER if !XIP_KERNEL
diff --git a/arch/arm/boot/dts/intel/ixp/intel-ixp42x-linksys-wrv54g.dts b/arch/arm/boot/dts/intel/ixp/intel-ixp42x-linksys-wrv54g.dts
index 98275a363c57..cb1842c83ac8 100644
--- a/arch/arm/boot/dts/intel/ixp/intel-ixp42x-linksys-wrv54g.dts
+++ b/arch/arm/boot/dts/intel/ixp/intel-ixp42x-linksys-wrv54g.dts
@@ -72,10 +72,55 @@
cs-gpios = <&gpio0 5 GPIO_ACTIVE_LOW>;
num-chipselects = <1>;
- switch@0 {
+ ethernet-switch@0 {
compatible = "micrel,ks8995";
reg = <0>;
spi-max-frequency = <50000000>;
+
+ /*
+ * The PHYs are accessed over the external MDIO
+ * bus and not internally through the switch control
+ * registers.
+ */
+ ethernet-ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ethernet-port@0 {
+ reg = <0>;
+ label = "1";
+ phy-mode = "mii";
+ phy-handle = <&phy1>;
+ };
+ ethernet-port@1 {
+ reg = <1>;
+ label = "2";
+ phy-mode = "mii";
+ phy-handle = <&phy2>;
+ };
+ ethernet-port@2 {
+ reg = <2>;
+ label = "3";
+ phy-mode = "mii";
+ phy-handle = <&phy3>;
+ };
+ ethernet-port@3 {
+ reg = <3>;
+ label = "4";
+ phy-mode = "mii";
+ phy-handle = <&phy4>;
+ };
+ ethernet-port@4 {
+ reg = <4>;
+ ethernet = <&ethb>;
+ phy-mode = "mii";
+ fixed-link {
+ speed = <100>;
+ full-duplex;
+ };
+ };
+
+ };
};
};
@@ -135,40 +180,59 @@
};
/*
- * EthB - connected to the KS8995 switch ports 1-4
- * FIXME: the boardfile defines .phy_mask = 0x1e for this port to enable output to
- * all four switch ports, also using an out of tree multiphy patch.
- * Do we need a new binding and property for this?
+ * EthB connects to the KS8995 CPU port and faces ports 1-4
+ * through the switch fabric.
+ *
+ * To complicate things, the MDIO channel is also only
+ * accessible through EthB, but used independently for PHY
+ * control.
*/
- ethernet@c8009000 {
+ ethb: ethernet@c8009000 {
status = "okay";
queue-rx = <&qmgr 3>;
queue-txready = <&qmgr 20>;
- phy-mode = "rgmii";
- phy-handle = <&phy4>;
+ phy-mode = "mii";
+ fixed-link {
+ speed = <100>;
+ full-duplex;
+ };
mdio {
#address-cells = <1>;
#size-cells = <0>;
- /* Should be ports 1-4 on the KS8995 switch */
+ /*
+ * LAN ports 1-4 on the KS8995 switch
+ * and PHY5 for WAN need to be accessed
+ * through this external MDIO channel.
+ */
+ phy1: ethernet-phy@1 {
+ reg = <1>;
+ };
+ phy2: ethernet-phy@2 {
+ reg = <2>;
+ };
+ phy3: ethernet-phy@3 {
+ reg = <3>;
+ };
phy4: ethernet-phy@4 {
reg = <4>;
};
-
- /* Should be port 5 on the KS8995 switch */
phy5: ethernet-phy@5 {
reg = <5>;
};
};
};
- /* EthC - connected to KS8995 switch port 5 */
- ethernet@c800a000 {
+ /*
+ * EthC connects to MII-P5 on the KS8995 bypassing
+ * all of the switch logic and facing PHY5
+ */
+ ethc: ethernet@c800a000 {
status = "okay";
queue-rx = <&qmgr 4>;
queue-txready = <&qmgr 21>;
- phy-mode = "rgmii";
+ phy-mode = "mii";
phy-handle = <&phy5>;
};
};
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 1491add30093..939913ed9a73 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -142,7 +142,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
CONFIG_NETFILTER_XT_MATCH_CPU=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
index c60104dc1585..df5afe601e4a 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -206,7 +206,7 @@ static int ctr_encrypt(struct skcipher_request *req)
while (walk.nbytes > 0) {
const u8 *src = walk.src.virt.addr;
u8 *dst = walk.dst.virt.addr;
- int bytes = walk.nbytes;
+ unsigned int bytes = walk.nbytes;
if (unlikely(bytes < AES_BLOCK_SIZE))
src = dst = memcpy(buf + sizeof(buf) - bytes,
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index ab01b51de559..46169fe42c61 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -268,7 +268,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
int sig, code;
vm_fault_t fault;
unsigned int flags = FAULT_FLAG_DEFAULT;
- unsigned long vm_flags = VM_ACCESS_FLAGS;
+ vm_flags_t vm_flags = VM_ACCESS_FLAGS;
if (kprobe_page_fault(regs, fsr))
return 0;
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 8b6cbdd1441b..e9bbfacc35a6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -42,7 +42,6 @@ config ARM64
select ARCH_HAS_NONLEAF_PMD_YOUNG if ARM64_HAFT
select ARCH_HAS_PREEMPT_LAZY
select ARCH_HAS_PTDUMP
- select ARCH_HAS_PTE_DEVMAP
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_HW_PTE_YOUNG
select ARCH_HAS_SETUP_DMA_OPS
@@ -127,6 +126,7 @@ config ARM64
select ARM_GIC_V2M if PCI
select ARM_GIC_V3
select ARM_GIC_V3_ITS if PCI
+ select ARM_GIC_V5
select ARM_PSCI_FW
select BUILDTIME_TABLE_SORT
select CLONE_BACKWARDS
@@ -134,6 +134,7 @@ config ARM64
select CPU_PM if (SUSPEND || CPU_IDLE)
select CPUMASK_OFFSTACK if NR_CPUS > 256
select DCACHE_WORD_ACCESS
+ select HAVE_EXTRA_IPI_TRACEPOINTS
select DYNAMIC_FTRACE if FUNCTION_TRACER
select DMA_BOUNCE_UNALIGNED_KMALLOC
select DMA_DIRECT_REMAP
@@ -221,7 +222,6 @@ config ARM64
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_GUP_FAST
select HAVE_FTRACE_GRAPH_FUNC
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_FUNCTION_GRAPH_FREGS
diff --git a/arch/arm64/boot/dts/mediatek/mt8370.dtsi b/arch/arm64/boot/dts/mediatek/mt8370.dtsi
index cf1a3759451f..7ac8b8d03494 100644
--- a/arch/arm64/boot/dts/mediatek/mt8370.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8370.dtsi
@@ -59,6 +59,22 @@
<&cpu3 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
};
+/*
+ * Please note that overriding compatibles is a discouraged practice and is a
+ * clear indication of nodes not being, well, compatible!
+ *
+ * This is a special case, where the GPU is the same as MT8188, but with one
+ * of the cores fused out in this lower-binned SoC.
+ */
+&gpu {
+ compatible = "mediatek,mt8370-mali", "arm,mali-valhall-jm";
+
+ power-domains = <&spm MT8188_POWER_DOMAIN_MFG2>,
+ <&spm MT8188_POWER_DOMAIN_MFG3>;
+
+ power-domain-names = "core0", "core1";
+};
+
&ppi_cluster0 {
affinity = <&cpu0 &cpu1 &cpu2 &cpu3>;
};
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 1ca947d5c939..f5801b0ba9e9 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -44,6 +44,9 @@
SB_BARRIER_INSN"nop\n", \
ARM64_HAS_SB))
+#define gsb_ack() asm volatile(GSB_ACK_BARRIER_INSN : : : "memory")
+#define gsb_sys() asm volatile(GSB_SYS_BARRIER_INSN : : : "memory")
+
#ifdef CONFIG_ARM64_PSEUDO_NMI
#define pmr_sync() \
do { \
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index b959115e9962..46033027510c 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -165,6 +165,50 @@
.Lskip_gicv3_\@:
.endm
+/* GICv5 system register access */
+.macro __init_el2_gicv5
+ mrs_s x0, SYS_ID_AA64PFR2_EL1
+ ubfx x0, x0, #ID_AA64PFR2_EL1_GCIE_SHIFT, #4
+ cbz x0, .Lskip_gicv5_\@
+
+ mov x0, #(ICH_HFGITR_EL2_GICRCDNMIA | \
+ ICH_HFGITR_EL2_GICRCDIA | \
+ ICH_HFGITR_EL2_GICCDDI | \
+ ICH_HFGITR_EL2_GICCDEOI | \
+ ICH_HFGITR_EL2_GICCDHM | \
+ ICH_HFGITR_EL2_GICCDRCFG | \
+ ICH_HFGITR_EL2_GICCDPEND | \
+ ICH_HFGITR_EL2_GICCDAFF | \
+ ICH_HFGITR_EL2_GICCDPRI | \
+ ICH_HFGITR_EL2_GICCDDIS | \
+ ICH_HFGITR_EL2_GICCDEN)
+ msr_s SYS_ICH_HFGITR_EL2, x0 // Disable instruction traps
+ mov_q x0, (ICH_HFGRTR_EL2_ICC_PPI_ACTIVERn_EL1 | \
+ ICH_HFGRTR_EL2_ICC_PPI_PRIORITYRn_EL1 | \
+ ICH_HFGRTR_EL2_ICC_PPI_PENDRn_EL1 | \
+ ICH_HFGRTR_EL2_ICC_PPI_ENABLERn_EL1 | \
+ ICH_HFGRTR_EL2_ICC_PPI_HMRn_EL1 | \
+ ICH_HFGRTR_EL2_ICC_IAFFIDR_EL1 | \
+ ICH_HFGRTR_EL2_ICC_ICSR_EL1 | \
+ ICH_HFGRTR_EL2_ICC_PCR_EL1 | \
+ ICH_HFGRTR_EL2_ICC_HPPIR_EL1 | \
+ ICH_HFGRTR_EL2_ICC_HAPR_EL1 | \
+ ICH_HFGRTR_EL2_ICC_CR0_EL1 | \
+ ICH_HFGRTR_EL2_ICC_IDRn_EL1 | \
+ ICH_HFGRTR_EL2_ICC_APR_EL1)
+ msr_s SYS_ICH_HFGRTR_EL2, x0 // Disable reg read traps
+ mov_q x0, (ICH_HFGWTR_EL2_ICC_PPI_ACTIVERn_EL1 | \
+ ICH_HFGWTR_EL2_ICC_PPI_PRIORITYRn_EL1 | \
+ ICH_HFGWTR_EL2_ICC_PPI_PENDRn_EL1 | \
+ ICH_HFGWTR_EL2_ICC_PPI_ENABLERn_EL1 | \
+ ICH_HFGWTR_EL2_ICC_ICSR_EL1 | \
+ ICH_HFGWTR_EL2_ICC_PCR_EL1 | \
+ ICH_HFGWTR_EL2_ICC_CR0_EL1 | \
+ ICH_HFGWTR_EL2_ICC_APR_EL1)
+ msr_s SYS_ICH_HFGWTR_EL2, x0 // Disable reg write traps
+.Lskip_gicv5_\@:
+.endm
+
.macro __init_el2_hstr
msr hstr_el2, xzr // Disable CP15 traps to EL2
.endm
@@ -368,6 +412,7 @@
__init_el2_lor
__init_el2_stage2
__init_el2_gicv3
+ __init_el2_gicv5
__init_el2_hstr
__init_el2_nvhe_idregs
__init_el2_cptr
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 0720898f563e..fa8a08a1ccd5 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -45,16 +45,39 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
void kvm_skip_instr32(struct kvm_vcpu *vcpu);
void kvm_inject_undefined(struct kvm_vcpu *vcpu);
-void kvm_inject_vabt(struct kvm_vcpu *vcpu);
-void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
-void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
+int kvm_inject_serror_esr(struct kvm_vcpu *vcpu, u64 esr);
+int kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr);
void kvm_inject_size_fault(struct kvm_vcpu *vcpu);
+static inline int kvm_inject_sea_dabt(struct kvm_vcpu *vcpu, u64 addr)
+{
+ return kvm_inject_sea(vcpu, false, addr);
+}
+
+static inline int kvm_inject_sea_iabt(struct kvm_vcpu *vcpu, u64 addr)
+{
+ return kvm_inject_sea(vcpu, true, addr);
+}
+
+static inline int kvm_inject_serror(struct kvm_vcpu *vcpu)
+{
+ /*
+ * ESR_ELx.ISV (later renamed to IDS) indicates whether or not
+ * ESR_ELx.ISS contains IMPLEMENTATION DEFINED syndrome information.
+ *
+ * Set the bit when injecting an SError w/o an ESR to indicate ISS
+ * does not follow the architected format.
+ */
+ return kvm_inject_serror_esr(vcpu, ESR_ELx_ISV);
+}
+
void kvm_vcpu_wfi(struct kvm_vcpu *vcpu);
void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
+int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr);
+int kvm_inject_nested_serror(struct kvm_vcpu *vcpu, u64 esr);
static inline void kvm_inject_nested_sve_trap(struct kvm_vcpu *vcpu)
{
@@ -195,6 +218,11 @@ static inline bool vcpu_el2_tge_is_set(const struct kvm_vcpu *vcpu)
return ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2) & HCR_TGE;
}
+static inline bool vcpu_el2_amo_is_set(const struct kvm_vcpu *vcpu)
+{
+ return ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2) & HCR_AMO;
+}
+
static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu)
{
bool e2h, tge;
@@ -224,6 +252,20 @@ static inline bool vcpu_is_host_el0(const struct kvm_vcpu *vcpu)
return is_hyp_ctxt(vcpu) && !vcpu_is_el2(vcpu);
}
+static inline bool is_nested_ctxt(struct kvm_vcpu *vcpu)
+{
+ return vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu);
+}
+
+static inline bool vserror_state_is_nested(struct kvm_vcpu *vcpu)
+{
+ if (!is_nested_ctxt(vcpu))
+ return false;
+
+ return vcpu_el2_amo_is_set(vcpu) ||
+ (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TMEA);
+}
+
/*
* The layout of SPSR for an AArch32 state is different when observed from an
* AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32
@@ -627,6 +669,9 @@ static inline void vcpu_set_hcrx(struct kvm_vcpu *vcpu)
if (kvm_has_fpmr(kvm))
vcpu->arch.hcrx_el2 |= HCRX_EL2_EnFPM;
+
+ if (kvm_has_sctlr2(kvm))
+ vcpu->arch.hcrx_el2 |= HCRX_EL2_SCTLR2En;
}
}
#endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 673925f17cdd..2f2394cce24e 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -523,6 +523,7 @@ enum vcpu_sysreg {
/* Anything from this can be RES0/RES1 sanitised */
MARKER(__SANITISED_REG_START__),
TCR2_EL2, /* Extended Translation Control Register (EL2) */
+ SCTLR2_EL2, /* System Control Register 2 (EL2) */
MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */
CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */
@@ -537,6 +538,7 @@ enum vcpu_sysreg {
VNCR(TTBR1_EL1),/* Translation Table Base Register 1 */
VNCR(TCR_EL1), /* Translation Control Register */
VNCR(TCR2_EL1), /* Extended Translation Control Register */
+ VNCR(SCTLR2_EL1), /* System Control Register 2 */
VNCR(ESR_EL1), /* Exception Syndrome Register */
VNCR(AFSR0_EL1),/* Auxiliary Fault Status Register 0 */
VNCR(AFSR1_EL1),/* Auxiliary Fault Status Register 1 */
@@ -565,6 +567,10 @@ enum vcpu_sysreg {
VNCR(POR_EL1), /* Permission Overlay Register 1 (EL1) */
+ /* FEAT_RAS registers */
+ VNCR(VDISR_EL2),
+ VNCR(VSESR_EL2),
+
VNCR(HFGRTR_EL2),
VNCR(HFGWTR_EL2),
VNCR(HFGITR_EL2),
@@ -819,7 +825,7 @@ struct kvm_vcpu_arch {
u8 iflags;
/* State flags for kernel bookkeeping, unused by the hypervisor code */
- u8 sflags;
+ u16 sflags;
/*
* Don't run the guest (internal implementation need).
@@ -955,9 +961,21 @@ struct kvm_vcpu_arch {
__vcpu_flags_preempt_enable(); \
} while (0)
+#define __vcpu_test_and_clear_flag(v, flagset, f, m) \
+ ({ \
+ typeof(v->arch.flagset) set; \
+ \
+ set = __vcpu_get_flag(v, flagset, f, m); \
+ __vcpu_clear_flag(v, flagset, f, m); \
+ \
+ set; \
+ })
+
#define vcpu_get_flag(v, ...) __vcpu_get_flag((v), __VA_ARGS__)
#define vcpu_set_flag(v, ...) __vcpu_set_flag((v), __VA_ARGS__)
#define vcpu_clear_flag(v, ...) __vcpu_clear_flag((v), __VA_ARGS__)
+#define vcpu_test_and_clear_flag(v, ...) \
+ __vcpu_test_and_clear_flag((v), __VA_ARGS__)
/* KVM_ARM_VCPU_INIT completed */
#define VCPU_INITIALIZED __vcpu_single_flag(cflags, BIT(0))
@@ -1017,6 +1035,8 @@ struct kvm_vcpu_arch {
#define IN_WFI __vcpu_single_flag(sflags, BIT(6))
/* KVM is currently emulating a nested ERET */
#define IN_NESTED_ERET __vcpu_single_flag(sflags, BIT(7))
+/* SError pending for nested guest */
+#define NESTED_SERROR_PENDING __vcpu_single_flag(sflags, BIT(8))
/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
@@ -1151,6 +1171,8 @@ static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
* System registers listed in the switch are not saved on every
* exit from the guest but are only saved on vcpu_put.
*
+ * SYSREGS_ON_CPU *MUST* be checked before using this helper.
+ *
* Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
* should never be listed below, because the guest cannot modify its
* own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's
@@ -1188,6 +1210,7 @@ static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break;
case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break;
case ZCR_EL1: *val = read_sysreg_s(SYS_ZCR_EL12); break;
+ case SCTLR2_EL1: *val = read_sysreg_s(SYS_SCTLR2_EL12); break;
default: return false;
}
@@ -1202,6 +1225,8 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
* System registers listed in the switch are not restored on every
* entry to the guest but are only restored on vcpu_load.
*
+ * SYSREGS_ON_CPU *MUST* be checked before using this helper.
+ *
* Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
* should never be listed below, because the MPIDR should only be set
* once, before running the VCPU, and never changed later.
@@ -1238,6 +1263,7 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break;
case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break;
case ZCR_EL1: write_sysreg_s(val, SYS_ZCR_EL12); break;
+ case SCTLR2_EL1: write_sysreg_s(val, SYS_SCTLR2_EL12); break;
default: return false;
}
@@ -1389,8 +1415,6 @@ static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
return (vcpu_arch->steal.base != INVALID_GPA);
}
-void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
-
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data);
@@ -1667,6 +1691,12 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);
#define kvm_has_s1poe(k) \
(kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP))
+#define kvm_has_ras(k) \
+ (kvm_has_feat((k), ID_AA64PFR0_EL1, RAS, IMP))
+
+#define kvm_has_sctlr2(k) \
+ (kvm_has_feat((k), ID_AA64MMFR3_EL1, SCTLRX, IMP))
+
static inline bool kvm_arch_has_irq_bypass(void)
{
return true;
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index b98ac6aa631f..ae563ebd6aee 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -371,6 +371,24 @@ static inline void kvm_fault_unlock(struct kvm *kvm)
read_unlock(&kvm->mmu_lock);
}
+/*
+ * ARM64 KVM relies on a simple conversion from physaddr to a kernel
+ * virtual address (KVA) when it does cache maintenance as the CMO
+ * instructions work on virtual addresses. This is incompatible with
+ * VM_PFNMAP VMAs which may not have a kernel direct mapping to a
+ * virtual address.
+ *
+ * With S2FWB and CACHE DIC features, KVM need not do cache flushing
+ * and CMOs are NOP'd. This has the effect of no longer requiring a
+ * KVA for addresses mapped into the S2. The presence of these features
+ * are thus necessary to support cacheable S2 mapping of VM_PFNMAP.
+ */
+static inline bool kvm_supports_cacheable_pfnmap(void)
+{
+ return cpus_have_final_cap(ARM64_HAS_STAGE2_FWB) &&
+ cpus_have_final_cap(ARM64_HAS_CACHE_DIC);
+}
+
#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
void kvm_s2_ptdump_create_debugfs(struct kvm *kvm);
#else
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index 0bd07ea068a1..7fd76f41c296 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -80,6 +80,8 @@ extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu);
extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu);
extern void check_nested_vcpu_requests(struct kvm_vcpu *vcpu);
+extern void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu);
+extern void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu);
struct kvm_s2_trans {
phys_addr_t output;
diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 21df8bbd2668..8770c7ee759f 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -11,10 +11,10 @@
#include <linux/shmem_fs.h>
#include <linux/types.h>
-static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
+static inline vm_flags_t arch_calc_vm_prot_bits(unsigned long prot,
unsigned long pkey)
{
- unsigned long ret = 0;
+ vm_flags_t ret = 0;
if (system_supports_bti() && (prot & PROT_BTI))
ret |= VM_ARM64_BTI;
@@ -34,8 +34,8 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
}
#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
-static inline unsigned long arch_calc_vm_flag_bits(struct file *file,
- unsigned long flags)
+static inline vm_flags_t arch_calc_vm_flag_bits(struct file *file,
+ unsigned long flags)
{
/*
* Only allow MTE on anonymous mappings as these are guaranteed to be
@@ -68,7 +68,7 @@ static inline bool arch_validate_prot(unsigned long prot,
}
#define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr)
-static inline bool arch_validate_flags(unsigned long vm_flags)
+static inline bool arch_validate_flags(vm_flags_t vm_flags)
{
if (system_supports_mte()) {
/*
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 7830d031742e..85dceb1c66f4 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -17,7 +17,6 @@
#define PTE_SWP_EXCLUSIVE (_AT(pteval_t, 1) << 2) /* only for swp ptes */
#define PTE_DIRTY (_AT(pteval_t, 1) << 55)
#define PTE_SPECIAL (_AT(pteval_t, 1) << 56)
-#define PTE_DEVMAP (_AT(pteval_t, 1) << 57)
/*
* PTE_PRESENT_INVALID=1 & PTE_VALID=0 indicates that the pte's fields should be
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 192d86e1cc76..abd2dee416b3 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -190,7 +190,6 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
#define pte_user(pte) (!!(pte_val(pte) & PTE_USER))
#define pte_user_exec(pte) (!(pte_val(pte) & PTE_UXN))
#define pte_cont(pte) (!!(pte_val(pte) & PTE_CONT))
-#define pte_devmap(pte) (!!(pte_val(pte) & PTE_DEVMAP))
#define pte_tagged(pte) ((pte_val(pte) & PTE_ATTRINDX_MASK) == \
PTE_ATTRINDX(MT_NORMAL_TAGGED))
@@ -372,11 +371,6 @@ static inline pmd_t pmd_mkcont(pmd_t pmd)
return __pmd(pmd_val(pmd) | PMD_SECT_CONT);
}
-static inline pte_t pte_mkdevmap(pte_t pte)
-{
- return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL));
-}
-
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
@@ -653,14 +647,6 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
return __pmd((pmd_val(pmd) & ~mask) | val);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define pmd_devmap(pmd) pte_devmap(pmd_pte(pmd))
-#endif
-static inline pmd_t pmd_mkdevmap(pmd_t pmd)
-{
- return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP)));
-}
-
#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
#define pmd_special(pte) (!!((pmd_val(pte) & PTE_SPECIAL)))
static inline pmd_t pmd_mkspecial(pmd_t pmd)
@@ -1302,16 +1288,6 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
return __ptep_set_access_flags(vma, address, (pte_t *)pmdp,
pmd_pte(entry), dirty);
}
-
-static inline int pud_devmap(pud_t pud)
-{
- return 0;
-}
-
-static inline int pgd_devmap(pgd_t pgd)
-{
- return 0;
-}
#endif
#ifdef CONFIG_PAGE_TABLE_CHECK
@@ -1643,6 +1619,14 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
*/
#define arch_wants_old_prefaulted_pte cpu_has_hw_af
+/*
+ * Request exec memory is read into pagecache in at least 64K folios. This size
+ * can be contpte-mapped when 4K base pages are in use (16 pages into 1 iTLB
+ * entry), and HPA can coalesce it (4 pages into 1 TLB entry) when 16K base
+ * pages are in use.
+ */
+#define exec_folio_order() ilog2(SZ_64K >> PAGE_SHIFT)
+
static inline bool pud_sect_supported(void)
{
return PAGE_SIZE == SZ_4K;
@@ -1659,6 +1643,16 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t old_pte, pte_t new_pte);
+#define modify_prot_start_ptes modify_prot_start_ptes
+extern pte_t modify_prot_start_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr);
+
+#define modify_prot_commit_ptes modify_prot_commit_ptes
+extern void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte,
+ unsigned int nr);
+
#ifdef CONFIG_ARM64_CONTPTE
/*
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 2510eec026f7..d48ef6d5abcc 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -50,10 +50,32 @@ struct seq_file;
*/
extern void smp_init_cpus(void);
+enum ipi_msg_type {
+ IPI_RESCHEDULE,
+ IPI_CALL_FUNC,
+ IPI_CPU_STOP,
+ IPI_CPU_STOP_NMI,
+ IPI_TIMER,
+ IPI_IRQ_WORK,
+ NR_IPI,
+ /*
+ * Any enum >= NR_IPI and < MAX_IPI is special and not tracable
+ * with trace_ipi_*
+ */
+ IPI_CPU_BACKTRACE = NR_IPI,
+ IPI_KGDB_ROUNDUP,
+ MAX_IPI
+};
+
/*
* Register IPI interrupts with the arch SMP code
*/
-extern void set_smp_ipi_range(int ipi_base, int nr_ipi);
+extern void set_smp_ipi_range_percpu(int ipi_base, int nr_ipi, int ncpus);
+
+static inline void set_smp_ipi_range(int ipi_base, int n)
+{
+ set_smp_ipi_range_percpu(ipi_base, n, 0);
+}
/*
* Called from the secondary holding pen, this is the secondary CPU entry point.
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index ef76a4b9e240..d5b5f2ae1afa 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -113,10 +113,14 @@
/* Register-based PAN access, for save/restore purposes */
#define SYS_PSTATE_PAN sys_reg(3, 0, 4, 2, 3)
-#define __SYS_BARRIER_INSN(CRm, op2, Rt) \
- __emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f))
+#define __SYS_BARRIER_INSN(op0, op1, CRn, CRm, op2, Rt) \
+ __emit_inst(0xd5000000 | \
+ sys_insn((op0), (op1), (CRn), (CRm), (op2)) | \
+ ((Rt) & 0x1f))
-#define SB_BARRIER_INSN __SYS_BARRIER_INSN(0, 7, 31)
+#define SB_BARRIER_INSN __SYS_BARRIER_INSN(0, 3, 3, 0, 7, 31)
+#define GSB_SYS_BARRIER_INSN __SYS_BARRIER_INSN(1, 0, 12, 0, 0, 31)
+#define GSB_ACK_BARRIER_INSN __SYS_BARRIER_INSN(1, 0, 12, 0, 1, 31)
/* Data cache zero operations */
#define SYS_DC_ISW sys_insn(1, 0, 7, 6, 2)
@@ -1074,6 +1078,67 @@
#define GCS_CAP(x) ((((unsigned long)x) & GCS_CAP_ADDR_MASK) | \
GCS_CAP_VALID_TOKEN)
+/*
+ * Definitions for GICv5 instructions
+ */
+#define GICV5_OP_GIC_CDAFF sys_insn(1, 0, 12, 1, 3)
+#define GICV5_OP_GIC_CDDI sys_insn(1, 0, 12, 2, 0)
+#define GICV5_OP_GIC_CDDIS sys_insn(1, 0, 12, 1, 0)
+#define GICV5_OP_GIC_CDHM sys_insn(1, 0, 12, 2, 1)
+#define GICV5_OP_GIC_CDEN sys_insn(1, 0, 12, 1, 1)
+#define GICV5_OP_GIC_CDEOI sys_insn(1, 0, 12, 1, 7)
+#define GICV5_OP_GIC_CDPEND sys_insn(1, 0, 12, 1, 4)
+#define GICV5_OP_GIC_CDPRI sys_insn(1, 0, 12, 1, 2)
+#define GICV5_OP_GIC_CDRCFG sys_insn(1, 0, 12, 1, 5)
+#define GICV5_OP_GICR_CDIA sys_insn(1, 0, 12, 3, 0)
+
+/* Definitions for GIC CDAFF */
+#define GICV5_GIC_CDAFF_IAFFID_MASK GENMASK_ULL(47, 32)
+#define GICV5_GIC_CDAFF_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDAFF_IRM_MASK BIT_ULL(28)
+#define GICV5_GIC_CDAFF_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDDI */
+#define GICV5_GIC_CDDI_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDDI_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDDIS */
+#define GICV5_GIC_CDDIS_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDDIS_TYPE(r) FIELD_GET(GICV5_GIC_CDDIS_TYPE_MASK, r)
+#define GICV5_GIC_CDDIS_ID_MASK GENMASK_ULL(23, 0)
+#define GICV5_GIC_CDDIS_ID(r) FIELD_GET(GICV5_GIC_CDDIS_ID_MASK, r)
+
+/* Definitions for GIC CDEN */
+#define GICV5_GIC_CDEN_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDEN_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDHM */
+#define GICV5_GIC_CDHM_HM_MASK BIT_ULL(32)
+#define GICV5_GIC_CDHM_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDHM_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDPEND */
+#define GICV5_GIC_CDPEND_PENDING_MASK BIT_ULL(32)
+#define GICV5_GIC_CDPEND_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDPEND_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDPRI */
+#define GICV5_GIC_CDPRI_PRIORITY_MASK GENMASK_ULL(39, 35)
+#define GICV5_GIC_CDPRI_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDPRI_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDRCFG */
+#define GICV5_GIC_CDRCFG_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDRCFG_ID_MASK GENMASK_ULL(23, 0)
+
+/* Definitions for GICR CDIA */
+#define GICV5_GIC_CDIA_VALID_MASK BIT_ULL(32)
+#define GICV5_GICR_CDIA_VALID(r) FIELD_GET(GICV5_GIC_CDIA_VALID_MASK, r)
+#define GICV5_GIC_CDIA_TYPE_MASK GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDIA_ID_MASK GENMASK_ULL(23, 0)
+
+#define gicr_insn(insn) read_sysreg_s(GICV5_OP_GICR_##insn)
+#define gic_insn(v, insn) write_sysreg_s(v, GICV5_OP_GIC_##insn)
#define ARM64_FEATURE_FIELD_BITS 4
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index aa9efee17277..18a5dc0c9a54 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -323,17 +323,6 @@ static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
}
/*
- * If mprotect/munmap/etc occurs during TLB batched flushing, we need to ensure
- * all the previously issued TLBIs targeting mm have completed. But since we
- * can be executing on a remote CPU, a DSB cannot guarantee this like it can
- * for arch_tlbbatch_flush(). Our only option is to flush the entire mm.
- */
-static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
-{
- flush_tlb_mm(mm);
-}
-
-/*
* To support TLB batched flush for multiple pages unmapping, we only send
* the TLBI for each page in arch_tlbbatch_add_pending() and wait for the
* completion at the end in arch_tlbbatch_flush(). Since we've already issued
diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h
index 6f556e993644..f6ec500ad3fa 100644
--- a/arch/arm64/include/asm/vncr_mapping.h
+++ b/arch/arm64/include/asm/vncr_mapping.h
@@ -51,6 +51,7 @@
#define VNCR_SP_EL1 0x240
#define VNCR_VBAR_EL1 0x250
#define VNCR_TCR2_EL1 0x270
+#define VNCR_SCTLR2_EL1 0x278
#define VNCR_PIRE0_EL1 0x290
#define VNCR_PIR_EL1 0x2A0
#define VNCR_POR_EL1 0x2A8
@@ -84,6 +85,7 @@
#define VNCR_ICH_HCR_EL2 0x4C0
#define VNCR_ICH_VMCR_EL2 0x4C8
#define VNCR_VDISR_EL2 0x500
+#define VNCR_VSESR_EL2 0x508
#define VNCR_PMBLIMITR_EL1 0x800
#define VNCR_PMBPTR_EL1 0x810
#define VNCR_PMBSR_EL1 0x820
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index a006e38b2684..9ad065f15f1d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -303,6 +303,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
};
static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_DF2_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS),
FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0),
S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_frac_SHIFT, 4, 0),
@@ -502,6 +503,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = {
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_POE),
FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1POE_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_SCTLRX_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0),
ARM64_FTR_END,
};
@@ -2330,11 +2332,11 @@ static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry,
int scope)
{
/*
- * ARM64_HAS_GIC_CPUIF_SYSREGS has a lower index, and is a boot CPU
+ * ARM64_HAS_GICV3_CPUIF has a lower index, and is a boot CPU
* feature, so will be detected earlier.
*/
- BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_MASKING <= ARM64_HAS_GIC_CPUIF_SYSREGS);
- if (!cpus_have_cap(ARM64_HAS_GIC_CPUIF_SYSREGS))
+ BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_MASKING <= ARM64_HAS_GICV3_CPUIF);
+ if (!cpus_have_cap(ARM64_HAS_GICV3_CPUIF))
return false;
return enable_pseudo_nmi;
@@ -2530,8 +2532,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.matches = has_always,
},
{
- .desc = "GIC system register CPU interface",
- .capability = ARM64_HAS_GIC_CPUIF_SYSREGS,
+ .desc = "GICv3 CPU interface",
+ .capability = ARM64_HAS_GICV3_CPUIF,
.type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
.matches = has_useable_gicv3_cpuif,
ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, GIC, IMP)
@@ -3115,6 +3117,20 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.matches = has_pmuv3,
},
#endif
+ {
+ .desc = "SCTLR2",
+ .capability = ARM64_HAS_SCTLR2,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = has_cpuid_feature,
+ ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, SCTLRX, IMP)
+ },
+ {
+ .desc = "GICv5 CPU interface",
+ .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
+ .capability = ARM64_HAS_GICV5_CPUIF,
+ .matches = has_cpuid_feature,
+ ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, GCIE, IMP)
+ },
{},
};
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 21a795303568..68cea3a4a35c 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -64,26 +64,18 @@ struct secondary_data secondary_data;
/* Number of CPUs which aren't online, but looping in kernel text. */
static int cpus_stuck_in_kernel;
-enum ipi_msg_type {
- IPI_RESCHEDULE,
- IPI_CALL_FUNC,
- IPI_CPU_STOP,
- IPI_CPU_STOP_NMI,
- IPI_TIMER,
- IPI_IRQ_WORK,
- NR_IPI,
- /*
- * Any enum >= NR_IPI and < MAX_IPI is special and not tracable
- * with trace_ipi_*
- */
- IPI_CPU_BACKTRACE = NR_IPI,
- IPI_KGDB_ROUNDUP,
- MAX_IPI
-};
-
static int ipi_irq_base __ro_after_init;
static int nr_ipi __ro_after_init = NR_IPI;
-static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init;
+
+struct ipi_descs {
+ struct irq_desc *descs[MAX_IPI];
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct ipi_descs, pcpu_ipi_desc);
+
+#define get_ipi_desc(__cpu, __ipi) (per_cpu_ptr(&pcpu_ipi_desc, __cpu)->descs[__ipi])
+
+static bool percpu_ipi_descs __ro_after_init;
static bool crash_stop;
@@ -844,7 +836,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i,
prec >= 4 ? " " : "");
for_each_online_cpu(cpu)
- seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu));
+ seq_printf(p, "%10u ", irq_desc_kstat_cpu(get_ipi_desc(cpu, i), cpu));
seq_printf(p, " %s\n", ipi_types[i]);
}
@@ -917,9 +909,20 @@ static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs
#endif
}
+static void arm64_send_ipi(const cpumask_t *mask, unsigned int nr)
+{
+ unsigned int cpu;
+
+ if (!percpu_ipi_descs)
+ __ipi_send_mask(get_ipi_desc(0, nr), mask);
+ else
+ for_each_cpu(cpu, mask)
+ __ipi_send_single(get_ipi_desc(cpu, nr), cpu);
+}
+
static void arm64_backtrace_ipi(cpumask_t *mask)
{
- __ipi_send_mask(ipi_desc[IPI_CPU_BACKTRACE], mask);
+ arm64_send_ipi(mask, IPI_CPU_BACKTRACE);
}
void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
@@ -944,7 +947,7 @@ void kgdb_roundup_cpus(void)
if (cpu == this_cpu)
continue;
- __ipi_send_single(ipi_desc[IPI_KGDB_ROUNDUP], cpu);
+ __ipi_send_single(get_ipi_desc(cpu, IPI_KGDB_ROUNDUP), cpu);
}
}
#endif
@@ -1013,14 +1016,16 @@ static void do_handle_IPI(int ipinr)
static irqreturn_t ipi_handler(int irq, void *data)
{
- do_handle_IPI(irq - ipi_irq_base);
+ unsigned int ipi = (irq - ipi_irq_base) % nr_ipi;
+
+ do_handle_IPI(ipi);
return IRQ_HANDLED;
}
static void smp_cross_call(const struct cpumask *target, unsigned int ipinr)
{
trace_ipi_raise(target, ipi_types[ipinr]);
- __ipi_send_mask(ipi_desc[ipinr], target);
+ arm64_send_ipi(target, ipinr);
}
static bool ipi_should_be_nmi(enum ipi_msg_type ipi)
@@ -1046,11 +1051,15 @@ static void ipi_setup(int cpu)
return;
for (i = 0; i < nr_ipi; i++) {
- if (ipi_should_be_nmi(i)) {
- prepare_percpu_nmi(ipi_irq_base + i);
- enable_percpu_nmi(ipi_irq_base + i, 0);
+ if (!percpu_ipi_descs) {
+ if (ipi_should_be_nmi(i)) {
+ prepare_percpu_nmi(ipi_irq_base + i);
+ enable_percpu_nmi(ipi_irq_base + i, 0);
+ } else {
+ enable_percpu_irq(ipi_irq_base + i, 0);
+ }
} else {
- enable_percpu_irq(ipi_irq_base + i, 0);
+ enable_irq(irq_desc_get_irq(get_ipi_desc(cpu, i)));
}
}
}
@@ -1064,44 +1073,77 @@ static void ipi_teardown(int cpu)
return;
for (i = 0; i < nr_ipi; i++) {
- if (ipi_should_be_nmi(i)) {
- disable_percpu_nmi(ipi_irq_base + i);
- teardown_percpu_nmi(ipi_irq_base + i);
+ if (!percpu_ipi_descs) {
+ if (ipi_should_be_nmi(i)) {
+ disable_percpu_nmi(ipi_irq_base + i);
+ teardown_percpu_nmi(ipi_irq_base + i);
+ } else {
+ disable_percpu_irq(ipi_irq_base + i);
+ }
} else {
- disable_percpu_irq(ipi_irq_base + i);
+ disable_irq(irq_desc_get_irq(get_ipi_desc(cpu, i)));
}
}
}
#endif
-void __init set_smp_ipi_range(int ipi_base, int n)
+static void ipi_setup_sgi(int ipi)
{
- int i;
+ int err, irq, cpu;
- WARN_ON(n < MAX_IPI);
- nr_ipi = min(n, MAX_IPI);
+ irq = ipi_irq_base + ipi;
- for (i = 0; i < nr_ipi; i++) {
- int err;
+ if (ipi_should_be_nmi(ipi)) {
+ err = request_percpu_nmi(irq, ipi_handler, "IPI", &irq_stat);
+ WARN(err, "Could not request IRQ %d as NMI, err=%d\n", irq, err);
+ } else {
+ err = request_percpu_irq(irq, ipi_handler, "IPI", &irq_stat);
+ WARN(err, "Could not request IRQ %d as IRQ, err=%d\n", irq, err);
+ }
- if (ipi_should_be_nmi(i)) {
- err = request_percpu_nmi(ipi_base + i, ipi_handler,
- "IPI", &irq_stat);
- WARN(err, "Could not request IPI %d as NMI, err=%d\n",
- i, err);
- } else {
- err = request_percpu_irq(ipi_base + i, ipi_handler,
- "IPI", &irq_stat);
- WARN(err, "Could not request IPI %d as IRQ, err=%d\n",
- i, err);
- }
+ for_each_possible_cpu(cpu)
+ get_ipi_desc(cpu, ipi) = irq_to_desc(irq);
+
+ irq_set_status_flags(irq, IRQ_HIDDEN);
+}
+
+static void ipi_setup_lpi(int ipi, int ncpus)
+{
+ for (int cpu = 0; cpu < ncpus; cpu++) {
+ int err, irq;
+
+ irq = ipi_irq_base + (cpu * nr_ipi) + ipi;
+
+ err = irq_force_affinity(irq, cpumask_of(cpu));
+ WARN(err, "Could not force affinity IRQ %d, err=%d\n", irq, err);
+
+ err = request_irq(irq, ipi_handler, IRQF_NO_AUTOEN, "IPI",
+ NULL);
+ WARN(err, "Could not request IRQ %d, err=%d\n", irq, err);
+
+ irq_set_status_flags(irq, (IRQ_HIDDEN | IRQ_NO_BALANCING_MASK));
- ipi_desc[i] = irq_to_desc(ipi_base + i);
- irq_set_status_flags(ipi_base + i, IRQ_HIDDEN);
+ get_ipi_desc(cpu, ipi) = irq_to_desc(irq);
}
+}
+
+void __init set_smp_ipi_range_percpu(int ipi_base, int n, int ncpus)
+{
+ int i;
+
+ WARN_ON(n < MAX_IPI);
+ nr_ipi = min(n, MAX_IPI);
+ percpu_ipi_descs = !!ncpus;
ipi_irq_base = ipi_base;
+ for (i = 0; i < nr_ipi; i++) {
+ if (!percpu_ipi_descs)
+ ipi_setup_sgi(i);
+ else
+ ipi_setup_lpi(i, ncpus);
+ }
+
/* Setup the boot CPU immediately */
ipi_setup(smp_processor_id());
}
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 7c329e01c557..3ebc0570345c 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -23,7 +23,8 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
vgic/vgic-v3.o vgic/vgic-v4.o \
vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \
vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
- vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o
+ vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o \
+ vgic/vgic-v5.o
kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o
kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 701ea10a63f1..dbd74e4885e2 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -830,7 +830,7 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
* by the guest (either FEAT_VHE or FEAT_E2H0 is implemented, but
* not both). This simplifies the handling of the EL1NV* bits.
*/
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
+ if (is_nested_ctxt(vcpu)) {
u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
/* Use the VHE format for mental sanity */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 23dd3f3fc3eb..888f7c7abf54 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -408,6 +408,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES:
r = BIT(0);
break;
+ case KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED:
+ if (!kvm)
+ r = -EINVAL;
+ else
+ r = kvm_supports_cacheable_pfnmap();
+ break;
+
default:
r = 0;
}
@@ -521,7 +528,7 @@ static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu)
* Either we're running an L2 guest, and the API/APK bits come
* from L1's HCR_EL2, or API/APK are both set.
*/
- if (unlikely(vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) {
+ if (unlikely(is_nested_ctxt(vcpu))) {
u64 val;
val = __vcpu_sys_reg(vcpu, HCR_EL2);
@@ -740,7 +747,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
*/
int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
{
- bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
+ bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF | HCR_VSE);
+
return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
&& !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
}
@@ -1183,6 +1191,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
*/
preempt_disable();
+ kvm_nested_flush_hwstate(vcpu);
+
if (kvm_vcpu_has_pmu(vcpu))
kvm_pmu_flush_hwstate(vcpu);
@@ -1282,6 +1292,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
/* Exit types that need handling before we can be preempted */
handle_exit_early(vcpu, ret);
+ kvm_nested_sync_hwstate(vcpu);
+
preempt_enable();
/*
@@ -2765,19 +2777,15 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq);
}
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
- struct kvm_kernel_irq_routing_entry *new)
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
{
- if (old->type != KVM_IRQ_ROUTING_MSI ||
- new->type != KVM_IRQ_ROUTING_MSI)
- return true;
-
- return memcmp(&old->msi, &new->msi, sizeof(new->msi));
-}
+ if (old->type == KVM_IRQ_ROUTING_MSI &&
+ new->type == KVM_IRQ_ROUTING_MSI &&
+ !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
+ return;
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
/*
* Remapping the vLPI requires taking the its_lock mutex to resolve
* the new translation. We're in spinlock land at this point, so no
@@ -2785,7 +2793,7 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
*
* Unmap the vLPI and fall back to software LPI injection.
*/
- return kvm_vgic_v4_unset_forwarding(kvm, host_irq);
+ return kvm_vgic_v4_unset_forwarding(irqfd->kvm, irqfd->producer->irq);
}
void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index a25be111cd8f..0e5610533949 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -1047,34 +1047,51 @@ static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu,
idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc);
- switch (wi->regime) {
- case TR_EL10:
- pov_perms = perm_idx(vcpu, POR_EL1, idx);
- uov_perms = perm_idx(vcpu, POR_EL0, idx);
- break;
- case TR_EL20:
- pov_perms = perm_idx(vcpu, POR_EL2, idx);
- uov_perms = perm_idx(vcpu, POR_EL0, idx);
- break;
- case TR_EL2:
- pov_perms = perm_idx(vcpu, POR_EL2, idx);
- uov_perms = 0;
- break;
- }
+ if (wr->pov) {
+ switch (wi->regime) {
+ case TR_EL10:
+ pov_perms = perm_idx(vcpu, POR_EL1, idx);
+ break;
+ case TR_EL20:
+ pov_perms = perm_idx(vcpu, POR_EL2, idx);
+ break;
+ case TR_EL2:
+ pov_perms = perm_idx(vcpu, POR_EL2, idx);
+ break;
+ }
+
+ if (pov_perms & ~POE_RWX)
+ pov_perms = POE_NONE;
- if (pov_perms & ~POE_RWX)
- pov_perms = POE_NONE;
+ /* R_QXXPC, S1PrivOverflow enabled */
+ if (wr->pwxn && (pov_perms & POE_X))
+ pov_perms &= ~POE_W;
- if (wi->poe && wr->pov) {
wr->pr &= pov_perms & POE_R;
wr->pw &= pov_perms & POE_W;
wr->px &= pov_perms & POE_X;
}
- if (uov_perms & ~POE_RWX)
- uov_perms = POE_NONE;
+ if (wr->uov) {
+ switch (wi->regime) {
+ case TR_EL10:
+ uov_perms = perm_idx(vcpu, POR_EL0, idx);
+ break;
+ case TR_EL20:
+ uov_perms = perm_idx(vcpu, POR_EL0, idx);
+ break;
+ case TR_EL2:
+ uov_perms = 0;
+ break;
+ }
+
+ if (uov_perms & ~POE_RWX)
+ uov_perms = POE_NONE;
+
+ /* R_NPBXC, S1UnprivOverlay enabled */
+ if (wr->uwxn && (uov_perms & POE_X))
+ uov_perms &= ~POE_W;
- if (wi->e0poe && wr->uov) {
wr->ur &= uov_perms & POE_R;
wr->uw &= uov_perms & POE_W;
wr->ux &= uov_perms & POE_X;
@@ -1095,24 +1112,15 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu,
if (!wi->hpd)
compute_s1_hierarchical_permissions(vcpu, wi, wr);
- if (wi->poe || wi->e0poe)
- compute_s1_overlay_permissions(vcpu, wi, wr);
+ compute_s1_overlay_permissions(vcpu, wi, wr);
- /* R_QXXPC */
- if (wr->pwxn) {
- if (!wr->pov && wr->pw)
- wr->px = false;
- if (wr->pov && wr->px)
- wr->pw = false;
- }
+ /* R_QXXPC, S1PrivOverlay disabled */
+ if (!wr->pov)
+ wr->px &= !(wr->pwxn && wr->pw);
- /* R_NPBXC */
- if (wr->uwxn) {
- if (!wr->uov && wr->uw)
- wr->ux = false;
- if (wr->uov && wr->ux)
- wr->uw = false;
- }
+ /* R_NPBXC, S1UnprivOverlay disabled */
+ if (!wr->uov)
+ wr->ux &= !(wr->uwxn && wr->uw);
pan = wi->pan && (wr->ur || wr->uw ||
(pan3_enabled(vcpu, wi->regime) && wr->ux));
diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c
index 54911a93b001..da66c4a14775 100644
--- a/arch/arm64/kvm/config.c
+++ b/arch/arm64/kvm/config.c
@@ -66,7 +66,6 @@ struct reg_bits_to_feat_map {
#define FEAT_BRBE ID_AA64DFR0_EL1, BRBE, IMP
#define FEAT_TRC_SR ID_AA64DFR0_EL1, TraceVer, IMP
#define FEAT_PMUv3 ID_AA64DFR0_EL1, PMUVer, IMP
-#define FEAT_PMUv3p9 ID_AA64DFR0_EL1, PMUVer, V3P9
#define FEAT_TRBE ID_AA64DFR0_EL1, TraceBuffer, IMP
#define FEAT_TRBEv1p1 ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1
#define FEAT_DoubleLock ID_AA64DFR0_EL1, DoubleLock, IMP
@@ -89,6 +88,7 @@ struct reg_bits_to_feat_map {
#define FEAT_RASv2 ID_AA64PFR0_EL1, RAS, V2
#define FEAT_GICv3 ID_AA64PFR0_EL1, GIC, IMP
#define FEAT_LOR ID_AA64MMFR1_EL1, LO, IMP
+#define FEAT_SPEv1p2 ID_AA64DFR0_EL1, PMSVer, V1P2
#define FEAT_SPEv1p4 ID_AA64DFR0_EL1, PMSVer, V1P4
#define FEAT_SPEv1p5 ID_AA64DFR0_EL1, PMSVer, V1P5
#define FEAT_ATS1A ID_AA64ISAR2_EL1, ATS1A, IMP
@@ -131,6 +131,27 @@ struct reg_bits_to_feat_map {
#define FEAT_SPMU ID_AA64DFR1_EL1, SPMU, IMP
#define FEAT_SPE_nVM ID_AA64DFR2_EL1, SPE_nVM, IMP
#define FEAT_STEP2 ID_AA64DFR2_EL1, STEP, IMP
+#define FEAT_SYSREG128 ID_AA64ISAR2_EL1, SYSREG_128, IMP
+#define FEAT_CPA2 ID_AA64ISAR3_EL1, CPA, CPA2
+#define FEAT_ASID2 ID_AA64MMFR4_EL1, ASID2, IMP
+#define FEAT_MEC ID_AA64MMFR3_EL1, MEC, IMP
+#define FEAT_HAFT ID_AA64MMFR1_EL1, HAFDBS, HAFT
+#define FEAT_BTI ID_AA64PFR1_EL1, BT, IMP
+#define FEAT_ExS ID_AA64MMFR0_EL1, EXS, IMP
+#define FEAT_IESB ID_AA64MMFR2_EL1, IESB, IMP
+#define FEAT_LSE2 ID_AA64MMFR2_EL1, AT, IMP
+#define FEAT_LSMAOC ID_AA64MMFR2_EL1, LSM, IMP
+#define FEAT_MixedEnd ID_AA64MMFR0_EL1, BIGEND, IMP
+#define FEAT_MixedEndEL0 ID_AA64MMFR0_EL1, BIGENDEL0, IMP
+#define FEAT_MTE2 ID_AA64PFR1_EL1, MTE, MTE2
+#define FEAT_MTE_ASYNC ID_AA64PFR1_EL1, MTE_frac, ASYNC
+#define FEAT_MTE_STORE_ONLY ID_AA64PFR2_EL1, MTESTOREONLY, IMP
+#define FEAT_PAN ID_AA64MMFR1_EL1, PAN, IMP
+#define FEAT_PAN3 ID_AA64MMFR1_EL1, PAN, PAN3
+#define FEAT_SSBS ID_AA64PFR1_EL1, SSBS, IMP
+#define FEAT_TIDCP1 ID_AA64MMFR1_EL1, TIDCP1, IMP
+#define FEAT_FGT ID_AA64MMFR0_EL1, FGT, IMP
+#define FEAT_MTPMU ID_AA64DFR0_EL1, MTPMU, IMP
static bool not_feat_aa64el3(struct kvm *kvm)
{
@@ -218,11 +239,62 @@ static bool feat_trbe_mpam(struct kvm *kvm)
(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_MPAM));
}
+static bool feat_asid2_e2h1(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_ASID2) && !kvm_has_feat(kvm, FEAT_E2H0);
+}
+
+static bool feat_d128_e2h1(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_D128) && !kvm_has_feat(kvm, FEAT_E2H0);
+}
+
+static bool feat_mec_e2h1(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_MEC) && !kvm_has_feat(kvm, FEAT_E2H0);
+}
+
static bool feat_ebep_pmuv3_ss(struct kvm *kvm)
{
return kvm_has_feat(kvm, FEAT_EBEP) || kvm_has_feat(kvm, FEAT_PMUv3_SS);
}
+static bool feat_mixedendel0(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_MixedEnd) || kvm_has_feat(kvm, FEAT_MixedEndEL0);
+}
+
+static bool feat_mte_async(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, FEAT_MTE2) && kvm_has_feat_enum(kvm, FEAT_MTE_ASYNC);
+}
+
+#define check_pmu_revision(k, r) \
+ ({ \
+ (kvm_has_feat((k), ID_AA64DFR0_EL1, PMUVer, r) && \
+ !kvm_has_feat((k), ID_AA64DFR0_EL1, PMUVer, IMP_DEF)); \
+ })
+
+static bool feat_pmuv3p1(struct kvm *kvm)
+{
+ return check_pmu_revision(kvm, V3P1);
+}
+
+static bool feat_pmuv3p5(struct kvm *kvm)
+{
+ return check_pmu_revision(kvm, V3P5);
+}
+
+static bool feat_pmuv3p7(struct kvm *kvm)
+{
+ return check_pmu_revision(kvm, V3P7);
+}
+
+static bool feat_pmuv3p9(struct kvm *kvm)
+{
+ return check_pmu_revision(kvm, V3P9);
+}
+
static bool compute_hcr_rw(struct kvm *kvm, u64 *bits)
{
/* This is purely academic: AArch32 and NV are mutually exclusive */
@@ -681,7 +753,7 @@ static const struct reg_bits_to_feat_map hdfgrtr2_feat_map[] = {
NEEDS_FEAT(HDFGRTR2_EL2_nPMICFILTR_EL0 |
HDFGRTR2_EL2_nPMICNTR_EL0,
FEAT_PMUv3_ICNTR),
- NEEDS_FEAT(HDFGRTR2_EL2_nPMUACR_EL1, FEAT_PMUv3p9),
+ NEEDS_FEAT(HDFGRTR2_EL2_nPMUACR_EL1, feat_pmuv3p9),
NEEDS_FEAT(HDFGRTR2_EL2_nPMSSCR_EL1 |
HDFGRTR2_EL2_nPMSSDATA,
FEAT_PMUv3_SS),
@@ -713,7 +785,7 @@ static const struct reg_bits_to_feat_map hdfgwtr2_feat_map[] = {
FEAT_PMUv3_ICNTR),
NEEDS_FEAT(HDFGWTR2_EL2_nPMUACR_EL1 |
HDFGWTR2_EL2_nPMZR_EL0,
- FEAT_PMUv3p9),
+ feat_pmuv3p9),
NEEDS_FEAT(HDFGWTR2_EL2_nPMSSCR_EL1, FEAT_PMUv3_SS),
NEEDS_FEAT(HDFGWTR2_EL2_nPMIAR_EL1, FEAT_SEBEP),
NEEDS_FEAT(HDFGWTR2_EL2_nPMSDSFR_EL1, feat_spe_fds),
@@ -832,6 +904,150 @@ static const struct reg_bits_to_feat_map hcr_feat_map[] = {
NEEDS_FEAT_FIXED(HCR_EL2_E2H, compute_hcr_e2h),
};
+static const struct reg_bits_to_feat_map sctlr2_feat_map[] = {
+ NEEDS_FEAT(SCTLR2_EL1_NMEA |
+ SCTLR2_EL1_EASE,
+ FEAT_DoubleFault2),
+ NEEDS_FEAT(SCTLR2_EL1_EnADERR, feat_aderr),
+ NEEDS_FEAT(SCTLR2_EL1_EnANERR, feat_anerr),
+ NEEDS_FEAT(SCTLR2_EL1_EnIDCP128, FEAT_SYSREG128),
+ NEEDS_FEAT(SCTLR2_EL1_EnPACM |
+ SCTLR2_EL1_EnPACM0,
+ feat_pauth_lr),
+ NEEDS_FEAT(SCTLR2_EL1_CPTA |
+ SCTLR2_EL1_CPTA0 |
+ SCTLR2_EL1_CPTM |
+ SCTLR2_EL1_CPTM0,
+ FEAT_CPA2),
+};
+
+static const struct reg_bits_to_feat_map tcr2_el2_feat_map[] = {
+ NEEDS_FEAT(TCR2_EL2_FNG1 |
+ TCR2_EL2_FNG0 |
+ TCR2_EL2_A2,
+ feat_asid2_e2h1),
+ NEEDS_FEAT(TCR2_EL2_DisCH1 |
+ TCR2_EL2_DisCH0 |
+ TCR2_EL2_D128,
+ feat_d128_e2h1),
+ NEEDS_FEAT(TCR2_EL2_AMEC1, feat_mec_e2h1),
+ NEEDS_FEAT(TCR2_EL2_AMEC0, FEAT_MEC),
+ NEEDS_FEAT(TCR2_EL2_HAFT, FEAT_HAFT),
+ NEEDS_FEAT(TCR2_EL2_PTTWI |
+ TCR2_EL2_PnCH,
+ FEAT_THE),
+ NEEDS_FEAT(TCR2_EL2_AIE, FEAT_AIE),
+ NEEDS_FEAT(TCR2_EL2_POE |
+ TCR2_EL2_E0POE,
+ FEAT_S1POE),
+ NEEDS_FEAT(TCR2_EL2_PIE, FEAT_S1PIE),
+};
+
+static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = {
+ NEEDS_FEAT(SCTLR_EL1_CP15BEN |
+ SCTLR_EL1_ITD |
+ SCTLR_EL1_SED,
+ FEAT_AA32EL0),
+ NEEDS_FEAT(SCTLR_EL1_BT0 |
+ SCTLR_EL1_BT1,
+ FEAT_BTI),
+ NEEDS_FEAT(SCTLR_EL1_CMOW, FEAT_CMOW),
+ NEEDS_FEAT(SCTLR_EL1_TSCXT, feat_csv2_2_csv2_1p2),
+ NEEDS_FEAT(SCTLR_EL1_EIS |
+ SCTLR_EL1_EOS,
+ FEAT_ExS),
+ NEEDS_FEAT(SCTLR_EL1_EnFPM, FEAT_FPMR),
+ NEEDS_FEAT(SCTLR_EL1_IESB, FEAT_IESB),
+ NEEDS_FEAT(SCTLR_EL1_EnALS, FEAT_LS64),
+ NEEDS_FEAT(SCTLR_EL1_EnAS0, FEAT_LS64_ACCDATA),
+ NEEDS_FEAT(SCTLR_EL1_EnASR, FEAT_LS64_V),
+ NEEDS_FEAT(SCTLR_EL1_nAA, FEAT_LSE2),
+ NEEDS_FEAT(SCTLR_EL1_LSMAOE |
+ SCTLR_EL1_nTLSMD,
+ FEAT_LSMAOC),
+ NEEDS_FEAT(SCTLR_EL1_EE, FEAT_MixedEnd),
+ NEEDS_FEAT(SCTLR_EL1_E0E, feat_mixedendel0),
+ NEEDS_FEAT(SCTLR_EL1_MSCEn, FEAT_MOPS),
+ NEEDS_FEAT(SCTLR_EL1_ATA0 |
+ SCTLR_EL1_ATA |
+ SCTLR_EL1_TCF0 |
+ SCTLR_EL1_TCF,
+ FEAT_MTE2),
+ NEEDS_FEAT(SCTLR_EL1_ITFSB, feat_mte_async),
+ NEEDS_FEAT(SCTLR_EL1_TCSO0 |
+ SCTLR_EL1_TCSO,
+ FEAT_MTE_STORE_ONLY),
+ NEEDS_FEAT(SCTLR_EL1_NMI |
+ SCTLR_EL1_SPINTMASK,
+ FEAT_NMI),
+ NEEDS_FEAT(SCTLR_EL1_SPAN, FEAT_PAN),
+ NEEDS_FEAT(SCTLR_EL1_EPAN, FEAT_PAN3),
+ NEEDS_FEAT(SCTLR_EL1_EnDA |
+ SCTLR_EL1_EnDB |
+ SCTLR_EL1_EnIA |
+ SCTLR_EL1_EnIB,
+ feat_pauth),
+ NEEDS_FEAT(SCTLR_EL1_EnTP2, FEAT_SME),
+ NEEDS_FEAT(SCTLR_EL1_EnRCTX, FEAT_SPECRES),
+ NEEDS_FEAT(SCTLR_EL1_DSSBS, FEAT_SSBS),
+ NEEDS_FEAT(SCTLR_EL1_TIDCP, FEAT_TIDCP1),
+ NEEDS_FEAT(SCTLR_EL1_TME0 |
+ SCTLR_EL1_TME |
+ SCTLR_EL1_TMT0 |
+ SCTLR_EL1_TMT,
+ FEAT_TME),
+ NEEDS_FEAT(SCTLR_EL1_TWEDEL |
+ SCTLR_EL1_TWEDEn,
+ FEAT_TWED),
+ NEEDS_FEAT(SCTLR_EL1_UCI |
+ SCTLR_EL1_EE |
+ SCTLR_EL1_E0E |
+ SCTLR_EL1_WXN |
+ SCTLR_EL1_nTWE |
+ SCTLR_EL1_nTWI |
+ SCTLR_EL1_UCT |
+ SCTLR_EL1_DZE |
+ SCTLR_EL1_I |
+ SCTLR_EL1_UMA |
+ SCTLR_EL1_SA0 |
+ SCTLR_EL1_SA |
+ SCTLR_EL1_C |
+ SCTLR_EL1_A |
+ SCTLR_EL1_M,
+ FEAT_AA64EL1),
+};
+
+static const struct reg_bits_to_feat_map mdcr_el2_feat_map[] = {
+ NEEDS_FEAT(MDCR_EL2_EBWE, FEAT_Debugv8p9),
+ NEEDS_FEAT(MDCR_EL2_TDOSA, FEAT_DoubleLock),
+ NEEDS_FEAT(MDCR_EL2_PMEE, FEAT_EBEP),
+ NEEDS_FEAT(MDCR_EL2_TDCC, FEAT_FGT),
+ NEEDS_FEAT(MDCR_EL2_MTPME, FEAT_MTPMU),
+ NEEDS_FEAT(MDCR_EL2_HPME |
+ MDCR_EL2_HPMN |
+ MDCR_EL2_TPMCR |
+ MDCR_EL2_TPM,
+ FEAT_PMUv3),
+ NEEDS_FEAT(MDCR_EL2_HPMD, feat_pmuv3p1),
+ NEEDS_FEAT(MDCR_EL2_HCCD |
+ MDCR_EL2_HLP,
+ feat_pmuv3p5),
+ NEEDS_FEAT(MDCR_EL2_HPMFZO, feat_pmuv3p7),
+ NEEDS_FEAT(MDCR_EL2_PMSSE, FEAT_PMUv3_SS),
+ NEEDS_FEAT(MDCR_EL2_E2PB |
+ MDCR_EL2_TPMS,
+ FEAT_SPE),
+ NEEDS_FEAT(MDCR_EL2_HPMFZS, FEAT_SPEv1p2),
+ NEEDS_FEAT(MDCR_EL2_EnSPM, FEAT_SPMU),
+ NEEDS_FEAT(MDCR_EL2_EnSTEPOP, FEAT_STEP2),
+ NEEDS_FEAT(MDCR_EL2_E2TB, FEAT_TRBE),
+ NEEDS_FEAT(MDCR_EL2_TTRF, FEAT_TRF),
+ NEEDS_FEAT(MDCR_EL2_TDA |
+ MDCR_EL2_TDE |
+ MDCR_EL2_TDRA,
+ FEAT_AA64EL1),
+};
+
static void __init check_feat_map(const struct reg_bits_to_feat_map *map,
int map_size, u64 res0, const char *str)
{
@@ -863,6 +1079,14 @@ void __init check_feature_map(void)
__HCRX_EL2_RES0, "HCRX_EL2");
check_feat_map(hcr_feat_map, ARRAY_SIZE(hcr_feat_map),
HCR_EL2_RES0, "HCR_EL2");
+ check_feat_map(sctlr2_feat_map, ARRAY_SIZE(sctlr2_feat_map),
+ SCTLR2_EL1_RES0, "SCTLR2_EL1");
+ check_feat_map(tcr2_el2_feat_map, ARRAY_SIZE(tcr2_el2_feat_map),
+ TCR2_EL2_RES0, "TCR2_EL2");
+ check_feat_map(sctlr_el1_feat_map, ARRAY_SIZE(sctlr_el1_feat_map),
+ SCTLR_EL1_RES0, "SCTLR_EL1");
+ check_feat_map(mdcr_el2_feat_map, ARRAY_SIZE(mdcr_el2_feat_map),
+ MDCR_EL2_RES0, "MDCR_EL2");
}
static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map *map)
@@ -1077,6 +1301,31 @@ void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *r
*res0 |= HCR_EL2_RES0 | (mask & ~fixed);
*res1 = HCR_EL2_RES1 | (mask & fixed);
break;
+ case SCTLR2_EL1:
+ case SCTLR2_EL2:
+ *res0 = compute_res0_bits(kvm, sctlr2_feat_map,
+ ARRAY_SIZE(sctlr2_feat_map), 0, 0);
+ *res0 |= SCTLR2_EL1_RES0;
+ *res1 = SCTLR2_EL1_RES1;
+ break;
+ case TCR2_EL2:
+ *res0 = compute_res0_bits(kvm, tcr2_el2_feat_map,
+ ARRAY_SIZE(tcr2_el2_feat_map), 0, 0);
+ *res0 |= TCR2_EL2_RES0;
+ *res1 = TCR2_EL2_RES1;
+ break;
+ case SCTLR_EL1:
+ *res0 = compute_res0_bits(kvm, sctlr_el1_feat_map,
+ ARRAY_SIZE(sctlr_el1_feat_map), 0, 0);
+ *res0 |= SCTLR_EL1_RES0;
+ *res1 = SCTLR_EL1_RES1;
+ break;
+ case MDCR_EL2:
+ *res0 = compute_res0_bits(kvm, mdcr_el2_feat_map,
+ ARRAY_SIZE(mdcr_el2_feat_map), 0, 0);
+ *res0 |= MDCR_EL2_RES0;
+ *res1 = MDCR_EL2_RES1;
+ break;
default:
WARN_ON_ONCE(1);
*res0 = *res1 = 0;
diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c
index 3a384e9660b8..90cb4b7ae0ff 100644
--- a/arch/arm64/kvm/emulate-nested.c
+++ b/arch/arm64/kvm/emulate-nested.c
@@ -88,6 +88,7 @@ enum cgt_group_id {
CGT_HCRX_EnFPM,
CGT_HCRX_TCR2En,
+ CGT_HCRX_SCTLR2En,
CGT_CNTHCTL_EL1TVT,
CGT_CNTHCTL_EL1TVCT,
@@ -108,6 +109,7 @@ enum cgt_group_id {
CGT_HCR_TTLB_TTLBOS,
CGT_HCR_TVM_TRVM,
CGT_HCR_TVM_TRVM_HCRX_TCR2En,
+ CGT_HCR_TVM_TRVM_HCRX_SCTLR2En,
CGT_HCR_TPU_TICAB,
CGT_HCR_TPU_TOCU,
CGT_HCR_NV1_nNV2_ENSCXT,
@@ -398,6 +400,12 @@ static const struct trap_bits coarse_trap_bits[] = {
.mask = HCRX_EL2_TCR2En,
.behaviour = BEHAVE_FORWARD_RW,
},
+ [CGT_HCRX_SCTLR2En] = {
+ .index = HCRX_EL2,
+ .value = 0,
+ .mask = HCRX_EL2_SCTLR2En,
+ .behaviour = BEHAVE_FORWARD_RW,
+ },
[CGT_CNTHCTL_EL1TVT] = {
.index = CNTHCTL_EL2,
.value = CNTHCTL_EL1TVT,
@@ -449,6 +457,8 @@ static const enum cgt_group_id *coarse_control_combo[] = {
MCB(CGT_HCR_TVM_TRVM, CGT_HCR_TVM, CGT_HCR_TRVM),
MCB(CGT_HCR_TVM_TRVM_HCRX_TCR2En,
CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_TCR2En),
+ MCB(CGT_HCR_TVM_TRVM_HCRX_SCTLR2En,
+ CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_SCTLR2En),
MCB(CGT_HCR_TPU_TICAB, CGT_HCR_TPU, CGT_HCR_TICAB),
MCB(CGT_HCR_TPU_TOCU, CGT_HCR_TPU, CGT_HCR_TOCU),
MCB(CGT_HCR_NV1_nNV2_ENSCXT, CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT),
@@ -782,6 +792,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(OP_TLBI_RVALE1OSNXS, CGT_HCR_TTLB_TTLBOS),
SR_TRAP(OP_TLBI_RVAALE1OSNXS, CGT_HCR_TTLB_TTLBOS),
SR_TRAP(SYS_SCTLR_EL1, CGT_HCR_TVM_TRVM),
+ SR_TRAP(SYS_SCTLR2_EL1, CGT_HCR_TVM_TRVM_HCRX_SCTLR2En),
SR_TRAP(SYS_TTBR0_EL1, CGT_HCR_TVM_TRVM),
SR_TRAP(SYS_TTBR1_EL1, CGT_HCR_TVM_TRVM),
SR_TRAP(SYS_TCR_EL1, CGT_HCR_TVM_TRVM),
@@ -1354,6 +1365,7 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = {
SR_FGT(SYS_SCXTNUM_EL0, HFGRTR, SCXTNUM_EL0, 1),
SR_FGT(SYS_SCXTNUM_EL1, HFGRTR, SCXTNUM_EL1, 1),
SR_FGT(SYS_SCTLR_EL1, HFGRTR, SCTLR_EL1, 1),
+ SR_FGT(SYS_SCTLR2_EL1, HFGRTR, SCTLR_EL1, 1),
SR_FGT(SYS_REVIDR_EL1, HFGRTR, REVIDR_EL1, 1),
SR_FGT(SYS_PAR_EL1, HFGRTR, PAR_EL1, 1),
SR_FGT(SYS_MPIDR_EL1, HFGRTR, MPIDR_EL1, 1),
@@ -2592,13 +2604,8 @@ inject:
static bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, u64 control_bit)
{
- bool control_bit_set;
-
- if (!vcpu_has_nv(vcpu))
- return false;
-
- control_bit_set = __vcpu_sys_reg(vcpu, reg) & control_bit;
- if (!is_hyp_ctxt(vcpu) && control_bit_set) {
+ if (is_nested_ctxt(vcpu) &&
+ (__vcpu_sys_reg(vcpu, reg) & control_bit)) {
kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
return true;
}
@@ -2719,6 +2726,9 @@ static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2,
case except_type_irq:
kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_IRQ);
break;
+ case except_type_serror:
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SERR);
+ break;
default:
WARN_ONCE(1, "Unsupported EL2 exception injection %d\n", type);
}
@@ -2816,3 +2826,28 @@ int kvm_inject_nested_irq(struct kvm_vcpu *vcpu)
/* esr_el2 value doesn't matter for exits due to irqs. */
return kvm_inject_nested(vcpu, 0, except_type_irq);
}
+
+int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr)
+{
+ u64 esr = FIELD_PREP(ESR_ELx_EC_MASK,
+ iabt ? ESR_ELx_EC_IABT_LOW : ESR_ELx_EC_DABT_LOW);
+ esr |= ESR_ELx_FSC_EXTABT | ESR_ELx_IL;
+
+ vcpu_write_sys_reg(vcpu, FAR_EL2, addr);
+
+ if (__vcpu_sys_reg(vcpu, SCTLR2_EL2) & SCTLR2_EL1_EASE)
+ return kvm_inject_nested(vcpu, esr, except_type_serror);
+
+ return kvm_inject_nested_sync(vcpu, esr);
+}
+
+int kvm_inject_nested_serror(struct kvm_vcpu *vcpu, u64 esr)
+{
+ /*
+ * Hardware sets up the EC field when propagating ESR as a result of
+ * vSError injection. Manually populate EC for an emulated SError
+ * exception.
+ */
+ esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SERROR);
+ return kvm_inject_nested(vcpu, esr, except_type_serror);
+}
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 2196979a24a3..16ba5e9ac86c 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -818,8 +818,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
- events->exception.serror_pending = !!(vcpu->arch.hcr_el2 & HCR_VSE);
events->exception.serror_has_esr = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
+ events->exception.serror_pending = (vcpu->arch.hcr_el2 & HCR_VSE) ||
+ vcpu_get_flag(vcpu, NESTED_SERROR_PENDING);
if (events->exception.serror_pending && events->exception.serror_has_esr)
events->exception.serror_esr = vcpu_get_vsesr(vcpu);
@@ -833,29 +834,62 @@ int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
return 0;
}
+static void commit_pending_events(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION))
+ return;
+
+ /*
+ * Reset the MMIO emulation state to avoid stepping PC after emulating
+ * the exception entry.
+ */
+ vcpu->mmio_needed = false;
+ kvm_call_hyp(__kvm_adjust_pc, vcpu);
+}
+
int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
bool serror_pending = events->exception.serror_pending;
bool has_esr = events->exception.serror_has_esr;
bool ext_dabt_pending = events->exception.ext_dabt_pending;
+ u64 esr = events->exception.serror_esr;
+ int ret = 0;
- if (serror_pending && has_esr) {
- if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
- return -EINVAL;
-
- if (!((events->exception.serror_esr) & ~ESR_ELx_ISS_MASK))
- kvm_set_sei_esr(vcpu, events->exception.serror_esr);
- else
- return -EINVAL;
- } else if (serror_pending) {
- kvm_inject_vabt(vcpu);
+ /*
+ * Immediately commit the pending SEA to the vCPU's architectural
+ * state which is necessary since we do not return a pending SEA
+ * to userspace via KVM_GET_VCPU_EVENTS.
+ */
+ if (ext_dabt_pending) {
+ ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+ commit_pending_events(vcpu);
}
- if (ext_dabt_pending)
- kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+ if (ret < 0)
+ return ret;
- return 0;
+ if (!serror_pending)
+ return 0;
+
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && has_esr)
+ return -EINVAL;
+
+ if (has_esr && (esr & ~ESR_ELx_ISS_MASK))
+ return -EINVAL;
+
+ if (has_esr)
+ ret = kvm_inject_serror_esr(vcpu, esr);
+ else
+ ret = kvm_inject_serror(vcpu);
+
+ /*
+ * We could've decided that the SError is due for immediate software
+ * injection; commit the exception in case userspace decides it wants
+ * to inject more exceptions for some strange reason.
+ */
+ commit_pending_events(vcpu);
+ return (ret < 0) ? ret : 0;
}
u32 __attribute_const__ kvm_target_cpu(void)
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 453266c96481..a598072f36d2 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -32,7 +32,7 @@ typedef int (*exit_handle_fn)(struct kvm_vcpu *);
static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u64 esr)
{
if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(NULL, esr))
- kvm_inject_vabt(vcpu);
+ kvm_inject_serror(vcpu);
}
static int handle_hvc(struct kvm_vcpu *vcpu)
@@ -252,7 +252,7 @@ static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu)
return 1;
}
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
+ if (is_nested_ctxt(vcpu)) {
kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
return 1;
}
@@ -311,12 +311,11 @@ static int kvm_handle_gcs(struct kvm_vcpu *vcpu)
static int handle_other(struct kvm_vcpu *vcpu)
{
- bool is_l2 = vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu);
+ bool allowed, fwd = is_nested_ctxt(vcpu);
u64 hcrx = __vcpu_sys_reg(vcpu, HCRX_EL2);
u64 esr = kvm_vcpu_get_esr(vcpu);
u64 iss = ESR_ELx_ISS(esr);
struct kvm *kvm = vcpu->kvm;
- bool allowed, fwd = false;
/*
* We only trap for two reasons:
@@ -335,28 +334,23 @@ static int handle_other(struct kvm_vcpu *vcpu)
switch (iss) {
case ESR_ELx_ISS_OTHER_ST64BV:
allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_V);
- if (is_l2)
- fwd = !(hcrx & HCRX_EL2_EnASR);
+ fwd &= !(hcrx & HCRX_EL2_EnASR);
break;
case ESR_ELx_ISS_OTHER_ST64BV0:
allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA);
- if (is_l2)
- fwd = !(hcrx & HCRX_EL2_EnAS0);
+ fwd &= !(hcrx & HCRX_EL2_EnAS0);
break;
case ESR_ELx_ISS_OTHER_LDST64B:
allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64);
- if (is_l2)
- fwd = !(hcrx & HCRX_EL2_EnALS);
+ fwd &= !(hcrx & HCRX_EL2_EnALS);
break;
case ESR_ELx_ISS_OTHER_TSBCSYNC:
allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1);
- if (is_l2)
- fwd = (__vcpu_sys_reg(vcpu, HFGITR2_EL2) & HFGITR2_EL2_TSBCSYNC);
+ fwd &= (__vcpu_sys_reg(vcpu, HFGITR2_EL2) & HFGITR2_EL2_TSBCSYNC);
break;
case ESR_ELx_ISS_OTHER_PSBCSYNC:
allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P5);
- if (is_l2)
- fwd = (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_PSBCSYNC);
+ fwd &= (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_PSBCSYNC);
break;
default:
/* Clearly, we're missing something. */
@@ -496,7 +490,7 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
kvm_handle_guest_serror(vcpu, disr_to_esr(disr));
} else {
- kvm_inject_vabt(vcpu);
+ kvm_inject_serror(vcpu);
}
return;
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 6a2a899a344e..95d186e0bf54 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -26,7 +26,8 @@ static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
if (unlikely(vcpu_has_nv(vcpu)))
return vcpu_read_sys_reg(vcpu, reg);
- else if (__vcpu_read_sys_reg_from_cpu(reg, &val))
+ else if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) &&
+ __vcpu_read_sys_reg_from_cpu(reg, &val))
return val;
return __vcpu_sys_reg(vcpu, reg);
@@ -36,7 +37,8 @@ static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
{
if (unlikely(vcpu_has_nv(vcpu)))
vcpu_write_sys_reg(vcpu, val, reg);
- else if (!__vcpu_write_sys_reg_to_cpu(val, reg))
+ else if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU) ||
+ !__vcpu_write_sys_reg_to_cpu(val, reg))
__vcpu_assign_sys_reg(vcpu, reg, val);
}
@@ -339,6 +341,10 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync);
break;
+ case unpack_vcpu_flag(EXCEPT_AA64_EL1_SERR):
+ enter_exception64(vcpu, PSR_MODE_EL1h, except_type_serror);
+ break;
+
case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC):
enter_exception64(vcpu, PSR_MODE_EL2h, except_type_sync);
break;
@@ -347,9 +353,13 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
enter_exception64(vcpu, PSR_MODE_EL2h, except_type_irq);
break;
+ case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR):
+ enter_exception64(vcpu, PSR_MODE_EL2h, except_type_serror);
+ break;
+
default:
/*
- * Only EL1_SYNC and EL2_{SYNC,IRQ} makes
+ * Only EL1_{SYNC,SERR} and EL2_{SYNC,IRQ,SERR} makes
* sense so far. Everything else gets silently
* ignored.
*/
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 2ad57b117385..84ec4e100fbb 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -298,7 +298,7 @@ static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
u64 val; \
\
ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) \
+ if (is_nested_ctxt(vcpu)) \
compute_clr_set(vcpu, reg, c, s); \
\
compute_undef_clr_set(vcpu, kvm, reg, c, s); \
@@ -436,7 +436,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
if (cpus_have_final_cap(ARM64_HAS_HCX)) {
u64 hcrx = vcpu->arch.hcrx_el2;
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
+ if (is_nested_ctxt(vcpu)) {
u64 val = __vcpu_sys_reg(vcpu, HCRX_EL2);
hcrx |= val & __HCRX_EL2_MASK;
hcrx &= ~(~val & __HCRX_EL2_nMASK);
@@ -476,21 +476,56 @@ static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr)
write_sysreg_hcr(hcr);
- if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE))
- write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2);
+ if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE)) {
+ u64 vsesr;
+
+ /*
+ * When HCR_EL2.AMO is set, physical SErrors are taken to EL2
+ * and vSError injection is enabled for EL1. Conveniently, for
+ * NV this means that it is never the case where a 'physical'
+ * SError (injected by KVM or userspace) and vSError are
+ * deliverable to the same context.
+ *
+ * As such, we can trivially select between the host or guest's
+ * VSESR_EL2. Except for the case that FEAT_RAS hasn't been
+ * exposed to the guest, where ESR propagation in hardware
+ * occurs unconditionally.
+ *
+ * Paper over the architectural wart and use an IMPLEMENTATION
+ * DEFINED ESR value in case FEAT_RAS is hidden from the guest.
+ */
+ if (!vserror_state_is_nested(vcpu))
+ vsesr = vcpu->arch.vsesr_el2;
+ else if (kvm_has_ras(kern_hyp_va(vcpu->kvm)))
+ vsesr = __vcpu_sys_reg(vcpu, VSESR_EL2);
+ else
+ vsesr = ESR_ELx_ISV;
+
+ write_sysreg_s(vsesr, SYS_VSESR_EL2);
+ }
}
static inline void ___deactivate_traps(struct kvm_vcpu *vcpu)
{
+ u64 *hcr;
+
+ if (vserror_state_is_nested(vcpu))
+ hcr = __ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2);
+ else
+ hcr = &vcpu->arch.hcr_el2;
+
/*
* If we pended a virtual abort, preserve it until it gets
* cleared. See D1.14.3 (Virtual Interrupts) for details, but
* the crucial bit is "On taking a vSError interrupt,
* HCR_EL2.VSE is cleared to 0."
+ *
+ * Additionally, when in a nested context we need to propagate the
+ * updated state to the guest hypervisor's HCR_EL2.
*/
- if (vcpu->arch.hcr_el2 & HCR_VSE) {
- vcpu->arch.hcr_el2 &= ~HCR_VSE;
- vcpu->arch.hcr_el2 |= read_sysreg(hcr_el2) & HCR_VSE;
+ if (*hcr & HCR_VSE) {
+ *hcr &= ~HCR_VSE;
+ *hcr |= read_sysreg(hcr_el2) & HCR_VSE;
}
}
@@ -531,7 +566,7 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu)
* nested guest, as the guest hypervisor could select a smaller VL. Slap
* that into hardware before wrapping up.
*/
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))
+ if (is_nested_ctxt(vcpu))
sve_cond_update_zcr_vq(__vcpu_sys_reg(vcpu, ZCR_EL2), SYS_ZCR_EL2);
write_sysreg_el1(__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)), SYS_ZCR);
@@ -557,7 +592,7 @@ static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu)
if (vcpu_has_sve(vcpu)) {
/* A guest hypervisor may restrict the effective max VL. */
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))
+ if (is_nested_ctxt(vcpu))
zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2);
else
zcr_el2 = vcpu_sve_max_vq(vcpu) - 1;
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index 4d0dbea4c56f..a17cbe7582de 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -109,6 +109,28 @@ static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt)
return kvm_has_s1poe(kern_hyp_va(vcpu->kvm));
}
+static inline bool ctxt_has_ras(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ return false;
+
+ vcpu = ctxt_to_vcpu(ctxt);
+ return kvm_has_ras(kern_hyp_va(vcpu->kvm));
+}
+
+static inline bool ctxt_has_sctlr2(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!cpus_have_final_cap(ARM64_HAS_SCTLR2))
+ return false;
+
+ vcpu = ctxt_to_vcpu(ctxt);
+ return kvm_has_sctlr2(kern_hyp_va(vcpu->kvm));
+}
+
static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
{
ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR);
@@ -147,6 +169,9 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
ctxt_sys_reg(ctxt, SP_EL1) = read_sysreg(sp_el1);
ctxt_sys_reg(ctxt, ELR_EL1) = read_sysreg_el1(SYS_ELR);
ctxt_sys_reg(ctxt, SPSR_EL1) = read_sysreg_el1(SYS_SPSR);
+
+ if (ctxt_has_sctlr2(ctxt))
+ ctxt_sys_reg(ctxt, SCTLR2_EL1) = read_sysreg_el1(SYS_SCTLR2);
}
static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
@@ -159,8 +184,13 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
if (!has_vhe() && ctxt->__hyp_running_vcpu)
ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR);
- if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ return;
+
+ if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt)))
ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2);
+ else if (ctxt_has_ras(ctxt))
+ ctxt_sys_reg(ctxt, VDISR_EL2) = read_sysreg_s(SYS_VDISR_EL2);
}
static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt)
@@ -252,6 +282,9 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt,
write_sysreg(ctxt_sys_reg(ctxt, SP_EL1), sp_el1);
write_sysreg_el1(ctxt_sys_reg(ctxt, ELR_EL1), SYS_ELR);
write_sysreg_el1(ctxt_sys_reg(ctxt, SPSR_EL1), SYS_SPSR);
+
+ if (ctxt_has_sctlr2(ctxt))
+ write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR2_EL1), SYS_SCTLR2);
}
/* Read the VCPU state's PSTATE, but translate (v)EL2 to EL1. */
@@ -275,6 +308,7 @@ static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctx
{
u64 pstate = to_hw_pstate(ctxt);
u64 mode = pstate & PSR_AA32_MODE_MASK;
+ u64 vdisr;
/*
* Safety check to ensure we're setting the CPU up to enter the guest
@@ -293,8 +327,17 @@ static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctx
write_sysreg_el2(ctxt->regs.pc, SYS_ELR);
write_sysreg_el2(pstate, SYS_SPSR);
- if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
- write_sysreg_s(ctxt_sys_reg(ctxt, DISR_EL1), SYS_VDISR_EL2);
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ return;
+
+ if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt)))
+ vdisr = ctxt_sys_reg(ctxt, DISR_EL1);
+ else if (ctxt_has_ras(ctxt))
+ vdisr = ctxt_sys_reg(ctxt, VDISR_EL2);
+ else
+ vdisr = 0;
+
+ write_sysreg_s(vdisr, SYS_VDISR_EL2);
}
static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index f162b0df5cae..d81275790e69 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -296,12 +296,19 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
}
/*
- * Prevent the guest from touching the ICC_SRE_EL1 system
- * register. Note that this may not have any effect, as
- * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation.
+ * GICv5 BET0 FEAT_GCIE_LEGACY doesn't include ICC_SRE_EL2. This is due
+ * to be relaxed in a future spec release, at which point this in
+ * condition can be dropped.
*/
- write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
- ICC_SRE_EL2);
+ if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) {
+ /*
+ * Prevent the guest from touching the ICC_SRE_EL1 system
+ * register. Note that this may not have any effect, as
+ * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation.
+ */
+ write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
+ ICC_SRE_EL2);
+ }
/*
* If we need to trap system registers, we must write
@@ -322,8 +329,14 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
}
- val = read_gicreg(ICC_SRE_EL2);
- write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
+ /*
+ * Can be dropped in the future when GICv5 spec is relaxed. See comment
+ * above.
+ */
+ if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) {
+ val = read_gicreg(ICC_SRE_EL2);
+ write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
+ }
if (!cpu_if->vgic_sre) {
/* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
@@ -423,10 +436,20 @@ void __vgic_v3_init_lrs(void)
*/
u64 __vgic_v3_get_gic_config(void)
{
- u64 val, sre = read_gicreg(ICC_SRE_EL1);
+ u64 val, sre;
unsigned long flags = 0;
/*
+ * In compat mode, we cannot access ICC_SRE_EL1 at any EL
+ * other than EL1 itself; just return the
+ * ICH_VTR_EL2. ICC_IDR0_EL1 is only implemented on a GICv5
+ * system, so we first check if we have GICv5 support.
+ */
+ if (cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF))
+ return read_gicreg(ICH_VTR_EL2);
+
+ sre = read_gicreg(ICC_SRE_EL1);
+ /*
* To check whether we have a MMIO-based (GICv2 compatible)
* CPU interface, we need to disable the system register
* view.
@@ -471,6 +494,16 @@ u64 __vgic_v3_get_gic_config(void)
return val;
}
+static void __vgic_v3_compat_mode_enable(void)
+{
+ if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF))
+ return;
+
+ sysreg_clear_set_s(SYS_ICH_VCTLR_EL2, 0, ICH_VCTLR_EL2_V3);
+ /* Wait for V3 to become enabled */
+ isb();
+}
+
static u64 __vgic_v3_read_vmcr(void)
{
return read_gicreg(ICH_VMCR_EL2);
@@ -490,6 +523,8 @@ void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
{
+ __vgic_v3_compat_mode_enable();
+
/*
* If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
* is dependent on ICC_SRE_EL1.SRE, and we have to perform the
@@ -1050,7 +1085,7 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu,
{
u64 ich_hcr;
- if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu))
+ if (!is_nested_ctxt(vcpu))
return false;
ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 477f1580ffea..e482181c6632 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -48,8 +48,7 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
static u64 __compute_hcr(struct kvm_vcpu *vcpu)
{
- u64 guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
- u64 hcr = vcpu->arch.hcr_el2;
+ u64 guest_hcr, hcr = vcpu->arch.hcr_el2;
if (!vcpu_has_nv(vcpu))
return hcr;
@@ -68,10 +67,21 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu)
if (!vcpu_el2_e2h_is_set(vcpu))
hcr |= HCR_NV1;
+ /*
+ * Nothing in HCR_EL2 should impact running in hypervisor
+ * context, apart from bits we have defined as RESx (E2H,
+ * HCD and co), or that cannot be set directly (the EXCLUDE
+ * bits). Given that we OR the guest's view with the host's,
+ * we can use the 0 value as the starting point, and only
+ * use the config-driven RES1 bits.
+ */
+ guest_hcr = kvm_vcpu_apply_reg_masks(vcpu, HCR_EL2, 0);
+
write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2);
} else {
host_data_clear_flag(VCPU_IN_HYP_CONTEXT);
+ guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
if (guest_hcr & HCR_NV) {
u64 va = __fix_to_virt(vncr_fixmap(smp_processor_id()));
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 73e4bc7fde9e..f28c6cf4fe1b 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -77,6 +77,9 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu)
__vcpu_assign_sys_reg(vcpu, SP_EL2, read_sysreg(sp_el1));
__vcpu_assign_sys_reg(vcpu, ELR_EL2, read_sysreg_el1(SYS_ELR));
__vcpu_assign_sys_reg(vcpu, SPSR_EL2, read_sysreg_el1(SYS_SPSR));
+
+ if (ctxt_has_sctlr2(&vcpu->arch.ctxt))
+ __vcpu_assign_sys_reg(vcpu, SCTLR2_EL2, read_sysreg_el1(SYS_SCTLR2));
}
static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu)
@@ -139,6 +142,9 @@ static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu)
write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1);
write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR);
write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR);
+
+ if (ctxt_has_sctlr2(&vcpu->arch.ctxt))
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR2_EL2), SYS_SCTLR2);
}
/*
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index a640e839848e..6745f38b64f9 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -15,13 +15,11 @@
#include <asm/kvm_nested.h>
#include <asm/esr.h>
-static void pend_sync_exception(struct kvm_vcpu *vcpu)
+static unsigned int exception_target_el(struct kvm_vcpu *vcpu)
{
/* If not nesting, EL1 is the only possible exception target */
- if (likely(!vcpu_has_nv(vcpu))) {
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
- return;
- }
+ if (likely(!vcpu_has_nv(vcpu)))
+ return PSR_MODE_EL1h;
/*
* With NV, we need to pick between EL1 and EL2. Note that we
@@ -32,26 +30,76 @@ static void pend_sync_exception(struct kvm_vcpu *vcpu)
switch(*vcpu_cpsr(vcpu) & PSR_MODE_MASK) {
case PSR_MODE_EL2h:
case PSR_MODE_EL2t:
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
- break;
+ return PSR_MODE_EL2h;
case PSR_MODE_EL1h:
case PSR_MODE_EL1t:
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
- break;
+ return PSR_MODE_EL1h;
case PSR_MODE_EL0t:
- if (vcpu_el2_tge_is_set(vcpu))
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
- else
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
- break;
+ return vcpu_el2_tge_is_set(vcpu) ? PSR_MODE_EL2h : PSR_MODE_EL1h;
default:
BUG();
}
}
-static bool match_target_el(struct kvm_vcpu *vcpu, unsigned long target)
+static enum vcpu_sysreg exception_esr_elx(struct kvm_vcpu *vcpu)
+{
+ if (exception_target_el(vcpu) == PSR_MODE_EL2h)
+ return ESR_EL2;
+
+ return ESR_EL1;
+}
+
+static enum vcpu_sysreg exception_far_elx(struct kvm_vcpu *vcpu)
+{
+ if (exception_target_el(vcpu) == PSR_MODE_EL2h)
+ return FAR_EL2;
+
+ return FAR_EL1;
+}
+
+static void pend_sync_exception(struct kvm_vcpu *vcpu)
+{
+ if (exception_target_el(vcpu) == PSR_MODE_EL1h)
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
+ else
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
+}
+
+static void pend_serror_exception(struct kvm_vcpu *vcpu)
{
- return (vcpu_get_flag(vcpu, EXCEPT_MASK) == target);
+ if (exception_target_el(vcpu) == PSR_MODE_EL1h)
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SERR);
+ else
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SERR);
+}
+
+static bool __effective_sctlr2_bit(struct kvm_vcpu *vcpu, unsigned int idx)
+{
+ u64 sctlr2;
+
+ if (!kvm_has_sctlr2(vcpu->kvm))
+ return false;
+
+ if (is_nested_ctxt(vcpu) &&
+ !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_SCTLR2En))
+ return false;
+
+ if (exception_target_el(vcpu) == PSR_MODE_EL1h)
+ sctlr2 = vcpu_read_sys_reg(vcpu, SCTLR2_EL1);
+ else
+ sctlr2 = vcpu_read_sys_reg(vcpu, SCTLR2_EL2);
+
+ return sctlr2 & BIT(idx);
+}
+
+static bool effective_sctlr2_ease(struct kvm_vcpu *vcpu)
+{
+ return __effective_sctlr2_bit(vcpu, SCTLR2_EL1_EASE_SHIFT);
+}
+
+static bool effective_sctlr2_nmea(struct kvm_vcpu *vcpu)
+{
+ return __effective_sctlr2_bit(vcpu, SCTLR2_EL1_NMEA_SHIFT);
}
static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr)
@@ -60,7 +108,11 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
bool is_aarch32 = vcpu_mode_is_32bit(vcpu);
u64 esr = 0;
- pend_sync_exception(vcpu);
+ /* This delight is brought to you by FEAT_DoubleFault2. */
+ if (effective_sctlr2_ease(vcpu))
+ pend_serror_exception(vcpu);
+ else
+ pend_sync_exception(vcpu);
/*
* Build an {i,d}abort, depending on the level and the
@@ -83,13 +135,8 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
esr |= ESR_ELx_FSC_EXTABT;
- if (match_target_el(vcpu, unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC))) {
- vcpu_write_sys_reg(vcpu, addr, FAR_EL1);
- vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
- } else {
- vcpu_write_sys_reg(vcpu, addr, FAR_EL2);
- vcpu_write_sys_reg(vcpu, esr, ESR_EL2);
- }
+ vcpu_write_sys_reg(vcpu, addr, exception_far_elx(vcpu));
+ vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
}
static void inject_undef64(struct kvm_vcpu *vcpu)
@@ -105,10 +152,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
if (kvm_vcpu_trap_il_is32bit(vcpu))
esr |= ESR_ELx_IL;
- if (match_target_el(vcpu, unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC)))
- vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
- else
- vcpu_write_sys_reg(vcpu, esr, ESR_EL2);
+ vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
}
#define DFSR_FSC_EXTABT_LPAE 0x10
@@ -155,36 +199,35 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr)
vcpu_write_sys_reg(vcpu, far, FAR_EL1);
}
-/**
- * kvm_inject_dabt - inject a data abort into the guest
- * @vcpu: The VCPU to receive the data abort
- * @addr: The address to report in the DFAR
- *
- * It is assumed that this code is called from the VCPU thread and that the
- * VCPU therefore is not currently executing guest code.
- */
-void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
+static void __kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr)
{
if (vcpu_el1_is_32bit(vcpu))
- inject_abt32(vcpu, false, addr);
+ inject_abt32(vcpu, iabt, addr);
else
- inject_abt64(vcpu, false, addr);
+ inject_abt64(vcpu, iabt, addr);
}
-/**
- * kvm_inject_pabt - inject a prefetch abort into the guest
- * @vcpu: The VCPU to receive the prefetch abort
- * @addr: The address to report in the DFAR
- *
- * It is assumed that this code is called from the VCPU thread and that the
- * VCPU therefore is not currently executing guest code.
- */
-void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
+static bool kvm_sea_target_is_el2(struct kvm_vcpu *vcpu)
{
- if (vcpu_el1_is_32bit(vcpu))
- inject_abt32(vcpu, true, addr);
- else
- inject_abt64(vcpu, true, addr);
+ if (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_TGE | HCR_TEA))
+ return true;
+
+ if (!vcpu_mode_priv(vcpu))
+ return false;
+
+ return (*vcpu_cpsr(vcpu) & PSR_A_BIT) &&
+ (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TMEA);
+}
+
+int kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr)
+{
+ lockdep_assert_held(&vcpu->mutex);
+
+ if (is_nested_ctxt(vcpu) && kvm_sea_target_is_el2(vcpu))
+ return kvm_inject_nested_sea(vcpu, iabt, addr);
+
+ __kvm_inject_sea(vcpu, iabt, addr);
+ return 1;
}
void kvm_inject_size_fault(struct kvm_vcpu *vcpu)
@@ -194,10 +237,7 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu)
addr = kvm_vcpu_get_fault_ipa(vcpu);
addr |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
- if (kvm_vcpu_trap_is_iabt(vcpu))
- kvm_inject_pabt(vcpu, addr);
- else
- kvm_inject_dabt(vcpu, addr);
+ __kvm_inject_sea(vcpu, kvm_vcpu_trap_is_iabt(vcpu), addr);
/*
* If AArch64 or LPAE, set FSC to 0 to indicate an Address
@@ -210,9 +250,9 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu)
!(vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE))
return;
- esr = vcpu_read_sys_reg(vcpu, ESR_EL1);
+ esr = vcpu_read_sys_reg(vcpu, exception_esr_elx(vcpu));
esr &= ~GENMASK_ULL(5, 0);
- vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
+ vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
}
/**
@@ -230,25 +270,70 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
inject_undef64(vcpu);
}
-void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 esr)
+static bool serror_is_masked(struct kvm_vcpu *vcpu)
{
- vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK);
- *vcpu_hcr(vcpu) |= HCR_VSE;
+ return (*vcpu_cpsr(vcpu) & PSR_A_BIT) && !effective_sctlr2_nmea(vcpu);
}
-/**
- * kvm_inject_vabt - inject an async abort / SError into the guest
- * @vcpu: The VCPU to receive the exception
- *
- * It is assumed that this code is called from the VCPU thread and that the
- * VCPU therefore is not currently executing guest code.
- *
- * Systems with the RAS Extensions specify an imp-def ESR (ISV/IDS = 1) with
- * the remaining ISS all-zeros so that this error is not interpreted as an
- * uncategorized RAS error. Without the RAS Extensions we can't specify an ESR
- * value, so the CPU generates an imp-def value.
- */
-void kvm_inject_vabt(struct kvm_vcpu *vcpu)
+static bool kvm_serror_target_is_el2(struct kvm_vcpu *vcpu)
+{
+ if (is_hyp_ctxt(vcpu) || vcpu_el2_amo_is_set(vcpu))
+ return true;
+
+ if (!(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TMEA))
+ return false;
+
+ /*
+ * In another example where FEAT_DoubleFault2 is entirely backwards,
+ * "masked" as it relates to the routing effects of HCRX_EL2.TMEA
+ * doesn't consider SCTLR2_EL1.NMEA. That is to say, even if EL1 asked
+ * for non-maskable SErrors, the EL2 bit takes priority if A is set.
+ */
+ if (vcpu_mode_priv(vcpu))
+ return *vcpu_cpsr(vcpu) & PSR_A_BIT;
+
+ /*
+ * Otherwise SErrors are considered unmasked when taken from EL0 and
+ * NMEA is set.
+ */
+ return serror_is_masked(vcpu);
+}
+
+static bool kvm_serror_undeliverable_at_el2(struct kvm_vcpu *vcpu)
+{
+ return !(vcpu_el2_tge_is_set(vcpu) || vcpu_el2_amo_is_set(vcpu));
+}
+
+int kvm_inject_serror_esr(struct kvm_vcpu *vcpu, u64 esr)
{
- kvm_set_sei_esr(vcpu, ESR_ELx_ISV);
+ lockdep_assert_held(&vcpu->mutex);
+
+ if (is_nested_ctxt(vcpu) && kvm_serror_target_is_el2(vcpu))
+ return kvm_inject_nested_serror(vcpu, esr);
+
+ if (vcpu_is_el2(vcpu) && kvm_serror_undeliverable_at_el2(vcpu)) {
+ vcpu_set_vsesr(vcpu, esr);
+ vcpu_set_flag(vcpu, NESTED_SERROR_PENDING);
+ return 1;
+ }
+
+ /*
+ * Emulate the exception entry if SErrors are unmasked. This is useful if
+ * the vCPU is in a nested context w/ vSErrors enabled then we've already
+ * delegated he hardware vSError context (i.e. HCR_EL2.VSE, VSESR_EL2,
+ * VDISR_EL2) to the guest hypervisor.
+ *
+ * As we're emulating the SError injection we need to explicitly populate
+ * ESR_ELx.EC because hardware will not do it on our behalf.
+ */
+ if (!serror_is_masked(vcpu)) {
+ pend_serror_exception(vcpu);
+ esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SERROR);
+ vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
+ return 1;
+ }
+
+ vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK);
+ *vcpu_hcr(vcpu) |= HCR_VSE;
+ return 1;
}
diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
index ab365e839874..54f9358c9e0e 100644
--- a/arch/arm64/kvm/mmio.c
+++ b/arch/arm64/kvm/mmio.c
@@ -72,7 +72,7 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
return data;
}
-static bool kvm_pending_sync_exception(struct kvm_vcpu *vcpu)
+static bool kvm_pending_external_abort(struct kvm_vcpu *vcpu)
{
if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION))
return false;
@@ -90,6 +90,8 @@ static bool kvm_pending_sync_exception(struct kvm_vcpu *vcpu)
switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) {
case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC):
case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC):
+ case unpack_vcpu_flag(EXCEPT_AA64_EL1_SERR):
+ case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR):
return true;
default:
return false;
@@ -113,7 +115,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu)
* Detect if the MMIO return was already handled or if userspace aborted
* the MMIO access.
*/
- if (unlikely(!vcpu->mmio_needed || kvm_pending_sync_exception(vcpu)))
+ if (unlikely(!vcpu->mmio_needed || kvm_pending_external_abort(vcpu)))
return 1;
vcpu->mmio_needed = 0;
@@ -169,10 +171,8 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
kvm_vcpu_get_hfar(vcpu), fault_ipa);
- if (vcpu_is_protected(vcpu)) {
- kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
- return 1;
- }
+ if (vcpu_is_protected(vcpu))
+ return kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
&vcpu->kvm->arch.flags)) {
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 2942ec92c5a4..1c78864767c5 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -193,11 +193,6 @@ int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
return 0;
}
-static bool kvm_is_device_pfn(unsigned long pfn)
-{
- return !pfn_is_map_memory(pfn);
-}
-
static void *stage2_memcache_zalloc_page(void *arg)
{
struct kvm_mmu_memory_cache *mc = arg;
@@ -1470,6 +1465,18 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
return vma->vm_flags & VM_MTE_ALLOWED;
}
+static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
+{
+ switch (FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot))) {
+ case MT_NORMAL_NC:
+ case MT_DEVICE_nGnRnE:
+ case MT_DEVICE_nGnRE:
+ return false;
+ default:
+ return true;
+ }
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_s2_trans *nested,
struct kvm_memory_slot *memslot, unsigned long hva,
@@ -1477,8 +1484,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
{
int ret = 0;
bool write_fault, writable, force_pte = false;
- bool exec_fault, mte_allowed;
- bool device = false, vfio_allow_any_uc = false;
+ bool exec_fault, mte_allowed, is_vma_cacheable;
+ bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
unsigned long mmu_seq;
phys_addr_t ipa = fault_ipa;
struct kvm *kvm = vcpu->kvm;
@@ -1492,6 +1499,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
struct page *page;
+ vm_flags_t vm_flags;
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
if (fault_is_perm)
@@ -1619,6 +1627,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
+ vm_flags = vma->vm_flags;
+
+ is_vma_cacheable = kvm_vma_is_cacheable(vma);
+
/* Don't use the VMA after the unlock -- it may have vanished */
vma = NULL;
@@ -1642,18 +1654,39 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (is_error_noslot_pfn(pfn))
return -EFAULT;
- if (kvm_is_device_pfn(pfn)) {
- /*
- * If the page was identified as device early by looking at
- * the VMA flags, vma_pagesize is already representing the
- * largest quantity we can map. If instead it was mapped
- * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
- * and must not be upgraded.
- *
- * In both cases, we don't let transparent_hugepage_adjust()
- * change things at the last minute.
- */
- device = true;
+ /*
+ * Check if this is non-struct page memory PFN, and cannot support
+ * CMOs. It could potentially be unsafe to access as cachable.
+ */
+ if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) {
+ if (is_vma_cacheable) {
+ /*
+ * Whilst the VMA owner expects cacheable mapping to this
+ * PFN, hardware also has to support the FWB and CACHE DIC
+ * features.
+ *
+ * ARM64 KVM relies on kernel VA mapping to the PFN to
+ * perform cache maintenance as the CMO instructions work on
+ * virtual addresses. VM_PFNMAP region are not necessarily
+ * mapped to a KVA and hence the presence of hardware features
+ * S2FWB and CACHE DIC are mandatory to avoid the need for
+ * cache maintenance.
+ */
+ if (!kvm_supports_cacheable_pfnmap())
+ return -EFAULT;
+ } else {
+ /*
+ * If the page was identified as device early by looking at
+ * the VMA flags, vma_pagesize is already representing the
+ * largest quantity we can map. If instead it was mapped
+ * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
+ * and must not be upgraded.
+ *
+ * In both cases, we don't let transparent_hugepage_adjust()
+ * change things at the last minute.
+ */
+ s2_force_noncacheable = true;
+ }
} else if (logging_active && !write_fault) {
/*
* Only actually map the page as writable if this was a write
@@ -1662,7 +1695,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
writable = false;
}
- if (exec_fault && device)
+ if (exec_fault && s2_force_noncacheable)
return -ENOEXEC;
/*
@@ -1695,7 +1728,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* If we are not forced to use page mapping, check if we are
* backed by a THP and thus use block mapping if possible.
*/
- if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
+ if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) {
if (fault_is_perm && fault_granule > PAGE_SIZE)
vma_pagesize = fault_granule;
else
@@ -1709,7 +1742,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
}
- if (!fault_is_perm && !device && kvm_has_mte(kvm)) {
+ if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) {
/* Check the VMM hasn't introduced a new disallowed VMA */
if (mte_allowed) {
sanitise_mte_tags(kvm, pfn, vma_pagesize);
@@ -1725,7 +1758,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault)
prot |= KVM_PGTABLE_PROT_X;
- if (device) {
+ if (s2_force_noncacheable) {
if (vfio_allow_any_uc)
prot |= KVM_PGTABLE_PROT_NORMAL_NC;
else
@@ -1808,7 +1841,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
* There is no need to pass the error into the guest.
*/
if (kvm_handle_guest_sea())
- kvm_inject_vabt(vcpu);
+ return kvm_inject_serror(vcpu);
return 1;
}
@@ -1836,11 +1869,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) {
fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
- if (is_iabt)
- kvm_inject_pabt(vcpu, fault_ipa);
- else
- kvm_inject_dabt(vcpu, fault_ipa);
- return 1;
+ return kvm_inject_sea(vcpu, is_iabt, fault_ipa);
}
}
@@ -1912,8 +1941,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
}
if (kvm_vcpu_abt_iss1tw(vcpu)) {
- kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
- ret = 1;
+ ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
goto out_unlock;
}
@@ -1958,10 +1986,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
if (ret == 0)
ret = 1;
out:
- if (ret == -ENOEXEC) {
- kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
- ret = 1;
- }
+ if (ret == -ENOEXEC)
+ ret = kvm_inject_sea_iabt(vcpu, kvm_vcpu_get_hfar(vcpu));
out_unlock:
srcu_read_unlock(&vcpu->kvm->srcu, idx);
return ret;
@@ -2221,6 +2247,15 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
ret = -EINVAL;
break;
}
+
+ /*
+ * Cacheable PFNMAP is allowed only if the hardware
+ * supports it.
+ */
+ if (kvm_vma_is_cacheable(vma) && !kvm_supports_cacheable_pfnmap()) {
+ ret = -EINVAL;
+ break;
+ }
}
hva = min(reg_end, vma->vm_end);
} while (hva < reg_end);
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index dc1d26559bfa..153b3e11b115 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -1441,12 +1441,11 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
break;
case SYS_ID_AA64PFR0_EL1:
- /* No RME, AMU, MPAM, S-EL2, or RAS */
+ /* No RME, AMU, MPAM, or S-EL2 */
val &= ~(ID_AA64PFR0_EL1_RME |
ID_AA64PFR0_EL1_AMU |
ID_AA64PFR0_EL1_MPAM |
ID_AA64PFR0_EL1_SEL2 |
- ID_AA64PFR0_EL1_RAS |
ID_AA64PFR0_EL1_EL3 |
ID_AA64PFR0_EL1_EL2 |
ID_AA64PFR0_EL1_EL1 |
@@ -1683,69 +1682,21 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
set_sysreg_masks(kvm, HFGITR2_EL2, res0, res1);
/* TCR2_EL2 */
- res0 = TCR2_EL2_RES0;
- res1 = TCR2_EL2_RES1;
- if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, D128, IMP))
- res0 |= (TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1 | TCR2_EL2_D128);
- if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, MEC, IMP))
- res0 |= TCR2_EL2_AMEC1 | TCR2_EL2_AMEC0;
- if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, HAFDBS, HAFT))
- res0 |= TCR2_EL2_HAFT;
- if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP))
- res0 |= TCR2_EL2_PTTWI | TCR2_EL2_PnCH;
- if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP))
- res0 |= TCR2_EL2_AIE;
- if (!kvm_has_s1poe(kvm))
- res0 |= TCR2_EL2_POE | TCR2_EL2_E0POE;
- if (!kvm_has_s1pie(kvm))
- res0 |= TCR2_EL2_PIE;
- if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP))
- res0 |= (TCR2_EL2_E0POE | TCR2_EL2_D128 |
- TCR2_EL2_AMEC1 | TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1);
+ get_reg_fixed_bits(kvm, TCR2_EL2, &res0, &res1);
set_sysreg_masks(kvm, TCR2_EL2, res0, res1);
/* SCTLR_EL1 */
- res0 = SCTLR_EL1_RES0;
- res1 = SCTLR_EL1_RES1;
- if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
- res0 |= SCTLR_EL1_EPAN;
+ get_reg_fixed_bits(kvm, SCTLR_EL1, &res0, &res1);
set_sysreg_masks(kvm, SCTLR_EL1, res0, res1);
+ /* SCTLR2_ELx */
+ get_reg_fixed_bits(kvm, SCTLR2_EL1, &res0, &res1);
+ set_sysreg_masks(kvm, SCTLR2_EL1, res0, res1);
+ get_reg_fixed_bits(kvm, SCTLR2_EL2, &res0, &res1);
+ set_sysreg_masks(kvm, SCTLR2_EL2, res0, res1);
+
/* MDCR_EL2 */
- res0 = MDCR_EL2_RES0;
- res1 = MDCR_EL2_RES1;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP))
- res0 |= (MDCR_EL2_HPMN | MDCR_EL2_TPMCR |
- MDCR_EL2_TPM | MDCR_EL2_HPME);
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP))
- res0 |= MDCR_EL2_E2PB | MDCR_EL2_TPMS;
- if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, SPMU, IMP))
- res0 |= MDCR_EL2_EnSPM;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P1))
- res0 |= MDCR_EL2_HPMD;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP))
- res0 |= MDCR_EL2_TTRF;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P5))
- res0 |= MDCR_EL2_HCCD | MDCR_EL2_HLP;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP))
- res0 |= MDCR_EL2_E2TB;
- if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP))
- res0 |= MDCR_EL2_TDCC;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, MTPMU, IMP) ||
- kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP))
- res0 |= MDCR_EL2_MTPME;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P7))
- res0 |= MDCR_EL2_HPMFZO;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSS, IMP))
- res0 |= MDCR_EL2_PMSSE;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2))
- res0 |= MDCR_EL2_HPMFZS;
- if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, EBEP, IMP))
- res0 |= MDCR_EL2_PMEE;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, V8P9))
- res0 |= MDCR_EL2_EBWE;
- if (!kvm_has_feat(kvm, ID_AA64DFR2_EL1, STEP, IMP))
- res0 |= MDCR_EL2_EnSTEPOP;
+ get_reg_fixed_bits(kvm, MDCR_EL2, &res0, &res1);
set_sysreg_masks(kvm, MDCR_EL2, res0, res1);
/* CNTHCTL_EL2 */
@@ -1802,3 +1753,43 @@ void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu))
kvm_inject_nested_irq(vcpu);
}
+
+/*
+ * One of the many architectural bugs in FEAT_NV2 is that the guest hypervisor
+ * can write to HCR_EL2 behind our back, potentially changing the exception
+ * routing / masking for even the host context.
+ *
+ * What follows is some slop to (1) react to exception routing / masking and (2)
+ * preserve the pending SError state across translation regimes.
+ */
+void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu_has_nv(vcpu))
+ return;
+
+ if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
+ kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
+}
+
+void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+ unsigned long *hcr = vcpu_hcr(vcpu);
+
+ if (!vcpu_has_nv(vcpu))
+ return;
+
+ /*
+ * We previously decided that an SError was deliverable to the guest.
+ * Reap the pending state from HCR_EL2 and...
+ */
+ if (unlikely(__test_and_clear_bit(__ffs(HCR_VSE), hcr)))
+ vcpu_set_flag(vcpu, NESTED_SERROR_PENDING);
+
+ /*
+ * Re-attempt SError injection in case the deliverability has changed,
+ * which is necessary to faithfully emulate WFI the case of a pending
+ * SError being a wakeup condition.
+ */
+ if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
+ kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
+}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 7b8a0a6f2646..82ffb3b3b3cf 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -108,7 +108,6 @@ static bool get_el2_to_el1_mapping(unsigned int reg,
PURE_EL2_SYSREG( HACR_EL2 );
PURE_EL2_SYSREG( VTTBR_EL2 );
PURE_EL2_SYSREG( VTCR_EL2 );
- PURE_EL2_SYSREG( RVBAR_EL2 );
PURE_EL2_SYSREG( TPIDR_EL2 );
PURE_EL2_SYSREG( HPFAR_EL2 );
PURE_EL2_SYSREG( HCRX_EL2 );
@@ -144,6 +143,7 @@ static bool get_el2_to_el1_mapping(unsigned int reg,
MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL );
MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL );
MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL );
+ MAPPED_EL2_SYSREG(SCTLR2_EL2, SCTLR2_EL1, NULL );
default:
return false;
}
@@ -533,8 +533,7 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu,
return ignore_write(vcpu, p);
if (p->Op1 == 4) { /* ICC_SRE_EL2 */
- p->regval = (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE |
- ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB);
+ p->regval = KVM_ICC_SRE_EL2;
} else { /* ICC_SRE_EL1 */
p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre;
}
@@ -773,6 +772,12 @@ static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
return mpidr;
}
+static unsigned int hidden_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *r)
+{
+ return REG_HIDDEN;
+}
+
static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
@@ -1612,7 +1617,6 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac);
break;
@@ -1645,8 +1649,10 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
val &= ~ID_AA64MMFR2_EL1_NV;
break;
case SYS_ID_AA64MMFR3_EL1:
- val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE |
- ID_AA64MMFR3_EL1_S1PIE;
+ val &= ID_AA64MMFR3_EL1_TCRX |
+ ID_AA64MMFR3_EL1_SCTLRX |
+ ID_AA64MMFR3_EL1_S1POE |
+ ID_AA64MMFR3_EL1_S1PIE;
break;
case SYS_ID_MMFR4_EL1:
val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX);
@@ -1813,7 +1819,7 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP);
}
- if (kvm_vgic_global_state.type == VGIC_V3) {
+ if (vgic_is_v3(vcpu->kvm)) {
val &= ~ID_AA64PFR0_EL1_GIC_MASK;
val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
}
@@ -1955,6 +1961,14 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
(vcpu_has_nv(vcpu) && !FIELD_GET(ID_AA64PFR0_EL1_EL2, user_val)))
return -EINVAL;
+ /*
+ * If we are running on a GICv5 host and support FEAT_GCIE_LEGACY, then
+ * we support GICv3. Fail attempts to do anything but set that to IMP.
+ */
+ if (vgic_is_v3_compat(vcpu->kvm) &&
+ FIELD_GET(ID_AA64PFR0_EL1_GIC_MASK, user_val) != ID_AA64PFR0_EL1_GIC_IMP)
+ return -EINVAL;
+
return set_id_reg(vcpu, rd, user_val);
}
@@ -2327,6 +2341,10 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu,
EL2_REG_FILTERED(name, acc, rst, v, el2_visibility)
#define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v)
+#define EL2_REG_VNCR_FILT(name, vis) \
+ EL2_REG_FILTERED(name, bad_vncr_trap, reset_val, 0, vis)
+#define EL2_REG_VNCR_GICv3(name) \
+ EL2_REG_VNCR_FILT(name, hidden_visibility)
#define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v)
/*
@@ -2485,6 +2503,21 @@ static unsigned int vncr_el2_visibility(const struct kvm_vcpu *vcpu,
return REG_HIDDEN;
}
+static unsigned int sctlr2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_sctlr2(vcpu->kvm))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int sctlr2_el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return __el2_visibility(vcpu, rd, sctlr2_visibility);
+}
+
static bool access_zcr_el2(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -2515,11 +2548,7 @@ static bool access_gic_vtr(struct kvm_vcpu *vcpu,
if (p->is_write)
return write_to_read_only(vcpu, p, r);
- p->regval = kvm_vgic_global_state.ich_vtr_el2;
- p->regval &= ~(ICH_VTR_EL2_DVIM |
- ICH_VTR_EL2_A3V |
- ICH_VTR_EL2_IDbits);
- p->regval |= ICH_VTR_EL2_nV4;
+ p->regval = kvm_get_guest_vtr_el2();
return true;
}
@@ -2590,6 +2619,26 @@ static unsigned int tcr2_el2_visibility(const struct kvm_vcpu *vcpu,
return __el2_visibility(vcpu, rd, tcr2_visibility);
}
+static unsigned int fgt2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (el2_visibility(vcpu, rd) == 0 &&
+ kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, FGT, FGT2))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int fgt_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (el2_visibility(vcpu, rd) == 0 &&
+ kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, FGT, IMP))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
static unsigned int s1pie_visibility(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
@@ -2641,6 +2690,23 @@ static bool access_mdcr(struct kvm_vcpu *vcpu,
return true;
}
+static bool access_ras(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ switch(reg_to_encoding(r)) {
+ default:
+ if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+ }
+
+ return trap_raz_wi(vcpu, p, r);
+}
+
/*
* For historical (ahem ABI) reasons, KVM treated MIDR_EL1, REVIDR_EL1, and
* AIDR_EL1 as "invariant" registers, meaning userspace cannot change them.
@@ -2868,7 +2934,6 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64PFR0_EL1_FP)),
ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1,
~(ID_AA64PFR1_EL1_PFAR |
- ID_AA64PFR1_EL1_DF2 |
ID_AA64PFR1_EL1_MTEX |
ID_AA64PFR1_EL1_THE |
ID_AA64PFR1_EL1_GCS |
@@ -2950,6 +3015,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64MMFR2_EL1_NV |
ID_AA64MMFR2_EL1_CCIDX)),
ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX |
+ ID_AA64MMFR3_EL1_SCTLRX |
ID_AA64MMFR3_EL1_S1PIE |
ID_AA64MMFR3_EL1_S1POE)),
ID_WRITABLE(ID_AA64MMFR4_EL1, ID_AA64MMFR4_EL1_NV_frac),
@@ -2960,6 +3026,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 },
{ SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 },
{ SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 },
+ { SYS_DESC(SYS_SCTLR2_EL1), access_vm_reg, reset_val, SCTLR2_EL1, 0,
+ .visibility = sctlr2_visibility },
MTE_REG(RGSR_EL1),
MTE_REG(GCR_EL1),
@@ -2989,14 +3057,14 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 },
{ SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 },
- { SYS_DESC(SYS_ERRIDR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERRSELR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXFR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXCTLR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXSTATUS_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXADDR_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi },
- { SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi },
+ { SYS_DESC(SYS_ERRIDR_EL1), access_ras },
+ { SYS_DESC(SYS_ERRSELR_EL1), access_ras },
+ { SYS_DESC(SYS_ERXFR_EL1), access_ras },
+ { SYS_DESC(SYS_ERXCTLR_EL1), access_ras },
+ { SYS_DESC(SYS_ERXSTATUS_EL1), access_ras },
+ { SYS_DESC(SYS_ERXADDR_EL1), access_ras },
+ { SYS_DESC(SYS_ERXMISC0_EL1), access_ras },
+ { SYS_DESC(SYS_ERXMISC1_EL1), access_ras },
MTE_REG(TFSR_EL1),
MTE_REG(TFSRE0_EL1),
@@ -3307,12 +3375,14 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG_VNCR(VMPIDR_EL2, reset_unknown, 0),
EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1),
EL2_REG(ACTLR_EL2, access_rw, reset_val, 0),
+ EL2_REG_FILTERED(SCTLR2_EL2, access_vm_reg, reset_val, 0,
+ sctlr2_el2_visibility),
EL2_REG_VNCR(HCR_EL2, reset_hcr, 0),
EL2_REG(MDCR_EL2, access_mdcr, reset_mdcr, 0),
EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1),
EL2_REG_VNCR(HSTR_EL2, reset_val, 0),
- EL2_REG_VNCR(HFGRTR_EL2, reset_val, 0),
- EL2_REG_VNCR(HFGWTR_EL2, reset_val, 0),
+ EL2_REG_VNCR_FILT(HFGRTR_EL2, fgt_visibility),
+ EL2_REG_VNCR_FILT(HFGWTR_EL2, fgt_visibility),
EL2_REG_VNCR(HFGITR_EL2, reset_val, 0),
EL2_REG_VNCR(HACR_EL2, reset_val, 0),
@@ -3332,9 +3402,14 @@ static const struct sys_reg_desc sys_reg_descs[] = {
vncr_el2_visibility),
{ SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 },
- EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0),
- EL2_REG_VNCR(HDFGWTR_EL2, reset_val, 0),
- EL2_REG_VNCR(HAFGRTR_EL2, reset_val, 0),
+ EL2_REG_VNCR_FILT(HDFGRTR2_EL2, fgt2_visibility),
+ EL2_REG_VNCR_FILT(HDFGWTR2_EL2, fgt2_visibility),
+ EL2_REG_VNCR_FILT(HFGRTR2_EL2, fgt2_visibility),
+ EL2_REG_VNCR_FILT(HFGWTR2_EL2, fgt2_visibility),
+ EL2_REG_VNCR_FILT(HDFGRTR_EL2, fgt_visibility),
+ EL2_REG_VNCR_FILT(HDFGWTR_EL2, fgt_visibility),
+ EL2_REG_VNCR_FILT(HAFGRTR_EL2, fgt_visibility),
+ EL2_REG_VNCR_FILT(HFGITR2_EL2, fgt2_visibility),
EL2_REG_REDIR(SPSR_EL2, reset_val, 0),
EL2_REG_REDIR(ELR_EL2, reset_val, 0),
{ SYS_DESC(SYS_SP_EL1), access_sp_el1},
@@ -3349,6 +3424,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG(AFSR0_EL2, access_rw, reset_val, 0),
EL2_REG(AFSR1_EL2, access_rw, reset_val, 0),
EL2_REG_REDIR(ESR_EL2, reset_val, 0),
+ EL2_REG_VNCR(VSESR_EL2, reset_unknown, 0),
{ SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 },
EL2_REG_REDIR(FAR_EL2, reset_val, 0),
@@ -3375,43 +3451,44 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_MPAMVPM7_EL2), undef_access },
EL2_REG(VBAR_EL2, access_rw, reset_val, 0),
- EL2_REG(RVBAR_EL2, access_rw, reset_val, 0),
+ { SYS_DESC(SYS_RVBAR_EL2), undef_access },
{ SYS_DESC(SYS_RMR_EL2), undef_access },
+ EL2_REG_VNCR(VDISR_EL2, reset_unknown, 0),
- EL2_REG_VNCR(ICH_AP0R0_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP0R1_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP0R2_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP0R3_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP1R0_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP1R1_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP1R2_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_AP1R3_EL2, reset_val, 0),
+ EL2_REG_VNCR_GICv3(ICH_AP0R0_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP0R1_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP0R2_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP0R3_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP1R0_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP1R1_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP1R2_EL2),
+ EL2_REG_VNCR_GICv3(ICH_AP1R3_EL2),
{ SYS_DESC(SYS_ICC_SRE_EL2), access_gic_sre },
- EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0),
+ EL2_REG_VNCR_GICv3(ICH_HCR_EL2),
{ SYS_DESC(SYS_ICH_VTR_EL2), access_gic_vtr },
{ SYS_DESC(SYS_ICH_MISR_EL2), access_gic_misr },
{ SYS_DESC(SYS_ICH_EISR_EL2), access_gic_eisr },
{ SYS_DESC(SYS_ICH_ELRSR_EL2), access_gic_elrsr },
- EL2_REG_VNCR(ICH_VMCR_EL2, reset_val, 0),
-
- EL2_REG_VNCR(ICH_LR0_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR1_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR2_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR3_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR4_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR5_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR6_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR7_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR8_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR9_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR10_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR11_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR12_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR13_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR14_EL2, reset_val, 0),
- EL2_REG_VNCR(ICH_LR15_EL2, reset_val, 0),
+ EL2_REG_VNCR_GICv3(ICH_VMCR_EL2),
+
+ EL2_REG_VNCR_GICv3(ICH_LR0_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR1_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR2_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR3_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR4_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR5_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR6_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR7_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR8_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR9_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR10_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR11_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR12_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR13_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR14_EL2),
+ EL2_REG_VNCR_GICv3(ICH_LR15_EL2),
EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0),
EL2_REG(TPIDR_EL2, access_rw, reset_val, 0),
@@ -4280,12 +4357,12 @@ static const struct sys_reg_desc cp15_64_regs[] = {
};
static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n,
- bool is_32)
+ bool reset_check)
{
unsigned int i;
for (i = 0; i < n; i++) {
- if (!is_32 && table[i].reg && !table[i].reset) {
+ if (reset_check && table[i].reg && !table[i].reset) {
kvm_err("sys_reg table %pS entry %d (%s) lacks reset\n",
&table[i], i, table[i].name);
return false;
@@ -4480,7 +4557,7 @@ static bool kvm_esr_cp10_id_to_sys64(u64 esr, struct sys_reg_params *params)
return true;
kvm_pr_unimpl("Unhandled cp10 register %s: %u\n",
- params->is_write ? "write" : "read", reg_id);
+ str_write_read(params->is_write), reg_id);
return false;
}
@@ -5274,18 +5351,22 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu)
int __init kvm_sys_reg_table_init(void)
{
+ const struct sys_reg_desc *gicv3_regs;
bool valid = true;
- unsigned int i;
+ unsigned int i, sz;
int ret = 0;
/* Make sure tables are unique and in order. */
- valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false);
- valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true);
- valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true);
- valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true);
- valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true);
+ valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), true);
+ valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), false);
+ valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), false);
+ valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), false);
+ valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), false);
valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false);
+ gicv3_regs = vgic_v3_get_sysreg_table(&sz);
+ valid &= check_sysreg_table(gicv3_regs, sz, false);
+
if (!valid)
return -EINVAL;
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index ef97d9fc67cc..317abc490368 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -108,7 +108,7 @@ inline void print_sys_reg_msg(const struct sys_reg_params *p,
/* Look, we even formatted it for you to paste into the table! */
kvm_pr_unimpl("%pV { Op0(%2u), Op1(%2u), CRn(%2u), CRm(%2u), Op2(%2u), func_%s },\n",
&(struct va_format){ fmt, &va },
- p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, p->is_write ? "write" : "read");
+ p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, str_write_read(p->is_write));
va_end(va);
}
diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h
index f85415db7713..a7ab9a3bbed0 100644
--- a/arch/arm64/kvm/trace_handle_exit.h
+++ b/arch/arm64/kvm/trace_handle_exit.h
@@ -113,7 +113,7 @@ TRACE_EVENT(kvm_sys_access,
__entry->vcpu_pc, __entry->name ?: "UNKN",
__entry->Op0, __entry->Op1, __entry->CRn,
__entry->CRm, __entry->Op2,
- __entry->is_write ? "write" : "read")
+ str_write_read(__entry->is_write))
);
TRACE_EVENT(kvm_set_guest_debug,
diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c
index 5eacb4b3250a..bdc2d57370b2 100644
--- a/arch/arm64/kvm/vgic-sys-reg-v3.c
+++ b/arch/arm64/kvm/vgic-sys-reg-v3.c
@@ -297,6 +297,91 @@ static int get_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
return 0;
}
+static int set_gic_ich_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ __vcpu_assign_sys_reg(vcpu, r->reg, val);
+ return 0;
+}
+
+static int get_gic_ich_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ *val = __vcpu_sys_reg(vcpu, r->reg);
+ return 0;
+}
+
+static int set_gic_ich_apr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ u8 idx = r->Op2 & 3;
+
+ if (idx > vgic_v3_max_apr_idx(vcpu))
+ return -EINVAL;
+
+ return set_gic_ich_reg(vcpu, r, val);
+}
+
+static int get_gic_ich_apr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ u8 idx = r->Op2 & 3;
+
+ if (idx > vgic_v3_max_apr_idx(vcpu))
+ return -EINVAL;
+
+ return get_gic_ich_reg(vcpu, r, val);
+}
+
+static int set_gic_icc_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ if (val != KVM_ICC_SRE_EL2)
+ return -EINVAL;
+ return 0;
+}
+
+static int get_gic_icc_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ *val = KVM_ICC_SRE_EL2;
+ return 0;
+}
+
+static int set_gic_ich_vtr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 val)
+{
+ if (val != kvm_get_guest_vtr_el2())
+ return -EINVAL;
+ return 0;
+}
+
+static int get_gic_ich_vtr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ *val = kvm_get_guest_vtr_el2();
+ return 0;
+}
+
+static unsigned int el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return vcpu_has_nv(vcpu) ? 0 : REG_HIDDEN;
+}
+
+#define __EL2_REG(r, acc, i) \
+ { \
+ SYS_DESC(SYS_ ## r), \
+ .get_user = get_gic_ ## acc, \
+ .set_user = set_gic_ ## acc, \
+ .reg = i, \
+ .visibility = el2_visibility, \
+ }
+
+#define EL2_REG(r, acc) __EL2_REG(r, acc, r)
+
+#define EL2_REG_RO(r, acc) __EL2_REG(r, acc, 0)
+
static const struct sys_reg_desc gic_v3_icc_reg_descs[] = {
{ SYS_DESC(SYS_ICC_PMR_EL1),
.set_user = set_gic_pmr, .get_user = get_gic_pmr, },
@@ -328,8 +413,42 @@ static const struct sys_reg_desc gic_v3_icc_reg_descs[] = {
.set_user = set_gic_grpen0, .get_user = get_gic_grpen0, },
{ SYS_DESC(SYS_ICC_IGRPEN1_EL1),
.set_user = set_gic_grpen1, .get_user = get_gic_grpen1, },
+ EL2_REG(ICH_AP0R0_EL2, ich_apr),
+ EL2_REG(ICH_AP0R1_EL2, ich_apr),
+ EL2_REG(ICH_AP0R2_EL2, ich_apr),
+ EL2_REG(ICH_AP0R3_EL2, ich_apr),
+ EL2_REG(ICH_AP1R0_EL2, ich_apr),
+ EL2_REG(ICH_AP1R1_EL2, ich_apr),
+ EL2_REG(ICH_AP1R2_EL2, ich_apr),
+ EL2_REG(ICH_AP1R3_EL2, ich_apr),
+ EL2_REG_RO(ICC_SRE_EL2, icc_sre),
+ EL2_REG(ICH_HCR_EL2, ich_reg),
+ EL2_REG_RO(ICH_VTR_EL2, ich_vtr),
+ EL2_REG(ICH_VMCR_EL2, ich_reg),
+ EL2_REG(ICH_LR0_EL2, ich_reg),
+ EL2_REG(ICH_LR1_EL2, ich_reg),
+ EL2_REG(ICH_LR2_EL2, ich_reg),
+ EL2_REG(ICH_LR3_EL2, ich_reg),
+ EL2_REG(ICH_LR4_EL2, ich_reg),
+ EL2_REG(ICH_LR5_EL2, ich_reg),
+ EL2_REG(ICH_LR6_EL2, ich_reg),
+ EL2_REG(ICH_LR7_EL2, ich_reg),
+ EL2_REG(ICH_LR8_EL2, ich_reg),
+ EL2_REG(ICH_LR9_EL2, ich_reg),
+ EL2_REG(ICH_LR10_EL2, ich_reg),
+ EL2_REG(ICH_LR11_EL2, ich_reg),
+ EL2_REG(ICH_LR12_EL2, ich_reg),
+ EL2_REG(ICH_LR13_EL2, ich_reg),
+ EL2_REG(ICH_LR14_EL2, ich_reg),
+ EL2_REG(ICH_LR15_EL2, ich_reg),
};
+const struct sys_reg_desc *vgic_v3_get_sysreg_table(unsigned int *sz)
+{
+ *sz = ARRAY_SIZE(gic_v3_icc_reg_descs);
+ return gic_v3_icc_reg_descs;
+}
+
static u64 attr_to_id(u64 attr)
{
return ARM64_SYS_REG(FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP0_MASK, attr),
@@ -341,8 +460,12 @@ static u64 attr_to_id(u64 attr)
int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
- if (get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs,
- ARRAY_SIZE(gic_v3_icc_reg_descs)))
+ const struct sys_reg_desc *r;
+
+ r = get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs,
+ ARRAY_SIZE(gic_v3_icc_reg_descs));
+
+ if (r && !sysreg_hidden(vcpu, r))
return 0;
return -ENXIO;
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index eb1205654ac8..1e680ad6e863 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -157,6 +157,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
kvm->arch.vgic.in_kernel = true;
kvm->arch.vgic.vgic_model = type;
+ kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST;
kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
@@ -165,6 +166,9 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
else
INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
+ if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
+ kvm->arch.vgic.nassgicap = system_supports_direct_sgis();
+
out_unlock:
mutex_unlock(&kvm->arch.config_lock);
kvm_unlock_all_vcpus(kvm);
@@ -391,11 +395,10 @@ int vgic_init(struct kvm *kvm)
goto out;
/*
- * If we have GICv4.1 enabled, unconditionally request enable the
- * v4 support so that we get HW-accelerated vSGIs. Otherwise, only
- * enable it if we present a virtual ITS to the guest.
+ * Ensure vPEs are allocated if direct IRQ injection (e.g. vSGIs,
+ * vLPIs) is supported.
*/
- if (vgic_supports_direct_msis(kvm)) {
+ if (vgic_supports_direct_irqs(kvm)) {
ret = vgic_v4_init(kvm);
if (ret)
goto out;
@@ -409,15 +412,7 @@ int vgic_init(struct kvm *kvm)
goto out;
vgic_debug_init(kvm);
-
- /*
- * If userspace didn't set the GIC implementation revision,
- * default to the latest and greatest. You know want it.
- */
- if (!dist->implementation_rev)
- dist->implementation_rev = KVM_VGIC_IMP_REV_LATEST;
dist->initialized = true;
-
out:
return ret;
}
@@ -443,7 +438,7 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
}
- if (vgic_supports_direct_msis(kvm))
+ if (vgic_supports_direct_irqs(kvm))
vgic_v4_teardown(kvm);
xa_destroy(&dist->lpi_xa);
@@ -674,10 +669,12 @@ void kvm_vgic_init_cpu_hardware(void)
* We want to make sure the list registers start out clear so that we
* only have the program the used registers.
*/
- if (kvm_vgic_global_state.type == VGIC_V2)
+ if (kvm_vgic_global_state.type == VGIC_V2) {
vgic_v2_init_lrs();
- else
+ } else if (kvm_vgic_global_state.type == VGIC_V3 ||
+ kvm_vgic_global_state.has_gcie_v3_compat) {
kvm_call_hyp(__vgic_v3_init_lrs);
+ }
}
/**
@@ -722,6 +719,9 @@ int kvm_vgic_hyp_init(void)
kvm_info("GIC system register CPU interface enabled\n");
}
break;
+ case GIC_V5:
+ ret = vgic_v5_probe(gic_kvm_info);
+ break;
default:
ret = -ENODEV;
}
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 534049c7c94b..7368c13f16b7 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -758,7 +758,7 @@ static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
if (irq) {
scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
if (irq->hw)
- WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
+ its_unmap_vlpi(ite->irq->host_irq);
irq->hw = false;
}
@@ -2694,6 +2694,9 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
case KVM_DEV_ARM_ITS_RESTORE_TABLES:
ret = abi->restore_tables(its);
break;
+ default:
+ ret = -ENXIO;
+ break;
}
mutex_unlock(&its->its_lock);
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index f9ae790163fb..3d1a776b716d 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -5,6 +5,7 @@
* Copyright (C) 2015 ARM Ltd.
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
+#include <linux/irqchip/arm-gic-v3.h>
#include <linux/kvm_host.h>
#include <kvm/arm_vgic.h>
#include <linux/uaccess.h>
@@ -303,12 +304,6 @@ static int vgic_get_common_attr(struct kvm_device *dev,
VGIC_NR_PRIVATE_IRQS, uaddr);
break;
}
- case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: {
- u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-
- r = put_user(dev->kvm->arch.vgic.mi_intid, uaddr);
- break;
- }
}
return r;
@@ -510,6 +505,24 @@ int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
}
/*
+ * Allow access to certain ID-like registers prior to VGIC initialization,
+ * thereby allowing the VMM to provision the features / sizing of the VGIC.
+ */
+static bool reg_allowed_pre_init(struct kvm_device_attr *attr)
+{
+ if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS)
+ return false;
+
+ switch (attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK) {
+ case GICD_IIDR:
+ case GICD_TYPER2:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
* vgic_v3_attr_regs_access - allows user space to access VGIC v3 state
*
* @dev: kvm device handle
@@ -523,7 +536,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
struct vgic_reg_attr reg_attr;
gpa_t addr;
struct kvm_vcpu *vcpu;
- bool uaccess, post_init = true;
+ bool uaccess;
u32 val;
int ret;
@@ -539,9 +552,6 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
/* Sysregs uaccess is performed by the sysreg handling code */
uaccess = false;
break;
- case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
- post_init = false;
- fallthrough;
default:
uaccess = true;
}
@@ -561,7 +571,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->arch.config_lock);
- if (post_init != vgic_initialized(dev->kvm)) {
+ if (!(vgic_initialized(dev->kvm) || reg_allowed_pre_init(attr))) {
ret = -EBUSY;
goto out;
}
@@ -591,19 +601,6 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
}
break;
}
- case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
- if (!is_write) {
- val = dev->kvm->arch.vgic.mi_intid;
- ret = 0;
- break;
- }
-
- ret = -EINVAL;
- if ((val < VGIC_NR_PRIVATE_IRQS) && (val >= VGIC_NR_SGIS)) {
- dev->kvm->arch.vgic.mi_intid = val;
- ret = 0;
- }
- break;
default:
ret = -EINVAL;
break;
@@ -630,8 +627,24 @@ static int vgic_v3_set_attr(struct kvm_device *dev,
case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO:
- case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
return vgic_v3_attr_regs_access(dev, attr, true);
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: {
+ u32 __user *uaddr = (u32 __user *)attr->addr;
+ u32 val;
+
+ if (get_user(val, uaddr))
+ return -EFAULT;
+
+ guard(mutex)(&dev->kvm->arch.config_lock);
+ if (vgic_initialized(dev->kvm))
+ return -EBUSY;
+
+ if (!irq_is_ppi(val))
+ return -EINVAL;
+
+ dev->kvm->arch.vgic.mi_intid = val;
+ return 0;
+ }
default:
return vgic_set_common_attr(dev, attr);
}
@@ -645,8 +658,13 @@ static int vgic_v3_get_attr(struct kvm_device *dev,
case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO:
- case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
return vgic_v3_attr_regs_access(dev, attr, false);
+ case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+
+ guard(mutex)(&dev->kvm->arch.config_lock);
+ return put_user(dev->kvm->arch.vgic.mi_intid, uaddr);
+ }
default:
return vgic_get_common_attr(dev, attr);
}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index ae4c0593d114..a3ef185209e9 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -50,8 +50,17 @@ bool vgic_has_its(struct kvm *kvm)
bool vgic_supports_direct_msis(struct kvm *kvm)
{
- return (kvm_vgic_global_state.has_gicv4_1 ||
- (kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm)));
+ return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm);
+}
+
+bool system_supports_direct_sgis(void)
+{
+ return kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi();
+}
+
+bool vgic_supports_direct_sgis(struct kvm *kvm)
+{
+ return kvm->arch.vgic.nassgicap;
}
/*
@@ -86,7 +95,7 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
}
break;
case GICD_TYPER2:
- if (kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi())
+ if (vgic_supports_direct_sgis(vcpu->kvm))
value = GICD_TYPER2_nASSGIcap;
break;
case GICD_IIDR:
@@ -119,7 +128,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
/* Not a GICv4.1? No HW SGIs */
- if (!kvm_vgic_global_state.has_gicv4_1 || !gic_cpuif_has_vsgi())
+ if (!vgic_supports_direct_sgis(vcpu->kvm))
val &= ~GICD_CTLR_nASSGIreq;
/* Dist stays enabled? nASSGIreq is RO */
@@ -133,7 +142,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
if (is_hwsgi != dist->nassgireq)
vgic_v4_configure_vsgis(vcpu->kvm);
- if (kvm_vgic_global_state.has_gicv4_1 &&
+ if (vgic_supports_direct_sgis(vcpu->kvm) &&
was_enabled != dist->enabled)
kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4);
else if (!was_enabled && dist->enabled)
@@ -159,8 +168,18 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
switch (addr & 0x0c) {
case GICD_TYPER2:
- if (val != vgic_mmio_read_v3_misc(vcpu, addr, len))
+ reg = vgic_mmio_read_v3_misc(vcpu, addr, len);
+
+ if (reg == val)
+ return 0;
+ if (vgic_initialized(vcpu->kvm))
+ return -EBUSY;
+ if ((reg ^ val) & ~GICD_TYPER2_nASSGIcap)
return -EINVAL;
+ if (!system_supports_direct_sgis() && val)
+ return -EINVAL;
+
+ dist->nassgicap = val & GICD_TYPER2_nASSGIcap;
return 0;
case GICD_IIDR:
reg = vgic_mmio_read_v3_misc(vcpu, addr, len);
@@ -178,7 +197,7 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
}
case GICD_CTLR:
/* Not a GICv4.1? No HW SGIs */
- if (!kvm_vgic_global_state.has_gicv4_1)
+ if (!vgic_supports_direct_sgis(vcpu->kvm))
val &= ~GICD_CTLR_nASSGIreq;
dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c
index 679aafe77de2..7f1259b49c50 100644
--- a/arch/arm64/kvm/vgic/vgic-v3-nested.c
+++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c
@@ -116,7 +116,7 @@ bool vgic_state_is_nested(struct kvm_vcpu *vcpu)
{
u64 xmo;
- if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
+ if (is_nested_ctxt(vcpu)) {
xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO);
WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO),
"Separate virtual IRQ/FIQ settings not supported\n");
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 193946108192..4d9343d2b0b1 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -356,7 +356,7 @@ int vgic_v4_put(struct kvm_vcpu *vcpu)
{
struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
- if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
+ if (!vgic_supports_direct_irqs(vcpu->kvm) || !vpe->resident)
return 0;
return its_make_vpe_non_resident(vpe, vgic_v4_want_doorbell(vcpu));
@@ -367,7 +367,7 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
int err;
- if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident)
+ if (!vgic_supports_direct_irqs(vcpu->kvm) || vpe->resident)
return 0;
if (vcpu_get_flag(vcpu, IN_WFI))
@@ -527,28 +527,26 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq)
return NULL;
}
-int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
+void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
{
struct vgic_irq *irq;
unsigned long flags;
- int ret = 0;
if (!vgic_supports_direct_msis(kvm))
- return 0;
+ return;
irq = __vgic_host_irq_get_vlpi(kvm, host_irq);
if (!irq)
- return 0;
+ return;
raw_spin_lock_irqsave(&irq->irq_lock, flags);
WARN_ON(irq->hw && irq->host_irq != host_irq);
if (irq->hw) {
atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
irq->hw = false;
- ret = its_unmap_vlpi(host_irq);
+ its_unmap_vlpi(host_irq);
}
raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(kvm, irq);
- return ret;
}
diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
new file mode 100644
index 000000000000..6bdbb221bcde
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <kvm/arm_vgic.h>
+#include <linux/irqchip/arm-vgic-info.h>
+
+#include "vgic.h"
+
+/*
+ * Probe for a vGICv5 compatible interrupt controller, returning 0 on success.
+ * Currently only supports GICv3-based VMs on a GICv5 host, and hence only
+ * registers a VGIC_V3 device.
+ */
+int vgic_v5_probe(const struct gic_kvm_info *info)
+{
+ u64 ich_vtr_el2;
+ int ret;
+
+ if (!info->has_gcie_v3_compat)
+ return -ENODEV;
+
+ kvm_vgic_global_state.type = VGIC_V5;
+ kvm_vgic_global_state.has_gcie_v3_compat = true;
+
+ /* We only support v3 compat mode - use vGICv3 limits */
+ kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
+
+ kvm_vgic_global_state.vcpu_base = 0;
+ kvm_vgic_global_state.vctrl_base = NULL;
+ kvm_vgic_global_state.can_emulate_gicv2 = false;
+ kvm_vgic_global_state.has_gicv4 = false;
+ kvm_vgic_global_state.has_gicv4_1 = false;
+
+ ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config);
+ kvm_vgic_global_state.ich_vtr_el2 = (u32)ich_vtr_el2;
+
+ /*
+ * The ListRegs field is 5 bits, but there is an architectural
+ * maximum of 16 list registers. Just ignore bit 4...
+ */
+ kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
+
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+ if (ret) {
+ kvm_err("Cannot register GICv3-legacy KVM device.\n");
+ return ret;
+ }
+
+ static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
+ kvm_info("GCIE legacy system register CPU interface\n");
+
+ return 0;
+}
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 8f8096d48925..f5148b38120a 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -951,7 +951,7 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
* can be directly injected (GICv4).
*/
if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
- !vgic_supports_direct_msis(vcpu->kvm))
+ !vgic_supports_direct_irqs(vcpu->kvm))
return;
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
@@ -965,7 +965,7 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
if (can_access_vgic_from_kernel())
vgic_restore_state(vcpu);
- if (vgic_supports_direct_msis(vcpu->kvm))
+ if (vgic_supports_direct_irqs(vcpu->kvm))
vgic_v4_commit(vcpu);
}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 4349084cb9a6..1384a04c0784 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -64,6 +64,24 @@
KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \
KVM_REG_ARM_VGIC_SYSREG_OP2_MASK)
+#define KVM_ICC_SRE_EL2 (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE | \
+ ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB)
+#define KVM_ICH_VTR_EL2_RES0 (ICH_VTR_EL2_DVIM | \
+ ICH_VTR_EL2_A3V | \
+ ICH_VTR_EL2_IDbits)
+#define KVM_ICH_VTR_EL2_RES1 ICH_VTR_EL2_nV4
+
+static inline u64 kvm_get_guest_vtr_el2(void)
+{
+ u64 vtr;
+
+ vtr = kvm_vgic_global_state.ich_vtr_el2;
+ vtr &= ~KVM_ICH_VTR_EL2_RES0;
+ vtr |= KVM_ICH_VTR_EL2_RES1;
+
+ return vtr;
+}
+
/*
* As per Documentation/virt/kvm/devices/arm-vgic-its.rst,
* below macros are defined for ITS table entry encoding.
@@ -297,6 +315,7 @@ int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr, bool is_write);
int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
+const struct sys_reg_desc *vgic_v3_get_sysreg_table(unsigned int *sz);
int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
u32 intid, u32 *val);
int kvm_register_vgic_device(unsigned long type);
@@ -308,6 +327,8 @@ int vgic_init(struct kvm *kvm);
void vgic_debug_init(struct kvm *kvm);
void vgic_debug_destroy(struct kvm *kvm);
+int vgic_v5_probe(const struct gic_kvm_info *info);
+
static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu;
@@ -369,7 +390,23 @@ void vgic_its_invalidate_all_caches(struct kvm *kvm);
int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq);
int vgic_its_invall(struct kvm_vcpu *vcpu);
+bool system_supports_direct_sgis(void);
bool vgic_supports_direct_msis(struct kvm *kvm);
+bool vgic_supports_direct_sgis(struct kvm *kvm);
+
+static inline bool vgic_supports_direct_irqs(struct kvm *kvm)
+{
+ /*
+ * Deliberately conflate vLPI and vSGI support on GICv4.1 hardware,
+ * indirectly allowing userspace to control whether or not vPEs are
+ * allocated for the VM.
+ */
+ if (system_supports_direct_sgis())
+ return vgic_supports_direct_sgis(kvm);
+
+ return vgic_supports_direct_msis(kvm);
+}
+
int vgic_v4_init(struct kvm *kvm);
void vgic_v4_teardown(struct kvm *kvm);
void vgic_v4_configure_vsgis(struct kvm *kvm);
@@ -389,6 +426,17 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu);
void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu);
void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu);
+static inline bool vgic_is_v3_compat(struct kvm *kvm)
+{
+ return cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF) &&
+ kvm_vgic_global_state.has_gcie_v3_compat;
+}
+
+static inline bool vgic_is_v3(struct kvm *kvm)
+{
+ return kvm_vgic_global_state.type == VGIC_V3 || vgic_is_v3_compat(kvm);
+}
+
int vgic_its_debug_init(struct kvm_device *dev);
void vgic_its_debug_destroy(struct kvm_device *dev);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index fcc783e8e9bb..d816ff44faff 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -555,7 +555,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
const struct fault_info *inf;
struct mm_struct *mm = current->mm;
vm_fault_t fault;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
unsigned int mm_flags = FAULT_FLAG_DEFAULT;
unsigned long addr = untagged_addr(far);
struct vm_area_struct *vma;
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index c86c348857c4..08ee177432c2 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -81,7 +81,7 @@ static int __init adjust_protection_map(void)
}
arch_initcall(adjust_protection_map);
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
{
ptdesc_t prot;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 00ab1d648db6..abd9725796e9 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -26,6 +26,7 @@
#include <linux/set_memory.h>
#include <linux/kfence.h>
#include <linux/pkeys.h>
+#include <linux/mm_inline.h>
#include <asm/barrier.h>
#include <asm/cputype.h>
@@ -720,7 +721,7 @@ void mark_rodata_ro(void)
static void __init declare_vma(struct vm_struct *vma,
void *va_start, void *va_end,
- unsigned long vm_flags)
+ vm_flags_t vm_flags)
{
phys_addr_t pa_start = __pa_symbol(va_start);
unsigned long size = va_end - va_start;
@@ -1524,24 +1525,41 @@ static int __init prevent_bootmem_remove_init(void)
early_initcall(prevent_bootmem_remove_init);
#endif
-pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
+pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
{
+ pte_t pte = get_and_clear_full_ptes(vma->vm_mm, addr, ptep, nr, /* full = */ 0);
+
if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
/*
* Break-before-make (BBM) is required for all user space mappings
* when the permission changes from executable to non-executable
* in cases where cpu is affected with errata #2645198.
*/
- if (pte_user_exec(ptep_get(ptep)))
- return ptep_clear_flush(vma, addr, ptep);
+ if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte))
+ __flush_tlb_range(vma, addr, nr * PAGE_SIZE,
+ PAGE_SIZE, true, 3);
}
- return ptep_get_and_clear(vma->vm_mm, addr, ptep);
+
+ return pte;
+}
+
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
+{
+ return modify_prot_start_ptes(vma, addr, ptep, 1);
+}
+
+void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte,
+ unsigned int nr)
+{
+ set_ptes(vma->vm_mm, addr, ptep, pte, nr);
}
void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
pte_t old_pte, pte_t pte)
{
- set_pte_at(vma->vm_mm, addr, ptep, pte);
+ modify_prot_commit_ptes(vma, addr, ptep, old_pte, pte, 1);
}
/*
diff --git a/arch/arm64/mm/ptdump_debugfs.c b/arch/arm64/mm/ptdump_debugfs.c
index 68bf1a125502..1e308328c079 100644
--- a/arch/arm64/mm/ptdump_debugfs.c
+++ b/arch/arm64/mm/ptdump_debugfs.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/debugfs.h>
-#include <linux/memory_hotplug.h>
#include <linux/seq_file.h>
#include <asm/ptdump.h>
@@ -9,9 +8,7 @@ static int ptdump_show(struct seq_file *m, void *v)
{
struct ptdump_info *info = m->private;
- get_online_mems();
ptdump_walk(m, info);
- put_online_mems();
return 0;
}
DEFINE_SHOW_ATTRIBUTE(ptdump);
diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
index a3b0e693a125..bbea4f36f9f2 100644
--- a/arch/arm64/net/bpf_jit.h
+++ b/arch/arm64/net/bpf_jit.h
@@ -325,4 +325,9 @@
#define A64_MRS_SP_EL0(Rt) \
aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_SP_EL0)
+/* Barriers */
+#define A64_SB aarch64_insn_get_sb_value()
+#define A64_DSB_NSH (aarch64_insn_get_dsb_base_value() | 0x7 << 8)
+#define A64_ISB aarch64_insn_get_isb_value()
+
#endif /* _BPF_JIT_H */
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index da8b89dd2910..97dfd5432809 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -30,6 +30,7 @@
#define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
#define TCCNT_PTR (MAX_BPF_JIT_REG + 2)
#define TMP_REG_3 (MAX_BPF_JIT_REG + 3)
+#define PRIVATE_SP (MAX_BPF_JIT_REG + 4)
#define ARENA_VM_START (MAX_BPF_JIT_REG + 5)
#define check_imm(bits, imm) do { \
@@ -68,6 +69,8 @@ static const int bpf2a64[] = {
[TCCNT_PTR] = A64_R(26),
/* temporary register for blinding constants */
[BPF_REG_AX] = A64_R(9),
+ /* callee saved register for private stack pointer */
+ [PRIVATE_SP] = A64_R(27),
/* callee saved register for kern_vm_start address */
[ARENA_VM_START] = A64_R(28),
};
@@ -86,6 +89,7 @@ struct jit_ctx {
u64 user_vm_start;
u64 arena_vm_start;
bool fp_used;
+ bool priv_sp_used;
bool write;
};
@@ -98,6 +102,10 @@ struct bpf_plt {
#define PLT_TARGET_SIZE sizeof_field(struct bpf_plt, target)
#define PLT_TARGET_OFFSET offsetof(struct bpf_plt, target)
+/* Memory size/value to protect private stack overflow/underflow */
+#define PRIV_STACK_GUARD_SZ 16
+#define PRIV_STACK_GUARD_VAL 0xEB9F12345678eb9fULL
+
static inline void emit(const u32 insn, struct jit_ctx *ctx)
{
if (ctx->image != NULL && ctx->write)
@@ -387,8 +395,11 @@ static void find_used_callee_regs(struct jit_ctx *ctx)
if (reg_used & 8)
ctx->used_callee_reg[i++] = bpf2a64[BPF_REG_9];
- if (reg_used & 16)
+ if (reg_used & 16) {
ctx->used_callee_reg[i++] = bpf2a64[BPF_REG_FP];
+ if (ctx->priv_sp_used)
+ ctx->used_callee_reg[i++] = bpf2a64[PRIVATE_SP];
+ }
if (ctx->arena_vm_start)
ctx->used_callee_reg[i++] = bpf2a64[ARENA_VM_START];
@@ -412,6 +423,7 @@ static void push_callee_regs(struct jit_ctx *ctx)
emit(A64_PUSH(A64_R(23), A64_R(24), A64_SP), ctx);
emit(A64_PUSH(A64_R(25), A64_R(26), A64_SP), ctx);
emit(A64_PUSH(A64_R(27), A64_R(28), A64_SP), ctx);
+ ctx->fp_used = true;
} else {
find_used_callee_regs(ctx);
for (i = 0; i + 1 < ctx->nr_used_callee_reg; i += 2) {
@@ -461,6 +473,19 @@ static void pop_callee_regs(struct jit_ctx *ctx)
}
}
+static void emit_percpu_ptr(const u8 dst_reg, void __percpu *ptr,
+ struct jit_ctx *ctx)
+{
+ const u8 tmp = bpf2a64[TMP_REG_1];
+
+ emit_a64_mov_i64(dst_reg, (__force const u64)ptr, ctx);
+ if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN))
+ emit(A64_MRS_TPIDR_EL2(tmp), ctx);
+ else
+ emit(A64_MRS_TPIDR_EL1(tmp), ctx);
+ emit(A64_ADD(1, dst_reg, dst_reg, tmp), ctx);
+}
+
#define BTI_INSNS (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) ? 1 : 0)
#define PAC_INSNS (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) ? 1 : 0)
@@ -476,6 +501,8 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
const bool is_main_prog = !bpf_is_subprog(prog);
const u8 fp = bpf2a64[BPF_REG_FP];
const u8 arena_vm_base = bpf2a64[ARENA_VM_START];
+ const u8 priv_sp = bpf2a64[PRIVATE_SP];
+ void __percpu *priv_stack_ptr;
const int idx0 = ctx->idx;
int cur_offset;
@@ -551,15 +578,23 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
emit(A64_SUB_I(1, A64_SP, A64_FP, 96), ctx);
}
- if (ctx->fp_used)
- /* Set up BPF prog stack base register */
- emit(A64_MOV(1, fp, A64_SP), ctx);
-
/* Stack must be multiples of 16B */
ctx->stack_size = round_up(prog->aux->stack_depth, 16);
+ if (ctx->fp_used) {
+ if (ctx->priv_sp_used) {
+ /* Set up private stack pointer */
+ priv_stack_ptr = prog->aux->priv_stack_ptr + PRIV_STACK_GUARD_SZ;
+ emit_percpu_ptr(priv_sp, priv_stack_ptr, ctx);
+ emit(A64_ADD_I(1, fp, priv_sp, ctx->stack_size), ctx);
+ } else {
+ /* Set up BPF prog stack base register */
+ emit(A64_MOV(1, fp, A64_SP), ctx);
+ }
+ }
+
/* Set up function call stack */
- if (ctx->stack_size)
+ if (ctx->stack_size && !ctx->priv_sp_used)
emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
if (ctx->arena_vm_start)
@@ -623,7 +658,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx)
emit(A64_STR64I(tcc, ptr, 0), ctx);
/* restore SP */
- if (ctx->stack_size)
+ if (ctx->stack_size && !ctx->priv_sp_used)
emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
pop_callee_regs(ctx);
@@ -991,7 +1026,7 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic)
const u8 ptr = bpf2a64[TCCNT_PTR];
/* We're done with BPF stack */
- if (ctx->stack_size)
+ if (ctx->stack_size && !ctx->priv_sp_used)
emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
pop_callee_regs(ctx);
@@ -1120,6 +1155,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
const u8 tmp2 = bpf2a64[TMP_REG_2];
const u8 fp = bpf2a64[BPF_REG_FP];
const u8 arena_vm_base = bpf2a64[ARENA_VM_START];
+ const u8 priv_sp = bpf2a64[PRIVATE_SP];
const s16 off = insn->off;
const s32 imm = insn->imm;
const int i = insn - ctx->prog->insnsi;
@@ -1564,7 +1600,7 @@ emit_cond_jmp:
src = tmp2;
}
if (src == fp) {
- src_adj = A64_SP;
+ src_adj = ctx->priv_sp_used ? priv_sp : A64_SP;
off_adj = off + ctx->stack_size;
} else {
src_adj = src;
@@ -1630,17 +1666,14 @@ emit_cond_jmp:
return ret;
break;
- /* speculation barrier */
+ /* speculation barrier against v1 and v4 */
case BPF_ST | BPF_NOSPEC:
- /*
- * Nothing required here.
- *
- * In case of arm64, we rely on the firmware mitigation of
- * Speculative Store Bypass as controlled via the ssbd kernel
- * parameter. Whenever the mitigation is enabled, it works
- * for all of the kernel code with no need to provide any
- * additional instructions.
- */
+ if (alternative_has_cap_likely(ARM64_HAS_SB)) {
+ emit(A64_SB, ctx);
+ } else {
+ emit(A64_DSB_NSH, ctx);
+ emit(A64_ISB, ctx);
+ }
break;
/* ST: *(size *)(dst + off) = imm */
@@ -1657,7 +1690,7 @@ emit_cond_jmp:
dst = tmp2;
}
if (dst == fp) {
- dst_adj = A64_SP;
+ dst_adj = ctx->priv_sp_used ? priv_sp : A64_SP;
off_adj = off + ctx->stack_size;
} else {
dst_adj = dst;
@@ -1719,7 +1752,7 @@ emit_cond_jmp:
dst = tmp2;
}
if (dst == fp) {
- dst_adj = A64_SP;
+ dst_adj = ctx->priv_sp_used ? priv_sp : A64_SP;
off_adj = off + ctx->stack_size;
} else {
dst_adj = dst;
@@ -1862,6 +1895,39 @@ static inline void bpf_flush_icache(void *start, void *end)
flush_icache_range((unsigned long)start, (unsigned long)end);
}
+static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size)
+{
+ int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+ u64 *stack_ptr;
+
+ for_each_possible_cpu(cpu) {
+ stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+ stack_ptr[0] = PRIV_STACK_GUARD_VAL;
+ stack_ptr[1] = PRIV_STACK_GUARD_VAL;
+ stack_ptr[underflow_idx] = PRIV_STACK_GUARD_VAL;
+ stack_ptr[underflow_idx + 1] = PRIV_STACK_GUARD_VAL;
+ }
+}
+
+static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size,
+ struct bpf_prog *prog)
+{
+ int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+ u64 *stack_ptr;
+
+ for_each_possible_cpu(cpu) {
+ stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+ if (stack_ptr[0] != PRIV_STACK_GUARD_VAL ||
+ stack_ptr[1] != PRIV_STACK_GUARD_VAL ||
+ stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL ||
+ stack_ptr[underflow_idx + 1] != PRIV_STACK_GUARD_VAL) {
+ pr_err("BPF private stack overflow/underflow detected for prog %sx\n",
+ bpf_jit_get_prog_name(prog));
+ break;
+ }
+ }
+}
+
struct arm64_jit_data {
struct bpf_binary_header *header;
u8 *ro_image;
@@ -1874,9 +1940,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
int image_size, prog_size, extable_size, extable_align, extable_offset;
struct bpf_prog *tmp, *orig_prog = prog;
struct bpf_binary_header *header;
- struct bpf_binary_header *ro_header;
+ struct bpf_binary_header *ro_header = NULL;
struct arm64_jit_data *jit_data;
+ void __percpu *priv_stack_ptr = NULL;
bool was_classic = bpf_prog_was_classic(prog);
+ int priv_stack_alloc_sz;
bool tmp_blinded = false;
bool extra_pass = false;
struct jit_ctx ctx;
@@ -1908,6 +1976,23 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
}
prog->aux->jit_data = jit_data;
}
+ priv_stack_ptr = prog->aux->priv_stack_ptr;
+ if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
+ /* Allocate actual private stack size with verifier-calculated
+ * stack size plus two memory guards to protect overflow and
+ * underflow.
+ */
+ priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 16) +
+ 2 * PRIV_STACK_GUARD_SZ;
+ priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 16, GFP_KERNEL);
+ if (!priv_stack_ptr) {
+ prog = orig_prog;
+ goto out_priv_stack;
+ }
+
+ priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_sz);
+ prog->aux->priv_stack_ptr = priv_stack_ptr;
+ }
if (jit_data->ctx.offset) {
ctx = jit_data->ctx;
ro_image_ptr = jit_data->ro_image;
@@ -1931,6 +2016,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
ctx.arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena);
+ if (priv_stack_ptr)
+ ctx.priv_sp_used = true;
+
/* Pass 1: Estimate the maximum image size.
*
* BPF line info needs ctx->offset[i] to be the offset of
@@ -2070,7 +2158,12 @@ skip_init_ctx:
ctx.offset[i] *= AARCH64_INSN_SIZE;
bpf_prog_fill_jited_linfo(prog, ctx.offset + 1);
out_off:
+ if (!ro_header && priv_stack_ptr) {
+ free_percpu(priv_stack_ptr);
+ prog->aux->priv_stack_ptr = NULL;
+ }
kvfree(ctx.offset);
+out_priv_stack:
kfree(jit_data);
prog->aux->jit_data = NULL;
}
@@ -2089,6 +2182,11 @@ out_free_hdr:
goto out_off;
}
+bool bpf_jit_supports_private_stack(void)
+{
+ return true;
+}
+
bool bpf_jit_supports_kfunc_call(void)
{
return true;
@@ -2243,11 +2341,6 @@ static int calc_arg_aux(const struct btf_func_model *m,
/* the rest arguments are passed through stack */
for (; i < m->nr_args; i++) {
- /* We can not know for sure about exact alignment needs for
- * struct passed on stack, so deny those
- */
- if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
- return -ENOTSUPP;
stack_slots = (m->arg_size[i] + 7) / 8;
a->bstack_for_args += stack_slots * 8;
a->ostack_for_args = a->ostack_for_args + stack_slots * 8;
@@ -2911,6 +3004,17 @@ bool bpf_jit_supports_percpu_insn(void)
return true;
}
+bool bpf_jit_bypass_spec_v4(void)
+{
+ /* In case of arm64, we rely on the firmware mitigation of Speculative
+ * Store Bypass as controlled via the ssbd kernel parameter. Whenever
+ * the mitigation is enabled, it works for all of the kernel code with
+ * no need to provide any additional instructions. Therefore, skip
+ * inserting nospec insns against Spectre v4.
+ */
+ return true;
+}
+
bool bpf_jit_inlines_helper_call(s32 imm)
{
switch (imm) {
@@ -2928,6 +3032,8 @@ void bpf_jit_free(struct bpf_prog *prog)
if (prog->jited) {
struct arm64_jit_data *jit_data = prog->aux->jit_data;
struct bpf_binary_header *hdr;
+ void __percpu *priv_stack_ptr;
+ int priv_stack_alloc_sz;
/*
* If we fail the final pass of JIT (from jit_subprogs),
@@ -2941,6 +3047,13 @@ void bpf_jit_free(struct bpf_prog *prog)
}
hdr = bpf_jit_binary_pack_hdr(prog);
bpf_jit_binary_pack_free(hdr, NULL);
+ priv_stack_ptr = prog->aux->priv_stack_ptr;
+ if (priv_stack_ptr) {
+ priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 16) +
+ 2 * PRIV_STACK_GUARD_SZ;
+ priv_stack_check_guard(priv_stack_ptr, priv_stack_alloc_sz, prog);
+ free_percpu(prog->aux->priv_stack_ptr);
+ }
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
}
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 066b0ee8b538..ef0b7946f5a4 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -35,7 +35,8 @@ HAS_GENERIC_AUTH
HAS_GENERIC_AUTH_ARCH_QARMA3
HAS_GENERIC_AUTH_ARCH_QARMA5
HAS_GENERIC_AUTH_IMP_DEF
-HAS_GIC_CPUIF_SYSREGS
+HAS_GICV3_CPUIF
+HAS_GICV5_CPUIF
HAS_GIC_PRIO_MASKING
HAS_GIC_PRIO_RELAXED_SYNC
HAS_HCR_NV1
@@ -50,6 +51,7 @@ HAS_PAN
HAS_PMUV3
HAS_S1PIE
HAS_S1POE
+HAS_SCTLR2
HAS_RAS_EXTN
HAS_RNG
HAS_SB
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 81c4e214ba33..696ab1f32a67 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1314,7 +1314,10 @@ UnsignedEnum 19:16 UINJ
0b0000 NI
0b0001 IMP
EndEnum
-Res0 15:12
+UnsignedEnum 15:12 GCIE
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
UnsignedEnum 11:8 MTEFAR
0b0000 NI
0b0001 IMP
@@ -3153,6 +3156,435 @@ Sysreg PMIAR_EL1 3 0 9 14 7
Field 63:0 ADDRESS
EndSysreg
+SysregFields ICC_PPI_HMRx_EL1
+Field 63 HM63
+Field 62 HM62
+Field 61 HM61
+Field 60 HM60
+Field 59 HM59
+Field 58 HM58
+Field 57 HM57
+Field 56 HM56
+Field 55 HM55
+Field 54 HM54
+Field 53 HM53
+Field 52 HM52
+Field 51 HM51
+Field 50 HM50
+Field 49 HM49
+Field 48 HM48
+Field 47 HM47
+Field 46 HM46
+Field 45 HM45
+Field 44 HM44
+Field 43 HM43
+Field 42 HM42
+Field 41 HM41
+Field 40 HM40
+Field 39 HM39
+Field 38 HM38
+Field 37 HM37
+Field 36 HM36
+Field 35 HM35
+Field 34 HM34
+Field 33 HM33
+Field 32 HM32
+Field 31 HM31
+Field 30 HM30
+Field 29 HM29
+Field 28 HM28
+Field 27 HM27
+Field 26 HM26
+Field 25 HM25
+Field 24 HM24
+Field 23 HM23
+Field 22 HM22
+Field 21 HM21
+Field 20 HM20
+Field 19 HM19
+Field 18 HM18
+Field 17 HM17
+Field 16 HM16
+Field 15 HM15
+Field 14 HM14
+Field 13 HM13
+Field 12 HM12
+Field 11 HM11
+Field 10 HM10
+Field 9 HM9
+Field 8 HM8
+Field 7 HM7
+Field 6 HM6
+Field 5 HM5
+Field 4 HM4
+Field 3 HM3
+Field 2 HM2
+Field 1 HM1
+Field 0 HM0
+EndSysregFields
+
+Sysreg ICC_PPI_HMR0_EL1 3 0 12 10 0
+Fields ICC_PPI_HMRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_HMR1_EL1 3 0 12 10 1
+Fields ICC_PPI_HMRx_EL1
+EndSysreg
+
+Sysreg ICC_IDR0_EL1 3 0 12 10 2
+Res0 63:12
+UnsignedEnum 11:8 GCIE_LEGACY
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+UnsignedEnum 7:4 PRI_BITS
+ 0b0011 4BITS
+ 0b0100 5BITS
+EndEnum
+UnsignedEnum 3:0 ID_BITS
+ 0b0000 16BITS
+ 0b0001 24BITS
+EndEnum
+EndSysreg
+
+Sysreg ICC_ICSR_EL1 3 0 12 10 4
+Res0 63:48
+Field 47:32 IAFFID
+Res0 31:16
+Field 15:11 Priority
+Res0 10:6
+Field 5 HM
+Field 4 Active
+Field 3 IRM
+Field 2 Pending
+Field 1 Enabled
+Field 0 F
+EndSysreg
+
+SysregFields ICC_PPI_ENABLERx_EL1
+Field 63 EN63
+Field 62 EN62
+Field 61 EN61
+Field 60 EN60
+Field 59 EN59
+Field 58 EN58
+Field 57 EN57
+Field 56 EN56
+Field 55 EN55
+Field 54 EN54
+Field 53 EN53
+Field 52 EN52
+Field 51 EN51
+Field 50 EN50
+Field 49 EN49
+Field 48 EN48
+Field 47 EN47
+Field 46 EN46
+Field 45 EN45
+Field 44 EN44
+Field 43 EN43
+Field 42 EN42
+Field 41 EN41
+Field 40 EN40
+Field 39 EN39
+Field 38 EN38
+Field 37 EN37
+Field 36 EN36
+Field 35 EN35
+Field 34 EN34
+Field 33 EN33
+Field 32 EN32
+Field 31 EN31
+Field 30 EN30
+Field 29 EN29
+Field 28 EN28
+Field 27 EN27
+Field 26 EN26
+Field 25 EN25
+Field 24 EN24
+Field 23 EN23
+Field 22 EN22
+Field 21 EN21
+Field 20 EN20
+Field 19 EN19
+Field 18 EN18
+Field 17 EN17
+Field 16 EN16
+Field 15 EN15
+Field 14 EN14
+Field 13 EN13
+Field 12 EN12
+Field 11 EN11
+Field 10 EN10
+Field 9 EN9
+Field 8 EN8
+Field 7 EN7
+Field 6 EN6
+Field 5 EN5
+Field 4 EN4
+Field 3 EN3
+Field 2 EN2
+Field 1 EN1
+Field 0 EN0
+EndSysregFields
+
+Sysreg ICC_PPI_ENABLER0_EL1 3 0 12 10 6
+Fields ICC_PPI_ENABLERx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_ENABLER1_EL1 3 0 12 10 7
+Fields ICC_PPI_ENABLERx_EL1
+EndSysreg
+
+SysregFields ICC_PPI_ACTIVERx_EL1
+Field 63 Active63
+Field 62 Active62
+Field 61 Active61
+Field 60 Active60
+Field 59 Active59
+Field 58 Active58
+Field 57 Active57
+Field 56 Active56
+Field 55 Active55
+Field 54 Active54
+Field 53 Active53
+Field 52 Active52
+Field 51 Active51
+Field 50 Active50
+Field 49 Active49
+Field 48 Active48
+Field 47 Active47
+Field 46 Active46
+Field 45 Active45
+Field 44 Active44
+Field 43 Active43
+Field 42 Active42
+Field 41 Active41
+Field 40 Active40
+Field 39 Active39
+Field 38 Active38
+Field 37 Active37
+Field 36 Active36
+Field 35 Active35
+Field 34 Active34
+Field 33 Active33
+Field 32 Active32
+Field 31 Active31
+Field 30 Active30
+Field 29 Active29
+Field 28 Active28
+Field 27 Active27
+Field 26 Active26
+Field 25 Active25
+Field 24 Active24
+Field 23 Active23
+Field 22 Active22
+Field 21 Active21
+Field 20 Active20
+Field 19 Active19
+Field 18 Active18
+Field 17 Active17
+Field 16 Active16
+Field 15 Active15
+Field 14 Active14
+Field 13 Active13
+Field 12 Active12
+Field 11 Active11
+Field 10 Active10
+Field 9 Active9
+Field 8 Active8
+Field 7 Active7
+Field 6 Active6
+Field 5 Active5
+Field 4 Active4
+Field 3 Active3
+Field 2 Active2
+Field 1 Active1
+Field 0 Active0
+EndSysregFields
+
+Sysreg ICC_PPI_CACTIVER0_EL1 3 0 12 13 0
+Fields ICC_PPI_ACTIVERx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_CACTIVER1_EL1 3 0 12 13 1
+Fields ICC_PPI_ACTIVERx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_SACTIVER0_EL1 3 0 12 13 2
+Fields ICC_PPI_ACTIVERx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_SACTIVER1_EL1 3 0 12 13 3
+Fields ICC_PPI_ACTIVERx_EL1
+EndSysreg
+
+SysregFields ICC_PPI_PENDRx_EL1
+Field 63 Pend63
+Field 62 Pend62
+Field 61 Pend61
+Field 60 Pend60
+Field 59 Pend59
+Field 58 Pend58
+Field 57 Pend57
+Field 56 Pend56
+Field 55 Pend55
+Field 54 Pend54
+Field 53 Pend53
+Field 52 Pend52
+Field 51 Pend51
+Field 50 Pend50
+Field 49 Pend49
+Field 48 Pend48
+Field 47 Pend47
+Field 46 Pend46
+Field 45 Pend45
+Field 44 Pend44
+Field 43 Pend43
+Field 42 Pend42
+Field 41 Pend41
+Field 40 Pend40
+Field 39 Pend39
+Field 38 Pend38
+Field 37 Pend37
+Field 36 Pend36
+Field 35 Pend35
+Field 34 Pend34
+Field 33 Pend33
+Field 32 Pend32
+Field 31 Pend31
+Field 30 Pend30
+Field 29 Pend29
+Field 28 Pend28
+Field 27 Pend27
+Field 26 Pend26
+Field 25 Pend25
+Field 24 Pend24
+Field 23 Pend23
+Field 22 Pend22
+Field 21 Pend21
+Field 20 Pend20
+Field 19 Pend19
+Field 18 Pend18
+Field 17 Pend17
+Field 16 Pend16
+Field 15 Pend15
+Field 14 Pend14
+Field 13 Pend13
+Field 12 Pend12
+Field 11 Pend11
+Field 10 Pend10
+Field 9 Pend9
+Field 8 Pend8
+Field 7 Pend7
+Field 6 Pend6
+Field 5 Pend5
+Field 4 Pend4
+Field 3 Pend3
+Field 2 Pend2
+Field 1 Pend1
+Field 0 Pend0
+EndSysregFields
+
+Sysreg ICC_PPI_CPENDR0_EL1 3 0 12 13 4
+Fields ICC_PPI_PENDRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_CPENDR1_EL1 3 0 12 13 5
+Fields ICC_PPI_PENDRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_SPENDR0_EL1 3 0 12 13 6
+Fields ICC_PPI_PENDRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_SPENDR1_EL1 3 0 12 13 7
+Fields ICC_PPI_PENDRx_EL1
+EndSysreg
+
+SysregFields ICC_PPI_PRIORITYRx_EL1
+Res0 63:61
+Field 60:56 Priority7
+Res0 55:53
+Field 52:48 Priority6
+Res0 47:45
+Field 44:40 Priority5
+Res0 39:37
+Field 36:32 Priority4
+Res0 31:29
+Field 28:24 Priority3
+Res0 23:21
+Field 20:16 Priority2
+Res0 15:13
+Field 12:8 Priority1
+Res0 7:5
+Field 4:0 Priority0
+EndSysregFields
+
+Sysreg ICC_PPI_PRIORITYR0_EL1 3 0 12 14 0
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR1_EL1 3 0 12 14 1
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR2_EL1 3 0 12 14 2
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR3_EL1 3 0 12 14 3
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR4_EL1 3 0 12 14 4
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR5_EL1 3 0 12 14 5
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR6_EL1 3 0 12 14 6
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR7_EL1 3 0 12 14 7
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR8_EL1 3 0 12 15 0
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR9_EL1 3 0 12 15 1
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR10_EL1 3 0 12 15 2
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR11_EL1 3 0 12 15 3
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR12_EL1 3 0 12 15 4
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR13_EL1 3 0 12 15 5
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR14_EL1 3 0 12 15 6
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
+Sysreg ICC_PPI_PRIORITYR15_EL1 3 0 12 15 7
+Fields ICC_PPI_PRIORITYRx_EL1
+EndSysreg
+
Sysreg PMSELR_EL0 3 3 9 12 5
Res0 63:5
Field 4:0 SEL
@@ -3235,6 +3667,19 @@ Res0 14:12
Field 11:0 AFFINITY
EndSysreg
+Sysreg ICC_CR0_EL1 3 1 12 0 1
+Res0 63:39
+Field 38 PID
+Field 37:32 IPPT
+Res0 31:1
+Field 0 EN
+EndSysreg
+
+Sysreg ICC_PCR_EL1 3 1 12 0 2
+Res0 63:5
+Field 4:0 PRIORITY
+EndSysreg
+
Sysreg CSSELR_EL1 3 2 0 0 0
Res0 63:5
Field 4 TnD
@@ -4121,6 +4566,54 @@ Field 31:16 PhyPARTID29
Field 15:0 PhyPARTID28
EndSysreg
+Sysreg ICH_HFGRTR_EL2 3 4 12 9 4
+Res0 63:21
+Field 20 ICC_PPI_ACTIVERn_EL1
+Field 19 ICC_PPI_PRIORITYRn_EL1
+Field 18 ICC_PPI_PENDRn_EL1
+Field 17 ICC_PPI_ENABLERn_EL1
+Field 16 ICC_PPI_HMRn_EL1
+Res0 15:8
+Field 7 ICC_IAFFIDR_EL1
+Field 6 ICC_ICSR_EL1
+Field 5 ICC_PCR_EL1
+Field 4 ICC_HPPIR_EL1
+Field 3 ICC_HAPR_EL1
+Field 2 ICC_CR0_EL1
+Field 1 ICC_IDRn_EL1
+Field 0 ICC_APR_EL1
+EndSysreg
+
+Sysreg ICH_HFGWTR_EL2 3 4 12 9 6
+Res0 63:21
+Field 20 ICC_PPI_ACTIVERn_EL1
+Field 19 ICC_PPI_PRIORITYRn_EL1
+Field 18 ICC_PPI_PENDRn_EL1
+Field 17 ICC_PPI_ENABLERn_EL1
+Res0 16:7
+Field 6 ICC_ICSR_EL1
+Field 5 ICC_PCR_EL1
+Res0 4:3
+Field 2 ICC_CR0_EL1
+Res0 1
+Field 0 ICC_APR_EL1
+EndSysreg
+
+Sysreg ICH_HFGITR_EL2 3 4 12 9 7
+Res0 63:11
+Field 10 GICRCDNMIA
+Field 9 GICRCDIA
+Field 8 GICCDDI
+Field 7 GICCDEOI
+Field 6 GICCDHM
+Field 5 GICCDRCFG
+Field 4 GICCDPEND
+Field 3 GICCDAFF
+Field 2 GICCDPRI
+Field 1 GICCDDIS
+Field 0 GICCDEN
+EndSysreg
+
Sysreg ICH_HCR_EL2 3 4 12 11 0
Res0 63:32
Field 31:27 EOIcount
@@ -4169,6 +4662,12 @@ Field 1 U
Field 0 EOI
EndSysreg
+Sysreg ICH_VCTLR_EL2 3 4 12 11 4
+Res0 63:2
+Field 1 V3
+Field 0 En
+EndSysreg
+
Sysreg CONTEXTIDR_EL2 3 4 13 0 1
Fields CONTEXTIDR_ELx
EndSysreg
@@ -4282,7 +4781,13 @@ Mapping TCR_EL1
EndSysreg
Sysreg TCR2_EL1 3 0 2 0 3
-Res0 63:16
+Res0 63:22
+Field 21 FNGNA1
+Field 20 FNGNA0
+Res0 19
+Field 18 FNG1
+Field 17 FNG0
+Field 16 A2
Field 15 DisCH1
Field 14 DisCH0
Res0 13:12
@@ -4306,7 +4811,10 @@ Mapping TCR2_EL1
EndSysreg
Sysreg TCR2_EL2 3 4 2 0 3
-Res0 63:16
+Res0 63:19
+Field 18 FNG1
+Field 17 FNG0
+Field 16 A2
Field 15 DisCH1
Field 14 DisCH0
Field 13 AMEC1
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index acc431c331b0..4331313a42ff 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -80,7 +80,6 @@ config CSKY
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_ERROR_INJECTION
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZO
select HAVE_KERNEL_LZMA
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index fa3eb87f0999..f0abc38c40ac 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -24,7 +24,6 @@ config LOONGARCH
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PREEMPT_LAZY
- select ARCH_HAS_PTE_DEVMAP
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SET_DIRECT_MAP
@@ -143,7 +142,6 @@ config LOONGARCH
select HAVE_EXIT_THREAD
select HAVE_GUP_FAST
select HAVE_FTRACE_GRAPH_FUNC
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_FUNCTION_GRAPH_FREGS
diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig
index 0d59af6007b7..68e337aed2bb 100644
--- a/arch/loongarch/configs/loongson3_defconfig
+++ b/arch/loongarch/configs/loongson3_defconfig
@@ -225,7 +225,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
CONFIG_NETFILTER_XT_MATCH_CPU=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
diff --git a/arch/loongarch/include/asm/hugetlb.h b/arch/loongarch/include/asm/hugetlb.h
index 4dc4b3e04225..ab68b594f889 100644
--- a/arch/loongarch/include/asm/hugetlb.h
+++ b/arch/loongarch/include/asm/hugetlb.h
@@ -10,20 +10,6 @@
uint64_t pmd_to_entrylo(unsigned long pmd_val);
-#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
-static inline int prepare_hugepage_range(struct file *file,
- unsigned long addr,
- unsigned long len)
-{
- unsigned long task_size = STACK_TOP;
-
- if (len > task_size)
- return -ENOMEM;
- if (task_size - len < addr)
- return -EINVAL;
- return 0;
-}
-
#define __HAVE_ARCH_HUGE_PTE_CLEAR
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz)
diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index a3c4cc46c892..0cecbd038bb3 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -50,12 +50,6 @@ struct kvm_vm_stat {
struct kvm_vm_stat_generic generic;
u64 pages;
u64 hugepages;
- u64 ipi_read_exits;
- u64 ipi_write_exits;
- u64 eiointc_read_exits;
- u64 eiointc_write_exits;
- u64 pch_pic_read_exits;
- u64 pch_pic_write_exits;
};
struct kvm_vcpu_stat {
@@ -65,6 +59,12 @@ struct kvm_vcpu_stat {
u64 cpucfg_exits;
u64 signal_exits;
u64 hypercall_exits;
+ u64 ipi_read_exits;
+ u64 ipi_write_exits;
+ u64 eiointc_read_exits;
+ u64 eiointc_write_exits;
+ u64 pch_pic_read_exits;
+ u64 pch_pic_write_exits;
};
#define KVM_MEM_HUGEPAGE_CAPABLE (1UL << 0)
diff --git a/arch/loongarch/include/asm/pgtable-bits.h b/arch/loongarch/include/asm/pgtable-bits.h
index 7bbfb04a54cc..2fc3789220ac 100644
--- a/arch/loongarch/include/asm/pgtable-bits.h
+++ b/arch/loongarch/include/asm/pgtable-bits.h
@@ -22,7 +22,6 @@
#define _PAGE_PFN_SHIFT 12
#define _PAGE_SWP_EXCLUSIVE_SHIFT 23
#define _PAGE_PFN_END_SHIFT 48
-#define _PAGE_DEVMAP_SHIFT 59
#define _PAGE_PRESENT_INVALID_SHIFT 60
#define _PAGE_NO_READ_SHIFT 61
#define _PAGE_NO_EXEC_SHIFT 62
@@ -36,7 +35,6 @@
#define _PAGE_MODIFIED (_ULCAST_(1) << _PAGE_MODIFIED_SHIFT)
#define _PAGE_PROTNONE (_ULCAST_(1) << _PAGE_PROTNONE_SHIFT)
#define _PAGE_SPECIAL (_ULCAST_(1) << _PAGE_SPECIAL_SHIFT)
-#define _PAGE_DEVMAP (_ULCAST_(1) << _PAGE_DEVMAP_SHIFT)
/* We borrow bit 23 to store the exclusive marker in swap PTEs. */
#define _PAGE_SWP_EXCLUSIVE (_ULCAST_(1) << _PAGE_SWP_EXCLUSIVE_SHIFT)
@@ -76,8 +74,8 @@
#define __READABLE (_PAGE_VALID)
#define __WRITEABLE (_PAGE_DIRTY | _PAGE_WRITE)
-#define _PAGE_CHG_MASK (_PAGE_MODIFIED | _PAGE_SPECIAL | _PAGE_DEVMAP | _PFN_MASK | _CACHE_MASK | _PAGE_PLV)
-#define _HPAGE_CHG_MASK (_PAGE_MODIFIED | _PAGE_SPECIAL | _PAGE_DEVMAP | _PFN_MASK | _CACHE_MASK | _PAGE_PLV | _PAGE_HUGE)
+#define _PAGE_CHG_MASK (_PAGE_MODIFIED | _PAGE_SPECIAL | _PFN_MASK | _CACHE_MASK | _PAGE_PLV)
+#define _HPAGE_CHG_MASK (_PAGE_MODIFIED | _PAGE_SPECIAL | _PFN_MASK | _CACHE_MASK | _PAGE_PLV | _PAGE_HUGE)
#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_NO_READ | \
_PAGE_USER | _CACHE_CC)
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index f2aeff544cee..bd128696e96d 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -409,9 +409,6 @@ static inline int pte_special(pte_t pte) { return pte_val(pte) & _PAGE_SPECIAL;
static inline pte_t pte_mkspecial(pte_t pte) { pte_val(pte) |= _PAGE_SPECIAL; return pte; }
#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
-static inline int pte_devmap(pte_t pte) { return !!(pte_val(pte) & _PAGE_DEVMAP); }
-static inline pte_t pte_mkdevmap(pte_t pte) { pte_val(pte) |= _PAGE_DEVMAP; return pte; }
-
#define pte_accessible pte_accessible
static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a)
{
@@ -540,17 +537,6 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd)
return pmd;
}
-static inline int pmd_devmap(pmd_t pmd)
-{
- return !!(pmd_val(pmd) & _PAGE_DEVMAP);
-}
-
-static inline pmd_t pmd_mkdevmap(pmd_t pmd)
-{
- pmd_val(pmd) |= _PAGE_DEVMAP;
- return pmd;
-}
-
static inline struct page *pmd_page(pmd_t pmd)
{
if (pmd_trans_huge(pmd))
@@ -606,11 +592,6 @@ static inline long pmd_protnone(pmd_t pmd)
#define pmd_leaf(pmd) ((pmd_val(pmd) & _PAGE_HUGE) != 0)
#define pud_leaf(pud) ((pud_val(pud) & _PAGE_HUGE) != 0)
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define pud_devmap(pud) (0)
-#define pgd_devmap(pgd) (0)
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
/*
* We provide our own get_unmapped area to cope with the virtual aliasing
* constraints placed on us by the cache architecture.
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index fa52251b3bf1..2ce41f93b2a4 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -289,9 +289,11 @@ static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
er = EMULATE_FAIL;
switch (((inst.word >> 24) & 0xff)) {
case 0x0: /* CPUCFG GSPR */
+ trace_kvm_exit_cpucfg(vcpu, KVM_TRACE_EXIT_CPUCFG);
er = kvm_emu_cpucfg(vcpu, inst);
break;
case 0x4: /* CSR{RD,WR,XCHG} GSPR */
+ trace_kvm_exit_csr(vcpu, KVM_TRACE_EXIT_CSR);
er = kvm_handle_csr(vcpu, inst);
break;
case 0x6: /* Cache, Idle and IOCSR GSPR */
@@ -821,32 +823,25 @@ static int kvm_handle_lbt_disabled(struct kvm_vcpu *vcpu, int ecode)
return RESUME_GUEST;
}
-static int kvm_send_pv_ipi(struct kvm_vcpu *vcpu)
+static void kvm_send_pv_ipi(struct kvm_vcpu *vcpu)
{
- unsigned int min, cpu, i;
- unsigned long ipi_bitmap;
+ unsigned int min, cpu;
struct kvm_vcpu *dest;
+ DECLARE_BITMAP(ipi_bitmap, BITS_PER_LONG * 2) = {
+ kvm_read_reg(vcpu, LOONGARCH_GPR_A1),
+ kvm_read_reg(vcpu, LOONGARCH_GPR_A2)
+ };
min = kvm_read_reg(vcpu, LOONGARCH_GPR_A3);
- for (i = 0; i < 2; i++, min += BITS_PER_LONG) {
- ipi_bitmap = kvm_read_reg(vcpu, LOONGARCH_GPR_A1 + i);
- if (!ipi_bitmap)
+ for_each_set_bit(cpu, ipi_bitmap, BITS_PER_LONG * 2) {
+ dest = kvm_get_vcpu_by_cpuid(vcpu->kvm, cpu + min);
+ if (!dest)
continue;
- cpu = find_first_bit((void *)&ipi_bitmap, BITS_PER_LONG);
- while (cpu < BITS_PER_LONG) {
- dest = kvm_get_vcpu_by_cpuid(vcpu->kvm, cpu + min);
- cpu = find_next_bit((void *)&ipi_bitmap, BITS_PER_LONG, cpu + 1);
- if (!dest)
- continue;
-
- /* Send SWI0 to dest vcpu to emulate IPI interrupt */
- kvm_queue_irq(dest, INT_SWI0);
- kvm_vcpu_kick(dest);
- }
+ /* Send SWI0 to dest vcpu to emulate IPI interrupt */
+ kvm_queue_irq(dest, INT_SWI0);
+ kvm_vcpu_kick(dest);
}
-
- return 0;
}
/*
diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c
index a75f865d6fb9..a3a12af9ecbf 100644
--- a/arch/loongarch/kvm/intc/eiointc.c
+++ b/arch/loongarch/kvm/intc/eiointc.c
@@ -9,7 +9,7 @@
static void eiointc_set_sw_coreisr(struct loongarch_eiointc *s)
{
- int ipnum, cpu, cpuid, irq_index, irq_mask, irq;
+ int ipnum, cpu, cpuid, irq;
struct kvm_vcpu *vcpu;
for (irq = 0; irq < EIOINTC_IRQS; irq++) {
@@ -18,8 +18,6 @@ static void eiointc_set_sw_coreisr(struct loongarch_eiointc *s)
ipnum = count_trailing_zeros(ipnum);
ipnum = (ipnum >= 0 && ipnum < 4) ? ipnum : 0;
}
- irq_index = irq / 32;
- irq_mask = BIT(irq & 0x1f);
cpuid = s->coremap.reg_u8[irq];
vcpu = kvm_get_vcpu_by_cpuid(s->kvm, cpuid);
@@ -27,16 +25,16 @@ static void eiointc_set_sw_coreisr(struct loongarch_eiointc *s)
continue;
cpu = vcpu->vcpu_id;
- if (!!(s->coreisr.reg_u32[cpu][irq_index] & irq_mask))
- set_bit(irq, s->sw_coreisr[cpu][ipnum]);
+ if (test_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu]))
+ __set_bit(irq, s->sw_coreisr[cpu][ipnum]);
else
- clear_bit(irq, s->sw_coreisr[cpu][ipnum]);
+ __clear_bit(irq, s->sw_coreisr[cpu][ipnum]);
}
}
static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level)
{
- int ipnum, cpu, found, irq_index, irq_mask;
+ int ipnum, cpu, found;
struct kvm_vcpu *vcpu;
struct kvm_interrupt vcpu_irq;
@@ -48,19 +46,16 @@ static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level)
cpu = s->sw_coremap[irq];
vcpu = kvm_get_vcpu(s->kvm, cpu);
- irq_index = irq / 32;
- irq_mask = BIT(irq & 0x1f);
-
if (level) {
/* if not enable return false */
- if (((s->enable.reg_u32[irq_index]) & irq_mask) == 0)
+ if (!test_bit(irq, (unsigned long *)s->enable.reg_u32))
return;
- s->coreisr.reg_u32[cpu][irq_index] |= irq_mask;
+ __set_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu]);
found = find_first_bit(s->sw_coreisr[cpu][ipnum], EIOINTC_IRQS);
- set_bit(irq, s->sw_coreisr[cpu][ipnum]);
+ __set_bit(irq, s->sw_coreisr[cpu][ipnum]);
} else {
- s->coreisr.reg_u32[cpu][irq_index] &= ~irq_mask;
- clear_bit(irq, s->sw_coreisr[cpu][ipnum]);
+ __clear_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu]);
+ __clear_bit(irq, s->sw_coreisr[cpu][ipnum]);
found = find_first_bit(s->sw_coreisr[cpu][ipnum], EIOINTC_IRQS);
}
@@ -110,159 +105,14 @@ void eiointc_set_irq(struct loongarch_eiointc *s, int irq, int level)
unsigned long flags;
unsigned long *isr = (unsigned long *)s->isr.reg_u8;
- level ? set_bit(irq, isr) : clear_bit(irq, isr);
spin_lock_irqsave(&s->lock, flags);
+ level ? __set_bit(irq, isr) : __clear_bit(irq, isr);
eiointc_update_irq(s, irq, level);
spin_unlock_irqrestore(&s->lock, flags);
}
-static inline void eiointc_enable_irq(struct kvm_vcpu *vcpu,
- struct loongarch_eiointc *s, int index, u8 mask, int level)
-{
- u8 val;
- int irq;
-
- val = mask & s->isr.reg_u8[index];
- irq = ffs(val);
- while (irq != 0) {
- /*
- * enable bit change from 0 to 1,
- * need to update irq by pending bits
- */
- eiointc_update_irq(s, irq - 1 + index * 8, level);
- val &= ~BIT(irq - 1);
- irq = ffs(val);
- }
-}
-
-static int loongarch_eiointc_readb(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s,
- gpa_t addr, int len, void *val)
-{
- int index, ret = 0;
- u8 data = 0;
- gpa_t offset;
-
- offset = addr - EIOINTC_BASE;
- switch (offset) {
- case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
- index = offset - EIOINTC_NODETYPE_START;
- data = s->nodetype.reg_u8[index];
- break;
- case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
- index = offset - EIOINTC_IPMAP_START;
- data = s->ipmap.reg_u8[index];
- break;
- case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
- index = offset - EIOINTC_ENABLE_START;
- data = s->enable.reg_u8[index];
- break;
- case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
- index = offset - EIOINTC_BOUNCE_START;
- data = s->bounce.reg_u8[index];
- break;
- case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
- index = offset - EIOINTC_COREISR_START;
- data = s->coreisr.reg_u8[vcpu->vcpu_id][index];
- break;
- case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- index = offset - EIOINTC_COREMAP_START;
- data = s->coremap.reg_u8[index];
- break;
- default:
- ret = -EINVAL;
- break;
- }
- *(u8 *)val = data;
-
- return ret;
-}
-
-static int loongarch_eiointc_readw(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s,
- gpa_t addr, int len, void *val)
-{
- int index, ret = 0;
- u16 data = 0;
- gpa_t offset;
-
- offset = addr - EIOINTC_BASE;
- switch (offset) {
- case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
- index = (offset - EIOINTC_NODETYPE_START) >> 1;
- data = s->nodetype.reg_u16[index];
- break;
- case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
- index = (offset - EIOINTC_IPMAP_START) >> 1;
- data = s->ipmap.reg_u16[index];
- break;
- case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
- index = (offset - EIOINTC_ENABLE_START) >> 1;
- data = s->enable.reg_u16[index];
- break;
- case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
- index = (offset - EIOINTC_BOUNCE_START) >> 1;
- data = s->bounce.reg_u16[index];
- break;
- case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
- index = (offset - EIOINTC_COREISR_START) >> 1;
- data = s->coreisr.reg_u16[vcpu->vcpu_id][index];
- break;
- case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- index = (offset - EIOINTC_COREMAP_START) >> 1;
- data = s->coremap.reg_u16[index];
- break;
- default:
- ret = -EINVAL;
- break;
- }
- *(u16 *)val = data;
-
- return ret;
-}
-
-static int loongarch_eiointc_readl(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s,
- gpa_t addr, int len, void *val)
-{
- int index, ret = 0;
- u32 data = 0;
- gpa_t offset;
-
- offset = addr - EIOINTC_BASE;
- switch (offset) {
- case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
- index = (offset - EIOINTC_NODETYPE_START) >> 2;
- data = s->nodetype.reg_u32[index];
- break;
- case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
- index = (offset - EIOINTC_IPMAP_START) >> 2;
- data = s->ipmap.reg_u32[index];
- break;
- case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
- index = (offset - EIOINTC_ENABLE_START) >> 2;
- data = s->enable.reg_u32[index];
- break;
- case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
- index = (offset - EIOINTC_BOUNCE_START) >> 2;
- data = s->bounce.reg_u32[index];
- break;
- case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
- index = (offset - EIOINTC_COREISR_START) >> 2;
- data = s->coreisr.reg_u32[vcpu->vcpu_id][index];
- break;
- case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- index = (offset - EIOINTC_COREMAP_START) >> 2;
- data = s->coremap.reg_u32[index];
- break;
- default:
- ret = -EINVAL;
- break;
- }
- *(u32 *)val = data;
-
- return ret;
-}
-
-static int loongarch_eiointc_readq(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s,
- gpa_t addr, int len, void *val)
+static int loongarch_eiointc_read(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s,
+ gpa_t addr, unsigned long *val)
{
int index, ret = 0;
u64 data = 0;
@@ -298,7 +148,7 @@ static int loongarch_eiointc_readq(struct kvm_vcpu *vcpu, struct loongarch_eioin
ret = -EINVAL;
break;
}
- *(u64 *)val = data;
+ *val = data;
return ret;
}
@@ -308,7 +158,7 @@ static int kvm_eiointc_read(struct kvm_vcpu *vcpu,
gpa_t addr, int len, void *val)
{
int ret = -EINVAL;
- unsigned long flags;
+ unsigned long flags, data, offset;
struct loongarch_eiointc *eiointc = vcpu->kvm->arch.eiointc;
if (!eiointc) {
@@ -321,355 +171,115 @@ static int kvm_eiointc_read(struct kvm_vcpu *vcpu,
return -EINVAL;
}
- vcpu->kvm->stat.eiointc_read_exits++;
+ offset = addr & 0x7;
+ addr -= offset;
+ vcpu->stat.eiointc_read_exits++;
spin_lock_irqsave(&eiointc->lock, flags);
+ ret = loongarch_eiointc_read(vcpu, eiointc, addr, &data);
+ spin_unlock_irqrestore(&eiointc->lock, flags);
+ if (ret)
+ return ret;
+
+ data = data >> (offset * 8);
switch (len) {
case 1:
- ret = loongarch_eiointc_readb(vcpu, eiointc, addr, len, val);
+ *(long *)val = (s8)data;
break;
case 2:
- ret = loongarch_eiointc_readw(vcpu, eiointc, addr, len, val);
+ *(long *)val = (s16)data;
break;
case 4:
- ret = loongarch_eiointc_readl(vcpu, eiointc, addr, len, val);
- break;
- case 8:
- ret = loongarch_eiointc_readq(vcpu, eiointc, addr, len, val);
+ *(long *)val = (s32)data;
break;
default:
- WARN_ONCE(1, "%s: Abnormal address access: addr 0x%llx, size %d\n",
- __func__, addr, len);
- }
- spin_unlock_irqrestore(&eiointc->lock, flags);
-
- return ret;
-}
-
-static int loongarch_eiointc_writeb(struct kvm_vcpu *vcpu,
- struct loongarch_eiointc *s,
- gpa_t addr, int len, const void *val)
-{
- int index, irq, bits, ret = 0;
- u8 cpu;
- u8 data, old_data;
- u8 coreisr, old_coreisr;
- gpa_t offset;
-
- data = *(u8 *)val;
- offset = addr - EIOINTC_BASE;
-
- switch (offset) {
- case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
- index = (offset - EIOINTC_NODETYPE_START);
- s->nodetype.reg_u8[index] = data;
- break;
- case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
- /*
- * ipmap cannot be set at runtime, can be set only at the beginning
- * of irqchip driver, need not update upper irq level
- */
- index = (offset - EIOINTC_IPMAP_START);
- s->ipmap.reg_u8[index] = data;
- break;
- case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
- index = (offset - EIOINTC_ENABLE_START);
- old_data = s->enable.reg_u8[index];
- s->enable.reg_u8[index] = data;
- /*
- * 1: enable irq.
- * update irq when isr is set.
- */
- data = s->enable.reg_u8[index] & ~old_data & s->isr.reg_u8[index];
- eiointc_enable_irq(vcpu, s, index, data, 1);
- /*
- * 0: disable irq.
- * update irq when isr is set.
- */
- data = ~s->enable.reg_u8[index] & old_data & s->isr.reg_u8[index];
- eiointc_enable_irq(vcpu, s, index, data, 0);
- break;
- case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
- /* do not emulate hw bounced irq routing */
- index = offset - EIOINTC_BOUNCE_START;
- s->bounce.reg_u8[index] = data;
- break;
- case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
- index = (offset - EIOINTC_COREISR_START);
- /* use attrs to get current cpu index */
- cpu = vcpu->vcpu_id;
- coreisr = data;
- old_coreisr = s->coreisr.reg_u8[cpu][index];
- /* write 1 to clear interrupt */
- s->coreisr.reg_u8[cpu][index] = old_coreisr & ~coreisr;
- coreisr &= old_coreisr;
- bits = sizeof(data) * 8;
- irq = find_first_bit((void *)&coreisr, bits);
- while (irq < bits) {
- eiointc_update_irq(s, irq + index * bits, 0);
- bitmap_clear((void *)&coreisr, irq, 1);
- irq = find_first_bit((void *)&coreisr, bits);
- }
- break;
- case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- irq = offset - EIOINTC_COREMAP_START;
- index = irq;
- s->coremap.reg_u8[index] = data;
- eiointc_update_sw_coremap(s, irq, data, sizeof(data), true);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-static int loongarch_eiointc_writew(struct kvm_vcpu *vcpu,
- struct loongarch_eiointc *s,
- gpa_t addr, int len, const void *val)
-{
- int i, index, irq, bits, ret = 0;
- u8 cpu;
- u16 data, old_data;
- u16 coreisr, old_coreisr;
- gpa_t offset;
-
- data = *(u16 *)val;
- offset = addr - EIOINTC_BASE;
-
- switch (offset) {
- case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
- index = (offset - EIOINTC_NODETYPE_START) >> 1;
- s->nodetype.reg_u16[index] = data;
- break;
- case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
- /*
- * ipmap cannot be set at runtime, can be set only at the beginning
- * of irqchip driver, need not update upper irq level
- */
- index = (offset - EIOINTC_IPMAP_START) >> 1;
- s->ipmap.reg_u16[index] = data;
- break;
- case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
- index = (offset - EIOINTC_ENABLE_START) >> 1;
- old_data = s->enable.reg_u16[index];
- s->enable.reg_u16[index] = data;
- /*
- * 1: enable irq.
- * update irq when isr is set.
- */
- data = s->enable.reg_u16[index] & ~old_data & s->isr.reg_u16[index];
- for (i = 0; i < sizeof(data); i++) {
- u8 mask = (data >> (i * 8)) & 0xff;
- eiointc_enable_irq(vcpu, s, index * 2 + i, mask, 1);
- }
- /*
- * 0: disable irq.
- * update irq when isr is set.
- */
- data = ~s->enable.reg_u16[index] & old_data & s->isr.reg_u16[index];
- for (i = 0; i < sizeof(data); i++) {
- u8 mask = (data >> (i * 8)) & 0xff;
- eiointc_enable_irq(vcpu, s, index * 2 + i, mask, 0);
- }
- break;
- case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
- /* do not emulate hw bounced irq routing */
- index = (offset - EIOINTC_BOUNCE_START) >> 1;
- s->bounce.reg_u16[index] = data;
- break;
- case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
- index = (offset - EIOINTC_COREISR_START) >> 1;
- /* use attrs to get current cpu index */
- cpu = vcpu->vcpu_id;
- coreisr = data;
- old_coreisr = s->coreisr.reg_u16[cpu][index];
- /* write 1 to clear interrupt */
- s->coreisr.reg_u16[cpu][index] = old_coreisr & ~coreisr;
- coreisr &= old_coreisr;
- bits = sizeof(data) * 8;
- irq = find_first_bit((void *)&coreisr, bits);
- while (irq < bits) {
- eiointc_update_irq(s, irq + index * bits, 0);
- bitmap_clear((void *)&coreisr, irq, 1);
- irq = find_first_bit((void *)&coreisr, bits);
- }
- break;
- case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- irq = offset - EIOINTC_COREMAP_START;
- index = irq >> 1;
- s->coremap.reg_u16[index] = data;
- eiointc_update_sw_coremap(s, irq, data, sizeof(data), true);
- break;
- default:
- ret = -EINVAL;
+ *(long *)val = (long)data;
break;
}
- return ret;
+ return 0;
}
-static int loongarch_eiointc_writel(struct kvm_vcpu *vcpu,
+static int loongarch_eiointc_write(struct kvm_vcpu *vcpu,
struct loongarch_eiointc *s,
- gpa_t addr, int len, const void *val)
+ gpa_t addr, u64 value, u64 field_mask)
{
- int i, index, irq, bits, ret = 0;
+ int index, irq, ret = 0;
u8 cpu;
- u32 data, old_data;
- u32 coreisr, old_coreisr;
+ u64 data, old, mask;
gpa_t offset;
- data = *(u32 *)val;
- offset = addr - EIOINTC_BASE;
+ offset = addr & 7;
+ mask = field_mask << (offset * 8);
+ data = (value & field_mask) << (offset * 8);
- switch (offset) {
- case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
- index = (offset - EIOINTC_NODETYPE_START) >> 2;
- s->nodetype.reg_u32[index] = data;
- break;
- case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
- /*
- * ipmap cannot be set at runtime, can be set only at the beginning
- * of irqchip driver, need not update upper irq level
- */
- index = (offset - EIOINTC_IPMAP_START) >> 2;
- s->ipmap.reg_u32[index] = data;
- break;
- case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
- index = (offset - EIOINTC_ENABLE_START) >> 2;
- old_data = s->enable.reg_u32[index];
- s->enable.reg_u32[index] = data;
- /*
- * 1: enable irq.
- * update irq when isr is set.
- */
- data = s->enable.reg_u32[index] & ~old_data & s->isr.reg_u32[index];
- for (i = 0; i < sizeof(data); i++) {
- u8 mask = (data >> (i * 8)) & 0xff;
- eiointc_enable_irq(vcpu, s, index * 4 + i, mask, 1);
- }
- /*
- * 0: disable irq.
- * update irq when isr is set.
- */
- data = ~s->enable.reg_u32[index] & old_data & s->isr.reg_u32[index];
- for (i = 0; i < sizeof(data); i++) {
- u8 mask = (data >> (i * 8)) & 0xff;
- eiointc_enable_irq(vcpu, s, index * 4 + i, mask, 0);
- }
- break;
- case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
- /* do not emulate hw bounced irq routing */
- index = (offset - EIOINTC_BOUNCE_START) >> 2;
- s->bounce.reg_u32[index] = data;
- break;
- case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
- index = (offset - EIOINTC_COREISR_START) >> 2;
- /* use attrs to get current cpu index */
- cpu = vcpu->vcpu_id;
- coreisr = data;
- old_coreisr = s->coreisr.reg_u32[cpu][index];
- /* write 1 to clear interrupt */
- s->coreisr.reg_u32[cpu][index] = old_coreisr & ~coreisr;
- coreisr &= old_coreisr;
- bits = sizeof(data) * 8;
- irq = find_first_bit((void *)&coreisr, bits);
- while (irq < bits) {
- eiointc_update_irq(s, irq + index * bits, 0);
- bitmap_clear((void *)&coreisr, irq, 1);
- irq = find_first_bit((void *)&coreisr, bits);
- }
- break;
- case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- irq = offset - EIOINTC_COREMAP_START;
- index = irq >> 2;
- s->coremap.reg_u32[index] = data;
- eiointc_update_sw_coremap(s, irq, data, sizeof(data), true);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-static int loongarch_eiointc_writeq(struct kvm_vcpu *vcpu,
- struct loongarch_eiointc *s,
- gpa_t addr, int len, const void *val)
-{
- int i, index, irq, bits, ret = 0;
- u8 cpu;
- u64 data, old_data;
- u64 coreisr, old_coreisr;
- gpa_t offset;
-
- data = *(u64 *)val;
+ addr -= offset;
offset = addr - EIOINTC_BASE;
switch (offset) {
case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
index = (offset - EIOINTC_NODETYPE_START) >> 3;
- s->nodetype.reg_u64[index] = data;
+ old = s->nodetype.reg_u64[index];
+ s->nodetype.reg_u64[index] = (old & ~mask) | data;
break;
case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END:
/*
* ipmap cannot be set at runtime, can be set only at the beginning
* of irqchip driver, need not update upper irq level
*/
- index = (offset - EIOINTC_IPMAP_START) >> 3;
- s->ipmap.reg_u64 = data;
+ old = s->ipmap.reg_u64;
+ s->ipmap.reg_u64 = (old & ~mask) | data;
break;
case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END:
index = (offset - EIOINTC_ENABLE_START) >> 3;
- old_data = s->enable.reg_u64[index];
- s->enable.reg_u64[index] = data;
+ old = s->enable.reg_u64[index];
+ s->enable.reg_u64[index] = (old & ~mask) | data;
/*
* 1: enable irq.
* update irq when isr is set.
*/
- data = s->enable.reg_u64[index] & ~old_data & s->isr.reg_u64[index];
- for (i = 0; i < sizeof(data); i++) {
- u8 mask = (data >> (i * 8)) & 0xff;
- eiointc_enable_irq(vcpu, s, index * 8 + i, mask, 1);
+ data = s->enable.reg_u64[index] & ~old & s->isr.reg_u64[index];
+ while (data) {
+ irq = __ffs(data);
+ eiointc_update_irq(s, irq + index * 64, 1);
+ data &= ~BIT_ULL(irq);
}
/*
* 0: disable irq.
* update irq when isr is set.
*/
- data = ~s->enable.reg_u64[index] & old_data & s->isr.reg_u64[index];
- for (i = 0; i < sizeof(data); i++) {
- u8 mask = (data >> (i * 8)) & 0xff;
- eiointc_enable_irq(vcpu, s, index * 8 + i, mask, 0);
+ data = ~s->enable.reg_u64[index] & old & s->isr.reg_u64[index];
+ while (data) {
+ irq = __ffs(data);
+ eiointc_update_irq(s, irq + index * 64, 0);
+ data &= ~BIT_ULL(irq);
}
break;
case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END:
/* do not emulate hw bounced irq routing */
index = (offset - EIOINTC_BOUNCE_START) >> 3;
- s->bounce.reg_u64[index] = data;
+ old = s->bounce.reg_u64[index];
+ s->bounce.reg_u64[index] = (old & ~mask) | data;
break;
case EIOINTC_COREISR_START ... EIOINTC_COREISR_END:
index = (offset - EIOINTC_COREISR_START) >> 3;
/* use attrs to get current cpu index */
cpu = vcpu->vcpu_id;
- coreisr = data;
- old_coreisr = s->coreisr.reg_u64[cpu][index];
+ old = s->coreisr.reg_u64[cpu][index];
/* write 1 to clear interrupt */
- s->coreisr.reg_u64[cpu][index] = old_coreisr & ~coreisr;
- coreisr &= old_coreisr;
- bits = sizeof(data) * 8;
- irq = find_first_bit((void *)&coreisr, bits);
- while (irq < bits) {
- eiointc_update_irq(s, irq + index * bits, 0);
- bitmap_clear((void *)&coreisr, irq, 1);
- irq = find_first_bit((void *)&coreisr, bits);
+ s->coreisr.reg_u64[cpu][index] = old & ~data;
+ data &= old;
+ while (data) {
+ irq = __ffs(data);
+ eiointc_update_irq(s, irq + index * 64, 0);
+ data &= ~BIT_ULL(irq);
}
break;
case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END:
- irq = offset - EIOINTC_COREMAP_START;
- index = irq >> 3;
- s->coremap.reg_u64[index] = data;
- eiointc_update_sw_coremap(s, irq, data, sizeof(data), true);
+ index = (offset - EIOINTC_COREMAP_START) >> 3;
+ old = s->coremap.reg_u64[index];
+ s->coremap.reg_u64[index] = (old & ~mask) | data;
+ data = s->coremap.reg_u64[index];
+ eiointc_update_sw_coremap(s, index * 8, data, sizeof(data), true);
break;
default:
ret = -EINVAL;
@@ -684,7 +294,7 @@ static int kvm_eiointc_write(struct kvm_vcpu *vcpu,
gpa_t addr, int len, const void *val)
{
int ret = -EINVAL;
- unsigned long flags;
+ unsigned long flags, value;
struct loongarch_eiointc *eiointc = vcpu->kvm->arch.eiointc;
if (!eiointc) {
@@ -697,24 +307,25 @@ static int kvm_eiointc_write(struct kvm_vcpu *vcpu,
return -EINVAL;
}
- vcpu->kvm->stat.eiointc_write_exits++;
+ vcpu->stat.eiointc_write_exits++;
spin_lock_irqsave(&eiointc->lock, flags);
switch (len) {
case 1:
- ret = loongarch_eiointc_writeb(vcpu, eiointc, addr, len, val);
+ value = *(unsigned char *)val;
+ ret = loongarch_eiointc_write(vcpu, eiointc, addr, value, 0xFF);
break;
case 2:
- ret = loongarch_eiointc_writew(vcpu, eiointc, addr, len, val);
+ value = *(unsigned short *)val;
+ ret = loongarch_eiointc_write(vcpu, eiointc, addr, value, USHRT_MAX);
break;
case 4:
- ret = loongarch_eiointc_writel(vcpu, eiointc, addr, len, val);
- break;
- case 8:
- ret = loongarch_eiointc_writeq(vcpu, eiointc, addr, len, val);
+ value = *(unsigned int *)val;
+ ret = loongarch_eiointc_write(vcpu, eiointc, addr, value, UINT_MAX);
break;
default:
- WARN_ONCE(1, "%s: Abnormal address access: addr 0x%llx, size %d\n",
- __func__, addr, len);
+ value = *(unsigned long *)val;
+ ret = loongarch_eiointc_write(vcpu, eiointc, addr, value, ULONG_MAX);
+ break;
}
spin_unlock_irqrestore(&eiointc->lock, flags);
@@ -989,7 +600,7 @@ static int kvm_eiointc_create(struct kvm_device *dev, u32 type)
{
int ret;
struct loongarch_eiointc *s;
- struct kvm_io_device *device, *device1;
+ struct kvm_io_device *device;
struct kvm *kvm = dev->kvm;
/* eiointc has been created */
@@ -1017,10 +628,10 @@ static int kvm_eiointc_create(struct kvm_device *dev, u32 type)
return ret;
}
- device1 = &s->device_vext;
- kvm_iodevice_init(device1, &kvm_eiointc_virt_ops);
+ device = &s->device_vext;
+ kvm_iodevice_init(device, &kvm_eiointc_virt_ops);
ret = kvm_io_bus_register_dev(kvm, KVM_IOCSR_BUS,
- EIOINTC_VIRT_BASE, EIOINTC_VIRT_SIZE, device1);
+ EIOINTC_VIRT_BASE, EIOINTC_VIRT_SIZE, device);
if (ret < 0) {
kvm_io_bus_unregister_dev(kvm, KVM_IOCSR_BUS, &s->device);
kfree(s);
diff --git a/arch/loongarch/kvm/intc/ipi.c b/arch/loongarch/kvm/intc/ipi.c
index fe734dc062ed..e658d5b37c04 100644
--- a/arch/loongarch/kvm/intc/ipi.c
+++ b/arch/loongarch/kvm/intc/ipi.c
@@ -268,36 +268,16 @@ static int kvm_ipi_read(struct kvm_vcpu *vcpu,
struct kvm_io_device *dev,
gpa_t addr, int len, void *val)
{
- int ret;
- struct loongarch_ipi *ipi;
-
- ipi = vcpu->kvm->arch.ipi;
- if (!ipi) {
- kvm_err("%s: ipi irqchip not valid!\n", __func__);
- return -EINVAL;
- }
- ipi->kvm->stat.ipi_read_exits++;
- ret = loongarch_ipi_readl(vcpu, addr, len, val);
-
- return ret;
+ vcpu->stat.ipi_read_exits++;
+ return loongarch_ipi_readl(vcpu, addr, len, val);
}
static int kvm_ipi_write(struct kvm_vcpu *vcpu,
struct kvm_io_device *dev,
gpa_t addr, int len, const void *val)
{
- int ret;
- struct loongarch_ipi *ipi;
-
- ipi = vcpu->kvm->arch.ipi;
- if (!ipi) {
- kvm_err("%s: ipi irqchip not valid!\n", __func__);
- return -EINVAL;
- }
- ipi->kvm->stat.ipi_write_exits++;
- ret = loongarch_ipi_writel(vcpu, addr, len, val);
-
- return ret;
+ vcpu->stat.ipi_write_exits++;
+ return loongarch_ipi_writel(vcpu, addr, len, val);
}
static const struct kvm_io_device_ops kvm_ipi_ops = {
diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c
index 08fce845f668..6f00ffe05c54 100644
--- a/arch/loongarch/kvm/intc/pch_pic.c
+++ b/arch/loongarch/kvm/intc/pch_pic.c
@@ -196,7 +196,7 @@ static int kvm_pch_pic_read(struct kvm_vcpu *vcpu,
}
/* statistics of pch pic reading */
- vcpu->kvm->stat.pch_pic_read_exits++;
+ vcpu->stat.pch_pic_read_exits++;
ret = loongarch_pch_pic_read(s, addr, len, val);
return ret;
@@ -303,7 +303,7 @@ static int kvm_pch_pic_write(struct kvm_vcpu *vcpu,
}
/* statistics of pch pic writing */
- vcpu->kvm->stat.pch_pic_write_exits++;
+ vcpu->stat.pch_pic_write_exits++;
ret = loongarch_pch_pic_write(s, addr, len, val);
return ret;
diff --git a/arch/loongarch/kvm/interrupt.c b/arch/loongarch/kvm/interrupt.c
index 4c3f22de4b40..8462083f0301 100644
--- a/arch/loongarch/kvm/interrupt.c
+++ b/arch/loongarch/kvm/interrupt.c
@@ -83,28 +83,11 @@ void kvm_deliver_intr(struct kvm_vcpu *vcpu)
unsigned long *pending = &vcpu->arch.irq_pending;
unsigned long *pending_clr = &vcpu->arch.irq_clear;
- if (!(*pending) && !(*pending_clr))
- return;
-
- if (*pending_clr) {
- priority = __ffs(*pending_clr);
- while (priority <= INT_IPI) {
- kvm_irq_clear(vcpu, priority);
- priority = find_next_bit(pending_clr,
- BITS_PER_BYTE * sizeof(*pending_clr),
- priority + 1);
- }
- }
+ for_each_set_bit(priority, pending_clr, INT_IPI + 1)
+ kvm_irq_clear(vcpu, priority);
- if (*pending) {
- priority = __ffs(*pending);
- while (priority <= INT_IPI) {
- kvm_irq_deliver(vcpu, priority);
- priority = find_next_bit(pending,
- BITS_PER_BYTE * sizeof(*pending),
- priority + 1);
- }
- }
+ for_each_set_bit(priority, pending, INT_IPI + 1)
+ kvm_irq_deliver(vcpu, priority);
}
int kvm_pending_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/loongarch/kvm/trace.h b/arch/loongarch/kvm/trace.h
index 1783397b1bc8..145514dab6d5 100644
--- a/arch/loongarch/kvm/trace.h
+++ b/arch/loongarch/kvm/trace.h
@@ -46,11 +46,15 @@ DEFINE_EVENT(kvm_transition, kvm_out,
/* Further exit reasons */
#define KVM_TRACE_EXIT_IDLE 64
#define KVM_TRACE_EXIT_CACHE 65
+#define KVM_TRACE_EXIT_CPUCFG 66
+#define KVM_TRACE_EXIT_CSR 67
/* Tracepoints for VM exits */
#define kvm_trace_symbol_exit_types \
{ KVM_TRACE_EXIT_IDLE, "IDLE" }, \
- { KVM_TRACE_EXIT_CACHE, "CACHE" }
+ { KVM_TRACE_EXIT_CACHE, "CACHE" }, \
+ { KVM_TRACE_EXIT_CPUCFG, "CPUCFG" }, \
+ { KVM_TRACE_EXIT_CSR, "CSR" }
DECLARE_EVENT_CLASS(kvm_exit,
TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
@@ -82,6 +86,14 @@ DEFINE_EVENT(kvm_exit, kvm_exit_cache,
TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
TP_ARGS(vcpu, reason));
+DEFINE_EVENT(kvm_exit, kvm_exit_cpucfg,
+ TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
+ TP_ARGS(vcpu, reason));
+
+DEFINE_EVENT(kvm_exit, kvm_exit_csr,
+ TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
+ TP_ARGS(vcpu, reason));
+
DEFINE_EVENT(kvm_exit, kvm_exit,
TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
TP_ARGS(vcpu, reason));
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 5af32ec62cb1..d1b8c50941ca 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -20,7 +20,13 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, idle_exits),
STATS_DESC_COUNTER(VCPU, cpucfg_exits),
STATS_DESC_COUNTER(VCPU, signal_exits),
- STATS_DESC_COUNTER(VCPU, hypercall_exits)
+ STATS_DESC_COUNTER(VCPU, hypercall_exits),
+ STATS_DESC_COUNTER(VCPU, ipi_read_exits),
+ STATS_DESC_COUNTER(VCPU, ipi_write_exits),
+ STATS_DESC_COUNTER(VCPU, eiointc_read_exits),
+ STATS_DESC_COUNTER(VCPU, eiointc_write_exits),
+ STATS_DESC_COUNTER(VCPU, pch_pic_read_exits),
+ STATS_DESC_COUNTER(VCPU, pch_pic_write_exits)
};
const struct kvm_stats_header kvm_vcpu_stats_header = {
diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c
index 99165903908a..f5e910b68229 100644
--- a/arch/loongarch/mm/pageattr.c
+++ b/arch/loongarch/mm/pageattr.c
@@ -118,7 +118,7 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask, pgp
return 0;
mmap_write_lock(&init_mm);
- ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL, &masks);
+ ret = walk_kernel_page_table_range(start, end, &pageattr_ops, NULL, &masks);
mmap_write_unlock(&init_mm);
flush_tlb_kernel_range(start, end);
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index fa20abd06992..5171bb183967 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -85,7 +85,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 3151ce39661c..16f343ae48c6 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -81,7 +81,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index 6a52463d778a..c08788728ea9 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -88,7 +88,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index cdaeb4f64824..962497e7c53f 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -78,7 +78,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index cb966cc46774..ec28650189e4 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -80,7 +80,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index 97b6b07b003a..0afb3ad180de 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -79,7 +79,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index ab76d6d0e6e6..b311e953995d 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -99,7 +99,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index 09c1c30a2ee8..f4e6224f137f 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -77,7 +77,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 983a512f99c0..498e167222f1 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -78,7 +78,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index edaf7f7b355e..8c6b1eef8534 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -79,7 +79,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index 512686fb75d1..c34648f299ef 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -74,7 +74,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index 28a8c35ece7e..73810d14660f 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -75,7 +75,6 @@ CONFIG_NETFILTER=y
CONFIG_NETFILTER_NETLINK_HOOK=m
CONFIG_NF_CONNTRACK=m
CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
CONFIG_NF_CONNTRACK_AMANDA=m
CONFIG_NF_CONNTRACK_FTP=m
CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index f18ec02ddeb2..484ebb3baedf 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -28,7 +28,6 @@ config MICROBLAZE
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_PAGE_SIZE_4KB
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 934eb961bd0d..caf508f6e9ec 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -73,7 +73,6 @@ config MIPS
select HAVE_EBPF_JIT if !CPU_MICROMIPS
select HAVE_EXIT_THREAD
select HAVE_GUP_FAST
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS
@@ -563,6 +562,7 @@ config MIPS_MALTA
select MIPS_L1_CACHE_SHIFT_6
select MIPS_MSC
select PCI_GT64XXX_PCI0
+ select RTC_MC146818_LIB
select SMP_UP if SMP
select SWAP_IO_SPACE
select SYS_HAS_CPU_MIPS32_R1
@@ -1837,6 +1837,7 @@ config CPU_LOONGSON2EF
select CPU_SUPPORTS_64BIT_KERNEL
select CPU_SUPPORTS_HIGHMEM
select CPU_SUPPORTS_HUGEPAGES
+ select RTC_MC146818_LIB
config CPU_LOONGSON32
bool
diff --git a/arch/mips/boot/Makefile b/arch/mips/boot/Makefile
index 196c44fa72d9..8473c4671702 100644
--- a/arch/mips/boot/Makefile
+++ b/arch/mips/boot/Makefile
@@ -54,10 +54,10 @@ UIMAGE_ENTRYADDR = $(VMLINUX_ENTRY_ADDRESS)
# Compressed vmlinux images
#
-extra-y += vmlinux.bin.bz2
-extra-y += vmlinux.bin.gz
-extra-y += vmlinux.bin.lzma
-extra-y += vmlinux.bin.lzo
+targets += vmlinux.bin.bz2
+targets += vmlinux.bin.gz
+targets += vmlinux.bin.lzma
+targets += vmlinux.bin.lzo
$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
$(call if_changed,bzip2)
diff --git a/arch/mips/boot/dts/mobileye/eyeq5-epm5.dts b/arch/mips/boot/dts/mobileye/eyeq5-epm5.dts
index 6898b2d8267d..9fc1a1b0a81b 100644
--- a/arch/mips/boot/dts/mobileye/eyeq5-epm5.dts
+++ b/arch/mips/boot/dts/mobileye/eyeq5-epm5.dts
@@ -21,3 +21,11 @@
<0x8 0x02000000 0x0 0x7E000000>;
};
};
+
+&i2c2 {
+ temperature-sensor@48 {
+ compatible = "ti,tmp112";
+ reg = <0x48>;
+ label = "U60";
+ };
+};
diff --git a/arch/mips/boot/dts/mobileye/eyeq5.dtsi b/arch/mips/boot/dts/mobileye/eyeq5.dtsi
index a84e6e720619..36a73e8a63a1 100644
--- a/arch/mips/boot/dts/mobileye/eyeq5.dtsi
+++ b/arch/mips/boot/dts/mobileye/eyeq5.dtsi
@@ -110,6 +110,81 @@
ranges;
compatible = "simple-bus";
+ i2c0: i2c@300000 {
+ compatible = "mobileye,eyeq5-i2c", "arm,primecell";
+ reg = <0 0x300000 0x0 0x1000>;
+ interrupt-parent = <&gic>;
+ interrupts = <GIC_SHARED 1 IRQ_TYPE_LEVEL_HIGH>;
+ clock-frequency = <400000>; /* Fast mode */
+ #address-cells = <1>;
+ #size-cells = <0>;
+ clocks = <&olb 35>, <&olb EQ5C_PER_I2C>;
+ clock-names = "i2cclk", "apb_pclk";
+ resets = <&olb 0 13>;
+ i2c-transfer-timeout-us = <10000>;
+ mobileye,olb = <&olb 0>;
+ };
+
+ i2c1: i2c@400000 {
+ compatible = "mobileye,eyeq5-i2c", "arm,primecell";
+ reg = <0 0x400000 0x0 0x1000>;
+ interrupt-parent = <&gic>;
+ interrupts = <GIC_SHARED 2 IRQ_TYPE_LEVEL_HIGH>;
+ clock-frequency = <400000>; /* Fast mode */
+ #address-cells = <1>;
+ #size-cells = <0>;
+ clocks = <&olb 35>, <&olb EQ5C_PER_I2C>;
+ clock-names = "i2cclk", "apb_pclk";
+ resets = <&olb 0 14>;
+ i2c-transfer-timeout-us = <10000>;
+ mobileye,olb = <&olb 1>;
+ };
+
+ i2c2: i2c@500000 {
+ compatible = "mobileye,eyeq5-i2c", "arm,primecell";
+ reg = <0 0x500000 0x0 0x1000>;
+ interrupt-parent = <&gic>;
+ interrupts = <GIC_SHARED 3 IRQ_TYPE_LEVEL_HIGH>;
+ clock-frequency = <400000>; /* Fast mode */
+ #address-cells = <1>;
+ #size-cells = <0>;
+ clocks = <&olb 35>, <&olb EQ5C_PER_I2C>;
+ clock-names = "i2cclk", "apb_pclk";
+ resets = <&olb 0 15>;
+ i2c-transfer-timeout-us = <10000>;
+ mobileye,olb = <&olb 2>;
+ };
+
+ i2c3: i2c@600000 {
+ compatible = "mobileye,eyeq5-i2c", "arm,primecell";
+ reg = <0 0x600000 0x0 0x1000>;
+ interrupt-parent = <&gic>;
+ interrupts = <GIC_SHARED 4 IRQ_TYPE_LEVEL_HIGH>;
+ clock-frequency = <400000>; /* Fast mode */
+ #address-cells = <1>;
+ #size-cells = <0>;
+ clocks = <&olb 35>, <&olb EQ5C_PER_I2C>;
+ clock-names = "i2cclk", "apb_pclk";
+ resets = <&olb 0 16>;
+ i2c-transfer-timeout-us = <10000>;
+ mobileye,olb = <&olb 3>;
+ };
+
+ i2c4: i2c@700000 {
+ compatible = "mobileye,eyeq5-i2c", "arm,primecell";
+ reg = <0 0x700000 0x0 0x1000>;
+ interrupt-parent = <&gic>;
+ interrupts = <GIC_SHARED 5 IRQ_TYPE_LEVEL_HIGH>;
+ clock-frequency = <400000>; /* Fast mode */
+ #address-cells = <1>;
+ #size-cells = <0>;
+ clocks = <&olb 35>, <&olb EQ5C_PER_I2C>;
+ clock-names = "i2cclk", "apb_pclk";
+ resets = <&olb 0 17>;
+ i2c-transfer-timeout-us = <10000>;
+ mobileye,olb = <&olb 4>;
+ };
+
uart0: serial@800000 {
compatible = "arm,pl011", "arm,primecell";
reg = <0 0x800000 0x0 0x1000>;
@@ -178,6 +253,58 @@
clocks = <&olb EQ5C_CPU_CORE0>;
};
};
+
+ emmc: mmc@2200000 {
+ compatible = "mobileye,eyeq-sd4hc", "cdns,sd4hc";
+ reg = <0 0x2200000 0x0 0x1000>;
+ interrupt-parent = <&gic>;
+ interrupts = <GIC_SHARED 10 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&olb EQ5C_PER_EMMC>;
+ bus-width = <8>;
+ max-frequency = <200000000>;
+ mmc-ddr-1_8v;
+ sd-uhs-ddr50;
+ mmc-hs200-1_8v;
+ mmc-hs400-1_8v;
+ mmc-hs400-enhanced-strobe;
+
+ cdns,phy-input-delay-legacy = <4>;
+ cdns,phy-input-delay-mmc-highspeed = <2>;
+ cdns,phy-input-delay-mmc-ddr = <3>;
+ cdns,phy-dll-delay-sdclk = <32>;
+ cdns,phy-dll-delay-sdclk-hsmmc = <32>;
+ cdns,phy-dll-delay-strobe = <32>;
+ };
+
+ gpio0: gpio@1400000 {
+ compatible = "mobileye,eyeq5-gpio";
+ reg = <0x0 0x1400000 0x0 0x1000>;
+ gpio-bank = <0>;
+ ngpios = <29>;
+ interrupt-parent = <&gic>;
+ interrupts = <GIC_SHARED 14 IRQ_TYPE_LEVEL_HIGH>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ gpio-ranges = <&olb 0 0 29>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ resets = <&olb 0 26>;
+ };
+
+ gpio1: gpio@1500000 {
+ compatible = "mobileye,eyeq5-gpio";
+ reg = <0x0 0x1500000 0x0 0x1000>;
+ gpio-bank = <1>;
+ ngpios = <23>;
+ interrupt-parent = <&gic>;
+ interrupts = <GIC_SHARED 14 IRQ_TYPE_LEVEL_HIGH>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ gpio-ranges = <&olb 0 29 23>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ resets = <&olb 0 26>;
+ };
};
};
diff --git a/arch/mips/boot/dts/mobileye/eyeq6h.dtsi b/arch/mips/boot/dts/mobileye/eyeq6h.dtsi
index dabd5ed778b7..5ae939d25ea8 100644
--- a/arch/mips/boot/dts/mobileye/eyeq6h.dtsi
+++ b/arch/mips/boot/dts/mobileye/eyeq6h.dtsi
@@ -109,6 +109,28 @@
clock-names = "ref";
};
+ emmc: mmc@d8010000 {
+ compatible = "mobileye,eyeq-sd4hc", "cdns,sd4hc";
+ reg = <0 0xd8010000 0x0 0x1000>;
+ interrupt-parent = <&gic>;
+ interrupts = <GIC_SHARED 91 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&olb_south EQ6HC_SOUTH_DIV_EMMC>;
+ bus-width = <8>;
+ max-frequency = <200000000>;
+ mmc-ddr-1_8v;
+ sd-uhs-ddr50;
+ mmc-hs200-1_8v;
+ mmc-hs400-1_8v;
+ mmc-hs400-enhanced-strobe;
+
+ cdns,phy-input-delay-legacy = <4>;
+ cdns,phy-input-delay-mmc-highspeed = <2>;
+ cdns,phy-input-delay-mmc-ddr = <3>;
+ cdns,phy-dll-delay-sdclk = <32>;
+ cdns,phy-dll-delay-sdclk-hsmmc = <32>;
+ cdns,phy-dll-delay-strobe = <32>;
+ };
+
olb_south: system-controller@d8013000 {
compatible = "mobileye,eyeq6h-south-olb", "syscon";
reg = <0x0 0xd8013000 0x0 0x1000>;
diff --git a/arch/mips/boot/dts/qca/ar9132.dtsi b/arch/mips/boot/dts/qca/ar9132.dtsi
index 61dcfa5b6ca7..c1ca03a27b6c 100644
--- a/arch/mips/boot/dts/qca/ar9132.dtsi
+++ b/arch/mips/boot/dts/qca/ar9132.dtsi
@@ -156,6 +156,15 @@
#address-cells = <1>;
#size-cells = <0>;
};
+
+ wifi: wifi@180c0000 {
+ compatible = "qca,ar9130-wifi";
+ reg = <0x180c0000 0x230000>;
+
+ interrupts = <2>;
+
+ status = "disabled";
+ };
};
usb_phy: usb-phy {
diff --git a/arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts b/arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts
index f894fe17816b..a7901bb040ce 100644
--- a/arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts
+++ b/arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts
@@ -108,3 +108,7 @@
};
};
};
+
+&wifi {
+ status = "okay";
+};
diff --git a/arch/mips/boot/dts/qca/ar9331.dtsi b/arch/mips/boot/dts/qca/ar9331.dtsi
index 768ac0f869b1..6eb84a26a20f 100644
--- a/arch/mips/boot/dts/qca/ar9331.dtsi
+++ b/arch/mips/boot/dts/qca/ar9331.dtsi
@@ -285,6 +285,15 @@
status = "disabled";
};
+
+ wifi: wifi@18100000 {
+ compatible = "qca,ar9330-wifi";
+ reg = <0x18100000 0x20000>;
+
+ interrupts = <2>;
+
+ status = "disabled";
+ };
};
usb_phy: usb-phy {
diff --git a/arch/mips/boot/dts/qca/ar9331_dpt_module.dts b/arch/mips/boot/dts/qca/ar9331_dpt_module.dts
index c857cd22f7db..08e728b8ced8 100644
--- a/arch/mips/boot/dts/qca/ar9331_dpt_module.dts
+++ b/arch/mips/boot/dts/qca/ar9331_dpt_module.dts
@@ -97,3 +97,7 @@
&phy_port4 {
status = "okay";
};
+
+&wifi {
+ status = "okay";
+};
diff --git a/arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts b/arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts
index 7affa58d4fa6..37a74aabe4b4 100644
--- a/arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts
+++ b/arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts
@@ -98,3 +98,7 @@
reg = <0>;
};
};
+
+&wifi {
+ status = "okay";
+};
diff --git a/arch/mips/boot/dts/qca/ar9331_omega.dts b/arch/mips/boot/dts/qca/ar9331_omega.dts
index 8904aa917a6e..1450419024cb 100644
--- a/arch/mips/boot/dts/qca/ar9331_omega.dts
+++ b/arch/mips/boot/dts/qca/ar9331_omega.dts
@@ -74,3 +74,7 @@
reg = <0>;
};
};
+
+&wifi {
+ status = "okay";
+};
diff --git a/arch/mips/boot/dts/qca/ar9331_openembed_som9331_board.dts b/arch/mips/boot/dts/qca/ar9331_openembed_som9331_board.dts
index dc65ebd60bbc..5786a827c000 100644
--- a/arch/mips/boot/dts/qca/ar9331_openembed_som9331_board.dts
+++ b/arch/mips/boot/dts/qca/ar9331_openembed_som9331_board.dts
@@ -106,3 +106,7 @@
&phy_port4 {
status = "okay";
};
+
+&wifi {
+ status = "okay";
+};
diff --git a/arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts b/arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts
index 10b9759228b7..a7108c803eb3 100644
--- a/arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts
+++ b/arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts
@@ -114,3 +114,7 @@
reg = <0>;
};
};
+
+&wifi {
+ status = "okay";
+};
diff --git a/arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts b/arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts
index 7743d014631a..0bfb1dde9764 100644
--- a/arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts
+++ b/arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts
@@ -56,7 +56,7 @@
led-power-green {
label = "smartgw:power:green";
gpios = <&gpio 19 GPIO_ACTIVE_HIGH>;
- default-state = "off";
+ linux,default-trigger = "timer";
};
led-power-red {
diff --git a/arch/mips/boot/dts/ralink/mt7620a.dtsi b/arch/mips/boot/dts/ralink/mt7620a.dtsi
index d66045948a83..460164bdd430 100644
--- a/arch/mips/boot/dts/ralink/mt7620a.dtsi
+++ b/arch/mips/boot/dts/ralink/mt7620a.dtsi
@@ -62,4 +62,14 @@
reg-shift = <2>;
};
};
+
+ wmac: wifi@10180000 {
+ compatible = "ralink,rt2880-wifi";
+ reg = <0x10180000 0x40000>;
+
+ clocks = <&sysc 16>;
+
+ interrupt-parent = <&cpuintc>;
+ interrupts = <6>;
+ };
};
diff --git a/arch/mips/boot/dts/ralink/mt7628a.dtsi b/arch/mips/boot/dts/ralink/mt7628a.dtsi
index 0212700c4fb4..5d7a6cfa9e2b 100644
--- a/arch/mips/boot/dts/ralink/mt7628a.dtsi
+++ b/arch/mips/boot/dts/ralink/mt7628a.dtsi
@@ -33,7 +33,7 @@
#size-cells = <1>;
sysc: syscon@0 {
- compatible = "ralink,mt7628-sysc", "syscon";
+ compatible = "ralink,mt7628-sysc", "ralink,mt7688-sysc", "syscon";
reg = <0x0 0x60>;
#clock-cells = <1>;
#reset-cells = <1>;
@@ -134,13 +134,8 @@
watchdog: watchdog@100 {
compatible = "mediatek,mt7621-wdt";
- reg = <0x100 0x30>;
-
- resets = <&sysc 8>;
- reset-names = "wdt";
-
- interrupt-parent = <&intc>;
- interrupts = <24>;
+ reg = <0x100 0x100>;
+ mediatek,sysctl = <&sysc>;
status = "disabled";
};
diff --git a/arch/mips/boot/dts/realtek/cameo-rtl9302c-2x-rtl8224-2xge.dts b/arch/mips/boot/dts/realtek/cameo-rtl9302c-2x-rtl8224-2xge.dts
index 6789bf374044..6f6a05d4088e 100644
--- a/arch/mips/boot/dts/realtek/cameo-rtl9302c-2x-rtl8224-2xge.dts
+++ b/arch/mips/boot/dts/realtek/cameo-rtl9302c-2x-rtl8224-2xge.dts
@@ -71,3 +71,99 @@
};
};
};
+
+&mdio0 {
+ /* External RTL8224 */
+ phy0: ethernet-phy@0 {
+ reg = <0>;
+ compatible = "ethernet-phy-ieee802.3-c45";
+ };
+ phy1: ethernet-phy@1 {
+ reg = <1>;
+ compatible = "ethernet-phy-ieee802.3-c45";
+ };
+ phy2: ethernet-phy@2 {
+ reg = <2>;
+ compatible = "ethernet-phy-ieee802.3-c45";
+ };
+ phy3: ethernet-phy@3 {
+ reg = <3>;
+ compatible = "ethernet-phy-ieee802.3-c45";
+ };
+};
+
+&mdio1 {
+ /* External RTL8224 */
+ phy4: ethernet-phy@0 {
+ reg = <0>;
+ compatible = "ethernet-phy-ieee802.3-c45";
+ };
+ phy5: ethernet-phy@1 {
+ reg = <1>;
+ compatible = "ethernet-phy-ieee802.3-c45";
+ };
+ phy6: ethernet-phy@2 {
+ reg = <2>;
+ compatible = "ethernet-phy-ieee802.3-c45";
+ };
+ phy7: ethernet-phy@3 {
+ reg = <3>;
+ compatible = "ethernet-phy-ieee802.3-c45";
+ };
+};
+
+&switch0 {
+ ethernet-ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ phy-handle = <&phy0>;
+ phy-mode = "usxgmii";
+ };
+ port@1 {
+ reg = <1>;
+ phy-handle = <&phy1>;
+ phy-mode = "usxgmii";
+ };
+ port@2 {
+ reg = <2>;
+ phy-handle = <&phy2>;
+ phy-mode = "usxgmii";
+ };
+ port@3 {
+ reg = <3>;
+ phy-handle = <&phy3>;
+ phy-mode = "usxgmii";
+ };
+ port@16 {
+ reg = <16>;
+ phy-handle = <&phy4>;
+ phy-mode = "usxgmii";
+ };
+ port@17 {
+ reg = <17>;
+ phy-handle = <&phy5>;
+ phy-mode = "usxgmii";
+ };
+ port@18 {
+ reg = <18>;
+ phy-handle = <&phy6>;
+ phy-mode = "usxgmii";
+ };
+ port@19 {
+ reg = <19>;
+ phy-handle = <&phy7>;
+ phy-mode = "usxgmii";
+ };
+ port@24{
+ reg = <24>;
+ phy-mode = "10gbase-r";
+ };
+ port@25{
+ reg = <25>;
+ phy-mode = "10gbase-r";
+ };
+ };
+};
diff --git a/arch/mips/boot/dts/realtek/rtl930x.dtsi b/arch/mips/boot/dts/realtek/rtl930x.dtsi
index 101bab72a95f..24e262e2dc2a 100644
--- a/arch/mips/boot/dts/realtek/rtl930x.dtsi
+++ b/arch/mips/boot/dts/realtek/rtl930x.dtsi
@@ -48,6 +48,10 @@
#address-cells = <1>;
#size-cells = <1>;
+ interrupt-parent = <&intc>;
+ interrupts = <23>, <24>;
+ interrupt-names = "switch", "nic";
+
reboot@c {
compatible = "syscon-reboot";
reg = <0x0c 0x4>;
@@ -138,6 +142,33 @@
clocks = <&lx_clk>;
};
+ watchdog0: watchdog@3260 {
+ compatible = "realtek,rtl9300-wdt";
+ reg = <0x3260 0xc>;
+
+ realtek,reset-mode = "soc";
+
+ clocks = <&lx_clk>;
+ timeout-sec = <30>;
+
+ interrupt-parent = <&intc>;
+ interrupt-names = "phase1", "phase2";
+ interrupts = <5>, <6>;
+ };
+
+ gpio0: gpio@3300 {
+ compatible = "realtek,rtl9300-gpio", "realtek,otto-gpio";
+ reg = <0x3300 0x1c>, <0x3338 0x8>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ ngpios = <24>;
+
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ interrupt-parent = <&intc>;
+ interrupts = <13>;
+ };
+
snand: spi@1a400 {
compatible = "realtek,rtl9301-snand";
reg = <0x1a400 0x44>;
diff --git a/arch/mips/configs/eyeq5_defconfig b/arch/mips/configs/eyeq5_defconfig
index ff7af5dc6d9d..6688f56aba1c 100644
--- a/arch/mips/configs/eyeq5_defconfig
+++ b/arch/mips/configs/eyeq5_defconfig
@@ -19,20 +19,18 @@ CONFIG_SCHED_AUTOGROUP=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_EXPERT=y
CONFIG_EYEQ=y
-CONFIG_MACH_EYEQ5=y
CONFIG_FIT_IMAGE_FDT_EPM5=y
-CONFIG_PAGE_SIZE_16KB=y
CONFIG_MIPS_CPS=y
CONFIG_CPU_HAS_MSA=y
CONFIG_NR_CPUS=16
CONFIG_MIPS_RAW_APPENDED_DTB=y
CONFIG_JUMP_LABEL=y
+CONFIG_PAGE_SIZE_16KB=y
CONFIG_COMPAT_32BIT_TIME=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_TRIM_UNUSED_KSYMS=y
# CONFIG_COMPAT_BRK is not set
-CONFIG_SPARSEMEM_MANUAL=y
CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
@@ -64,8 +62,14 @@ CONFIG_CAN_M_CAN=y
CONFIG_SERIAL_AMBA_PL011=y
CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
CONFIG_HW_RANDOM=y
+CONFIG_I2C=y
+CONFIG_I2C_CHARDEV=y
+CONFIG_I2C_NOMADIK=y
# CONFIG_PTP_1588_CLOCK is not set
CONFIG_PINCTRL=y
+CONFIG_GPIOLIB=y
+CONFIG_GPIO_NOMADIK=y
+CONFIG_SENSORS_LM75=y
CONFIG_MFD_SYSCON=y
CONFIG_HID_A4TECH=y
CONFIG_HID_BELKIN=y
@@ -79,6 +83,8 @@ CONFIG_HID_MICROSOFT=y
CONFIG_HID_MONTEREY=y
CONFIG_MMC=y
CONFIG_MMC_SDHCI=y
+CONFIG_MMC_SDHCI_PLTFM=y
+CONFIG_MMC_SDHCI_CADENCE=y
# CONFIG_IOMMU_SUPPORT is not set
CONFIG_RESET_CONTROLLER=y
# CONFIG_NVMEM is not set
diff --git a/arch/mips/configs/eyeq6_defconfig b/arch/mips/configs/eyeq6_defconfig
index 0afbb45a78e8..0a00a201937b 100644
--- a/arch/mips/configs/eyeq6_defconfig
+++ b/arch/mips/configs/eyeq6_defconfig
@@ -82,6 +82,8 @@ CONFIG_HID_MICROSOFT=y
CONFIG_HID_MONTEREY=y
CONFIG_MMC=y
CONFIG_MMC_SDHCI=y
+CONFIG_MMC_SDHCI_PLTFM=y
+CONFIG_MMC_SDHCI_CADENCE=y
# CONFIG_IOMMU_SUPPORT is not set
CONFIG_RESET_CONTROLLER=y
# CONFIG_NVMEM is not set
diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig
index 114fcd67898d..cdedbb8a8f53 100644
--- a/arch/mips/configs/fuloong2e_defconfig
+++ b/arch/mips/configs/fuloong2e_defconfig
@@ -44,7 +44,6 @@ CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
CONFIG_NETFILTER_XT_TARGET_TRACE=m
CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig
index f1a8ccf2c459..2decf8b98d31 100644
--- a/arch/mips/configs/ip22_defconfig
+++ b/arch/mips/configs/ip22_defconfig
@@ -79,7 +79,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
diff --git a/arch/mips/configs/loongson2k_defconfig b/arch/mips/configs/loongson2k_defconfig
index 4b7f914d01d0..6aea6a5b1b66 100644
--- a/arch/mips/configs/loongson2k_defconfig
+++ b/arch/mips/configs/loongson2k_defconfig
@@ -52,7 +52,6 @@ CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
CONFIG_NETFILTER_XT_TARGET_MARK=m
CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
CONFIG_NETFILTER_XT_MATCH_LIMIT=m
diff --git a/arch/mips/configs/loongson3_defconfig b/arch/mips/configs/loongson3_defconfig
index 5ff0c1554168..b00344e0d899 100644
--- a/arch/mips/configs/loongson3_defconfig
+++ b/arch/mips/configs/loongson3_defconfig
@@ -72,7 +72,6 @@ CONFIG_NETFILTER_XT_TARGET_MARK=m
CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
CONFIG_NETFILTER_XT_MATCH_LIMIT=m
diff --git a/arch/mips/configs/malta_defconfig b/arch/mips/configs/malta_defconfig
index 869a14b3184f..9fcbac829920 100644
--- a/arch/mips/configs/malta_defconfig
+++ b/arch/mips/configs/malta_defconfig
@@ -80,7 +80,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
CONFIG_NETFILTER_XT_MATCH_HELPER=m
diff --git a/arch/mips/configs/malta_kvm_defconfig b/arch/mips/configs/malta_kvm_defconfig
index 41e1fea303ea..19102386a81c 100644
--- a/arch/mips/configs/malta_kvm_defconfig
+++ b/arch/mips/configs/malta_kvm_defconfig
@@ -84,7 +84,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
CONFIG_NETFILTER_XT_MATCH_HELPER=m
diff --git a/arch/mips/configs/maltaup_xpa_defconfig b/arch/mips/configs/maltaup_xpa_defconfig
index 13ff1877e26e..1dd07c9d1812 100644
--- a/arch/mips/configs/maltaup_xpa_defconfig
+++ b/arch/mips/configs/maltaup_xpa_defconfig
@@ -82,7 +82,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
CONFIG_NETFILTER_XT_MATCH_HELPER=m
diff --git a/arch/mips/configs/rb532_defconfig b/arch/mips/configs/rb532_defconfig
index 9fb114ef5e2d..30d18b084cda 100644
--- a/arch/mips/configs/rb532_defconfig
+++ b/arch/mips/configs/rb532_defconfig
@@ -56,7 +56,6 @@ CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
CONFIG_NETFILTER_XT_TARGET_TRACE=m
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
CONFIG_NETFILTER_XT_MATCH_LIMIT=y
CONFIG_NETFILTER_XT_MATCH_MULTIPORT=y
diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig
index 7b5a5591ccc9..39a2419e1f3e 100644
--- a/arch/mips/configs/rm200_defconfig
+++ b/arch/mips/configs/rm200_defconfig
@@ -64,7 +64,6 @@ CONFIG_NETFILTER_XT_MATCH_COMMENT=m
CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
diff --git a/arch/mips/include/asm/cpu-info.h b/arch/mips/include/asm/cpu-info.h
index a600670d00e9..fd60837ce50b 100644
--- a/arch/mips/include/asm/cpu-info.h
+++ b/arch/mips/include/asm/cpu-info.h
@@ -123,6 +123,7 @@ extern struct cpuinfo_mips cpu_data[];
extern void cpu_probe(void);
extern void cpu_report(void);
+extern void cpu_disable_mmid(void);
extern const char *__cpu_name[];
#define cpu_name_string() __cpu_name[raw_smp_processor_id()]
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index fbc71ddcf0f6..8c460ce01ffe 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -11,20 +11,6 @@
#include <asm/page.h>
-#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
-static inline int prepare_hugepage_range(struct file *file,
- unsigned long addr,
- unsigned long len)
-{
- unsigned long task_size = STACK_TOP;
-
- if (len > task_size)
- return -ENOMEM;
- if (task_size - len < addr)
- return -EINVAL;
- return 0;
-}
-
#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep,
diff --git a/arch/mips/include/asm/mach-generic/mc146818rtc.h b/arch/mips/include/asm/mach-generic/mc146818rtc.h
index 9c72e540ff56..249279b0494d 100644
--- a/arch/mips/include/asm/mach-generic/mc146818rtc.h
+++ b/arch/mips/include/asm/mach-generic/mc146818rtc.h
@@ -29,8 +29,4 @@ static inline void CMOS_WRITE(unsigned char data, unsigned long addr)
#define RTC_ALWAYS_BCD 0
-#ifndef mc146818_decode_year
-#define mc146818_decode_year(year) ((year) < 70 ? (year) + 2000 : (year) + 1900)
-#endif
-
#endif /* __ASM_MACH_GENERIC_MC146818RTC_H */
diff --git a/arch/mips/include/asm/mach-ip30/cpu-feature-overrides.h b/arch/mips/include/asm/mach-ip30/cpu-feature-overrides.h
index ce4e4c6e09e2..50d487a4c95e 100644
--- a/arch/mips/include/asm/mach-ip30/cpu-feature-overrides.h
+++ b/arch/mips/include/asm/mach-ip30/cpu-feature-overrides.h
@@ -5,7 +5,7 @@
* Copyright (C) 2003 Ralf Baechle <ralf@linux-mips.org>
* 2004-2007 Stanislaw Skowronek <skylark@unaligned.org>
* 2009 Johannes Dickgreber <tanzy@gmx.de>
- * 2015 Joshua Kinard <kumba@gentoo.org>
+ * 2015 Joshua Kinard <linux@kumba.dev>
*
*/
#ifndef __ASM_MACH_IP30_CPU_FEATURE_OVERRIDES_H
diff --git a/arch/mips/include/asm/mach-ip30/spaces.h b/arch/mips/include/asm/mach-ip30/spaces.h
index c8a302dfbe05..d381b93d6ad3 100644
--- a/arch/mips/include/asm/mach-ip30/spaces.h
+++ b/arch/mips/include/asm/mach-ip30/spaces.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Copyright (C) 2016 Joshua Kinard <kumba@gentoo.org>
+ * Copyright (C) 2016 Joshua Kinard <linux@kumba.dev>
*
*/
#ifndef _ASM_MACH_IP30_SPACES_H
diff --git a/arch/mips/include/asm/mach-jazz/mc146818rtc.h b/arch/mips/include/asm/mach-jazz/mc146818rtc.h
index 987f727afe25..639bff8ebca3 100644
--- a/arch/mips/include/asm/mach-jazz/mc146818rtc.h
+++ b/arch/mips/include/asm/mach-jazz/mc146818rtc.h
@@ -33,6 +33,4 @@ static inline void CMOS_WRITE(unsigned char data, unsigned long addr)
#define RTC_ALWAYS_BCD 0
-#define mc146818_decode_year(year) ((year) + 1980)
-
#endif /* __ASM_MACH_JAZZ_MC146818RTC_H */
diff --git a/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h b/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h
index c2e0acb755cd..dd9f621d0204 100644
--- a/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h
+++ b/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h
@@ -99,5 +99,8 @@ extern __iomem void *ltq_cgu_membase;
extern void ltq_pmu_enable(unsigned int module);
extern void ltq_pmu_disable(unsigned int module);
+/* VMMC */
+extern unsigned int *ltq_get_cp1_base(void);
+
#endif /* CONFIG_SOC_TYPE_XWAY */
#endif /* _LTQ_XWAY_H__ */
diff --git a/arch/mips/include/asm/mach-malta/mc146818rtc.h b/arch/mips/include/asm/mach-malta/mc146818rtc.h
index e8cc7fdf7415..7da2c0ea55da 100644
--- a/arch/mips/include/asm/mach-malta/mc146818rtc.h
+++ b/arch/mips/include/asm/mach-malta/mc146818rtc.h
@@ -31,6 +31,4 @@ static inline void CMOS_WRITE(unsigned char data, unsigned long addr)
#define RTC_ALWAYS_BCD 0
-#define mc146818_decode_year(year) ((year) < 70 ? (year) + 2000 : (year) + 1900)
-
#endif /* __ASM_MACH_MALTA_MC146818RTC_H */
diff --git a/arch/mips/include/asm/mach-rm/mc146818rtc.h b/arch/mips/include/asm/mach-rm/mc146818rtc.h
deleted file mode 100644
index a074f4f84f75..000000000000
--- a/arch/mips/include/asm/mach-rm/mc146818rtc.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2004 by Ralf Baechle
- *
- * RTC routines for PC style attached Dallas chip with ARC epoch.
- */
-#ifndef __ASM_MACH_RM_MC146818RTC_H
-#define __ASM_MACH_RM_MC146818RTC_H
-
-#ifdef CONFIG_CPU_BIG_ENDIAN
-#define mc146818_decode_year(year) ((year) < 70 ? (year) + 2000 : (year) + 1900)
-#else
-#define mc146818_decode_year(year) ((year) + 1980)
-#endif
-
-#include <asm/mach-generic/mc146818rtc.h>
-
-#endif /* __ASM_MACH_RM_MC146818RTC_H */
diff --git a/arch/mips/include/asm/mc146818-time.h b/arch/mips/include/asm/mc146818-time.h
index cbf5cec345f1..ac52a30b4161 100644
--- a/arch/mips/include/asm/mc146818-time.h
+++ b/arch/mips/include/asm/mc146818-time.h
@@ -8,112 +8,21 @@
#ifndef __ASM_MC146818_TIME_H
#define __ASM_MC146818_TIME_H
-#include <linux/bcd.h>
#include <linux/mc146818rtc.h>
#include <linux/time.h>
-/*
- * For check timing call set_rtc_mmss() 500ms; used in timer interrupt.
- */
-#define USEC_AFTER 500000
-#define USEC_BEFORE 500000
-
-/*
- * In order to set the CMOS clock precisely, set_rtc_mmss has to be
- * called 500 ms after the second nowtime has started, because when
- * nowtime is written into the registers of the CMOS clock, it will
- * jump to the next second precisely 500 ms later. Check the Motorola
- * MC146818A or Dallas DS12887 data sheet for details.
- *
- * BUG: This routine does not handle hour overflow properly; it just
- * sets the minutes. Usually you'll only notice that after reboot!
- */
-static inline int mc146818_set_rtc_mmss(unsigned long nowtime)
-{
- int real_seconds, real_minutes, cmos_minutes;
- unsigned char save_control, save_freq_select;
- int retval = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&rtc_lock, flags);
- save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
- CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
-
- save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
- CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
-
- cmos_minutes = CMOS_READ(RTC_MINUTES);
- if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
- cmos_minutes = bcd2bin(cmos_minutes);
-
- /*
- * since we're only adjusting minutes and seconds,
- * don't interfere with hour overflow. This avoids
- * messing with unknown time zones but requires your
- * RTC not to be off by more than 15 minutes
- */
- real_seconds = nowtime % 60;
- real_minutes = nowtime / 60;
- if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
- real_minutes += 30; /* correct for half hour time zone */
- real_minutes %= 60;
-
- if (abs(real_minutes - cmos_minutes) < 30) {
- if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
- real_seconds = bin2bcd(real_seconds);
- real_minutes = bin2bcd(real_minutes);
- }
- CMOS_WRITE(real_seconds, RTC_SECONDS);
- CMOS_WRITE(real_minutes, RTC_MINUTES);
- } else {
- printk_once(KERN_NOTICE
- "set_rtc_mmss: can't update from %d to %d\n",
- cmos_minutes, real_minutes);
- retval = -1;
- }
-
- /* The following flags have to be released exactly in this order,
- * otherwise the DS12887 (popular MC146818A clone with integrated
- * battery and quartz) will not reset the oscillator and will not
- * update precisely 500 ms later. You won't find this mentioned in
- * the Dallas Semiconductor data sheets, but who believes data
- * sheets anyway ... -- Markus Kuhn
- */
- CMOS_WRITE(save_control, RTC_CONTROL);
- CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
- spin_unlock_irqrestore(&rtc_lock, flags);
-
- return retval;
-}
-
+#ifdef CONFIG_RTC_MC146818_LIB
static inline time64_t mc146818_get_cmos_time(void)
{
- unsigned int year, mon, day, hour, min, sec;
- unsigned long flags;
-
- spin_lock_irqsave(&rtc_lock, flags);
-
- do {
- sec = CMOS_READ(RTC_SECONDS);
- min = CMOS_READ(RTC_MINUTES);
- hour = CMOS_READ(RTC_HOURS);
- day = CMOS_READ(RTC_DAY_OF_MONTH);
- mon = CMOS_READ(RTC_MONTH);
- year = CMOS_READ(RTC_YEAR);
- } while (sec != CMOS_READ(RTC_SECONDS));
+ struct rtc_time tm;
- if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
- sec = bcd2bin(sec);
- min = bcd2bin(min);
- hour = bcd2bin(hour);
- day = bcd2bin(day);
- mon = bcd2bin(mon);
- year = bcd2bin(year);
+ if (mc146818_get_time(&tm, 1000)) {
+ pr_err("Unable to read current time from RTC\n");
+ return 0;
}
- spin_unlock_irqrestore(&rtc_lock, flags);
- year = mc146818_decode_year(year);
- return mktime64(year, mon, day, hour, min, sec);
+ return rtc_tm_to_time64(&tm);
}
+#endif /* CONFIG_RTC_MC146818_LIB */
#endif /* __ASM_MC146818_TIME_H */
diff --git a/arch/mips/include/asm/mips-cps.h b/arch/mips/include/asm/mips-cps.h
index 917009b80e69..1fffd47a4564 100644
--- a/arch/mips/include/asm/mips-cps.h
+++ b/arch/mips/include/asm/mips-cps.h
@@ -258,6 +258,8 @@ static inline bool mips_cps_multicluster_cpus(void)
/**
* mips_cps_first_online_in_cluster() - Detect if CPU is first online in cluster
+ * @first_cpu: The first other online CPU in cluster, or nr_cpu_ids if
+ * the function returns true.
*
* Determine whether the local CPU is the first to be brought online in its
* cluster - that is, whether there are any other online CPUs in the local
@@ -265,6 +267,6 @@ static inline bool mips_cps_multicluster_cpus(void)
*
* Returns true if this CPU is first online, else false.
*/
-extern unsigned int mips_cps_first_online_in_cluster(void);
+extern unsigned int mips_cps_first_online_in_cluster(int *first_cpu);
#endif /* __MIPS_ASM_MIPS_CPS_H__ */
diff --git a/arch/mips/include/asm/sgi/heart.h b/arch/mips/include/asm/sgi/heart.h
index 0d03751955c4..c224c2e3575a 100644
--- a/arch/mips/include/asm/sgi/heart.h
+++ b/arch/mips/include/asm/sgi/heart.h
@@ -4,7 +4,7 @@
*
* Copyright (C) 2004-2007 Stanislaw Skowronek <skylark@unaligned.org>
* 2009 Johannes Dickgreber <tanzy@gmx.de>
- * 2007-2015 Joshua Kinard <kumba@gentoo.org>
+ * 2007-2015 Joshua Kinard <linux@kumba.dev>
*/
#ifndef __ASM_SGI_HEART_H
#define __ASM_SGI_HEART_H
diff --git a/arch/mips/include/asm/smp-cps.h b/arch/mips/include/asm/smp-cps.h
index 10d3ebd890cb..88cfae5d22c8 100644
--- a/arch/mips/include/asm/smp-cps.h
+++ b/arch/mips/include/asm/smp-cps.h
@@ -24,6 +24,7 @@ struct core_boot_config {
struct cluster_boot_config {
unsigned long *core_power;
+ struct cpumask cpumask;
struct core_boot_config *core_config;
};
diff --git a/arch/mips/include/asm/vpe.h b/arch/mips/include/asm/vpe.h
index 61fd4d0aeda4..c0769dc4b853 100644
--- a/arch/mips/include/asm/vpe.h
+++ b/arch/mips/include/asm/vpe.h
@@ -119,4 +119,12 @@ void cleanup_tc(struct tc *tc);
int __init vpe_module_init(void);
void __exit vpe_module_exit(void);
+
+#ifdef CONFIG_MIPS_VPE_LOADER_MT
+void *vpe_alloc(void);
+int vpe_start(void *vpe, unsigned long start);
+int vpe_stop(void *vpe);
+int vpe_free(void *vpe);
+#endif /* CONFIG_MIPS_VPE_LOADER_MT */
+
#endif /* _ASM_VPE_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 31ac655b7837..72fb1b006da9 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -163,6 +163,9 @@
#define SO_PASSRIGHTS 83
+#define SO_INQ 84
+#define SCM_INQ SO_INQ
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index af7412549e6e..04dc9ab55524 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -9,6 +9,7 @@
*/
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/mmu_context.h>
#include <linux/ptrace.h>
#include <linux/smp.h>
#include <linux/stddef.h>
@@ -37,6 +38,8 @@
unsigned int elf_hwcap __read_mostly;
EXPORT_SYMBOL_GPL(elf_hwcap);
+static bool mmid_disabled_quirk;
+
static inline unsigned long cpu_get_msa_id(void)
{
unsigned long status, msa_id;
@@ -645,7 +648,7 @@ static inline unsigned int decode_config5(struct cpuinfo_mips *c)
config5 &= ~(MIPS_CONF5_UFR | MIPS_CONF5_UFE);
if (cpu_has_mips_r6) {
- if (!__builtin_constant_p(cpu_has_mmid) || cpu_has_mmid)
+ if (!mmid_disabled_quirk && (!__builtin_constant_p(cpu_has_mmid) || cpu_has_mmid))
config5 |= MIPS_CONF5_MI;
else
config5 &= ~MIPS_CONF5_MI;
@@ -708,7 +711,6 @@ static inline unsigned int decode_config5(struct cpuinfo_mips *c)
max_mmid_width);
asid_mask = GENMASK(max_mmid_width - 1, 0);
}
-
set_cpu_asid_mask(c, asid_mask);
}
}
@@ -2046,3 +2048,39 @@ void cpu_set_vpe_id(struct cpuinfo_mips *cpuinfo, unsigned int vpe)
cpuinfo->globalnumber &= ~MIPS_GLOBALNUMBER_VP;
cpuinfo->globalnumber |= vpe << MIPS_GLOBALNUMBER_VP_SHF;
}
+
+void cpu_disable_mmid(void)
+{
+ int i;
+ unsigned long asid_mask;
+ unsigned int cpu = smp_processor_id();
+ struct cpuinfo_mips *c = &current_cpu_data;
+ unsigned int config4 = read_c0_config4();
+ unsigned int config5 = read_c0_config5();
+
+ /* Setup the initial ASID mask based on config4 */
+ asid_mask = MIPS_ENTRYHI_ASID;
+ if (config4 & MIPS_CONF4_AE)
+ asid_mask |= MIPS_ENTRYHI_ASIDX;
+ set_cpu_asid_mask(c, asid_mask);
+
+ /* Disable MMID in the C0 and update cpuinfo_mips accordingly */
+ config5 &= ~(MIPS_CONF5_UFR | MIPS_CONF5_UFE);
+ config5 &= ~MIPS_CONF5_MI;
+ write_c0_config5(config5);
+ /* Ensure the write to config5 above takes effect */
+ back_to_back_c0_hazard();
+ c->options &= ~MIPS_CPU_MMID;
+
+ /* Setup asid cache value cleared in per_cpu_trap_init() */
+ cpu_data[cpu].asid_cache = asid_first_version(cpu);
+
+ /* Reinit context for each CPU */
+ for_each_possible_cpu(i)
+ set_cpu_context(i, &init_mm, 0);
+
+ /* Ensure that now MMID will be seen as disable */
+ mmid_disabled_quirk = true;
+
+ pr_info("MMID support disabled due to hardware support issue\n");
+}
diff --git a/arch/mips/kernel/mips-cm.c b/arch/mips/kernel/mips-cm.c
index 43cb1e20baed..7c9c5dc38823 100644
--- a/arch/mips/kernel/mips-cm.c
+++ b/arch/mips/kernel/mips-cm.c
@@ -10,6 +10,7 @@
#include <linux/spinlock.h>
#include <asm/mips-cps.h>
+#include <asm/smp-cps.h>
#include <asm/mipsregs.h>
void __iomem *mips_gcr_base;
@@ -248,6 +249,11 @@ void mips_cm_update_property(void)
return;
pr_info("HCI (Hardware Cache Init for the L2 cache) in GCR_L2_RAM_CONFIG from the CM3 is broken");
mips_cm_is_l2_hci_broken = true;
+
+ /* Disable MMID only if it was configured */
+ if (cpu_has_mmid)
+ cpu_disable_mmid();
+
of_node_put(cm_node);
}
@@ -529,39 +535,23 @@ void mips_cm_error_report(void)
write_gcr_error_cause(cm_error);
}
-unsigned int mips_cps_first_online_in_cluster(void)
+unsigned int mips_cps_first_online_in_cluster(int *first_cpu)
{
- unsigned int local_cl;
- int i;
-
- local_cl = cpu_cluster(&current_cpu_data);
+ unsigned int local_cl = cpu_cluster(&current_cpu_data);
+ struct cpumask *local_cl_mask;
/*
- * We rely upon knowledge that CPUs are numbered sequentially by
- * cluster - ie. CPUs 0..X will be in cluster 0, CPUs X+1..Y in cluster
- * 1, CPUs Y+1..Z in cluster 2 etc. This means that CPUs in the same
- * cluster will immediately precede or follow one another.
- *
- * First we scan backwards, until we find an online CPU in the cluster
- * or we move on to another cluster.
+ * mips_cps_cluster_bootcfg is allocated in cps_prepare_cpus. If it is
+ * not yet done, then we are so early that only one CPU is running, so
+ * it is the first online CPU in the cluster.
*/
- for (i = smp_processor_id() - 1; i >= 0; i--) {
- if (cpu_cluster(&cpu_data[i]) != local_cl)
- break;
- if (!cpu_online(i))
- continue;
- return false;
- }
-
- /* Then do the same for higher numbered CPUs */
- for (i = smp_processor_id() + 1; i < nr_cpu_ids; i++) {
- if (cpu_cluster(&cpu_data[i]) != local_cl)
- break;
- if (!cpu_online(i))
- continue;
- return false;
- }
-
- /* We found no online CPUs in the local cluster */
- return true;
+ if (IS_ENABLED(CONFIG_MIPS_CPS) && mips_cps_cluster_bootcfg)
+ local_cl_mask = &mips_cps_cluster_bootcfg[local_cl].cpumask;
+ else
+ return true;
+
+ *first_cpu = cpumask_any_and_but(local_cl_mask,
+ cpu_online_mask,
+ smp_processor_id());
+ return (*first_cpu >= nr_cpu_ids);
}
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index b630604c577f..02aa6a04a21d 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -690,18 +690,20 @@ unsigned long mips_stack_top(void)
}
/* Space for the VDSO, data page & GIC user page */
- top -= PAGE_ALIGN(current->thread.abi->vdso->size);
- top -= PAGE_SIZE;
- top -= mips_gic_present() ? PAGE_SIZE : 0;
+ if (current->thread.abi) {
+ top -= PAGE_ALIGN(current->thread.abi->vdso->size);
+ top -= PAGE_SIZE;
+ top -= mips_gic_present() ? PAGE_SIZE : 0;
+
+ /* Space to randomize the VDSO base */
+ if (current->flags & PF_RANDOMIZE)
+ top -= VDSO_RANDOMIZE_SIZE;
+ }
/* Space for cache colour alignment */
if (cpu_has_dc_aliases)
top -= shm_align_mask + 1;
- /* Space to randomize the VDSO base */
- if (current->flags & PF_RANDOMIZE)
- top -= VDSO_RANDOMIZE_SIZE;
-
return top;
}
diff --git a/arch/mips/kernel/relocate.c b/arch/mips/kernel/relocate.c
index cda7983e7c18..7f1c136ad850 100644
--- a/arch/mips/kernel/relocate.c
+++ b/arch/mips/kernel/relocate.c
@@ -138,7 +138,7 @@ static int __init reloc_handler(u32 type, u32 *loc_orig, u32 *loc_new,
apply_r_mips_hi16_rel(loc_orig, loc_new, offset);
break;
default:
- pr_err("Unhandled relocation type %d at 0x%pK\n", type,
+ pr_err("Unhandled relocation type %d at 0x%p\n", type,
loc_orig);
return -ENOEXEC;
}
@@ -439,10 +439,10 @@ static void show_kernel_relocation(const char *level)
{
if (__kaslr_offset > 0) {
printk(level);
- pr_cont("Kernel relocated by 0x%pK\n", (void *)__kaslr_offset);
- pr_cont(" .text @ 0x%pK\n", _text);
- pr_cont(" .data @ 0x%pK\n", _sdata);
- pr_cont(" .bss @ 0x%pK\n", __bss_start);
+ pr_cont("Kernel relocated by 0x%p\n", (void *)__kaslr_offset);
+ pr_cont(" .text @ 0x%p\n", _text);
+ pr_cont(" .data @ 0x%p\n", _sdata);
+ pr_cont(" .bss @ 0x%p\n", __bss_start);
}
}
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index 7b0e69af4097..22d4f9ff3ae2 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -281,9 +281,20 @@ static void __init cps_smp_setup(void)
#endif /* CONFIG_MIPS_MT_FPAFF */
}
+unsigned long calibrate_delay_is_known(void)
+{
+ int first_cpu_cluster = 0;
+
+ /* The calibration has to be done on the primary CPU of the cluster */
+ if (mips_cps_first_online_in_cluster(&first_cpu_cluster))
+ return 0;
+
+ return cpu_data[first_cpu_cluster].udelay_val;
+}
+
static void __init cps_prepare_cpus(unsigned int max_cpus)
{
- unsigned int nclusters, ncores, core_vpes, c, cl, cca;
+ unsigned int nclusters, ncores, core_vpes, nvpe = 0, c, cl, cca;
bool cca_unsuitable, cores_limited;
struct cluster_boot_config *cluster_bootcfg;
struct core_boot_config *core_bootcfg;
@@ -356,10 +367,13 @@ static void __init cps_prepare_cpus(unsigned int max_cpus)
/* Allocate VPE boot configuration structs */
for (c = 0; c < ncores; c++) {
+ int v;
core_vpes = core_vpe_count(cl, c);
core_bootcfg[c].vpe_config = kcalloc(core_vpes,
sizeof(*core_bootcfg[c].vpe_config),
GFP_KERNEL);
+ for (v = 0; v < core_vpes; v++)
+ cpumask_set_cpu(nvpe++, &mips_cps_cluster_bootcfg[cl].cpumask);
if (!core_bootcfg[c].vpe_config)
goto err_out;
}
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index cef3c423a41a..a75587018f44 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -315,7 +315,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
* we allocate is out of range, just give up now.
*/
if (!cpu_has_ebase_wg && virt_to_phys(gebase) >= 0x20000000) {
- kvm_err("CP0_EBase.WG required for guest exception base %pK\n",
+ kvm_err("CP0_EBase.WG required for guest exception base %p\n",
gebase);
err = -ENOMEM;
goto out_free_gebase;
diff --git a/arch/mips/lantiq/falcon/prom.c b/arch/mips/lantiq/falcon/prom.c
index 7b98def106e4..2a38c4267685 100644
--- a/arch/mips/lantiq/falcon/prom.c
+++ b/arch/mips/lantiq/falcon/prom.c
@@ -36,14 +36,14 @@
#define BOOT_NVEC (BOOT_REG_BASE | 0x04)
#define BOOT_EVEC (BOOT_REG_BASE | 0x08)
-void __init ltq_soc_nmi_setup(void)
+static void __init ltq_soc_nmi_setup(void)
{
extern void (*nmi_handler)(void);
ltq_w32((unsigned long)&nmi_handler, (void *)BOOT_NVEC);
}
-void __init ltq_soc_ejtag_setup(void)
+static void __init ltq_soc_ejtag_setup(void)
{
extern void (*ejtag_debug_handler)(void);
diff --git a/arch/mips/lantiq/falcon/sysctrl.c b/arch/mips/lantiq/falcon/sysctrl.c
index 1187729d8cbb..577e6e6309a6 100644
--- a/arch/mips/lantiq/falcon/sysctrl.c
+++ b/arch/mips/lantiq/falcon/sysctrl.c
@@ -14,6 +14,7 @@
#include <lantiq_soc.h>
#include "../clk.h"
+#include "../prom.h"
/* infrastructure control register */
#define SYS1_INFRAC 0x00bc
@@ -72,11 +73,6 @@
static void __iomem *sysctl_membase[3], *status_membase;
void __iomem *ltq_sys1_membase, *ltq_ebu_membase;
-void falcon_trigger_hrst(int level)
-{
- sysctl_w32(SYSCTL_SYS1, level & 1, SYS1_HRSTOUTC);
-}
-
static inline void sysctl_wait(struct clk *clk,
unsigned int test, unsigned int reg)
{
@@ -214,19 +210,16 @@ void __init ltq_soc_init(void)
of_node_put(np_syseth);
of_node_put(np_sysgpe);
- if ((request_mem_region(res_status.start, resource_size(&res_status),
- res_status.name) < 0) ||
- (request_mem_region(res_ebu.start, resource_size(&res_ebu),
- res_ebu.name) < 0) ||
- (request_mem_region(res_sys[0].start,
- resource_size(&res_sys[0]),
- res_sys[0].name) < 0) ||
- (request_mem_region(res_sys[1].start,
- resource_size(&res_sys[1]),
- res_sys[1].name) < 0) ||
- (request_mem_region(res_sys[2].start,
- resource_size(&res_sys[2]),
- res_sys[2].name) < 0))
+ if ((!request_mem_region(res_status.start, resource_size(&res_status),
+ res_status.name)) ||
+ (!request_mem_region(res_ebu.start, resource_size(&res_ebu),
+ res_ebu.name)) ||
+ (!request_mem_region(res_sys[0].start, resource_size(&res_sys[0]),
+ res_sys[0].name)) ||
+ (!request_mem_region(res_sys[1].start, resource_size(&res_sys[1]),
+ res_sys[1].name)) ||
+ (!request_mem_region(res_sys[2].start, resource_size(&res_sys[2]),
+ res_sys[2].name)))
pr_err("Failed to request core resources");
status_membase = ioremap(res_status.start,
diff --git a/arch/mips/lantiq/irq.c b/arch/mips/lantiq/irq.c
index a112573b6e37..961c55933a6d 100644
--- a/arch/mips/lantiq/irq.c
+++ b/arch/mips/lantiq/irq.c
@@ -16,6 +16,7 @@
#include <asm/bootinfo.h>
#include <asm/irq_cpu.h>
+#include <asm/time.h>
#include <lantiq_soc.h>
#include <irq.h>
@@ -335,7 +336,8 @@ static const struct irq_domain_ops irq_domain_ops = {
.map = icu_map,
};
-int __init icu_of_init(struct device_node *node, struct device_node *parent)
+static int __init
+icu_of_init(struct device_node *node, struct device_node *parent)
{
struct device_node *eiu_node;
struct resource res;
diff --git a/arch/mips/lantiq/xway/clk.c b/arch/mips/lantiq/xway/clk.c
index 47ad21430fe2..39fb3ecdd6b7 100644
--- a/arch/mips/lantiq/xway/clk.c
+++ b/arch/mips/lantiq/xway/clk.c
@@ -74,7 +74,7 @@ unsigned long ltq_danube_pp32_hz(void)
return clk;
}
-unsigned long ltq_ar9_sys_hz(void)
+static unsigned long ltq_ar9_sys_hz(void)
{
if (((ltq_cgu_r32(CGU_SYS) >> 3) & 0x3) == 0x2)
return CLOCK_393M;
diff --git a/arch/mips/lantiq/xway/dcdc.c b/arch/mips/lantiq/xway/dcdc.c
index 4a808f8c5beb..b79c462fd48a 100644
--- a/arch/mips/lantiq/xway/dcdc.c
+++ b/arch/mips/lantiq/xway/dcdc.c
@@ -46,7 +46,7 @@ static struct platform_driver dcdc_driver = {
},
};
-int __init dcdc_init(void)
+static int __init dcdc_init(void)
{
int ret = platform_driver_register(&dcdc_driver);
diff --git a/arch/mips/lantiq/xway/dma.c b/arch/mips/lantiq/xway/dma.c
index 934ac72937e5..4693eba6c296 100644
--- a/arch/mips/lantiq/xway/dma.c
+++ b/arch/mips/lantiq/xway/dma.c
@@ -289,7 +289,7 @@ static struct platform_driver dma_driver = {
},
};
-int __init
+static int __init
dma_init(void)
{
return platform_driver_register(&dma_driver);
diff --git a/arch/mips/lantiq/xway/gptu.c b/arch/mips/lantiq/xway/gptu.c
index 8d52001301de..484c9e3000c1 100644
--- a/arch/mips/lantiq/xway/gptu.c
+++ b/arch/mips/lantiq/xway/gptu.c
@@ -194,7 +194,7 @@ static struct platform_driver dma_driver = {
},
};
-int __init gptu_init(void)
+static int __init gptu_init(void)
{
int ret = platform_driver_register(&dma_driver);
diff --git a/arch/mips/loongson64/setup.c b/arch/mips/loongson64/setup.c
index 257038e18779..b3e590eae952 100644
--- a/arch/mips/loongson64/setup.c
+++ b/arch/mips/loongson64/setup.c
@@ -3,7 +3,6 @@
* Copyright (C) 2007 Lemote Inc. & Institute of Computing Technology
* Author: Fuxin Zhang, zhangfx@lemote.com
*/
-#include <linux/export.h>
#include <linux/init.h>
#include <asm/bootinfo.h>
diff --git a/arch/mips/mm/physaddr.c b/arch/mips/mm/physaddr.c
index f9b8c85e9843..a6b1bf82057a 100644
--- a/arch/mips/mm/physaddr.c
+++ b/arch/mips/mm/physaddr.c
@@ -30,7 +30,7 @@ static inline bool __debug_virt_addr_valid(unsigned long x)
phys_addr_t __virt_to_phys(volatile const void *x)
{
WARN(!__debug_virt_addr_valid((unsigned long)x),
- "virt_to_phys used for non-linear address: %pK (%pS)\n",
+ "virt_to_phys used for non-linear address: %p (%pS)\n",
x, x);
return __virt_to_phys_nodebug(x);
diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c
index 76f3b9c0a9f0..347126dc010d 100644
--- a/arch/mips/mm/tlb-r4k.c
+++ b/arch/mips/mm/tlb-r4k.c
@@ -508,6 +508,60 @@ static int __init set_ntlb(char *str)
__setup("ntlb=", set_ntlb);
+/* Initialise all TLB entries with unique values */
+static void r4k_tlb_uniquify(void)
+{
+ int entry = num_wired_entries();
+
+ htw_stop();
+ write_c0_entrylo0(0);
+ write_c0_entrylo1(0);
+
+ while (entry < current_cpu_data.tlbsize) {
+ unsigned long asid_mask = cpu_asid_mask(&current_cpu_data);
+ unsigned long asid = 0;
+ int idx;
+
+ /* Skip wired MMID to make ginvt_mmid work */
+ if (cpu_has_mmid)
+ asid = MMID_KERNEL_WIRED + 1;
+
+ /* Check for match before using UNIQUE_ENTRYHI */
+ do {
+ if (cpu_has_mmid) {
+ write_c0_memorymapid(asid);
+ write_c0_entryhi(UNIQUE_ENTRYHI(entry));
+ } else {
+ write_c0_entryhi(UNIQUE_ENTRYHI(entry) | asid);
+ }
+ mtc0_tlbw_hazard();
+ tlb_probe();
+ tlb_probe_hazard();
+ idx = read_c0_index();
+ /* No match or match is on current entry */
+ if (idx < 0 || idx == entry)
+ break;
+ /*
+ * If we hit a match, we need to try again with
+ * a different ASID.
+ */
+ asid++;
+ } while (asid < asid_mask);
+
+ if (idx >= 0 && idx != entry)
+ panic("Unable to uniquify TLB entry %d", idx);
+
+ write_c0_index(entry);
+ mtc0_tlbw_hazard();
+ tlb_write_indexed();
+ entry++;
+ }
+
+ tlbw_use_hazard();
+ htw_start();
+ flush_micro_tlb();
+}
+
/*
* Configure TLB (for init or after a CPU has been powered off).
*/
@@ -547,7 +601,7 @@ static void r4k_tlb_configure(void)
temp_tlb_entry = current_cpu_data.tlbsize - 1;
/* From this point on the ARC firmware is dead. */
- local_flush_tlb_all();
+ r4k_tlb_uniquify();
/* Did I tell you that ARC SUCKS? */
}
diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c
index 68a8cefed420..0e85839b8225 100644
--- a/arch/mips/pci/pci-lantiq.c
+++ b/arch/mips/pci/pci-lantiq.c
@@ -234,7 +234,7 @@ static struct platform_driver ltq_pci_driver = {
},
};
-int __init pcibios_init(void)
+static int __init pcibios_init(void)
{
int ret = platform_driver_register(&ltq_pci_driver);
if (ret)
diff --git a/arch/mips/pci/pci-rt2880.c b/arch/mips/pci/pci-rt2880.c
index 1cada09fa5db..006e2bbab87e 100644
--- a/arch/mips/pci/pci-rt2880.c
+++ b/arch/mips/pci/pci-rt2880.c
@@ -264,7 +264,7 @@ static struct platform_driver rt288x_pci_driver = {
},
};
-int __init pcibios_init(void)
+static int __init pcibios_init(void)
{
int ret = platform_driver_register(&rt288x_pci_driver);
diff --git a/arch/mips/ralink/irq.c b/arch/mips/ralink/irq.c
index af5bbbea949b..955b36e89358 100644
--- a/arch/mips/ralink/irq.c
+++ b/arch/mips/ralink/irq.c
@@ -15,6 +15,7 @@
#include <asm/irq_cpu.h>
#include <asm/mipsregs.h>
+#include <asm/time.h>
#include "common.h"
diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c
index 288d4d17eddd..20ef663af16e 100644
--- a/arch/mips/sgi-ip27/ip27-irq.c
+++ b/arch/mips/sgi-ip27/ip27-irq.c
@@ -165,7 +165,7 @@ static void hub_domain_free(struct irq_domain *domain,
return;
irqd = irq_domain_get_irq_data(domain, virq);
- if (irqd && irqd->chip_data)
+ if (irqd)
kfree(irqd->chip_data);
}
diff --git a/arch/mips/sgi-ip30/ip30-power.c b/arch/mips/sgi-ip30/ip30-power.c
index 120b3f3d5108..66851e17c5a7 100644
--- a/arch/mips/sgi-ip30/ip30-power.c
+++ b/arch/mips/sgi-ip30/ip30-power.c
@@ -3,7 +3,7 @@
* ip30-power.c: Software powerdown and reset handling for IP30 architecture.
*
* Copyright (C) 2004-2007 Stanislaw Skowronek <skylark@unaligned.org>
- * 2014 Joshua Kinard <kumba@gentoo.org>
+ * 2014 Joshua Kinard <linux@kumba.dev>
* 2009 Johannes Dickgreber <tanzy@gmx.de>
*/
diff --git a/arch/mips/sgi-ip30/ip30-setup.c b/arch/mips/sgi-ip30/ip30-setup.c
index e8547636a748..3fcb3ec9f802 100644
--- a/arch/mips/sgi-ip30/ip30-setup.c
+++ b/arch/mips/sgi-ip30/ip30-setup.c
@@ -3,7 +3,7 @@
* SGI IP30 miscellaneous setup bits.
*
* Copyright (C) 2004-2007 Stanislaw Skowronek <skylark@unaligned.org>
- * 2007 Joshua Kinard <kumba@gentoo.org>
+ * 2007 Joshua Kinard <linux@kumba.dev>
* 2009 Johannes Dickgreber <tanzy@gmx.de>
*/
diff --git a/arch/mips/sgi-ip30/ip30-smp.c b/arch/mips/sgi-ip30/ip30-smp.c
index 4bfe654602b1..1e8210f2a9f8 100644
--- a/arch/mips/sgi-ip30/ip30-smp.c
+++ b/arch/mips/sgi-ip30/ip30-smp.c
@@ -5,7 +5,7 @@
* and smp-bmips.c.
*
* Copyright (C) 2005-2007 Stanislaw Skowronek <skylark@unaligned.org>
- * 2006-2007, 2014-2015 Joshua Kinard <kumba@gentoo.org>
+ * 2006-2007, 2014-2015 Joshua Kinard <linux@kumba.dev>
* 2009 Johannes Dickgreber <tanzy@gmx.de>
*/
diff --git a/arch/mips/sgi-ip30/ip30-timer.c b/arch/mips/sgi-ip30/ip30-timer.c
index d13e105478ae..7652f72f0daf 100644
--- a/arch/mips/sgi-ip30/ip30-timer.c
+++ b/arch/mips/sgi-ip30/ip30-timer.c
@@ -5,7 +5,7 @@
*
* Copyright (C) 2004-2007 Stanislaw Skowronek <skylark@unaligned.org>
* Copyright (C) 2009 Johannes Dickgreber <tanzy@gmx.de>
- * Copyright (C) 2011 Joshua Kinard <kumba@gentoo.org>
+ * Copyright (C) 2011 Joshua Kinard <linux@kumba.dev>
*/
#include <linux/clocksource.h>
diff --git a/arch/mips/sgi-ip30/ip30-xtalk.c b/arch/mips/sgi-ip30/ip30-xtalk.c
index 7ceb2b23ea1c..d798ee8c998c 100644
--- a/arch/mips/sgi-ip30/ip30-xtalk.c
+++ b/arch/mips/sgi-ip30/ip30-xtalk.c
@@ -3,7 +3,7 @@
* ip30-xtalk.c - Very basic Crosstalk (XIO) detection support.
* Copyright (C) 2004-2007 Stanislaw Skowronek <skylark@unaligned.org>
* Copyright (C) 2009 Johannes Dickgreber <tanzy@gmx.de>
- * Copyright (C) 2007, 2014-2016 Joshua Kinard <kumba@gentoo.org>
+ * Copyright (C) 2007, 2014-2016 Joshua Kinard <linux@kumba.dev>
*/
#include <linux/init.h>
diff --git a/arch/mips/txx9/generic/setup.c b/arch/mips/txx9/generic/setup.c
index 0586ca7668b4..5a37e8b234a3 100644
--- a/arch/mips/txx9/generic/setup.c
+++ b/arch/mips/txx9/generic/setup.c
@@ -776,7 +776,7 @@ struct txx9_sramc_dev {
};
static ssize_t txx9_sram_read(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buf, loff_t pos, size_t size)
{
struct txx9_sramc_dev *dev = bin_attr->private;
@@ -791,7 +791,7 @@ static ssize_t txx9_sram_read(struct file *filp, struct kobject *kobj,
}
static ssize_t txx9_sram_write(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buf, loff_t pos, size_t size)
{
struct txx9_sramc_dev *dev = bin_attr->private;
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index 3a7b5baaa450..af932a4ad306 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -72,7 +72,7 @@ void *arch_dma_set_uncached(void *cpu_addr, size_t size)
* them and setting the cache-inhibit bit.
*/
mmap_write_lock(&init_mm);
- error = walk_page_range_novma(&init_mm, va, va + size,
+ error = walk_kernel_page_table_range(va, va + size,
&set_nocache_walk_ops, NULL, NULL);
mmap_write_unlock(&init_mm);
@@ -87,7 +87,7 @@ void arch_dma_clear_uncached(void *cpu_addr, size_t size)
mmap_write_lock(&init_mm);
/* walk_page_range shouldn't be able to fail here */
- WARN_ON(walk_page_range_novma(&init_mm, va, va + size,
+ WARN_ON(walk_kernel_page_table_range(va, va + size,
&clear_nocache_walk_ops, NULL, NULL));
mmap_write_unlock(&init_mm);
}
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index fcc5973f7519..2efa4b08b7b8 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -81,7 +81,6 @@ config PARISC
select HAVE_KPROBES
select HAVE_KRETPROBES
select HAVE_DYNAMIC_FTRACE if $(cc-option,-fpatchable-function-entry=1,1)
- select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE
select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE
select HAVE_KPROBES_ON_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 1f2d5b7a7f5d..c16ec36dfee6 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -144,6 +144,9 @@
#define SO_PASSRIGHTS 0x4051
+#define SO_INQ 0x4052
+#define SCM_INQ SO_INQ
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 45b4fa7b9b02..93402a1d9c9f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -147,7 +147,6 @@ config PPC
select ARCH_HAS_PMEM_API
select ARCH_HAS_PREEMPT_LAZY
select ARCH_HAS_PTDUMP
- select ARCH_HAS_PTE_DEVMAP if PPC_BOOK3S_64
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
select ARCH_HAS_SET_MEMORY
@@ -244,7 +243,6 @@ config PPC
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_GUP_FAST
select HAVE_FTRACE_GRAPH_FUNC
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_DESCRIPTORS if PPC64_ELF_ABI_V1
select HAVE_FUNCTION_ERROR_INJECTION
diff --git a/arch/powerpc/configs/cell_defconfig b/arch/powerpc/configs/cell_defconfig
index 3347192b77b8..7a31b52e92e1 100644
--- a/arch/powerpc/configs/cell_defconfig
+++ b/arch/powerpc/configs/cell_defconfig
@@ -62,7 +62,6 @@ CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index aa90a048f319..7132392fa7cd 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -168,12 +168,6 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
extern int hash__has_transparent_hugepage(void);
#endif
-static inline pmd_t hash__pmd_mkdevmap(pmd_t pmd)
-{
- BUG();
- return pmd;
-}
-
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_POWERPC_BOOK3S_64_HASH_4K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 0bf6fd0bf42a..0fb5b7da9478 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -259,7 +259,7 @@ static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
*/
static inline int hash__pmd_trans_huge(pmd_t pmd)
{
- return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE | _PAGE_DEVMAP)) ==
+ return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE)) ==
(_PAGE_PTE | H_PAGE_THP_HUGE));
}
@@ -281,11 +281,6 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
extern int hash__has_transparent_hugepage(void);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline pmd_t hash__pmd_mkdevmap(pmd_t pmd)
-{
- return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE | _PAGE_DEVMAP));
-}
-
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_BOOK3S_64_HASH_64K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index a2ddcbb3fcb9..c19800365315 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -88,7 +88,6 @@
#define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */
#define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */
-#define _PAGE_DEVMAP _RPAGE_SW1 /* software: ZONE_DEVICE page */
/*
* Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
@@ -109,7 +108,7 @@
*/
#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
_PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \
- _PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
+ _PAGE_SOFT_DIRTY)
/*
* user access blocked by key
*/
@@ -123,7 +122,7 @@
*/
#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
_PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \
- _PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
+ _PAGE_SOFT_DIRTY)
/*
* We define 2 sets of base prot bits, one for basic pages (ie,
@@ -609,24 +608,6 @@ static inline pte_t pte_mkhuge(pte_t pte)
return pte;
}
-static inline pte_t pte_mkdevmap(pte_t pte)
-{
- return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SPECIAL | _PAGE_DEVMAP));
-}
-
-/*
- * This is potentially called with a pmd as the argument, in which case it's not
- * safe to check _PAGE_DEVMAP unless we also confirm that _PAGE_PTE is set.
- * That's because the bit we use for _PAGE_DEVMAP is not reserved for software
- * use in page directory entries (ie. non-ptes).
- */
-static inline int pte_devmap(pte_t pte)
-{
- __be64 mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE);
-
- return (pte_raw(pte) & mask) == mask;
-}
-
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
/* FIXME!! check whether this need to be a conditional */
@@ -1379,36 +1360,6 @@ static inline bool arch_needs_pgtable_deposit(void)
}
extern void serialize_against_pte_lookup(struct mm_struct *mm);
-
-static inline pmd_t pmd_mkdevmap(pmd_t pmd)
-{
- if (radix_enabled())
- return radix__pmd_mkdevmap(pmd);
- return hash__pmd_mkdevmap(pmd);
-}
-
-static inline pud_t pud_mkdevmap(pud_t pud)
-{
- if (radix_enabled())
- return radix__pud_mkdevmap(pud);
- BUG();
- return pud;
-}
-
-static inline int pmd_devmap(pmd_t pmd)
-{
- return pte_devmap(pmd_pte(pmd));
-}
-
-static inline int pud_devmap(pud_t pud)
-{
- return pte_devmap(pud_pte(pud));
-}
-
-static inline int pgd_devmap(pgd_t pgd)
-{
- return 0;
-}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
diff --git a/arch/powerpc/include/asm/book3s/64/pkeys.h b/arch/powerpc/include/asm/book3s/64/pkeys.h
index 5b178139f3c0..ff911b4251d9 100644
--- a/arch/powerpc/include/asm/book3s/64/pkeys.h
+++ b/arch/powerpc/include/asm/book3s/64/pkeys.h
@@ -5,7 +5,7 @@
#include <asm/book3s/64/hash-pkey.h>
-static inline u64 vmflag_to_pte_pkey_bits(u64 vm_flags)
+static inline u64 vmflag_to_pte_pkey_bits(vm_flags_t vm_flags)
{
if (!mmu_has_feature(MMU_FTR_PKEY))
return 0x0UL;
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 8f55ff74bb68..df23a8267e4d 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -264,7 +264,7 @@ static inline int radix__p4d_bad(p4d_t p4d)
static inline int radix__pmd_trans_huge(pmd_t pmd)
{
- return (pmd_val(pmd) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE;
+ return (pmd_val(pmd) & _PAGE_PTE) == _PAGE_PTE;
}
static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
@@ -274,7 +274,7 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
static inline int radix__pud_trans_huge(pud_t pud)
{
- return (pud_val(pud) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE;
+ return (pud_val(pud) & _PAGE_PTE) == _PAGE_PTE;
}
static inline pud_t radix__pud_mkhuge(pud_t pud)
@@ -315,16 +315,6 @@ static inline int radix__has_transparent_pud_hugepage(void)
}
#endif
-static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
-{
- return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP));
-}
-
-static inline pud_t radix__pud_mkdevmap(pud_t pud)
-{
- return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP));
-}
-
struct vmem_altmap;
struct dev_pagemap;
extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 6df6dbbe1e7c..ea6c8dc400d2 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -270,6 +270,7 @@
#define H_QUERY_INT_STATE 0x1E4
#define H_POLL_PENDING 0x1D8
#define H_ILLAN_ATTRIBUTES 0x244
+#define H_ADD_LOGICAL_LAN_BUFFERS 0x248
#define H_MODIFY_HEA_QP 0x250
#define H_QUERY_HEA_QP 0x254
#define H_QUERY_HEA 0x258
diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h
index 42a51a993d94..912f78a956a1 100644
--- a/arch/powerpc/include/asm/mman.h
+++ b/arch/powerpc/include/asm/mman.h
@@ -14,7 +14,7 @@
#include <asm/cpu_has_feature.h>
#include <asm/firmware.h>
-static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
+static inline vm_flags_t arch_calc_vm_prot_bits(unsigned long prot,
unsigned long pkey)
{
#ifdef CONFIG_PPC_MEM_KEYS
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 59a2c7dbc78f..28e752138996 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -30,9 +30,9 @@ extern u32 reserved_allocation_mask; /* bits set for reserved keys */
#endif
-static inline u64 pkey_to_vmflag_bits(u16 pkey)
+static inline vm_flags_t pkey_to_vmflag_bits(u16 pkey)
{
- return (((u64)pkey << VM_PKEY_SHIFT) & ARCH_VM_PKEY_FLAGS);
+ return (((vm_flags_t)pkey << VM_PKEY_SHIFT) & ARCH_VM_PKEY_FLAGS);
}
static inline int vma_pkey(struct vm_area_struct *vma)
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 3a6592a31a10..03f8c34fa0a2 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -393,7 +393,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm,
{
unsigned long gfn = memslot->base_gfn;
unsigned long end, start = gfn_to_hva(kvm, gfn);
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
int ret = 0;
struct vm_area_struct *vma;
int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE;
diff --git a/arch/powerpc/mm/book3s64/hash_hugepage.c b/arch/powerpc/mm/book3s64/hash_hugepage.c
index 15d6f3ea7178..cdfd4fe75edb 100644
--- a/arch/powerpc/mm/book3s64/hash_hugepage.c
+++ b/arch/powerpc/mm/book3s64/hash_hugepage.c
@@ -54,7 +54,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
/*
* Make sure this is thp or devmap entry
*/
- if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)))
+ if (!(old_pmd & H_PAGE_THP_HUGE))
return 0;
rflags = htab_convert_pte_flags(new_pmd, flags);
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 988948d69bc1..82d31177630b 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -195,7 +195,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr
unsigned long old;
#ifdef CONFIG_DEBUG_VM
- WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ WARN_ON(!hash__pmd_trans_huge(*pmdp));
assert_spin_locked(pmd_lockptr(mm, pmdp));
#endif
@@ -227,7 +227,6 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
VM_BUG_ON(pmd_trans_huge(*pmdp));
- VM_BUG_ON(pmd_devmap(*pmdp));
pmd = *pmdp;
pmd_clear(pmdp);
diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c
index 83c3361b358b..2bcbbf9d85ac 100644
--- a/arch/powerpc/mm/book3s64/hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/hugetlbpage.c
@@ -74,7 +74,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
} while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
/* Make sure this is a hugetlb entry */
- if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))
+ if (old_pte & H_PAGE_THP_HUGE)
return 0;
rflags = htab_convert_pte_flags(new_pte, flags);
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 0db01e10a3f8..c9431ae7f78a 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -62,7 +62,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
{
int changed;
#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ WARN_ON(!pmd_trans_huge(*pmdp));
assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp));
#endif
changed = !pmd_same(*(pmdp), entry);
@@ -82,7 +82,6 @@ int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
{
int changed;
#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pud_devmap(*pudp));
assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
#endif
changed = !pud_same(*(pudp), entry);
@@ -204,8 +203,8 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
{
pmd_t pmd;
VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
- VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
- !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
+ VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp)) ||
+ !pmd_present(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
/*
* if it not a fullmm flush, then we can possibly end up converting
@@ -223,8 +222,7 @@ pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
pud_t pud;
VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
- VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) ||
- !pud_present(*pudp));
+ VM_BUG_ON(!pud_present(*pudp));
pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
/*
* if it not a fullmm flush, then we can possibly end up converting
@@ -644,7 +642,7 @@ unsigned long memremap_compat_align(void)
EXPORT_SYMBOL_GPL(memremap_compat_align);
#endif
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
{
unsigned long prot;
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index d865ec915f9d..be523e5fe9c5 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1433,7 +1433,7 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add
unsigned long old;
#ifdef CONFIG_DEBUG_VM
- WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ WARN_ON(!radix__pmd_trans_huge(*pmdp));
assert_spin_locked(pmd_lockptr(mm, pmdp));
#endif
@@ -1450,7 +1450,7 @@ unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long add
unsigned long old;
#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pud_devmap(*pudp));
+ WARN_ON(!pud_trans_huge(*pudp));
assert_spin_locked(pud_lockptr(mm, pudp));
#endif
@@ -1468,7 +1468,6 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
- VM_BUG_ON(pmd_devmap(*pmdp));
/*
* khugepaged calls this for normal pmd
*/
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 61df5aed7989..dfaa9fd86f7e 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -509,7 +509,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
return NULL;
#endif
- if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
+ if (pmd_trans_huge(pmd)) {
if (is_thp)
*is_thp = true;
ret_pte = (pte_t *)pmdp;
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 5daa77aee7f7..a25a6ffe7d7c 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -370,6 +370,23 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o
return 0;
}
+bool bpf_jit_bypass_spec_v1(void)
+{
+#if defined(CONFIG_PPC_E500) || defined(CONFIG_PPC_BOOK3S_64)
+ return !(security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+ security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR));
+#else
+ return true;
+#endif
+}
+
+bool bpf_jit_bypass_spec_v4(void)
+{
+ return !(security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+ security_ftr_enabled(SEC_FTR_STF_BARRIER) &&
+ stf_barrier_type_get() != STF_BARRIER_NONE);
+}
+
/*
* We spill into the redzone always, even if the bpf program has its own stackframe.
* Offsets hardcoded based on BPF_PPC_STACK_SAVE -- see bpf_jit_stack_local()
@@ -397,6 +414,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
u32 *addrs, int pass, bool extra_pass)
{
enum stf_barrier_type stf_barrier = stf_barrier_type_get();
+ bool sync_emitted, ori31_emitted;
const struct bpf_insn *insn = fp->insnsi;
int flen = fp->len;
int i, ret;
@@ -789,30 +807,51 @@ emit_clear:
/*
* BPF_ST NOSPEC (speculation barrier)
+ *
+ * The following must act as a barrier against both Spectre v1
+ * and v4 if we requested both mitigations. Therefore, also emit
+ * 'isync; sync' on E500 or 'ori31' on BOOK3S_64 in addition to
+ * the insns needed for a Spectre v4 barrier.
+ *
+ * If we requested only !bypass_spec_v1 OR only !bypass_spec_v4,
+ * we can skip the respective other barrier type as an
+ * optimization.
*/
case BPF_ST | BPF_NOSPEC:
- if (!security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) ||
- !security_ftr_enabled(SEC_FTR_STF_BARRIER))
- break;
-
- switch (stf_barrier) {
- case STF_BARRIER_EIEIO:
- EMIT(PPC_RAW_EIEIO() | 0x02000000);
- break;
- case STF_BARRIER_SYNC_ORI:
+ sync_emitted = false;
+ ori31_emitted = false;
+ if (IS_ENABLED(CONFIG_PPC_E500) &&
+ !bpf_jit_bypass_spec_v1()) {
+ EMIT(PPC_RAW_ISYNC());
EMIT(PPC_RAW_SYNC());
- EMIT(PPC_RAW_LD(tmp1_reg, _R13, 0));
- EMIT(PPC_RAW_ORI(_R31, _R31, 0));
- break;
- case STF_BARRIER_FALLBACK:
- ctx->seen |= SEEN_FUNC;
- PPC_LI64(_R12, dereference_kernel_function_descriptor(bpf_stf_barrier));
- EMIT(PPC_RAW_MTCTR(_R12));
- EMIT(PPC_RAW_BCTRL());
- break;
- case STF_BARRIER_NONE:
- break;
+ sync_emitted = true;
}
+ if (!bpf_jit_bypass_spec_v4()) {
+ switch (stf_barrier) {
+ case STF_BARRIER_EIEIO:
+ EMIT(PPC_RAW_EIEIO() | 0x02000000);
+ break;
+ case STF_BARRIER_SYNC_ORI:
+ if (!sync_emitted)
+ EMIT(PPC_RAW_SYNC());
+ EMIT(PPC_RAW_LD(tmp1_reg, _R13, 0));
+ EMIT(PPC_RAW_ORI(_R31, _R31, 0));
+ ori31_emitted = true;
+ break;
+ case STF_BARRIER_FALLBACK:
+ ctx->seen |= SEEN_FUNC;
+ PPC_LI64(_R12, dereference_kernel_function_descriptor(bpf_stf_barrier));
+ EMIT(PPC_RAW_MTCTR(_R12));
+ EMIT(PPC_RAW_BCTRL());
+ break;
+ case STF_BARRIER_NONE:
+ break;
+ }
+ }
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) &&
+ !bpf_jit_bypass_spec_v1() &&
+ !ori31_emitted)
+ EMIT(PPC_RAW_ORI(_R31, _R31, 0));
break;
/*
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 5f4037c1d7fe..5e0a718d1be7 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -532,7 +532,6 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info,
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
balloon_page_insert(b_dev_info, newpage);
- balloon_page_delete(page);
b_dev_info->isolated_pages--;
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
@@ -542,6 +541,7 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info,
*/
plpar_page_set_active(page);
+ balloon_page_finalize(page);
/* balloon page list reference */
put_page(page);
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
index 52e2623a741d..aeb8633a3d00 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -29,7 +29,7 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn)
nid = of_node_to_nid(dn);
if (likely((nid) >= 0)) {
if (!node_online(nid)) {
- if (__register_one_node(nid)) {
+ if (register_one_node(nid)) {
pr_err("PCI: Failed to register node %d\n", nid);
} else {
update_numa_distance(dn);
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 5352932badd8..e5668d9de58b 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -43,7 +43,6 @@ config RISCV
select ARCH_HAS_PREEMPT_LAZY
select ARCH_HAS_PREPARE_SYNC_CORE_CMD
select ARCH_HAS_PTDUMP if MMU
- select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_DIRECT_MAP if MMU
select ARCH_HAS_SET_MEMORY if MMU
@@ -157,7 +156,6 @@ config RISCV
select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG)
select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE_GRAPH_FUNC
- select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
select HAVE_FUNCTION_GRAPH_TRACER if HAVE_DYNAMIC_FTRACE_WITH_ARGS
select HAVE_FUNCTION_GRAPH_FREGS
select HAVE_FUNCTION_TRACER if !XIP_KERNEL && HAVE_DYNAMIC_FTRACE
diff --git a/arch/riscv/include/asm/kvm_aia.h b/arch/riscv/include/asm/kvm_aia.h
index 5acce285e56e..b04ecdd1a860 100644
--- a/arch/riscv/include/asm/kvm_aia.h
+++ b/arch/riscv/include/asm/kvm_aia.h
@@ -150,7 +150,7 @@ int kvm_riscv_vcpu_aia_rmw_ireg(struct kvm_vcpu *vcpu, unsigned int csr_num,
int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu);
-int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu);
+void kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu);
int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, u32 hart_index,
diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
new file mode 100644
index 000000000000..595e2183173e
--- /dev/null
+++ b/arch/riscv/include/asm/kvm_gstage.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ * Copyright (c) 2025 Ventana Micro Systems Inc.
+ */
+
+#ifndef __RISCV_KVM_GSTAGE_H_
+#define __RISCV_KVM_GSTAGE_H_
+
+#include <linux/kvm_types.h>
+
+struct kvm_gstage {
+ struct kvm *kvm;
+ unsigned long flags;
+#define KVM_GSTAGE_FLAGS_LOCAL BIT(0)
+ unsigned long vmid;
+ pgd_t *pgd;
+};
+
+struct kvm_gstage_mapping {
+ gpa_t addr;
+ pte_t pte;
+ u32 level;
+};
+
+#ifdef CONFIG_64BIT
+#define kvm_riscv_gstage_index_bits 9
+#else
+#define kvm_riscv_gstage_index_bits 10
+#endif
+
+extern unsigned long kvm_riscv_gstage_mode;
+extern unsigned long kvm_riscv_gstage_pgd_levels;
+
+#define kvm_riscv_gstage_pgd_xbits 2
+#define kvm_riscv_gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits))
+#define kvm_riscv_gstage_gpa_bits (HGATP_PAGE_SHIFT + \
+ (kvm_riscv_gstage_pgd_levels * \
+ kvm_riscv_gstage_index_bits) + \
+ kvm_riscv_gstage_pgd_xbits)
+#define kvm_riscv_gstage_gpa_size ((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits))
+
+bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
+ pte_t **ptepp, u32 *ptep_level);
+
+int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
+ struct kvm_mmu_memory_cache *pcache,
+ const struct kvm_gstage_mapping *map);
+
+int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
+ struct kvm_mmu_memory_cache *pcache,
+ gpa_t gpa, phys_addr_t hpa, unsigned long page_size,
+ bool page_rdonly, bool page_exec,
+ struct kvm_gstage_mapping *out_map);
+
+enum kvm_riscv_gstage_op {
+ GSTAGE_OP_NOP = 0, /* Nothing */
+ GSTAGE_OP_CLEAR, /* Clear/Unmap */
+ GSTAGE_OP_WP, /* Write-protect */
+};
+
+void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
+ pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op);
+
+void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
+ gpa_t start, gpa_t size, bool may_block);
+
+void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end);
+
+void kvm_riscv_gstage_mode_detect(void);
+
+#endif
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index bcbf8b1ec115..d71d3299a335 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -16,6 +16,8 @@
#include <asm/hwcap.h>
#include <asm/kvm_aia.h>
#include <asm/ptrace.h>
+#include <asm/kvm_tlb.h>
+#include <asm/kvm_vmid.h>
#include <asm/kvm_vcpu_fp.h>
#include <asm/kvm_vcpu_insn.h>
#include <asm/kvm_vcpu_sbi.h>
@@ -36,14 +38,16 @@
#define KVM_REQ_UPDATE_HGATP KVM_ARCH_REQ(2)
#define KVM_REQ_FENCE_I \
KVM_ARCH_REQ_FLAGS(3, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_HFENCE_GVMA_VMID_ALL KVM_REQ_TLB_FLUSH
#define KVM_REQ_HFENCE_VVMA_ALL \
KVM_ARCH_REQ_FLAGS(4, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_HFENCE \
KVM_ARCH_REQ_FLAGS(5, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(6)
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
+
#define KVM_HEDELEG_DEFAULT (BIT(EXC_INST_MISALIGNED) | \
+ BIT(EXC_INST_ILLEGAL) | \
BIT(EXC_BREAKPOINT) | \
BIT(EXC_SYSCALL) | \
BIT(EXC_INST_PAGE_FAULT) | \
@@ -54,24 +58,6 @@
BIT(IRQ_VS_TIMER) | \
BIT(IRQ_VS_EXT))
-enum kvm_riscv_hfence_type {
- KVM_RISCV_HFENCE_UNKNOWN = 0,
- KVM_RISCV_HFENCE_GVMA_VMID_GPA,
- KVM_RISCV_HFENCE_VVMA_ASID_GVA,
- KVM_RISCV_HFENCE_VVMA_ASID_ALL,
- KVM_RISCV_HFENCE_VVMA_GVA,
-};
-
-struct kvm_riscv_hfence {
- enum kvm_riscv_hfence_type type;
- unsigned long asid;
- unsigned long order;
- gpa_t addr;
- gpa_t size;
-};
-
-#define KVM_RISCV_VCPU_MAX_HFENCE 64
-
struct kvm_vm_stat {
struct kvm_vm_stat_generic generic;
};
@@ -97,15 +83,6 @@ struct kvm_vcpu_stat {
struct kvm_arch_memory_slot {
};
-struct kvm_vmid {
- /*
- * Writes to vmid_version and vmid happen with vmid_lock held
- * whereas reads happen without any lock held.
- */
- unsigned long vmid_version;
- unsigned long vmid;
-};
-
struct kvm_arch {
/* G-stage vmid */
struct kvm_vmid vmid;
@@ -309,77 +286,6 @@ static inline bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu)
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
-#define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12
-
-void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid,
- gpa_t gpa, gpa_t gpsz,
- unsigned long order);
-void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid);
-void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz,
- unsigned long order);
-void kvm_riscv_local_hfence_gvma_all(void);
-void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid,
- unsigned long asid,
- unsigned long gva,
- unsigned long gvsz,
- unsigned long order);
-void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid,
- unsigned long asid);
-void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid,
- unsigned long gva, unsigned long gvsz,
- unsigned long order);
-void kvm_riscv_local_hfence_vvma_all(unsigned long vmid);
-
-void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu);
-
-void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu);
-void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu);
-void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu);
-void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu);
-
-void kvm_riscv_fence_i(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask);
-void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask,
- gpa_t gpa, gpa_t gpsz,
- unsigned long order);
-void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask);
-void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask,
- unsigned long gva, unsigned long gvsz,
- unsigned long order, unsigned long asid);
-void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask,
- unsigned long asid);
-void kvm_riscv_hfence_vvma_gva(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask,
- unsigned long gva, unsigned long gvsz,
- unsigned long order);
-void kvm_riscv_hfence_vvma_all(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask);
-
-int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa,
- phys_addr_t hpa, unsigned long size,
- bool writable, bool in_atomic);
-void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa,
- unsigned long size);
-int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
- struct kvm_memory_slot *memslot,
- gpa_t gpa, unsigned long hva, bool is_write);
-int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm);
-void kvm_riscv_gstage_free_pgd(struct kvm *kvm);
-void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu);
-void __init kvm_riscv_gstage_mode_detect(void);
-unsigned long __init kvm_riscv_gstage_mode(void);
-int kvm_riscv_gstage_gpa_bits(void);
-
-void __init kvm_riscv_gstage_vmid_detect(void);
-unsigned long kvm_riscv_gstage_vmid_bits(void);
-int kvm_riscv_gstage_vmid_init(struct kvm *kvm);
-bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid);
-void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu);
-
int kvm_riscv_setup_default_irq_routing(struct kvm *kvm, u32 lines);
void __kvm_riscv_unpriv_trap(void);
@@ -415,7 +321,6 @@ void __kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu);
bool kvm_riscv_vcpu_stopped(struct kvm_vcpu *vcpu);
-void kvm_riscv_vcpu_sbi_sta_reset(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu);
#endif /* __RISCV_KVM_HOST_H__ */
diff --git a/arch/riscv/include/asm/kvm_mmu.h b/arch/riscv/include/asm/kvm_mmu.h
new file mode 100644
index 000000000000..5439e76f0a96
--- /dev/null
+++ b/arch/riscv/include/asm/kvm_mmu.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025 Ventana Micro Systems Inc.
+ */
+
+#ifndef __RISCV_KVM_MMU_H_
+#define __RISCV_KVM_MMU_H_
+
+#include <asm/kvm_gstage.h>
+
+int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
+ unsigned long size, bool writable, bool in_atomic);
+void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size);
+int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
+ gpa_t gpa, unsigned long hva, bool is_write,
+ struct kvm_gstage_mapping *out_map);
+int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm);
+void kvm_riscv_mmu_free_pgd(struct kvm *kvm);
+void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/arch/riscv/include/asm/kvm_tlb.h b/arch/riscv/include/asm/kvm_tlb.h
new file mode 100644
index 000000000000..38a2f933ad3a
--- /dev/null
+++ b/arch/riscv/include/asm/kvm_tlb.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025 Ventana Micro Systems Inc.
+ */
+
+#ifndef __RISCV_KVM_TLB_H_
+#define __RISCV_KVM_TLB_H_
+
+#include <linux/kvm_types.h>
+
+enum kvm_riscv_hfence_type {
+ KVM_RISCV_HFENCE_UNKNOWN = 0,
+ KVM_RISCV_HFENCE_GVMA_VMID_GPA,
+ KVM_RISCV_HFENCE_GVMA_VMID_ALL,
+ KVM_RISCV_HFENCE_VVMA_ASID_GVA,
+ KVM_RISCV_HFENCE_VVMA_ASID_ALL,
+ KVM_RISCV_HFENCE_VVMA_GVA,
+ KVM_RISCV_HFENCE_VVMA_ALL
+};
+
+struct kvm_riscv_hfence {
+ enum kvm_riscv_hfence_type type;
+ unsigned long asid;
+ unsigned long vmid;
+ unsigned long order;
+ gpa_t addr;
+ gpa_t size;
+};
+
+#define KVM_RISCV_VCPU_MAX_HFENCE 64
+
+#define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12
+
+void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid,
+ gpa_t gpa, gpa_t gpsz,
+ unsigned long order);
+void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid);
+void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz,
+ unsigned long order);
+void kvm_riscv_local_hfence_gvma_all(void);
+void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid,
+ unsigned long asid,
+ unsigned long gva,
+ unsigned long gvsz,
+ unsigned long order);
+void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid,
+ unsigned long asid);
+void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid,
+ unsigned long gva, unsigned long gvsz,
+ unsigned long order);
+void kvm_riscv_local_hfence_vvma_all(unsigned long vmid);
+
+void kvm_riscv_tlb_flush_process(struct kvm_vcpu *vcpu);
+
+void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu);
+void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu);
+void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu);
+
+void kvm_riscv_fence_i(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask);
+void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ gpa_t gpa, gpa_t gpsz,
+ unsigned long order, unsigned long vmid);
+void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned long vmid);
+void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned long gva, unsigned long gvsz,
+ unsigned long order, unsigned long asid,
+ unsigned long vmid);
+void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned long asid, unsigned long vmid);
+void kvm_riscv_hfence_vvma_gva(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned long gva, unsigned long gvsz,
+ unsigned long order, unsigned long vmid);
+void kvm_riscv_hfence_vvma_all(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned long vmid);
+
+#endif
diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h
index 439ab2b3534f..d678fd7e5973 100644
--- a/arch/riscv/include/asm/kvm_vcpu_sbi.h
+++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h
@@ -49,6 +49,16 @@ struct kvm_vcpu_sbi_extension {
/* Extension specific probe function */
unsigned long (*probe)(struct kvm_vcpu *vcpu);
+
+ /*
+ * Init/deinit function called once during VCPU init/destroy. These
+ * might be use if the SBI extensions need to allocate or do specific
+ * init time only configuration.
+ */
+ int (*init)(struct kvm_vcpu *vcpu);
+ void (*deinit)(struct kvm_vcpu *vcpu);
+
+ void (*reset)(struct kvm_vcpu *vcpu);
};
void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run);
@@ -72,6 +82,8 @@ const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext(
bool riscv_vcpu_supports_sbi_ext(struct kvm_vcpu *vcpu, int idx);
int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run);
void kvm_riscv_vcpu_sbi_init(struct kvm_vcpu *vcpu);
+void kvm_riscv_vcpu_sbi_deinit(struct kvm_vcpu *vcpu);
+void kvm_riscv_vcpu_sbi_reset(struct kvm_vcpu *vcpu);
int kvm_riscv_vcpu_get_reg_sbi_sta(struct kvm_vcpu *vcpu, unsigned long reg_num,
unsigned long *reg_val);
diff --git a/arch/riscv/include/asm/kvm_vmid.h b/arch/riscv/include/asm/kvm_vmid.h
new file mode 100644
index 000000000000..ab98e1434fb7
--- /dev/null
+++ b/arch/riscv/include/asm/kvm_vmid.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2025 Ventana Micro Systems Inc.
+ */
+
+#ifndef __RISCV_KVM_VMID_H_
+#define __RISCV_KVM_VMID_H_
+
+#include <linux/kvm_types.h>
+
+struct kvm_vmid {
+ /*
+ * Writes to vmid_version and vmid happen with vmid_lock held
+ * whereas reads happen without any lock held.
+ */
+ unsigned long vmid_version;
+ unsigned long vmid;
+};
+
+void __init kvm_riscv_gstage_vmid_detect(void);
+unsigned long kvm_riscv_gstage_vmid_bits(void);
+int kvm_riscv_gstage_vmid_init(struct kvm *kvm);
+bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid);
+void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu);
+void kvm_riscv_gstage_vmid_sanitize(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index 7de05db7d3bd..1018d2216901 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -397,24 +397,8 @@ static inline struct page *pgd_page(pgd_t pgd)
p4d_t *p4d_offset(pgd_t *pgd, unsigned long address);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pte_devmap(pte_t pte);
static inline pte_t pmd_pte(pmd_t pmd);
static inline pte_t pud_pte(pud_t pud);
-
-static inline int pmd_devmap(pmd_t pmd)
-{
- return pte_devmap(pmd_pte(pmd));
-}
-
-static inline int pud_devmap(pud_t pud)
-{
- return pte_devmap(pud_pte(pud));
-}
-
-static inline int pgd_devmap(pgd_t pgd)
-{
- return 0;
-}
#endif
#endif /* _ASM_RISCV_PGTABLE_64_H */
diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
index a8f5205cea54..179bd4afece4 100644
--- a/arch/riscv/include/asm/pgtable-bits.h
+++ b/arch/riscv/include/asm/pgtable-bits.h
@@ -19,7 +19,6 @@
#define _PAGE_SOFT (3 << 8) /* Reserved for software */
#define _PAGE_SPECIAL (1 << 8) /* RSW: 0x1 */
-#define _PAGE_DEVMAP (1 << 9) /* RSW, devmap */
#define _PAGE_TABLE _PAGE_PRESENT
/*
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 5bd5aae60d53..91697fbf1f90 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -409,13 +409,6 @@ static inline int pte_special(pte_t pte)
return pte_val(pte) & _PAGE_SPECIAL;
}
-#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static inline int pte_devmap(pte_t pte)
-{
- return pte_val(pte) & _PAGE_DEVMAP;
-}
-#endif
-
/* static inline pte_t pte_rdprotect(pte_t pte) */
static inline pte_t pte_wrprotect(pte_t pte)
@@ -457,11 +450,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
return __pte(pte_val(pte) | _PAGE_SPECIAL);
}
-static inline pte_t pte_mkdevmap(pte_t pte)
-{
- return __pte(pte_val(pte) | _PAGE_DEVMAP);
-}
-
static inline pte_t pte_mkhuge(pte_t pte)
{
return pte;
@@ -790,11 +778,6 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
}
-static inline pmd_t pmd_mkdevmap(pmd_t pmd)
-{
- return pte_pmd(pte_mkdevmap(pmd_pte(pmd)));
-}
-
#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
static inline bool pmd_special(pmd_t pmd)
{
@@ -946,11 +929,6 @@ static inline pud_t pud_mkhuge(pud_t pud)
return pud;
}
-static inline pud_t pud_mkdevmap(pud_t pud)
-{
- return pte_pud(pte_mkdevmap(pud_pte(pud)));
-}
-
static inline int pudp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
pud_t entry, int dirty)
diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index 1a20dd746a49..eed0abc40514 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -63,7 +63,6 @@ void flush_pud_tlb_range(struct vm_area_struct *vma, unsigned long start,
bool arch_tlbbatch_should_defer(struct mm_struct *mm);
void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
struct mm_struct *mm, unsigned long start, unsigned long end);
-void arch_flush_tlb_batched_pending(struct mm_struct *mm);
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
extern unsigned long tlb_flush_all_threshold;
diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h
index 5f59fd226cc5..ef27d4289da1 100644
--- a/arch/riscv/include/uapi/asm/kvm.h
+++ b/arch/riscv/include/uapi/asm/kvm.h
@@ -18,6 +18,7 @@
#define __KVM_HAVE_IRQ_LINE
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
#define KVM_INTERRUPT_SET -1U
#define KVM_INTERRUPT_UNSET -2U
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
index 704c2899197e..5a62091b0809 100644
--- a/arch/riscv/kvm/Kconfig
+++ b/arch/riscv/kvm/Kconfig
@@ -25,6 +25,7 @@ config KVM
select HAVE_KVM_MSI
select HAVE_KVM_VCPU_ASYNC_IOCTL
select HAVE_KVM_READONLY_MEM
+ select HAVE_KVM_DIRTY_RING_ACQ_REL
select KVM_COMMON
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_GENERIC_HARDWARE_ENABLING
diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile
index 4e0bba91d284..4b199dc3e58b 100644
--- a/arch/riscv/kvm/Makefile
+++ b/arch/riscv/kvm/Makefile
@@ -14,6 +14,7 @@ kvm-y += aia.o
kvm-y += aia_aplic.o
kvm-y += aia_device.o
kvm-y += aia_imsic.o
+kvm-y += gstage.o
kvm-y += main.o
kvm-y += mmu.o
kvm-y += nacl.o
diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c
index 806c41931cde..b195a93add1c 100644
--- a/arch/riscv/kvm/aia_device.c
+++ b/arch/riscv/kvm/aia_device.c
@@ -509,12 +509,12 @@ void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu)
kvm_riscv_vcpu_aia_imsic_reset(vcpu);
}
-int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu)
+void kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu_aia *vaia = &vcpu->arch.aia_context;
if (!kvm_riscv_aia_available())
- return 0;
+ return;
/*
* We don't do any memory allocations over here because these
@@ -526,8 +526,6 @@ int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu)
/* Initialize default values in AIA vcpu context */
vaia->imsic_addr = KVM_RISCV_AIA_UNDEF_ADDR;
vaia->hart_index = vcpu->vcpu_idx;
-
- return 0;
}
void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu)
diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c
index 2ff865943ebb..fda0346f0ea1 100644
--- a/arch/riscv/kvm/aia_imsic.c
+++ b/arch/riscv/kvm/aia_imsic.c
@@ -16,6 +16,7 @@
#include <linux/swab.h>
#include <kvm/iodev.h>
#include <asm/csr.h>
+#include <asm/kvm_mmu.h>
#define IMSIC_MAX_EIX (IMSIC_MAX_ID / BITS_PER_TYPE(u64))
@@ -745,9 +746,8 @@ void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu)
*/
/* Purge the G-stage mapping */
- kvm_riscv_gstage_iounmap(vcpu->kvm,
- vcpu->arch.aia_context.imsic_addr,
- IMSIC_MMIO_PAGE_SZ);
+ kvm_riscv_mmu_iounmap(vcpu->kvm, vcpu->arch.aia_context.imsic_addr,
+ IMSIC_MMIO_PAGE_SZ);
/* TODO: Purge the IOMMU mapping ??? */
@@ -830,9 +830,9 @@ int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu)
imsic_vsfile_local_clear(new_vsfile_hgei, imsic->nr_hw_eix);
/* Update G-stage mapping for the new IMSIC VS-file */
- ret = kvm_riscv_gstage_ioremap(kvm, vcpu->arch.aia_context.imsic_addr,
- new_vsfile_pa, IMSIC_MMIO_PAGE_SZ,
- true, true);
+ ret = kvm_riscv_mmu_ioremap(kvm, vcpu->arch.aia_context.imsic_addr,
+ new_vsfile_pa, IMSIC_MMIO_PAGE_SZ,
+ true, true);
if (ret)
goto fail_free_vsfile_hgei;
diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
new file mode 100644
index 000000000000..24c270d6d0e2
--- /dev/null
+++ b/arch/riscv/kvm/gstage.c
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ * Copyright (c) 2025 Ventana Micro Systems Inc.
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/pgtable.h>
+#include <asm/kvm_gstage.h>
+
+#ifdef CONFIG_64BIT
+unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4;
+unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3;
+#else
+unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4;
+unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2;
+#endif
+
+#define gstage_pte_leaf(__ptep) \
+ (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
+
+static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
+{
+ unsigned long mask;
+ unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level);
+
+ if (level == (kvm_riscv_gstage_pgd_levels - 1))
+ mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1;
+ else
+ mask = PTRS_PER_PTE - 1;
+
+ return (addr >> shift) & mask;
+}
+
+static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
+{
+ return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
+}
+
+static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
+{
+ u32 i;
+ unsigned long psz = 1UL << 12;
+
+ for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) {
+ if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) {
+ *out_level = i;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
+{
+ if (kvm_riscv_gstage_pgd_levels < level)
+ return -EINVAL;
+
+ *out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits);
+ return 0;
+}
+
+static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
+{
+ int rc;
+ unsigned long page_order = PAGE_SHIFT;
+
+ rc = gstage_level_to_page_order(level, &page_order);
+ if (rc)
+ return rc;
+
+ *out_pgsize = BIT(page_order);
+ return 0;
+}
+
+bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
+ pte_t **ptepp, u32 *ptep_level)
+{
+ pte_t *ptep;
+ u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+
+ *ptep_level = current_level;
+ ptep = (pte_t *)gstage->pgd;
+ ptep = &ptep[gstage_pte_index(addr, current_level)];
+ while (ptep && pte_val(ptep_get(ptep))) {
+ if (gstage_pte_leaf(ptep)) {
+ *ptep_level = current_level;
+ *ptepp = ptep;
+ return true;
+ }
+
+ if (current_level) {
+ current_level--;
+ *ptep_level = current_level;
+ ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
+ ptep = &ptep[gstage_pte_index(addr, current_level)];
+ } else {
+ ptep = NULL;
+ }
+ }
+
+ return false;
+}
+
+static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr)
+{
+ unsigned long order = PAGE_SHIFT;
+
+ if (gstage_level_to_page_order(level, &order))
+ return;
+ addr &= ~(BIT(order) - 1);
+
+ if (gstage->flags & KVM_GSTAGE_FLAGS_LOCAL)
+ kvm_riscv_local_hfence_gvma_vmid_gpa(gstage->vmid, addr, BIT(order), order);
+ else
+ kvm_riscv_hfence_gvma_vmid_gpa(gstage->kvm, -1UL, 0, addr, BIT(order), order,
+ gstage->vmid);
+}
+
+int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
+ struct kvm_mmu_memory_cache *pcache,
+ const struct kvm_gstage_mapping *map)
+{
+ u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+ pte_t *next_ptep = (pte_t *)gstage->pgd;
+ pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
+
+ if (current_level < map->level)
+ return -EINVAL;
+
+ while (current_level != map->level) {
+ if (gstage_pte_leaf(ptep))
+ return -EEXIST;
+
+ if (!pte_val(ptep_get(ptep))) {
+ if (!pcache)
+ return -ENOMEM;
+ next_ptep = kvm_mmu_memory_cache_alloc(pcache);
+ if (!next_ptep)
+ return -ENOMEM;
+ set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)),
+ __pgprot(_PAGE_TABLE)));
+ } else {
+ if (gstage_pte_leaf(ptep))
+ return -EEXIST;
+ next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
+ }
+
+ current_level--;
+ ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
+ }
+
+ if (pte_val(*ptep) != pte_val(map->pte)) {
+ set_pte(ptep, map->pte);
+ if (gstage_pte_leaf(ptep))
+ gstage_tlb_flush(gstage, current_level, map->addr);
+ }
+
+ return 0;
+}
+
+int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
+ struct kvm_mmu_memory_cache *pcache,
+ gpa_t gpa, phys_addr_t hpa, unsigned long page_size,
+ bool page_rdonly, bool page_exec,
+ struct kvm_gstage_mapping *out_map)
+{
+ pgprot_t prot;
+ int ret;
+
+ out_map->addr = gpa;
+ out_map->level = 0;
+
+ ret = gstage_page_size_to_level(page_size, &out_map->level);
+ if (ret)
+ return ret;
+
+ /*
+ * A RISC-V implementation can choose to either:
+ * 1) Update 'A' and 'D' PTE bits in hardware
+ * 2) Generate page fault when 'A' and/or 'D' bits are not set
+ * PTE so that software can update these bits.
+ *
+ * We support both options mentioned above. To achieve this, we
+ * always set 'A' and 'D' PTE bits at time of creating G-stage
+ * mapping. To support KVM dirty page logging with both options
+ * mentioned above, we will write-protect G-stage PTEs to track
+ * dirty pages.
+ */
+
+ if (page_exec) {
+ if (page_rdonly)
+ prot = PAGE_READ_EXEC;
+ else
+ prot = PAGE_WRITE_EXEC;
+ } else {
+ if (page_rdonly)
+ prot = PAGE_READ;
+ else
+ prot = PAGE_WRITE;
+ }
+ out_map->pte = pfn_pte(PFN_DOWN(hpa), prot);
+ out_map->pte = pte_mkdirty(out_map->pte);
+
+ return kvm_riscv_gstage_set_pte(gstage, pcache, out_map);
+}
+
+void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
+ pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op)
+{
+ int i, ret;
+ pte_t old_pte, *next_ptep;
+ u32 next_ptep_level;
+ unsigned long next_page_size, page_size;
+
+ ret = gstage_level_to_page_size(ptep_level, &page_size);
+ if (ret)
+ return;
+
+ WARN_ON(addr & (page_size - 1));
+
+ if (!pte_val(ptep_get(ptep)))
+ return;
+
+ if (ptep_level && !gstage_pte_leaf(ptep)) {
+ next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
+ next_ptep_level = ptep_level - 1;
+ ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
+ if (ret)
+ return;
+
+ if (op == GSTAGE_OP_CLEAR)
+ set_pte(ptep, __pte(0));
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ kvm_riscv_gstage_op_pte(gstage, addr + i * next_page_size,
+ &next_ptep[i], next_ptep_level, op);
+ if (op == GSTAGE_OP_CLEAR)
+ put_page(virt_to_page(next_ptep));
+ } else {
+ old_pte = *ptep;
+ if (op == GSTAGE_OP_CLEAR)
+ set_pte(ptep, __pte(0));
+ else if (op == GSTAGE_OP_WP)
+ set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE));
+ if (pte_val(*ptep) != pte_val(old_pte))
+ gstage_tlb_flush(gstage, ptep_level, addr);
+ }
+}
+
+void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
+ gpa_t start, gpa_t size, bool may_block)
+{
+ int ret;
+ pte_t *ptep;
+ u32 ptep_level;
+ bool found_leaf;
+ unsigned long page_size;
+ gpa_t addr = start, end = start + size;
+
+ while (addr < end) {
+ found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
+ ret = gstage_level_to_page_size(ptep_level, &page_size);
+ if (ret)
+ break;
+
+ if (!found_leaf)
+ goto next;
+
+ if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
+ kvm_riscv_gstage_op_pte(gstage, addr, ptep,
+ ptep_level, GSTAGE_OP_CLEAR);
+
+next:
+ addr += page_size;
+
+ /*
+ * If the range is too large, release the kvm->mmu_lock
+ * to prevent starvation and lockup detector warnings.
+ */
+ if (!(gstage->flags & KVM_GSTAGE_FLAGS_LOCAL) && may_block && addr < end)
+ cond_resched_lock(&gstage->kvm->mmu_lock);
+ }
+}
+
+void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end)
+{
+ int ret;
+ pte_t *ptep;
+ u32 ptep_level;
+ bool found_leaf;
+ gpa_t addr = start;
+ unsigned long page_size;
+
+ while (addr < end) {
+ found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
+ ret = gstage_level_to_page_size(ptep_level, &page_size);
+ if (ret)
+ break;
+
+ if (!found_leaf)
+ goto next;
+
+ if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
+ kvm_riscv_gstage_op_pte(gstage, addr, ptep,
+ ptep_level, GSTAGE_OP_WP);
+
+next:
+ addr += page_size;
+ }
+}
+
+void __init kvm_riscv_gstage_mode_detect(void)
+{
+#ifdef CONFIG_64BIT
+ /* Try Sv57x4 G-stage mode */
+ csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
+ if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
+ kvm_riscv_gstage_mode = HGATP_MODE_SV57X4;
+ kvm_riscv_gstage_pgd_levels = 5;
+ goto skip_sv48x4_test;
+ }
+
+ /* Try Sv48x4 G-stage mode */
+ csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
+ if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
+ kvm_riscv_gstage_mode = HGATP_MODE_SV48X4;
+ kvm_riscv_gstage_pgd_levels = 4;
+ }
+skip_sv48x4_test:
+
+ csr_write(CSR_HGATP, 0);
+ kvm_riscv_local_hfence_gvma_all();
+#endif
+}
diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
index 4b24705dc63a..67c876de74ef 100644
--- a/arch/riscv/kvm/main.c
+++ b/arch/riscv/kvm/main.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/kvm_host.h>
#include <asm/cpufeature.h>
+#include <asm/kvm_mmu.h>
#include <asm/kvm_nacl.h>
#include <asm/sbi.h>
@@ -134,7 +135,7 @@ static int __init riscv_kvm_init(void)
(rc) ? slist : "no features");
}
- switch (kvm_riscv_gstage_mode()) {
+ switch (kvm_riscv_gstage_mode) {
case HGATP_MODE_SV32X4:
str = "Sv32x4";
break;
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 1087ea74567b..a1c3b2ec1dde 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -6,348 +6,38 @@
* Anup Patel <anup.patel@wdc.com>
*/
-#include <linux/bitops.h>
#include <linux/errno.h>
-#include <linux/err.h>
#include <linux/hugetlb.h>
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/kvm_host.h>
#include <linux/sched/signal.h>
+#include <asm/kvm_mmu.h>
#include <asm/kvm_nacl.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-
-#ifdef CONFIG_64BIT
-static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
-static unsigned long gstage_pgd_levels __ro_after_init = 3;
-#define gstage_index_bits 9
-#else
-static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
-static unsigned long gstage_pgd_levels __ro_after_init = 2;
-#define gstage_index_bits 10
-#endif
-
-#define gstage_pgd_xbits 2
-#define gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + gstage_pgd_xbits))
-#define gstage_gpa_bits (HGATP_PAGE_SHIFT + \
- (gstage_pgd_levels * gstage_index_bits) + \
- gstage_pgd_xbits)
-#define gstage_gpa_size ((gpa_t)(1ULL << gstage_gpa_bits))
-
-#define gstage_pte_leaf(__ptep) \
- (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
-
-static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
-{
- unsigned long mask;
- unsigned long shift = HGATP_PAGE_SHIFT + (gstage_index_bits * level);
-
- if (level == (gstage_pgd_levels - 1))
- mask = (PTRS_PER_PTE * (1UL << gstage_pgd_xbits)) - 1;
- else
- mask = PTRS_PER_PTE - 1;
-
- return (addr >> shift) & mask;
-}
-
-static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
-{
- return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
-}
-
-static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
-{
- u32 i;
- unsigned long psz = 1UL << 12;
-
- for (i = 0; i < gstage_pgd_levels; i++) {
- if (page_size == (psz << (i * gstage_index_bits))) {
- *out_level = i;
- return 0;
- }
- }
-
- return -EINVAL;
-}
-
-static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
-{
- if (gstage_pgd_levels < level)
- return -EINVAL;
-
- *out_pgorder = 12 + (level * gstage_index_bits);
- return 0;
-}
-
-static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
-{
- int rc;
- unsigned long page_order = PAGE_SHIFT;
-
- rc = gstage_level_to_page_order(level, &page_order);
- if (rc)
- return rc;
-
- *out_pgsize = BIT(page_order);
- return 0;
-}
-
-static bool gstage_get_leaf_entry(struct kvm *kvm, gpa_t addr,
- pte_t **ptepp, u32 *ptep_level)
-{
- pte_t *ptep;
- u32 current_level = gstage_pgd_levels - 1;
-
- *ptep_level = current_level;
- ptep = (pte_t *)kvm->arch.pgd;
- ptep = &ptep[gstage_pte_index(addr, current_level)];
- while (ptep && pte_val(ptep_get(ptep))) {
- if (gstage_pte_leaf(ptep)) {
- *ptep_level = current_level;
- *ptepp = ptep;
- return true;
- }
-
- if (current_level) {
- current_level--;
- *ptep_level = current_level;
- ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
- ptep = &ptep[gstage_pte_index(addr, current_level)];
- } else {
- ptep = NULL;
- }
- }
-
- return false;
-}
-
-static void gstage_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr)
-{
- unsigned long order = PAGE_SHIFT;
-
- if (gstage_level_to_page_order(level, &order))
- return;
- addr &= ~(BIT(order) - 1);
-
- kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, addr, BIT(order), order);
-}
-
-static int gstage_set_pte(struct kvm *kvm, u32 level,
- struct kvm_mmu_memory_cache *pcache,
- gpa_t addr, const pte_t *new_pte)
-{
- u32 current_level = gstage_pgd_levels - 1;
- pte_t *next_ptep = (pte_t *)kvm->arch.pgd;
- pte_t *ptep = &next_ptep[gstage_pte_index(addr, current_level)];
-
- if (current_level < level)
- return -EINVAL;
-
- while (current_level != level) {
- if (gstage_pte_leaf(ptep))
- return -EEXIST;
-
- if (!pte_val(ptep_get(ptep))) {
- if (!pcache)
- return -ENOMEM;
- next_ptep = kvm_mmu_memory_cache_alloc(pcache);
- if (!next_ptep)
- return -ENOMEM;
- set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)),
- __pgprot(_PAGE_TABLE)));
- } else {
- if (gstage_pte_leaf(ptep))
- return -EEXIST;
- next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
- }
-
- current_level--;
- ptep = &next_ptep[gstage_pte_index(addr, current_level)];
- }
-
- set_pte(ptep, *new_pte);
- if (gstage_pte_leaf(ptep))
- gstage_remote_tlb_flush(kvm, current_level, addr);
-
- return 0;
-}
-
-static int gstage_map_page(struct kvm *kvm,
- struct kvm_mmu_memory_cache *pcache,
- gpa_t gpa, phys_addr_t hpa,
- unsigned long page_size,
- bool page_rdonly, bool page_exec)
-{
- int ret;
- u32 level = 0;
- pte_t new_pte;
- pgprot_t prot;
-
- ret = gstage_page_size_to_level(page_size, &level);
- if (ret)
- return ret;
-
- /*
- * A RISC-V implementation can choose to either:
- * 1) Update 'A' and 'D' PTE bits in hardware
- * 2) Generate page fault when 'A' and/or 'D' bits are not set
- * PTE so that software can update these bits.
- *
- * We support both options mentioned above. To achieve this, we
- * always set 'A' and 'D' PTE bits at time of creating G-stage
- * mapping. To support KVM dirty page logging with both options
- * mentioned above, we will write-protect G-stage PTEs to track
- * dirty pages.
- */
- if (page_exec) {
- if (page_rdonly)
- prot = PAGE_READ_EXEC;
- else
- prot = PAGE_WRITE_EXEC;
- } else {
- if (page_rdonly)
- prot = PAGE_READ;
- else
- prot = PAGE_WRITE;
- }
- new_pte = pfn_pte(PFN_DOWN(hpa), prot);
- new_pte = pte_mkdirty(new_pte);
-
- return gstage_set_pte(kvm, level, pcache, gpa, &new_pte);
-}
-
-enum gstage_op {
- GSTAGE_OP_NOP = 0, /* Nothing */
- GSTAGE_OP_CLEAR, /* Clear/Unmap */
- GSTAGE_OP_WP, /* Write-protect */
-};
-
-static void gstage_op_pte(struct kvm *kvm, gpa_t addr,
- pte_t *ptep, u32 ptep_level, enum gstage_op op)
-{
- int i, ret;
- pte_t *next_ptep;
- u32 next_ptep_level;
- unsigned long next_page_size, page_size;
-
- ret = gstage_level_to_page_size(ptep_level, &page_size);
- if (ret)
- return;
-
- BUG_ON(addr & (page_size - 1));
-
- if (!pte_val(ptep_get(ptep)))
- return;
-
- if (ptep_level && !gstage_pte_leaf(ptep)) {
- next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
- next_ptep_level = ptep_level - 1;
- ret = gstage_level_to_page_size(next_ptep_level,
- &next_page_size);
- if (ret)
- return;
-
- if (op == GSTAGE_OP_CLEAR)
- set_pte(ptep, __pte(0));
- for (i = 0; i < PTRS_PER_PTE; i++)
- gstage_op_pte(kvm, addr + i * next_page_size,
- &next_ptep[i], next_ptep_level, op);
- if (op == GSTAGE_OP_CLEAR)
- put_page(virt_to_page(next_ptep));
- } else {
- if (op == GSTAGE_OP_CLEAR)
- set_pte(ptep, __pte(0));
- else if (op == GSTAGE_OP_WP)
- set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE));
- gstage_remote_tlb_flush(kvm, ptep_level, addr);
- }
-}
-
-static void gstage_unmap_range(struct kvm *kvm, gpa_t start,
- gpa_t size, bool may_block)
-{
- int ret;
- pte_t *ptep;
- u32 ptep_level;
- bool found_leaf;
- unsigned long page_size;
- gpa_t addr = start, end = start + size;
-
- while (addr < end) {
- found_leaf = gstage_get_leaf_entry(kvm, addr,
- &ptep, &ptep_level);
- ret = gstage_level_to_page_size(ptep_level, &page_size);
- if (ret)
- break;
-
- if (!found_leaf)
- goto next;
-
- if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
- gstage_op_pte(kvm, addr, ptep,
- ptep_level, GSTAGE_OP_CLEAR);
-
-next:
- addr += page_size;
-
- /*
- * If the range is too large, release the kvm->mmu_lock
- * to prevent starvation and lockup detector warnings.
- */
- if (may_block && addr < end)
- cond_resched_lock(&kvm->mmu_lock);
- }
-}
-
-static void gstage_wp_range(struct kvm *kvm, gpa_t start, gpa_t end)
-{
- int ret;
- pte_t *ptep;
- u32 ptep_level;
- bool found_leaf;
- gpa_t addr = start;
- unsigned long page_size;
-
- while (addr < end) {
- found_leaf = gstage_get_leaf_entry(kvm, addr,
- &ptep, &ptep_level);
- ret = gstage_level_to_page_size(ptep_level, &page_size);
- if (ret)
- break;
-
- if (!found_leaf)
- goto next;
-
- if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
- gstage_op_pte(kvm, addr, ptep,
- ptep_level, GSTAGE_OP_WP);
-
-next:
- addr += page_size;
- }
-}
-
-static void gstage_wp_memory_region(struct kvm *kvm, int slot)
+static void mmu_wp_memory_region(struct kvm *kvm, int slot)
{
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+ struct kvm_gstage gstage;
+
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
spin_lock(&kvm->mmu_lock);
- gstage_wp_range(kvm, start, end);
+ kvm_riscv_gstage_wp_range(&gstage, start, end);
spin_unlock(&kvm->mmu_lock);
- kvm_flush_remote_tlbs(kvm);
+ kvm_flush_remote_tlbs_memslot(kvm, memslot);
}
-int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa,
- phys_addr_t hpa, unsigned long size,
- bool writable, bool in_atomic)
+int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
+ unsigned long size, bool writable, bool in_atomic)
{
- pte_t pte;
int ret = 0;
unsigned long pfn;
phys_addr_t addr, end;
@@ -355,22 +45,31 @@ int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa,
.gfp_custom = (in_atomic) ? GFP_ATOMIC | __GFP_ACCOUNT : 0,
.gfp_zero = __GFP_ZERO,
};
+ struct kvm_gstage_mapping map;
+ struct kvm_gstage gstage;
+
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK;
pfn = __phys_to_pfn(hpa);
for (addr = gpa; addr < end; addr += PAGE_SIZE) {
- pte = pfn_pte(pfn, PAGE_KERNEL_IO);
+ map.addr = addr;
+ map.pte = pfn_pte(pfn, PAGE_KERNEL_IO);
+ map.level = 0;
if (!writable)
- pte = pte_wrprotect(pte);
+ map.pte = pte_wrprotect(map.pte);
- ret = kvm_mmu_topup_memory_cache(&pcache, gstage_pgd_levels);
+ ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels);
if (ret)
goto out;
spin_lock(&kvm->mmu_lock);
- ret = gstage_set_pte(kvm, 0, &pcache, addr, &pte);
+ ret = kvm_riscv_gstage_set_pte(&gstage, &pcache, &map);
spin_unlock(&kvm->mmu_lock);
if (ret)
goto out;
@@ -383,10 +82,17 @@ out:
return ret;
}
-void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size)
+void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size)
{
+ struct kvm_gstage gstage;
+
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+
spin_lock(&kvm->mmu_lock);
- gstage_unmap_range(kvm, gpa, size, false);
+ kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
spin_unlock(&kvm->mmu_lock);
}
@@ -398,8 +104,14 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+ struct kvm_gstage gstage;
- gstage_wp_range(kvm, start, end);
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+
+ kvm_riscv_gstage_wp_range(&gstage, start, end);
}
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
@@ -416,7 +128,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
- kvm_riscv_gstage_free_pgd(kvm);
+ kvm_riscv_mmu_free_pgd(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -424,9 +136,15 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
{
gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
phys_addr_t size = slot->npages << PAGE_SHIFT;
+ struct kvm_gstage gstage;
+
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
spin_lock(&kvm->mmu_lock);
- gstage_unmap_range(kvm, gpa, size, false);
+ kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
spin_unlock(&kvm->mmu_lock);
}
@@ -441,7 +159,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
* the memory slot is write protected.
*/
if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES)
- gstage_wp_memory_region(kvm, new->id);
+ mmu_wp_memory_region(kvm, new->id);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -463,7 +181,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
* space addressable by the KVM guest GPA space.
*/
if ((new->base_gfn + new->npages) >=
- (gstage_gpa_size >> PAGE_SHIFT))
+ (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT))
return -EFAULT;
hva = new->userspace_addr;
@@ -487,10 +205,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
* +--------------------------------------------+
*/
do {
- struct vm_area_struct *vma = find_vma(current->mm, hva);
+ struct vm_area_struct *vma;
hva_t vm_start, vm_end;
- if (!vma || vma->vm_start >= reg_end)
+ vma = find_vma_intersection(current->mm, hva, reg_end);
+ if (!vma)
break;
/*
@@ -519,9 +238,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
goto out;
}
- ret = kvm_riscv_gstage_ioremap(kvm, gpa, pa,
- vm_end - vm_start,
- writable, false);
+ ret = kvm_riscv_mmu_ioremap(kvm, gpa, pa, vm_end - vm_start,
+ writable, false);
if (ret)
break;
}
@@ -532,7 +250,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
goto out;
if (ret)
- kvm_riscv_gstage_iounmap(kvm, base_gpa, size);
+ kvm_riscv_mmu_iounmap(kvm, base_gpa, size);
out:
mmap_read_unlock(current->mm);
@@ -541,12 +259,18 @@ out:
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
+ struct kvm_gstage gstage;
+
if (!kvm->arch.pgd)
return false;
- gstage_unmap_range(kvm, range->start << PAGE_SHIFT,
- (range->end - range->start) << PAGE_SHIFT,
- range->may_block);
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+ kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT,
+ (range->end - range->start) << PAGE_SHIFT,
+ range->may_block);
return false;
}
@@ -555,14 +279,19 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
pte_t *ptep;
u32 ptep_level = 0;
u64 size = (range->end - range->start) << PAGE_SHIFT;
+ struct kvm_gstage gstage;
if (!kvm->arch.pgd)
return false;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
- &ptep, &ptep_level))
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+ if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT,
+ &ptep, &ptep_level))
return false;
return ptep_test_and_clear_young(NULL, 0, ptep);
@@ -573,22 +302,27 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
pte_t *ptep;
u32 ptep_level = 0;
u64 size = (range->end - range->start) << PAGE_SHIFT;
+ struct kvm_gstage gstage;
if (!kvm->arch.pgd)
return false;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
- &ptep, &ptep_level))
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+ if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT,
+ &ptep, &ptep_level))
return false;
return pte_young(ptep_get(ptep));
}
-int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
- struct kvm_memory_slot *memslot,
- gpa_t gpa, unsigned long hva, bool is_write)
+int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
+ gpa_t gpa, unsigned long hva, bool is_write,
+ struct kvm_gstage_mapping *out_map)
{
int ret;
kvm_pfn_t hfn;
@@ -601,10 +335,19 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
bool logging = (memslot->dirty_bitmap &&
!(memslot->flags & KVM_MEM_READONLY)) ? true : false;
unsigned long vma_pagesize, mmu_seq;
+ struct kvm_gstage gstage;
struct page *page;
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+
+ /* Setup initial state of output mapping */
+ memset(out_map, 0, sizeof(*out_map));
+
/* We need minimum second+third level pages */
- ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels);
+ ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels);
if (ret) {
kvm_err("Failed to topup G-stage cache\n");
return ret;
@@ -648,7 +391,8 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
return -EFAULT;
}
- hfn = kvm_faultin_pfn(vcpu, gfn, is_write, &writable, &page);
+ hfn = __kvm_faultin_pfn(memslot, gfn, is_write ? FOLL_WRITE : 0,
+ &writable, &page);
if (hfn == KVM_PFN_ERR_HWPOISON) {
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva,
vma_pageshift, current);
@@ -670,12 +414,12 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
goto out_unlock;
if (writable) {
- mark_page_dirty(kvm, gfn);
- ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
- vma_pagesize, false, true);
+ mark_page_dirty_in_slot(kvm, memslot, gfn);
+ ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT,
+ vma_pagesize, false, true, out_map);
} else {
- ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
- vma_pagesize, true, true);
+ ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT,
+ vma_pagesize, true, true, out_map);
}
if (ret)
@@ -687,7 +431,7 @@ out_unlock:
return ret;
}
-int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm)
+int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm)
{
struct page *pgd_page;
@@ -697,7 +441,7 @@ int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm)
}
pgd_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(gstage_pgd_size));
+ get_order(kvm_riscv_gstage_pgd_size));
if (!pgd_page)
return -ENOMEM;
kvm->arch.pgd = page_to_virt(pgd_page);
@@ -706,13 +450,18 @@ int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm)
return 0;
}
-void kvm_riscv_gstage_free_pgd(struct kvm *kvm)
+void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
{
+ struct kvm_gstage gstage;
void *pgd = NULL;
spin_lock(&kvm->mmu_lock);
if (kvm->arch.pgd) {
- gstage_unmap_range(kvm, 0UL, gstage_gpa_size, false);
+ gstage.kvm = kvm;
+ gstage.flags = 0;
+ gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
+ gstage.pgd = kvm->arch.pgd;
+ kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false);
pgd = READ_ONCE(kvm->arch.pgd);
kvm->arch.pgd = NULL;
kvm->arch.pgd_phys = 0;
@@ -720,12 +469,12 @@ void kvm_riscv_gstage_free_pgd(struct kvm *kvm)
spin_unlock(&kvm->mmu_lock);
if (pgd)
- free_pages((unsigned long)pgd, get_order(gstage_pgd_size));
+ free_pages((unsigned long)pgd, get_order(kvm_riscv_gstage_pgd_size));
}
-void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu)
+void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu)
{
- unsigned long hgatp = gstage_mode;
+ unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
struct kvm_arch *k = &vcpu->kvm->arch;
hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID;
@@ -736,37 +485,3 @@ void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu)
if (!kvm_riscv_gstage_vmid_bits())
kvm_riscv_local_hfence_gvma_all();
}
-
-void __init kvm_riscv_gstage_mode_detect(void)
-{
-#ifdef CONFIG_64BIT
- /* Try Sv57x4 G-stage mode */
- csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
- if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
- gstage_mode = (HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
- gstage_pgd_levels = 5;
- goto skip_sv48x4_test;
- }
-
- /* Try Sv48x4 G-stage mode */
- csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
- if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
- gstage_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
- gstage_pgd_levels = 4;
- }
-skip_sv48x4_test:
-
- csr_write(CSR_HGATP, 0);
- kvm_riscv_local_hfence_gvma_all();
-#endif
-}
-
-unsigned long __init kvm_riscv_gstage_mode(void)
-{
- return gstage_mode >> HGATP_MODE_SHIFT;
-}
-
-int kvm_riscv_gstage_gpa_bits(void)
-{
- return gstage_gpa_bits;
-}
diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c
index 2f91ea5f8493..3c5a70a2b927 100644
--- a/arch/riscv/kvm/tlb.c
+++ b/arch/riscv/kvm/tlb.c
@@ -15,6 +15,8 @@
#include <asm/cpufeature.h>
#include <asm/insn-def.h>
#include <asm/kvm_nacl.h>
+#include <asm/kvm_tlb.h>
+#include <asm/kvm_vmid.h>
#define has_svinval() riscv_has_extension_unlikely(RISCV_ISA_EXT_SVINVAL)
@@ -156,36 +158,13 @@ void kvm_riscv_local_hfence_vvma_all(unsigned long vmid)
csr_write(CSR_HGATP, hgatp);
}
-void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu)
-{
- unsigned long vmid;
-
- if (!kvm_riscv_gstage_vmid_bits() ||
- vcpu->arch.last_exit_cpu == vcpu->cpu)
- return;
-
- /*
- * On RISC-V platforms with hardware VMID support, we share same
- * VMID for all VCPUs of a particular Guest/VM. This means we might
- * have stale G-stage TLB entries on the current Host CPU due to
- * some other VCPU of the same Guest which ran previously on the
- * current Host CPU.
- *
- * To cleanup stale TLB entries, we simply flush all G-stage TLB
- * entries by VMID whenever underlying Host CPU changes for a VCPU.
- */
-
- vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
- kvm_riscv_local_hfence_gvma_vmid_all(vmid);
-}
-
void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu)
{
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_RCVD);
local_flush_icache_all();
}
-void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu)
+void kvm_riscv_tlb_flush_process(struct kvm_vcpu *vcpu)
{
struct kvm_vmid *v = &vcpu->kvm->arch.vmid;
unsigned long vmid = READ_ONCE(v->vmid);
@@ -258,51 +237,58 @@ static bool vcpu_hfence_enqueue(struct kvm_vcpu *vcpu,
void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu)
{
- unsigned long vmid;
struct kvm_riscv_hfence d = { 0 };
- struct kvm_vmid *v = &vcpu->kvm->arch.vmid;
while (vcpu_hfence_dequeue(vcpu, &d)) {
switch (d.type) {
case KVM_RISCV_HFENCE_UNKNOWN:
break;
case KVM_RISCV_HFENCE_GVMA_VMID_GPA:
- vmid = READ_ONCE(v->vmid);
if (kvm_riscv_nacl_available())
- nacl_hfence_gvma_vmid(nacl_shmem(), vmid,
+ nacl_hfence_gvma_vmid(nacl_shmem(), d.vmid,
d.addr, d.size, d.order);
else
- kvm_riscv_local_hfence_gvma_vmid_gpa(vmid, d.addr,
+ kvm_riscv_local_hfence_gvma_vmid_gpa(d.vmid, d.addr,
d.size, d.order);
break;
+ case KVM_RISCV_HFENCE_GVMA_VMID_ALL:
+ if (kvm_riscv_nacl_available())
+ nacl_hfence_gvma_vmid_all(nacl_shmem(), d.vmid);
+ else
+ kvm_riscv_local_hfence_gvma_vmid_all(d.vmid);
+ break;
case KVM_RISCV_HFENCE_VVMA_ASID_GVA:
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD);
- vmid = READ_ONCE(v->vmid);
if (kvm_riscv_nacl_available())
- nacl_hfence_vvma_asid(nacl_shmem(), vmid, d.asid,
+ nacl_hfence_vvma_asid(nacl_shmem(), d.vmid, d.asid,
d.addr, d.size, d.order);
else
- kvm_riscv_local_hfence_vvma_asid_gva(vmid, d.asid, d.addr,
+ kvm_riscv_local_hfence_vvma_asid_gva(d.vmid, d.asid, d.addr,
d.size, d.order);
break;
case KVM_RISCV_HFENCE_VVMA_ASID_ALL:
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD);
- vmid = READ_ONCE(v->vmid);
if (kvm_riscv_nacl_available())
- nacl_hfence_vvma_asid_all(nacl_shmem(), vmid, d.asid);
+ nacl_hfence_vvma_asid_all(nacl_shmem(), d.vmid, d.asid);
else
- kvm_riscv_local_hfence_vvma_asid_all(vmid, d.asid);
+ kvm_riscv_local_hfence_vvma_asid_all(d.vmid, d.asid);
break;
case KVM_RISCV_HFENCE_VVMA_GVA:
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_RCVD);
- vmid = READ_ONCE(v->vmid);
if (kvm_riscv_nacl_available())
- nacl_hfence_vvma(nacl_shmem(), vmid,
+ nacl_hfence_vvma(nacl_shmem(), d.vmid,
d.addr, d.size, d.order);
else
- kvm_riscv_local_hfence_vvma_gva(vmid, d.addr,
+ kvm_riscv_local_hfence_vvma_gva(d.vmid, d.addr,
d.size, d.order);
break;
+ case KVM_RISCV_HFENCE_VVMA_ALL:
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_RCVD);
+ if (kvm_riscv_nacl_available())
+ nacl_hfence_vvma_all(nacl_shmem(), d.vmid);
+ else
+ kvm_riscv_local_hfence_vvma_all(d.vmid);
+ break;
default:
break;
}
@@ -355,35 +341,43 @@ void kvm_riscv_fence_i(struct kvm *kvm,
void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm,
unsigned long hbase, unsigned long hmask,
gpa_t gpa, gpa_t gpsz,
- unsigned long order)
+ unsigned long order, unsigned long vmid)
{
struct kvm_riscv_hfence data;
data.type = KVM_RISCV_HFENCE_GVMA_VMID_GPA;
data.asid = 0;
+ data.vmid = vmid;
data.addr = gpa;
data.size = gpsz;
data.order = order;
make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
- KVM_REQ_HFENCE_GVMA_VMID_ALL, &data);
+ KVM_REQ_TLB_FLUSH, &data);
}
void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask)
+ unsigned long hbase, unsigned long hmask,
+ unsigned long vmid)
{
- make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_GVMA_VMID_ALL,
- KVM_REQ_HFENCE_GVMA_VMID_ALL, NULL);
+ struct kvm_riscv_hfence data = {0};
+
+ data.type = KVM_RISCV_HFENCE_GVMA_VMID_ALL;
+ data.vmid = vmid;
+ make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
+ KVM_REQ_TLB_FLUSH, &data);
}
void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm,
unsigned long hbase, unsigned long hmask,
unsigned long gva, unsigned long gvsz,
- unsigned long order, unsigned long asid)
+ unsigned long order, unsigned long asid,
+ unsigned long vmid)
{
struct kvm_riscv_hfence data;
data.type = KVM_RISCV_HFENCE_VVMA_ASID_GVA;
data.asid = asid;
+ data.vmid = vmid;
data.addr = gva;
data.size = gvsz;
data.order = order;
@@ -393,13 +387,13 @@ void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm,
void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm,
unsigned long hbase, unsigned long hmask,
- unsigned long asid)
+ unsigned long asid, unsigned long vmid)
{
- struct kvm_riscv_hfence data;
+ struct kvm_riscv_hfence data = {0};
data.type = KVM_RISCV_HFENCE_VVMA_ASID_ALL;
data.asid = asid;
- data.addr = data.size = data.order = 0;
+ data.vmid = vmid;
make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
KVM_REQ_HFENCE_VVMA_ALL, &data);
}
@@ -407,12 +401,13 @@ void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm,
void kvm_riscv_hfence_vvma_gva(struct kvm *kvm,
unsigned long hbase, unsigned long hmask,
unsigned long gva, unsigned long gvsz,
- unsigned long order)
+ unsigned long order, unsigned long vmid)
{
struct kvm_riscv_hfence data;
data.type = KVM_RISCV_HFENCE_VVMA_GVA;
data.asid = 0;
+ data.vmid = vmid;
data.addr = gva;
data.size = gvsz;
data.order = order;
@@ -421,8 +416,21 @@ void kvm_riscv_hfence_vvma_gva(struct kvm *kvm,
}
void kvm_riscv_hfence_vvma_all(struct kvm *kvm,
- unsigned long hbase, unsigned long hmask)
+ unsigned long hbase, unsigned long hmask,
+ unsigned long vmid)
+{
+ struct kvm_riscv_hfence data = {0};
+
+ data.type = KVM_RISCV_HFENCE_VVMA_ALL;
+ data.vmid = vmid;
+ make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
+ KVM_REQ_HFENCE_VVMA_ALL, &data);
+}
+
+int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
{
- make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_VVMA_ALL,
- KVM_REQ_HFENCE_VVMA_ALL, NULL);
+ kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0,
+ gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT,
+ PAGE_SHIFT, READ_ONCE(kvm->arch.vmid.vmid));
+ return 0;
}
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index 0462863206ca..f001e56403f9 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -18,6 +18,7 @@
#include <linux/fs.h>
#include <linux/kvm_host.h>
#include <asm/cacheflush.h>
+#include <asm/kvm_mmu.h>
#include <asm/kvm_nacl.h>
#include <asm/kvm_vcpu_vector.h>
@@ -111,7 +112,7 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu, bool kvm_sbi_reset)
vcpu->arch.hfence_tail = 0;
memset(vcpu->arch.hfence_queue, 0, sizeof(vcpu->arch.hfence_queue));
- kvm_riscv_vcpu_sbi_sta_reset(vcpu);
+ kvm_riscv_vcpu_sbi_reset(vcpu);
/* Reset the guest CSRs for hotplug usecase */
if (loaded)
@@ -148,8 +149,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
spin_lock_init(&vcpu->arch.reset_state.lock);
- if (kvm_riscv_vcpu_alloc_vector_context(vcpu))
- return -ENOMEM;
+ rc = kvm_riscv_vcpu_alloc_vector_context(vcpu);
+ if (rc)
+ return rc;
/* Setup VCPU timer */
kvm_riscv_vcpu_timer_init(vcpu);
@@ -158,9 +160,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
kvm_riscv_vcpu_pmu_init(vcpu);
/* Setup VCPU AIA */
- rc = kvm_riscv_vcpu_aia_init(vcpu);
- if (rc)
- return rc;
+ kvm_riscv_vcpu_aia_init(vcpu);
/*
* Setup SBI extensions
@@ -187,6 +187,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
+ kvm_riscv_vcpu_sbi_deinit(vcpu);
+
/* Cleanup VCPU AIA context */
kvm_riscv_vcpu_aia_deinit(vcpu);
@@ -620,7 +622,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
}
- kvm_riscv_gstage_update_hgatp(vcpu);
+ kvm_riscv_mmu_update_hgatp(vcpu);
kvm_riscv_vcpu_timer_restore(vcpu);
@@ -680,7 +682,14 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
}
}
-static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu)
+/**
+ * check_vcpu_requests - check and handle pending vCPU requests
+ * @vcpu: the VCPU pointer
+ *
+ * Return: 1 if we should enter the guest
+ * 0 if we should exit to userspace
+ */
+static int kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu)
{
struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
@@ -705,17 +714,13 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu)
kvm_riscv_reset_vcpu(vcpu, true);
if (kvm_check_request(KVM_REQ_UPDATE_HGATP, vcpu))
- kvm_riscv_gstage_update_hgatp(vcpu);
+ kvm_riscv_mmu_update_hgatp(vcpu);
if (kvm_check_request(KVM_REQ_FENCE_I, vcpu))
kvm_riscv_fence_i_process(vcpu);
- /*
- * The generic KVM_REQ_TLB_FLUSH is same as
- * KVM_REQ_HFENCE_GVMA_VMID_ALL
- */
- if (kvm_check_request(KVM_REQ_HFENCE_GVMA_VMID_ALL, vcpu))
- kvm_riscv_hfence_gvma_vmid_all_process(vcpu);
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+ kvm_riscv_tlb_flush_process(vcpu);
if (kvm_check_request(KVM_REQ_HFENCE_VVMA_ALL, vcpu))
kvm_riscv_hfence_vvma_all_process(vcpu);
@@ -725,7 +730,12 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
kvm_riscv_vcpu_record_steal_time(vcpu);
+
+ if (kvm_dirty_ring_check_request(vcpu))
+ return 0;
}
+
+ return 1;
}
static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu)
@@ -907,7 +917,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_riscv_gstage_vmid_update(vcpu);
- kvm_riscv_check_vcpu_requests(vcpu);
+ ret = kvm_riscv_check_vcpu_requests(vcpu);
+ if (ret <= 0)
+ continue;
preempt_disable();
@@ -951,12 +963,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
}
/*
- * Cleanup stale TLB enteries
+ * Sanitize VMID mappings cached (TLB) on current CPU
*
* Note: This should be done after G-stage VMID has been
* updated using kvm_riscv_gstage_vmid_ver_changed()
*/
- kvm_riscv_local_tlb_sanitize(vcpu);
+ kvm_riscv_gstage_vmid_sanitize(vcpu);
trace_kvm_entry(vcpu);
diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
index 6e0c18412795..0bb0c51e3c89 100644
--- a/arch/riscv/kvm/vcpu_exit.c
+++ b/arch/riscv/kvm/vcpu_exit.c
@@ -9,10 +9,13 @@
#include <linux/kvm_host.h>
#include <asm/csr.h>
#include <asm/insn-def.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_nacl.h>
static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run,
struct kvm_cpu_trap *trap)
{
+ struct kvm_gstage_mapping host_map;
struct kvm_memory_slot *memslot;
unsigned long hva, fault_addr;
bool writable;
@@ -40,8 +43,9 @@ static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run,
};
}
- ret = kvm_riscv_gstage_map(vcpu, memslot, fault_addr, hva,
- (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false);
+ ret = kvm_riscv_mmu_map(vcpu, memslot, fault_addr, hva,
+ (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false,
+ &host_map);
if (ret < 0)
return ret;
@@ -135,7 +139,7 @@ unsigned long kvm_riscv_vcpu_unpriv_read(struct kvm_vcpu *vcpu,
void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu,
struct kvm_cpu_trap *trap)
{
- unsigned long vsstatus = csr_read(CSR_VSSTATUS);
+ unsigned long vsstatus = ncsr_read(CSR_VSSTATUS);
/* Change Guest SSTATUS.SPP bit */
vsstatus &= ~SR_SPP;
@@ -151,15 +155,15 @@ void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu,
vsstatus &= ~SR_SIE;
/* Update Guest SSTATUS */
- csr_write(CSR_VSSTATUS, vsstatus);
+ ncsr_write(CSR_VSSTATUS, vsstatus);
/* Update Guest SCAUSE, STVAL, and SEPC */
- csr_write(CSR_VSCAUSE, trap->scause);
- csr_write(CSR_VSTVAL, trap->stval);
- csr_write(CSR_VSEPC, trap->sepc);
+ ncsr_write(CSR_VSCAUSE, trap->scause);
+ ncsr_write(CSR_VSTVAL, trap->stval);
+ ncsr_write(CSR_VSEPC, trap->sepc);
/* Set Guest PC to Guest exception vector */
- vcpu->arch.guest_context.sepc = csr_read(CSR_VSTVEC);
+ vcpu->arch.guest_context.sepc = ncsr_read(CSR_VSTVEC);
/* Set Guest privilege mode to supervisor */
vcpu->arch.guest_context.sstatus |= SR_SPP;
diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c
index 2e1b646f0d61..cce6a38ea54f 100644
--- a/arch/riscv/kvm/vcpu_onereg.c
+++ b/arch/riscv/kvm/vcpu_onereg.c
@@ -23,7 +23,7 @@
#define KVM_ISA_EXT_ARR(ext) \
[KVM_RISCV_ISA_EXT_##ext] = RISCV_ISA_EXT_##ext
-/* Mapping between KVM ISA Extension ID & Host ISA extension ID */
+/* Mapping between KVM ISA Extension ID & guest ISA extension ID */
static const unsigned long kvm_isa_ext_arr[] = {
/* Single letter extensions (alphabetically sorted) */
[KVM_RISCV_ISA_EXT_A] = RISCV_ISA_EXT_a,
@@ -35,7 +35,7 @@ static const unsigned long kvm_isa_ext_arr[] = {
[KVM_RISCV_ISA_EXT_M] = RISCV_ISA_EXT_m,
[KVM_RISCV_ISA_EXT_V] = RISCV_ISA_EXT_v,
/* Multi letter extensions (alphabetically sorted) */
- [KVM_RISCV_ISA_EXT_SMNPM] = RISCV_ISA_EXT_SSNPM,
+ KVM_ISA_EXT_ARR(SMNPM),
KVM_ISA_EXT_ARR(SMSTATEEN),
KVM_ISA_EXT_ARR(SSAIA),
KVM_ISA_EXT_ARR(SSCOFPMF),
@@ -112,6 +112,36 @@ static unsigned long kvm_riscv_vcpu_base2isa_ext(unsigned long base_ext)
return KVM_RISCV_ISA_EXT_MAX;
}
+static int kvm_riscv_vcpu_isa_check_host(unsigned long kvm_ext, unsigned long *guest_ext)
+{
+ unsigned long host_ext;
+
+ if (kvm_ext >= KVM_RISCV_ISA_EXT_MAX ||
+ kvm_ext >= ARRAY_SIZE(kvm_isa_ext_arr))
+ return -ENOENT;
+
+ *guest_ext = kvm_isa_ext_arr[kvm_ext];
+ switch (*guest_ext) {
+ case RISCV_ISA_EXT_SMNPM:
+ /*
+ * Pointer masking effective in (H)S-mode is provided by the
+ * Smnpm extension, so that extension is reported to the guest,
+ * even though the CSR bits for configuring VS-mode pointer
+ * masking on the host side are part of the Ssnpm extension.
+ */
+ host_ext = RISCV_ISA_EXT_SSNPM;
+ break;
+ default:
+ host_ext = *guest_ext;
+ break;
+ }
+
+ if (!__riscv_isa_extension_available(NULL, host_ext))
+ return -ENOENT;
+
+ return 0;
+}
+
static bool kvm_riscv_vcpu_isa_enable_allowed(unsigned long ext)
{
switch (ext) {
@@ -219,13 +249,13 @@ static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext)
void kvm_riscv_vcpu_setup_isa(struct kvm_vcpu *vcpu)
{
- unsigned long host_isa, i;
+ unsigned long guest_ext, i;
for (i = 0; i < ARRAY_SIZE(kvm_isa_ext_arr); i++) {
- host_isa = kvm_isa_ext_arr[i];
- if (__riscv_isa_extension_available(NULL, host_isa) &&
- kvm_riscv_vcpu_isa_enable_allowed(i))
- set_bit(host_isa, vcpu->arch.isa);
+ if (kvm_riscv_vcpu_isa_check_host(i, &guest_ext))
+ continue;
+ if (kvm_riscv_vcpu_isa_enable_allowed(i))
+ set_bit(guest_ext, vcpu->arch.isa);
}
}
@@ -607,18 +637,15 @@ static int riscv_vcpu_get_isa_ext_single(struct kvm_vcpu *vcpu,
unsigned long reg_num,
unsigned long *reg_val)
{
- unsigned long host_isa_ext;
-
- if (reg_num >= KVM_RISCV_ISA_EXT_MAX ||
- reg_num >= ARRAY_SIZE(kvm_isa_ext_arr))
- return -ENOENT;
+ unsigned long guest_ext;
+ int ret;
- host_isa_ext = kvm_isa_ext_arr[reg_num];
- if (!__riscv_isa_extension_available(NULL, host_isa_ext))
- return -ENOENT;
+ ret = kvm_riscv_vcpu_isa_check_host(reg_num, &guest_ext);
+ if (ret)
+ return ret;
*reg_val = 0;
- if (__riscv_isa_extension_available(vcpu->arch.isa, host_isa_ext))
+ if (__riscv_isa_extension_available(vcpu->arch.isa, guest_ext))
*reg_val = 1; /* Mark the given extension as available */
return 0;
@@ -628,17 +655,14 @@ static int riscv_vcpu_set_isa_ext_single(struct kvm_vcpu *vcpu,
unsigned long reg_num,
unsigned long reg_val)
{
- unsigned long host_isa_ext;
-
- if (reg_num >= KVM_RISCV_ISA_EXT_MAX ||
- reg_num >= ARRAY_SIZE(kvm_isa_ext_arr))
- return -ENOENT;
+ unsigned long guest_ext;
+ int ret;
- host_isa_ext = kvm_isa_ext_arr[reg_num];
- if (!__riscv_isa_extension_available(NULL, host_isa_ext))
- return -ENOENT;
+ ret = kvm_riscv_vcpu_isa_check_host(reg_num, &guest_ext);
+ if (ret)
+ return ret;
- if (reg_val == test_bit(host_isa_ext, vcpu->arch.isa))
+ if (reg_val == test_bit(guest_ext, vcpu->arch.isa))
return 0;
if (!vcpu->arch.ran_atleast_once) {
@@ -648,10 +672,10 @@ static int riscv_vcpu_set_isa_ext_single(struct kvm_vcpu *vcpu,
*/
if (reg_val == 1 &&
kvm_riscv_vcpu_isa_enable_allowed(reg_num))
- set_bit(host_isa_ext, vcpu->arch.isa);
+ set_bit(guest_ext, vcpu->arch.isa);
else if (!reg_val &&
kvm_riscv_vcpu_isa_disable_allowed(reg_num))
- clear_bit(host_isa_ext, vcpu->arch.isa);
+ clear_bit(guest_ext, vcpu->arch.isa);
else
return -EINVAL;
kvm_riscv_vcpu_fp_reset(vcpu);
@@ -1009,16 +1033,15 @@ static int copy_fp_d_reg_indices(const struct kvm_vcpu *vcpu,
static int copy_isa_ext_reg_indices(const struct kvm_vcpu *vcpu,
u64 __user *uindices)
{
+ unsigned long guest_ext;
unsigned int n = 0;
- unsigned long isa_ext;
for (int i = 0; i < KVM_RISCV_ISA_EXT_MAX; i++) {
u64 size = IS_ENABLED(CONFIG_32BIT) ?
KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64;
u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_ISA_EXT | i;
- isa_ext = kvm_isa_ext_arr[i];
- if (!__riscv_isa_extension_available(NULL, isa_ext))
+ if (kvm_riscv_vcpu_isa_check_host(i, &guest_ext))
continue;
if (uindices) {
diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c
index 6e09b518a5d1..a56c4959f9ad 100644
--- a/arch/riscv/kvm/vcpu_sbi.c
+++ b/arch/riscv/kvm/vcpu_sbi.c
@@ -536,5 +536,54 @@ void kvm_riscv_vcpu_sbi_init(struct kvm_vcpu *vcpu)
scontext->ext_status[idx] = ext->default_disabled ?
KVM_RISCV_SBI_EXT_STATUS_DISABLED :
KVM_RISCV_SBI_EXT_STATUS_ENABLED;
+
+ if (ext->init && ext->init(vcpu) != 0)
+ scontext->ext_status[idx] = KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE;
+ }
+}
+
+void kvm_riscv_vcpu_sbi_deinit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context;
+ const struct kvm_riscv_sbi_extension_entry *entry;
+ const struct kvm_vcpu_sbi_extension *ext;
+ int idx, i;
+
+ for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) {
+ entry = &sbi_ext[i];
+ ext = entry->ext_ptr;
+ idx = entry->ext_idx;
+
+ if (idx < 0 || idx >= ARRAY_SIZE(scontext->ext_status))
+ continue;
+
+ if (scontext->ext_status[idx] == KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE ||
+ !ext->deinit)
+ continue;
+
+ ext->deinit(vcpu);
+ }
+}
+
+void kvm_riscv_vcpu_sbi_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context;
+ const struct kvm_riscv_sbi_extension_entry *entry;
+ const struct kvm_vcpu_sbi_extension *ext;
+ int idx, i;
+
+ for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) {
+ entry = &sbi_ext[i];
+ ext = entry->ext_ptr;
+ idx = entry->ext_idx;
+
+ if (idx < 0 || idx >= ARRAY_SIZE(scontext->ext_status))
+ continue;
+
+ if (scontext->ext_status[idx] != KVM_RISCV_SBI_EXT_STATUS_ENABLED ||
+ !ext->reset)
+ continue;
+
+ ext->reset(vcpu);
}
}
diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c
index b17fad091bab..b490ed1428a6 100644
--- a/arch/riscv/kvm/vcpu_sbi_replace.c
+++ b/arch/riscv/kvm/vcpu_sbi_replace.c
@@ -96,6 +96,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
unsigned long hmask = cp->a0;
unsigned long hbase = cp->a1;
unsigned long funcid = cp->a6;
+ unsigned long vmid;
switch (funcid) {
case SBI_EXT_RFENCE_REMOTE_FENCE_I:
@@ -103,22 +104,22 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_SENT);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
+ vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL)
- kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask);
+ kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask, vmid);
else
kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask,
- cp->a2, cp->a3, PAGE_SHIFT);
+ cp->a2, cp->a3, PAGE_SHIFT, vmid);
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_SENT);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
+ vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL)
- kvm_riscv_hfence_vvma_asid_all(vcpu->kvm,
- hbase, hmask, cp->a4);
+ kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, hbase, hmask,
+ cp->a4, vmid);
else
- kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm,
- hbase, hmask,
- cp->a2, cp->a3,
- PAGE_SHIFT, cp->a4);
+ kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, hbase, hmask, cp->a2,
+ cp->a3, PAGE_SHIFT, cp->a4, vmid);
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_SENT);
break;
case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA:
diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c
index 5f35427114c1..cc6cb7c8f0e4 100644
--- a/arch/riscv/kvm/vcpu_sbi_sta.c
+++ b/arch/riscv/kvm/vcpu_sbi_sta.c
@@ -16,7 +16,7 @@
#include <asm/sbi.h>
#include <asm/uaccess.h>
-void kvm_riscv_vcpu_sbi_sta_reset(struct kvm_vcpu *vcpu)
+static void kvm_riscv_vcpu_sbi_sta_reset(struct kvm_vcpu *vcpu)
{
vcpu->arch.sta.shmem = INVALID_GPA;
vcpu->arch.sta.last_steal = 0;
@@ -156,6 +156,7 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_sta = {
.extid_end = SBI_EXT_STA,
.handler = kvm_sbi_ext_sta_handler,
.probe = kvm_sbi_ext_sta_probe,
+ .reset = kvm_riscv_vcpu_sbi_sta_reset,
};
int kvm_riscv_vcpu_get_reg_sbi_sta(struct kvm_vcpu *vcpu,
diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c
index 8f4c4fa16227..368dfddd23d9 100644
--- a/arch/riscv/kvm/vcpu_sbi_v01.c
+++ b/arch/riscv/kvm/vcpu_sbi_v01.c
@@ -23,6 +23,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
struct kvm *kvm = vcpu->kvm;
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
struct kvm_cpu_trap *utrap = retdata->utrap;
+ unsigned long vmid;
switch (cp->a7) {
case SBI_EXT_0_1_CONSOLE_GETCHAR:
@@ -78,25 +79,21 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
if (cp->a7 == SBI_EXT_0_1_REMOTE_FENCE_I)
kvm_riscv_fence_i(vcpu->kvm, 0, hmask);
else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) {
+ vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
if (cp->a1 == 0 && cp->a2 == 0)
- kvm_riscv_hfence_vvma_all(vcpu->kvm,
- 0, hmask);
+ kvm_riscv_hfence_vvma_all(vcpu->kvm, 0, hmask, vmid);
else
- kvm_riscv_hfence_vvma_gva(vcpu->kvm,
- 0, hmask,
- cp->a1, cp->a2,
- PAGE_SHIFT);
+ kvm_riscv_hfence_vvma_gva(vcpu->kvm, 0, hmask, cp->a1,
+ cp->a2, PAGE_SHIFT, vmid);
} else {
+ vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
if (cp->a1 == 0 && cp->a2 == 0)
- kvm_riscv_hfence_vvma_asid_all(vcpu->kvm,
- 0, hmask,
- cp->a3);
+ kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, 0, hmask,
+ cp->a3, vmid);
else
- kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm,
- 0, hmask,
- cp->a1, cp->a2,
- PAGE_SHIFT,
- cp->a3);
+ kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, 0, hmask,
+ cp->a1, cp->a2, PAGE_SHIFT,
+ cp->a3, vmid);
}
break;
default:
diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
index b27ec8f96697..66d91ae6e9b2 100644
--- a/arch/riscv/kvm/vm.c
+++ b/arch/riscv/kvm/vm.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/kvm_host.h>
+#include <asm/kvm_mmu.h>
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS()
@@ -31,13 +32,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
int r;
- r = kvm_riscv_gstage_alloc_pgd(kvm);
+ r = kvm_riscv_mmu_alloc_pgd(kvm);
if (r)
return r;
r = kvm_riscv_gstage_vmid_init(kvm);
if (r) {
- kvm_riscv_gstage_free_pgd(kvm);
+ kvm_riscv_mmu_free_pgd(kvm);
return r;
}
@@ -199,7 +200,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_USER_MEM_SLOTS;
break;
case KVM_CAP_VM_GPA_BITS:
- r = kvm_riscv_gstage_gpa_bits();
+ r = kvm_riscv_gstage_gpa_bits;
break;
default:
r = 0;
diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
index ddc98714ce8e..3b426c800480 100644
--- a/arch/riscv/kvm/vmid.c
+++ b/arch/riscv/kvm/vmid.c
@@ -14,6 +14,8 @@
#include <linux/smp.h>
#include <linux/kvm_host.h>
#include <asm/csr.h>
+#include <asm/kvm_tlb.h>
+#include <asm/kvm_vmid.h>
static unsigned long vmid_version = 1;
static unsigned long vmid_next;
@@ -122,3 +124,26 @@ void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu)
kvm_for_each_vcpu(i, v, vcpu->kvm)
kvm_make_request(KVM_REQ_UPDATE_HGATP, v);
}
+
+void kvm_riscv_gstage_vmid_sanitize(struct kvm_vcpu *vcpu)
+{
+ unsigned long vmid;
+
+ if (!kvm_riscv_gstage_vmid_bits() ||
+ vcpu->arch.last_exit_cpu == vcpu->cpu)
+ return;
+
+ /*
+ * On RISC-V platforms with hardware VMID support, we share same
+ * VMID for all VCPUs of a particular Guest/VM. This means we might
+ * have stale G-stage TLB entries on the current Host CPU due to
+ * some other VCPU of the same Guest which ran previously on the
+ * current Host CPU.
+ *
+ * To cleanup stale TLB entries, we simply flush all G-stage TLB
+ * entries by VMID whenever underlying Host CPU changes for a VCPU.
+ */
+
+ vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
+ kvm_riscv_local_hfence_gvma_vmid_all(vmid);
+}
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 0194324a0c50..04ed6f8acae4 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -20,6 +20,9 @@
#include <asm/ptrace.h>
#include <asm/tlbflush.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/exceptions.h>
+
#include "../kernel/head.h"
static void show_pte(unsigned long addr)
@@ -291,6 +294,11 @@ void handle_page_fault(struct pt_regs *regs)
if (kprobe_page_fault(regs, cause))
return;
+ if (user_mode(regs))
+ trace_page_fault_user(addr, regs, cause);
+ else
+ trace_page_fault_kernel(addr, regs, cause);
+
/*
* Fault-in kernel-space virtual memory on-demand.
* The 'reference' page table is init_mm.pgd.
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index d815448758a1..3f76db3d2769 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -299,7 +299,7 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
if (ret)
goto unlock;
- ret = walk_page_range_novma(&init_mm, lm_start, lm_end,
+ ret = walk_kernel_page_table_range(lm_start, lm_end,
&pageattr_ops, NULL, &masks);
if (ret)
goto unlock;
@@ -317,13 +317,13 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
if (ret)
goto unlock;
- ret = walk_page_range_novma(&init_mm, lm_start, lm_end,
+ ret = walk_kernel_page_table_range(lm_start, lm_end,
&pageattr_ops, NULL, &masks);
if (ret)
goto unlock;
}
- ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL,
+ ret = walk_kernel_page_table_range(start, end, &pageattr_ops, NULL,
&masks);
unlock:
@@ -335,7 +335,7 @@ unlock:
*/
flush_tlb_all();
#else
- ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL,
+ ret = walk_kernel_page_table_range(start, end, &pageattr_ops, NULL,
&masks);
mmap_write_unlock(&init_mm);
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index 32922550a50a..3b51690cc876 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -6,7 +6,6 @@
#include <linux/efi.h>
#include <linux/init.h>
#include <linux/debugfs.h>
-#include <linux/memory_hotplug.h>
#include <linux/seq_file.h>
#include <linux/ptdump.h>
@@ -413,9 +412,7 @@ bool ptdump_check_wx(void)
static int ptdump_show(struct seq_file *m, void *v)
{
- get_online_mems();
ptdump_walk(m, m->private);
- put_online_mems();
return 0;
}
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index e737ba7949b1..8404530ec00f 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -234,11 +234,6 @@ void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
-void arch_flush_tlb_batched_pending(struct mm_struct *mm)
-{
- flush_tlb_mm(mm);
-}
-
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
__flush_tlb_range(NULL, &batch->cpumask,
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 25a773e6596e..f0c0469e553d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -131,6 +131,7 @@ config S390
select ARCH_INLINE_WRITE_UNLOCK_IRQ
select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+ select ARCH_MODULE_NEEDS_WEAK_PER_CPU
select ARCH_STACKWALK
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
@@ -198,7 +199,6 @@ config S390
select HAVE_GUP_FAST
select HAVE_FENTRY
select HAVE_FTRACE_GRAPH_FUNC
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_FUNCTION_GRAPH_FREGS
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index a7db7ed28720..6b33429f1c4d 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -248,7 +248,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
CONFIG_NETFILTER_XT_MATCH_CPU=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
@@ -817,6 +816,7 @@ CONFIG_PKEY_EP11=m
CONFIG_PKEY_PCKMO=m
CONFIG_PKEY_UV=m
CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_PHMAC_S390=m
CONFIG_CRYPTO_DEV_VIRTIO=m
CONFIG_SYSTEM_BLACKLIST_KEYRING=y
CONFIG_CRYPTO_KRB5=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 0217ed5616bf..b75eb2775850 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -239,7 +239,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
CONFIG_NETFILTER_XT_MATCH_CPU=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
CONFIG_NETFILTER_XT_MATCH_DSCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
@@ -804,6 +803,7 @@ CONFIG_PKEY_EP11=m
CONFIG_PKEY_PCKMO=m
CONFIG_PKEY_UV=m
CONFIG_CRYPTO_PAES_S390=m
+CONFIG_CRYPTO_PHMAC_S390=m
CONFIG_CRYPTO_DEV_VIRTIO=m
CONFIG_SYSTEM_BLACKLIST_KEYRING=y
CONFIG_CRYPTO_KRB5=m
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 1e5a1038d491..998f4b656b18 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -11,4 +11,5 @@ obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o
obj-$(CONFIG_S390_PRNG) += prng.o
obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
obj-$(CONFIG_CRYPTO_HMAC_S390) += hmac_s390.o
+obj-$(CONFIG_CRYPTO_PHMAC_S390) += phmac_s390.o
obj-y += arch_random.o
diff --git a/arch/s390/crypto/hmac_s390.c b/arch/s390/crypto/hmac_s390.c
index 93a1098d9f8d..58444da9b004 100644
--- a/arch/s390/crypto/hmac_s390.c
+++ b/arch/s390/crypto/hmac_s390.c
@@ -290,6 +290,7 @@ static int s390_hmac_export(struct shash_desc *desc, void *out)
struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc);
unsigned int bs = crypto_shash_blocksize(desc->tfm);
unsigned int ds = bs / 2;
+ u64 lo = ctx->buflen[0];
union {
u8 *u8;
u64 *u64;
@@ -301,9 +302,10 @@ static int s390_hmac_export(struct shash_desc *desc, void *out)
else
memcpy(p.u8, ctx->param, ds);
p.u8 += ds;
- put_unaligned(ctx->buflen[0], p.u64++);
+ lo += bs;
+ put_unaligned(lo, p.u64++);
if (ds == SHA512_DIGEST_SIZE)
- put_unaligned(ctx->buflen[1], p.u64);
+ put_unaligned(ctx->buflen[1] + (lo < bs), p.u64);
return err;
}
@@ -316,14 +318,16 @@ static int s390_hmac_import(struct shash_desc *desc, const void *in)
const u8 *u8;
const u64 *u64;
} p = { .u8 = in };
+ u64 lo;
int err;
err = s390_hmac_sha2_init(desc);
memcpy(ctx->param, p.u8, ds);
p.u8 += ds;
- ctx->buflen[0] = get_unaligned(p.u64++);
+ lo = get_unaligned(p.u64++);
+ ctx->buflen[0] = lo - bs;
if (ds == SHA512_DIGEST_SIZE)
- ctx->buflen[1] = get_unaligned(p.u64);
+ ctx->buflen[1] = get_unaligned(p.u64) - (lo < bs);
if (ctx->buflen[0] | ctx->buflen[1])
ctx->gr0.ikp = 1;
return err;
diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
index 8a340c16acb4..a624a43a2b54 100644
--- a/arch/s390/crypto/paes_s390.c
+++ b/arch/s390/crypto/paes_s390.c
@@ -1633,7 +1633,7 @@ static int __init paes_s390_init(void)
/* with this pseudo devie alloc and start a crypto engine */
paes_crypto_engine =
crypto_engine_alloc_init_and_set(paes_dev.this_device,
- true, NULL, false, MAX_QLEN);
+ true, false, MAX_QLEN);
if (!paes_crypto_engine) {
rc = -ENOMEM;
goto out_err;
diff --git a/arch/s390/crypto/phmac_s390.c b/arch/s390/crypto/phmac_s390.c
new file mode 100644
index 000000000000..7ecfdc4fba2d
--- /dev/null
+++ b/arch/s390/crypto/phmac_s390.c
@@ -0,0 +1,1048 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright IBM Corp. 2025
+ *
+ * s390 specific HMAC support for protected keys.
+ */
+
+#define KMSG_COMPONENT "phmac_s390"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <asm/cpacf.h>
+#include <asm/pkey.h>
+#include <crypto/engine.h>
+#include <crypto/hash.h>
+#include <crypto/internal/hash.h>
+#include <crypto/sha2.h>
+#include <linux/atomic.h>
+#include <linux/cpufeature.h>
+#include <linux/delay.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+
+static struct crypto_engine *phmac_crypto_engine;
+#define MAX_QLEN 10
+
+/*
+ * A simple hash walk helper
+ */
+
+struct hash_walk_helper {
+ struct crypto_hash_walk walk;
+ const u8 *walkaddr;
+ int walkbytes;
+};
+
+/*
+ * Prepare hash walk helper.
+ * Set up the base hash walk, fill walkaddr and walkbytes.
+ * Returns 0 on success or negative value on error.
+ */
+static inline int hwh_prepare(struct ahash_request *req,
+ struct hash_walk_helper *hwh)
+{
+ hwh->walkbytes = crypto_hash_walk_first(req, &hwh->walk);
+ if (hwh->walkbytes < 0)
+ return hwh->walkbytes;
+ hwh->walkaddr = hwh->walk.data;
+ return 0;
+}
+
+/*
+ * Advance hash walk helper by n bytes.
+ * Progress the walkbytes and walkaddr fields by n bytes.
+ * If walkbytes is then 0, pull next hunk from hash walk
+ * and update walkbytes and walkaddr.
+ * If n is negative, unmap hash walk and return error.
+ * Returns 0 on success or negative value on error.
+ */
+static inline int hwh_advance(struct hash_walk_helper *hwh, int n)
+{
+ if (n < 0)
+ return crypto_hash_walk_done(&hwh->walk, n);
+
+ hwh->walkbytes -= n;
+ hwh->walkaddr += n;
+ if (hwh->walkbytes > 0)
+ return 0;
+
+ hwh->walkbytes = crypto_hash_walk_done(&hwh->walk, 0);
+ if (hwh->walkbytes < 0)
+ return hwh->walkbytes;
+
+ hwh->walkaddr = hwh->walk.data;
+ return 0;
+}
+
+/*
+ * KMAC param block layout for sha2 function codes:
+ * The layout of the param block for the KMAC instruction depends on the
+ * blocksize of the used hashing sha2-algorithm function codes. The param block
+ * contains the hash chaining value (cv), the input message bit-length (imbl)
+ * and the hmac-secret (key). To prevent code duplication, the sizes of all
+ * these are calculated based on the blocksize.
+ *
+ * param-block:
+ * +-------+
+ * | cv |
+ * +-------+
+ * | imbl |
+ * +-------+
+ * | key |
+ * +-------+
+ *
+ * sizes:
+ * part | sh2-alg | calculation | size | type
+ * -----+---------+-------------+------+--------
+ * cv | 224/256 | blocksize/2 | 32 | u64[8]
+ * | 384/512 | | 64 | u128[8]
+ * imbl | 224/256 | blocksize/8 | 8 | u64
+ * | 384/512 | | 16 | u128
+ * key | 224/256 | blocksize | 96 | u8[96]
+ * | 384/512 | | 160 | u8[160]
+ */
+
+#define MAX_DIGEST_SIZE SHA512_DIGEST_SIZE
+#define MAX_IMBL_SIZE sizeof(u128)
+#define MAX_BLOCK_SIZE SHA512_BLOCK_SIZE
+
+#define SHA2_CV_SIZE(bs) ((bs) >> 1)
+#define SHA2_IMBL_SIZE(bs) ((bs) >> 3)
+
+#define SHA2_IMBL_OFFSET(bs) (SHA2_CV_SIZE(bs))
+#define SHA2_KEY_OFFSET(bs) (SHA2_CV_SIZE(bs) + SHA2_IMBL_SIZE(bs))
+
+#define PHMAC_MAX_KEYSIZE 256
+#define PHMAC_SHA256_PK_SIZE (SHA256_BLOCK_SIZE + 32)
+#define PHMAC_SHA512_PK_SIZE (SHA512_BLOCK_SIZE + 32)
+#define PHMAC_MAX_PK_SIZE PHMAC_SHA512_PK_SIZE
+
+/* phmac protected key struct */
+struct phmac_protkey {
+ u32 type;
+ u32 len;
+ u8 protkey[PHMAC_MAX_PK_SIZE];
+};
+
+#define PK_STATE_NO_KEY 0
+#define PK_STATE_CONVERT_IN_PROGRESS 1
+#define PK_STATE_VALID 2
+
+/* phmac tfm context */
+struct phmac_tfm_ctx {
+ /* source key material used to derive a protected key from */
+ u8 keybuf[PHMAC_MAX_KEYSIZE];
+ unsigned int keylen;
+
+ /* cpacf function code to use with this protected key type */
+ long fc;
+
+ /* nr of requests enqueued via crypto engine which use this tfm ctx */
+ atomic_t via_engine_ctr;
+
+ /* spinlock to atomic read/update all the following fields */
+ spinlock_t pk_lock;
+
+ /* see PK_STATE* defines above, < 0 holds convert failure rc */
+ int pk_state;
+ /* if state is valid, pk holds the protected key */
+ struct phmac_protkey pk;
+};
+
+union kmac_gr0 {
+ unsigned long reg;
+ struct {
+ unsigned long : 48;
+ unsigned long ikp : 1;
+ unsigned long iimp : 1;
+ unsigned long ccup : 1;
+ unsigned long : 6;
+ unsigned long fc : 7;
+ };
+};
+
+struct kmac_sha2_ctx {
+ u8 param[MAX_DIGEST_SIZE + MAX_IMBL_SIZE + PHMAC_MAX_PK_SIZE];
+ union kmac_gr0 gr0;
+ u8 buf[MAX_BLOCK_SIZE];
+ u64 buflen[2];
+};
+
+/* phmac request context */
+struct phmac_req_ctx {
+ struct hash_walk_helper hwh;
+ struct kmac_sha2_ctx kmac_ctx;
+ bool final;
+};
+
+/*
+ * Pkey 'token' struct used to derive a protected key value from a clear key.
+ */
+struct hmac_clrkey_token {
+ u8 type;
+ u8 res0[3];
+ u8 version;
+ u8 res1[3];
+ u32 keytype;
+ u32 len;
+ u8 key[];
+} __packed;
+
+static int hash_key(const u8 *in, unsigned int inlen,
+ u8 *digest, unsigned int digestsize)
+{
+ unsigned long func;
+ union {
+ struct sha256_paramblock {
+ u32 h[8];
+ u64 mbl;
+ } sha256;
+ struct sha512_paramblock {
+ u64 h[8];
+ u128 mbl;
+ } sha512;
+ } __packed param;
+
+#define PARAM_INIT(x, y, z) \
+ param.sha##x.h[0] = SHA##y ## _H0; \
+ param.sha##x.h[1] = SHA##y ## _H1; \
+ param.sha##x.h[2] = SHA##y ## _H2; \
+ param.sha##x.h[3] = SHA##y ## _H3; \
+ param.sha##x.h[4] = SHA##y ## _H4; \
+ param.sha##x.h[5] = SHA##y ## _H5; \
+ param.sha##x.h[6] = SHA##y ## _H6; \
+ param.sha##x.h[7] = SHA##y ## _H7; \
+ param.sha##x.mbl = (z)
+
+ switch (digestsize) {
+ case SHA224_DIGEST_SIZE:
+ func = CPACF_KLMD_SHA_256;
+ PARAM_INIT(256, 224, inlen * 8);
+ break;
+ case SHA256_DIGEST_SIZE:
+ func = CPACF_KLMD_SHA_256;
+ PARAM_INIT(256, 256, inlen * 8);
+ break;
+ case SHA384_DIGEST_SIZE:
+ func = CPACF_KLMD_SHA_512;
+ PARAM_INIT(512, 384, inlen * 8);
+ break;
+ case SHA512_DIGEST_SIZE:
+ func = CPACF_KLMD_SHA_512;
+ PARAM_INIT(512, 512, inlen * 8);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+#undef PARAM_INIT
+
+ cpacf_klmd(func, &param, in, inlen);
+
+ memcpy(digest, &param, digestsize);
+
+ return 0;
+}
+
+/*
+ * make_clrkey_token() - wrap the clear key into a pkey clearkey token.
+ */
+static inline int make_clrkey_token(const u8 *clrkey, size_t clrkeylen,
+ unsigned int digestsize, u8 *dest)
+{
+ struct hmac_clrkey_token *token = (struct hmac_clrkey_token *)dest;
+ unsigned int blocksize;
+ int rc;
+
+ token->type = 0x00;
+ token->version = 0x02;
+ switch (digestsize) {
+ case SHA224_DIGEST_SIZE:
+ case SHA256_DIGEST_SIZE:
+ token->keytype = PKEY_KEYTYPE_HMAC_512;
+ blocksize = 64;
+ break;
+ case SHA384_DIGEST_SIZE:
+ case SHA512_DIGEST_SIZE:
+ token->keytype = PKEY_KEYTYPE_HMAC_1024;
+ blocksize = 128;
+ break;
+ default:
+ return -EINVAL;
+ }
+ token->len = blocksize;
+
+ if (clrkeylen > blocksize) {
+ rc = hash_key(clrkey, clrkeylen, token->key, digestsize);
+ if (rc)
+ return rc;
+ } else {
+ memcpy(token->key, clrkey, clrkeylen);
+ }
+
+ return 0;
+}
+
+/*
+ * phmac_tfm_ctx_setkey() - Set key value into tfm context, maybe construct
+ * a clear key token digestible by pkey from a clear key value.
+ */
+static inline int phmac_tfm_ctx_setkey(struct phmac_tfm_ctx *tfm_ctx,
+ const u8 *key, unsigned int keylen)
+{
+ if (keylen > sizeof(tfm_ctx->keybuf))
+ return -EINVAL;
+
+ memcpy(tfm_ctx->keybuf, key, keylen);
+ tfm_ctx->keylen = keylen;
+
+ return 0;
+}
+
+/*
+ * Convert the raw key material into a protected key via PKEY api.
+ * This function may sleep - don't call in non-sleeping context.
+ */
+static inline int convert_key(const u8 *key, unsigned int keylen,
+ struct phmac_protkey *pk)
+{
+ int rc, i;
+
+ pk->len = sizeof(pk->protkey);
+
+ /*
+ * In case of a busy card retry with increasing delay
+ * of 200, 400, 800 and 1600 ms - in total 3 s.
+ */
+ for (rc = -EIO, i = 0; rc && i < 5; i++) {
+ if (rc == -EBUSY && msleep_interruptible((1 << i) * 100)) {
+ rc = -EINTR;
+ goto out;
+ }
+ rc = pkey_key2protkey(key, keylen,
+ pk->protkey, &pk->len, &pk->type,
+ PKEY_XFLAG_NOMEMALLOC);
+ }
+
+out:
+ pr_debug("rc=%d\n", rc);
+ return rc;
+}
+
+/*
+ * (Re-)Convert the raw key material from the tfm ctx into a protected
+ * key via convert_key() function. Update the pk_state, pk_type, pk_len
+ * and the protected key in the tfm context.
+ * Please note this function may be invoked concurrently with the very
+ * same tfm context. The pk_lock spinlock in the context ensures an
+ * atomic update of the pk and the pk state but does not guarantee any
+ * order of update. So a fresh converted valid protected key may get
+ * updated with an 'old' expired key value. As the cpacf instructions
+ * detect this, refuse to operate with an invalid key and the calling
+ * code triggers a (re-)conversion this does no harm. This may lead to
+ * unnecessary additional conversion but never to invalid data on the
+ * hash operation.
+ */
+static int phmac_convert_key(struct phmac_tfm_ctx *tfm_ctx)
+{
+ struct phmac_protkey pk;
+ int rc;
+
+ spin_lock_bh(&tfm_ctx->pk_lock);
+ tfm_ctx->pk_state = PK_STATE_CONVERT_IN_PROGRESS;
+ spin_unlock_bh(&tfm_ctx->pk_lock);
+
+ rc = convert_key(tfm_ctx->keybuf, tfm_ctx->keylen, &pk);
+
+ /* update context */
+ spin_lock_bh(&tfm_ctx->pk_lock);
+ if (rc) {
+ tfm_ctx->pk_state = rc;
+ } else {
+ tfm_ctx->pk_state = PK_STATE_VALID;
+ tfm_ctx->pk = pk;
+ }
+ spin_unlock_bh(&tfm_ctx->pk_lock);
+
+ memzero_explicit(&pk, sizeof(pk));
+ pr_debug("rc=%d\n", rc);
+ return rc;
+}
+
+/*
+ * kmac_sha2_set_imbl - sets the input message bit-length based on the blocksize
+ */
+static inline void kmac_sha2_set_imbl(u8 *param, u64 buflen_lo,
+ u64 buflen_hi, unsigned int blocksize)
+{
+ u8 *imbl = param + SHA2_IMBL_OFFSET(blocksize);
+
+ switch (blocksize) {
+ case SHA256_BLOCK_SIZE:
+ *(u64 *)imbl = buflen_lo * BITS_PER_BYTE;
+ break;
+ case SHA512_BLOCK_SIZE:
+ *(u128 *)imbl = (((u128)buflen_hi << 64) + buflen_lo) << 3;
+ break;
+ default:
+ break;
+ }
+}
+
+static int phmac_kmac_update(struct ahash_request *req, bool maysleep)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm);
+ struct phmac_req_ctx *req_ctx = ahash_request_ctx(req);
+ struct kmac_sha2_ctx *ctx = &req_ctx->kmac_ctx;
+ struct hash_walk_helper *hwh = &req_ctx->hwh;
+ unsigned int bs = crypto_ahash_blocksize(tfm);
+ unsigned int offset, k, n;
+ int rc = 0;
+
+ /*
+ * The walk is always mapped when this function is called.
+ * Note that in case of partial processing or failure the walk
+ * is NOT unmapped here. So a follow up task may reuse the walk
+ * or in case of unrecoverable failure needs to unmap it.
+ */
+
+ while (hwh->walkbytes > 0) {
+ /* check sha2 context buffer */
+ offset = ctx->buflen[0] % bs;
+ if (offset + hwh->walkbytes < bs)
+ goto store;
+
+ if (offset) {
+ /* fill ctx buffer up to blocksize and process this block */
+ n = bs - offset;
+ memcpy(ctx->buf + offset, hwh->walkaddr, n);
+ ctx->gr0.iimp = 1;
+ for (;;) {
+ k = _cpacf_kmac(&ctx->gr0.reg, ctx->param, ctx->buf, bs);
+ if (likely(k == bs))
+ break;
+ if (unlikely(k > 0)) {
+ /*
+ * Can't deal with hunks smaller than blocksize.
+ * And kmac should always return the nr of
+ * processed bytes as 0 or a multiple of the
+ * blocksize.
+ */
+ rc = -EIO;
+ goto out;
+ }
+ /* protected key is invalid and needs re-conversion */
+ if (!maysleep) {
+ rc = -EKEYEXPIRED;
+ goto out;
+ }
+ rc = phmac_convert_key(tfm_ctx);
+ if (rc)
+ goto out;
+ spin_lock_bh(&tfm_ctx->pk_lock);
+ memcpy(ctx->param + SHA2_KEY_OFFSET(bs),
+ tfm_ctx->pk.protkey, tfm_ctx->pk.len);
+ spin_unlock_bh(&tfm_ctx->pk_lock);
+ }
+ ctx->buflen[0] += n;
+ if (ctx->buflen[0] < n)
+ ctx->buflen[1]++;
+ rc = hwh_advance(hwh, n);
+ if (unlikely(rc))
+ goto out;
+ offset = 0;
+ }
+
+ /* process as many blocks as possible from the walk */
+ while (hwh->walkbytes >= bs) {
+ n = (hwh->walkbytes / bs) * bs;
+ ctx->gr0.iimp = 1;
+ k = _cpacf_kmac(&ctx->gr0.reg, ctx->param, hwh->walkaddr, n);
+ if (likely(k > 0)) {
+ ctx->buflen[0] += k;
+ if (ctx->buflen[0] < k)
+ ctx->buflen[1]++;
+ rc = hwh_advance(hwh, k);
+ if (unlikely(rc))
+ goto out;
+ }
+ if (unlikely(k < n)) {
+ /* protected key is invalid and needs re-conversion */
+ if (!maysleep) {
+ rc = -EKEYEXPIRED;
+ goto out;
+ }
+ rc = phmac_convert_key(tfm_ctx);
+ if (rc)
+ goto out;
+ spin_lock_bh(&tfm_ctx->pk_lock);
+ memcpy(ctx->param + SHA2_KEY_OFFSET(bs),
+ tfm_ctx->pk.protkey, tfm_ctx->pk.len);
+ spin_unlock_bh(&tfm_ctx->pk_lock);
+ }
+ }
+
+store:
+ /* store incomplete block in context buffer */
+ if (hwh->walkbytes) {
+ memcpy(ctx->buf + offset, hwh->walkaddr, hwh->walkbytes);
+ ctx->buflen[0] += hwh->walkbytes;
+ if (ctx->buflen[0] < hwh->walkbytes)
+ ctx->buflen[1]++;
+ rc = hwh_advance(hwh, hwh->walkbytes);
+ if (unlikely(rc))
+ goto out;
+ }
+
+ } /* end of while (hwh->walkbytes > 0) */
+
+out:
+ pr_debug("rc=%d\n", rc);
+ return rc;
+}
+
+static int phmac_kmac_final(struct ahash_request *req, bool maysleep)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm);
+ struct phmac_req_ctx *req_ctx = ahash_request_ctx(req);
+ struct kmac_sha2_ctx *ctx = &req_ctx->kmac_ctx;
+ unsigned int ds = crypto_ahash_digestsize(tfm);
+ unsigned int bs = crypto_ahash_blocksize(tfm);
+ unsigned int k, n;
+ int rc = 0;
+
+ n = ctx->buflen[0] % bs;
+ ctx->gr0.iimp = 0;
+ kmac_sha2_set_imbl(ctx->param, ctx->buflen[0], ctx->buflen[1], bs);
+ for (;;) {
+ k = _cpacf_kmac(&ctx->gr0.reg, ctx->param, ctx->buf, n);
+ if (likely(k == n))
+ break;
+ if (unlikely(k > 0)) {
+ /* Can't deal with hunks smaller than blocksize. */
+ rc = -EIO;
+ goto out;
+ }
+ /* protected key is invalid and needs re-conversion */
+ if (!maysleep) {
+ rc = -EKEYEXPIRED;
+ goto out;
+ }
+ rc = phmac_convert_key(tfm_ctx);
+ if (rc)
+ goto out;
+ spin_lock_bh(&tfm_ctx->pk_lock);
+ memcpy(ctx->param + SHA2_KEY_OFFSET(bs),
+ tfm_ctx->pk.protkey, tfm_ctx->pk.len);
+ spin_unlock_bh(&tfm_ctx->pk_lock);
+ }
+
+ memcpy(req->result, ctx->param, ds);
+
+out:
+ pr_debug("rc=%d\n", rc);
+ return rc;
+}
+
+static int phmac_init(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm);
+ struct phmac_req_ctx *req_ctx = ahash_request_ctx(req);
+ struct kmac_sha2_ctx *kmac_ctx = &req_ctx->kmac_ctx;
+ unsigned int bs = crypto_ahash_blocksize(tfm);
+ int rc = 0;
+
+ /* zero request context (includes the kmac sha2 context) */
+ memset(req_ctx, 0, sizeof(*req_ctx));
+
+ /*
+ * setkey() should have set a valid fc into the tfm context.
+ * Copy this function code into the gr0 field of the kmac context.
+ */
+ if (!tfm_ctx->fc) {
+ rc = -ENOKEY;
+ goto out;
+ }
+ kmac_ctx->gr0.fc = tfm_ctx->fc;
+
+ /*
+ * Copy the pk from tfm ctx into kmac ctx. The protected key
+ * may be outdated but update() and final() will handle this.
+ */
+ spin_lock_bh(&tfm_ctx->pk_lock);
+ memcpy(kmac_ctx->param + SHA2_KEY_OFFSET(bs),
+ tfm_ctx->pk.protkey, tfm_ctx->pk.len);
+ spin_unlock_bh(&tfm_ctx->pk_lock);
+
+out:
+ pr_debug("rc=%d\n", rc);
+ return rc;
+}
+
+static int phmac_update(struct ahash_request *req)
+{
+ struct phmac_req_ctx *req_ctx = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm);
+ struct kmac_sha2_ctx *kmac_ctx = &req_ctx->kmac_ctx;
+ struct hash_walk_helper *hwh = &req_ctx->hwh;
+ int rc;
+
+ /* prep the walk in the request context */
+ rc = hwh_prepare(req, hwh);
+ if (rc)
+ goto out;
+
+ /* Try synchronous operation if no active engine usage */
+ if (!atomic_read(&tfm_ctx->via_engine_ctr)) {
+ rc = phmac_kmac_update(req, false);
+ if (rc == 0)
+ goto out;
+ }
+
+ /*
+ * If sync operation failed or key expired or there are already
+ * requests enqueued via engine, fallback to async. Mark tfm as
+ * using engine to serialize requests.
+ */
+ if (rc == 0 || rc == -EKEYEXPIRED) {
+ atomic_inc(&tfm_ctx->via_engine_ctr);
+ rc = crypto_transfer_hash_request_to_engine(phmac_crypto_engine, req);
+ if (rc != -EINPROGRESS)
+ atomic_dec(&tfm_ctx->via_engine_ctr);
+ }
+
+ if (rc != -EINPROGRESS) {
+ hwh_advance(hwh, rc);
+ memzero_explicit(kmac_ctx, sizeof(*kmac_ctx));
+ }
+
+out:
+ pr_debug("rc=%d\n", rc);
+ return rc;
+}
+
+static int phmac_final(struct ahash_request *req)
+{
+ struct phmac_req_ctx *req_ctx = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm);
+ struct kmac_sha2_ctx *kmac_ctx = &req_ctx->kmac_ctx;
+ int rc = 0;
+
+ /* Try synchronous operation if no active engine usage */
+ if (!atomic_read(&tfm_ctx->via_engine_ctr)) {
+ rc = phmac_kmac_final(req, false);
+ if (rc == 0)
+ goto out;
+ }
+
+ /*
+ * If sync operation failed or key expired or there are already
+ * requests enqueued via engine, fallback to async. Mark tfm as
+ * using engine to serialize requests.
+ */
+ if (rc == 0 || rc == -EKEYEXPIRED) {
+ req->nbytes = 0;
+ req_ctx->final = true;
+ atomic_inc(&tfm_ctx->via_engine_ctr);
+ rc = crypto_transfer_hash_request_to_engine(phmac_crypto_engine, req);
+ if (rc != -EINPROGRESS)
+ atomic_dec(&tfm_ctx->via_engine_ctr);
+ }
+
+out:
+ if (rc != -EINPROGRESS)
+ memzero_explicit(kmac_ctx, sizeof(*kmac_ctx));
+ pr_debug("rc=%d\n", rc);
+ return rc;
+}
+
+static int phmac_finup(struct ahash_request *req)
+{
+ struct phmac_req_ctx *req_ctx = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm);
+ struct kmac_sha2_ctx *kmac_ctx = &req_ctx->kmac_ctx;
+ struct hash_walk_helper *hwh = &req_ctx->hwh;
+ int rc;
+
+ /* prep the walk in the request context */
+ rc = hwh_prepare(req, hwh);
+ if (rc)
+ goto out;
+
+ /* Try synchronous operations if no active engine usage */
+ if (!atomic_read(&tfm_ctx->via_engine_ctr)) {
+ rc = phmac_kmac_update(req, false);
+ if (rc == 0)
+ req->nbytes = 0;
+ }
+ if (!rc && !req->nbytes && !atomic_read(&tfm_ctx->via_engine_ctr)) {
+ rc = phmac_kmac_final(req, false);
+ if (rc == 0)
+ goto out;
+ }
+
+ /*
+ * If sync operation failed or key expired or there are already
+ * requests enqueued via engine, fallback to async. Mark tfm as
+ * using engine to serialize requests.
+ */
+ if (rc == 0 || rc == -EKEYEXPIRED) {
+ req_ctx->final = true;
+ atomic_inc(&tfm_ctx->via_engine_ctr);
+ rc = crypto_transfer_hash_request_to_engine(phmac_crypto_engine, req);
+ if (rc != -EINPROGRESS)
+ atomic_dec(&tfm_ctx->via_engine_ctr);
+ }
+
+ if (rc != -EINPROGRESS)
+ hwh_advance(hwh, rc);
+
+out:
+ if (rc != -EINPROGRESS)
+ memzero_explicit(kmac_ctx, sizeof(*kmac_ctx));
+ pr_debug("rc=%d\n", rc);
+ return rc;
+}
+
+static int phmac_digest(struct ahash_request *req)
+{
+ int rc;
+
+ rc = phmac_init(req);
+ if (rc)
+ goto out;
+
+ rc = phmac_finup(req);
+
+out:
+ pr_debug("rc=%d\n", rc);
+ return rc;
+}
+
+static int phmac_setkey(struct crypto_ahash *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm);
+ unsigned int ds = crypto_ahash_digestsize(tfm);
+ unsigned int bs = crypto_ahash_blocksize(tfm);
+ unsigned int tmpkeylen;
+ u8 *tmpkey = NULL;
+ int rc = 0;
+
+ if (!crypto_ahash_tested(tfm)) {
+ /*
+ * selftest running: key is a raw hmac clear key and needs
+ * to get embedded into a 'clear key token' in order to have
+ * it correctly processed by the pkey module.
+ */
+ tmpkeylen = sizeof(struct hmac_clrkey_token) + bs;
+ tmpkey = kzalloc(tmpkeylen, GFP_KERNEL);
+ if (!tmpkey) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ rc = make_clrkey_token(key, keylen, ds, tmpkey);
+ if (rc)
+ goto out;
+ keylen = tmpkeylen;
+ key = tmpkey;
+ }
+
+ /* copy raw key into tfm context */
+ rc = phmac_tfm_ctx_setkey(tfm_ctx, key, keylen);
+ if (rc)
+ goto out;
+
+ /* convert raw key into protected key */
+ rc = phmac_convert_key(tfm_ctx);
+ if (rc)
+ goto out;
+
+ /* set function code in tfm context, check for valid pk type */
+ switch (ds) {
+ case SHA224_DIGEST_SIZE:
+ if (tfm_ctx->pk.type != PKEY_KEYTYPE_HMAC_512)
+ rc = -EINVAL;
+ else
+ tfm_ctx->fc = CPACF_KMAC_PHMAC_SHA_224;
+ break;
+ case SHA256_DIGEST_SIZE:
+ if (tfm_ctx->pk.type != PKEY_KEYTYPE_HMAC_512)
+ rc = -EINVAL;
+ else
+ tfm_ctx->fc = CPACF_KMAC_PHMAC_SHA_256;
+ break;
+ case SHA384_DIGEST_SIZE:
+ if (tfm_ctx->pk.type != PKEY_KEYTYPE_HMAC_1024)
+ rc = -EINVAL;
+ else
+ tfm_ctx->fc = CPACF_KMAC_PHMAC_SHA_384;
+ break;
+ case SHA512_DIGEST_SIZE:
+ if (tfm_ctx->pk.type != PKEY_KEYTYPE_HMAC_1024)
+ rc = -EINVAL;
+ else
+ tfm_ctx->fc = CPACF_KMAC_PHMAC_SHA_512;
+ break;
+ default:
+ tfm_ctx->fc = 0;
+ rc = -EINVAL;
+ }
+
+out:
+ kfree(tmpkey);
+ pr_debug("rc=%d\n", rc);
+ return rc;
+}
+
+static int phmac_export(struct ahash_request *req, void *out)
+{
+ struct phmac_req_ctx *req_ctx = ahash_request_ctx(req);
+ struct kmac_sha2_ctx *ctx = &req_ctx->kmac_ctx;
+
+ memcpy(out, ctx, sizeof(*ctx));
+
+ return 0;
+}
+
+static int phmac_import(struct ahash_request *req, const void *in)
+{
+ struct phmac_req_ctx *req_ctx = ahash_request_ctx(req);
+ struct kmac_sha2_ctx *ctx = &req_ctx->kmac_ctx;
+
+ memset(req_ctx, 0, sizeof(*req_ctx));
+ memcpy(ctx, in, sizeof(*ctx));
+
+ return 0;
+}
+
+static int phmac_init_tfm(struct crypto_ahash *tfm)
+{
+ struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm);
+
+ memset(tfm_ctx, 0, sizeof(*tfm_ctx));
+ spin_lock_init(&tfm_ctx->pk_lock);
+
+ crypto_ahash_set_reqsize(tfm, sizeof(struct phmac_req_ctx));
+
+ return 0;
+}
+
+static void phmac_exit_tfm(struct crypto_ahash *tfm)
+{
+ struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm);
+
+ memzero_explicit(tfm_ctx->keybuf, sizeof(tfm_ctx->keybuf));
+ memzero_explicit(&tfm_ctx->pk, sizeof(tfm_ctx->pk));
+}
+
+static int phmac_do_one_request(struct crypto_engine *engine, void *areq)
+{
+ struct ahash_request *req = ahash_request_cast(areq);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct phmac_tfm_ctx *tfm_ctx = crypto_ahash_ctx(tfm);
+ struct phmac_req_ctx *req_ctx = ahash_request_ctx(req);
+ struct kmac_sha2_ctx *kmac_ctx = &req_ctx->kmac_ctx;
+ struct hash_walk_helper *hwh = &req_ctx->hwh;
+ int rc = -EINVAL;
+
+ /*
+ * Three kinds of requests come in here:
+ * update when req->nbytes > 0 and req_ctx->final is false
+ * final when req->nbytes = 0 and req_ctx->final is true
+ * finup when req->nbytes > 0 and req_ctx->final is true
+ * For update and finup the hwh walk needs to be prepared and
+ * up to date but the actual nr of bytes in req->nbytes may be
+ * any non zero number. For final there is no hwh walk needed.
+ */
+
+ if (req->nbytes) {
+ rc = phmac_kmac_update(req, true);
+ if (rc == -EKEYEXPIRED) {
+ /*
+ * Protected key expired, conversion is in process.
+ * Trigger a re-schedule of this request by returning
+ * -ENOSPC ("hardware queue full") to the crypto engine.
+ * To avoid immediately re-invocation of this callback,
+ * tell scheduler to voluntarily give up the CPU here.
+ */
+ pr_debug("rescheduling request\n");
+ cond_resched();
+ return -ENOSPC;
+ } else if (rc) {
+ hwh_advance(hwh, rc);
+ goto out;
+ }
+ req->nbytes = 0;
+ }
+
+ if (req_ctx->final) {
+ rc = phmac_kmac_final(req, true);
+ if (rc == -EKEYEXPIRED) {
+ /*
+ * Protected key expired, conversion is in process.
+ * Trigger a re-schedule of this request by returning
+ * -ENOSPC ("hardware queue full") to the crypto engine.
+ * To avoid immediately re-invocation of this callback,
+ * tell scheduler to voluntarily give up the CPU here.
+ */
+ pr_debug("rescheduling request\n");
+ cond_resched();
+ return -ENOSPC;
+ }
+ }
+
+out:
+ if (rc || req_ctx->final)
+ memzero_explicit(kmac_ctx, sizeof(*kmac_ctx));
+ pr_debug("request complete with rc=%d\n", rc);
+ local_bh_disable();
+ atomic_dec(&tfm_ctx->via_engine_ctr);
+ crypto_finalize_hash_request(engine, req, rc);
+ local_bh_enable();
+ return rc;
+}
+
+#define S390_ASYNC_PHMAC_ALG(x) \
+{ \
+ .base = { \
+ .init = phmac_init, \
+ .update = phmac_update, \
+ .final = phmac_final, \
+ .finup = phmac_finup, \
+ .digest = phmac_digest, \
+ .setkey = phmac_setkey, \
+ .import = phmac_import, \
+ .export = phmac_export, \
+ .init_tfm = phmac_init_tfm, \
+ .exit_tfm = phmac_exit_tfm, \
+ .halg = { \
+ .digestsize = SHA##x##_DIGEST_SIZE, \
+ .statesize = sizeof(struct kmac_sha2_ctx), \
+ .base = { \
+ .cra_name = "phmac(sha" #x ")", \
+ .cra_driver_name = "phmac_s390_sha" #x, \
+ .cra_blocksize = SHA##x##_BLOCK_SIZE, \
+ .cra_priority = 400, \
+ .cra_flags = CRYPTO_ALG_ASYNC | \
+ CRYPTO_ALG_NO_FALLBACK, \
+ .cra_ctxsize = sizeof(struct phmac_tfm_ctx), \
+ .cra_module = THIS_MODULE, \
+ }, \
+ }, \
+ }, \
+ .op = { \
+ .do_one_request = phmac_do_one_request, \
+ }, \
+}
+
+static struct phmac_alg {
+ unsigned int fc;
+ struct ahash_engine_alg alg;
+ bool registered;
+} phmac_algs[] = {
+ {
+ .fc = CPACF_KMAC_PHMAC_SHA_224,
+ .alg = S390_ASYNC_PHMAC_ALG(224),
+ }, {
+ .fc = CPACF_KMAC_PHMAC_SHA_256,
+ .alg = S390_ASYNC_PHMAC_ALG(256),
+ }, {
+ .fc = CPACF_KMAC_PHMAC_SHA_384,
+ .alg = S390_ASYNC_PHMAC_ALG(384),
+ }, {
+ .fc = CPACF_KMAC_PHMAC_SHA_512,
+ .alg = S390_ASYNC_PHMAC_ALG(512),
+ }
+};
+
+static struct miscdevice phmac_dev = {
+ .name = "phmac",
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static void s390_phmac_exit(void)
+{
+ struct phmac_alg *phmac;
+ int i;
+
+ if (phmac_crypto_engine) {
+ crypto_engine_stop(phmac_crypto_engine);
+ crypto_engine_exit(phmac_crypto_engine);
+ }
+
+ for (i = ARRAY_SIZE(phmac_algs) - 1; i >= 0; i--) {
+ phmac = &phmac_algs[i];
+ if (phmac->registered)
+ crypto_engine_unregister_ahash(&phmac->alg);
+ }
+
+ misc_deregister(&phmac_dev);
+}
+
+static int __init s390_phmac_init(void)
+{
+ struct phmac_alg *phmac;
+ int i, rc;
+
+ /* for selftest cpacf klmd subfunction is needed */
+ if (!cpacf_query_func(CPACF_KLMD, CPACF_KLMD_SHA_256))
+ return -ENODEV;
+ if (!cpacf_query_func(CPACF_KLMD, CPACF_KLMD_SHA_512))
+ return -ENODEV;
+
+ /* register a simple phmac pseudo misc device */
+ rc = misc_register(&phmac_dev);
+ if (rc)
+ return rc;
+
+ /* with this pseudo device alloc and start a crypto engine */
+ phmac_crypto_engine =
+ crypto_engine_alloc_init_and_set(phmac_dev.this_device,
+ true, false, MAX_QLEN);
+ if (!phmac_crypto_engine) {
+ rc = -ENOMEM;
+ goto out_err;
+ }
+ rc = crypto_engine_start(phmac_crypto_engine);
+ if (rc) {
+ crypto_engine_exit(phmac_crypto_engine);
+ phmac_crypto_engine = NULL;
+ goto out_err;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(phmac_algs); i++) {
+ phmac = &phmac_algs[i];
+ if (!cpacf_query_func(CPACF_KMAC, phmac->fc))
+ continue;
+ rc = crypto_engine_register_ahash(&phmac->alg);
+ if (rc)
+ goto out_err;
+ phmac->registered = true;
+ pr_debug("%s registered\n", phmac->alg.base.halg.base.cra_name);
+ }
+
+ return 0;
+
+out_err:
+ s390_phmac_exit();
+ return rc;
+}
+
+module_init(s390_phmac_init);
+module_exit(s390_phmac_exit);
+
+MODULE_ALIAS_CRYPTO("phmac(sha224)");
+MODULE_ALIAS_CRYPTO("phmac(sha256)");
+MODULE_ALIAS_CRYPTO("phmac(sha384)");
+MODULE_ALIAS_CRYPTO("phmac(sha512)");
+
+MODULE_DESCRIPTION("S390 HMAC driver for protected keys");
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h
index d757ccbce2b4..cadb4b13622a 100644
--- a/arch/s390/crypto/sha.h
+++ b/arch/s390/crypto/sha.h
@@ -27,6 +27,9 @@ struct s390_sha_ctx {
u64 state[SHA512_DIGEST_SIZE / sizeof(u64)];
u64 count_hi;
} sha512;
+ struct {
+ __le64 state[SHA3_STATE_SIZE / sizeof(u64)];
+ } sha3;
};
int func; /* KIMD function to use */
bool first_message_part;
diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c
index 4a7731ac6bcd..03bb4f4bab70 100644
--- a/arch/s390/crypto/sha3_256_s390.c
+++ b/arch/s390/crypto/sha3_256_s390.c
@@ -35,23 +35,33 @@ static int sha3_256_init(struct shash_desc *desc)
static int sha3_256_export(struct shash_desc *desc, void *out)
{
struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
- struct sha3_state *octx = out;
+ union {
+ u8 *u8;
+ u64 *u64;
+ } p = { .u8 = out };
+ int i;
if (sctx->first_message_part) {
- memset(sctx->state, 0, sizeof(sctx->state));
- sctx->first_message_part = 0;
+ memset(out, 0, SHA3_STATE_SIZE);
+ return 0;
}
- memcpy(octx->st, sctx->state, sizeof(octx->st));
+ for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
+ put_unaligned(le64_to_cpu(sctx->sha3.state[i]), p.u64++);
return 0;
}
static int sha3_256_import(struct shash_desc *desc, const void *in)
{
struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
- const struct sha3_state *ictx = in;
-
+ union {
+ const u8 *u8;
+ const u64 *u64;
+ } p = { .u8 = in };
+ int i;
+
+ for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
+ sctx->sha3.state[i] = cpu_to_le64(get_unaligned(p.u64++));
sctx->count = 0;
- memcpy(sctx->state, ictx->st, sizeof(ictx->st));
sctx->first_message_part = 0;
sctx->func = CPACF_KIMD_SHA3_256;
diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c
index 018f02fff444..a5c9690eecb1 100644
--- a/arch/s390/crypto/sha3_512_s390.c
+++ b/arch/s390/crypto/sha3_512_s390.c
@@ -34,24 +34,33 @@ static int sha3_512_init(struct shash_desc *desc)
static int sha3_512_export(struct shash_desc *desc, void *out)
{
struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
- struct sha3_state *octx = out;
-
+ union {
+ u8 *u8;
+ u64 *u64;
+ } p = { .u8 = out };
+ int i;
if (sctx->first_message_part) {
- memset(sctx->state, 0, sizeof(sctx->state));
- sctx->first_message_part = 0;
+ memset(out, 0, SHA3_STATE_SIZE);
+ return 0;
}
- memcpy(octx->st, sctx->state, sizeof(octx->st));
+ for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
+ put_unaligned(le64_to_cpu(sctx->sha3.state[i]), p.u64++);
return 0;
}
static int sha3_512_import(struct shash_desc *desc, const void *in)
{
struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
- const struct sha3_state *ictx = in;
-
+ union {
+ const u8 *u8;
+ const u64 *u64;
+ } p = { .u8 = in };
+ int i;
+
+ for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
+ sctx->sha3.state[i] = cpu_to_le64(get_unaligned(p.u64++));
sctx->count = 0;
- memcpy(sctx->state, ictx->st, sizeof(ictx->st));
sctx->first_message_part = 0;
sctx->func = CPACF_KIMD_SHA3_512;
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index 54cb97603ec0..4bc5317fbb12 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -129,6 +129,10 @@
#define CPACF_KMAC_HMAC_SHA_256 0x71
#define CPACF_KMAC_HMAC_SHA_384 0x72
#define CPACF_KMAC_HMAC_SHA_512 0x73
+#define CPACF_KMAC_PHMAC_SHA_224 0x78
+#define CPACF_KMAC_PHMAC_SHA_256 0x79
+#define CPACF_KMAC_PHMAC_SHA_384 0x7a
+#define CPACF_KMAC_PHMAC_SHA_512 0x7b
/*
* Function codes for the PCKMO (PERFORM CRYPTOGRAPHIC KEY MANAGEMENT)
diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h
index 35555c944630..979af986a8fe 100644
--- a/arch/s390/include/asm/entry-common.h
+++ b/arch/s390/include/asm/entry-common.h
@@ -59,4 +59,14 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
+static __always_inline bool arch_in_rcu_eqs(void)
+{
+ if (IS_ENABLED(CONFIG_KVM))
+ return current->flags & PF_VCPU;
+
+ return false;
+}
+
+#define arch_in_rcu_eqs arch_in_rcu_eqs
+
#endif
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index cb89e54ada25..f870d09515cc 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -716,6 +716,9 @@ extern char sie_exit;
bool kvm_s390_pv_is_protected(struct kvm *kvm);
bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu);
+extern int kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb,
+ u64 *gprs, unsigned long gasce);
+
extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc);
diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 84f6b8357b45..96af7d964014 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -16,10 +16,9 @@
* For 64 bit module code, the module may be more than 4G above the
* per cpu area, use weak definitions to force the compiler to
* generate external references.
+ * Therefore, we have enabled CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU
+ * in the Kconfig.
*/
-#if defined(MODULE)
-#define ARCH_NEEDS_WEAK_PER_CPU
-#endif
/*
* We use a compare-and-swap loop since that uses less cpu cycles than
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 78c9a310efa5..bf6fa8b9ca73 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -5063,6 +5063,30 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
return vcpu_post_run_handle_fault(vcpu);
}
+int noinstr kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb,
+ u64 *gprs, unsigned long gasce)
+{
+ int ret;
+
+ guest_state_enter_irqoff();
+
+ /*
+ * The guest_state_{enter,exit}_irqoff() functions inform lockdep and
+ * tracing that entry to the guest will enable host IRQs, and exit from
+ * the guest will disable host IRQs.
+ *
+ * We must not use lockdep/tracing/RCU in this critical section, so we
+ * use the low-level arch_local_irq_*() helpers to enable/disable IRQs.
+ */
+ arch_local_irq_enable();
+ ret = sie64a(scb, gprs, gasce);
+ arch_local_irq_disable();
+
+ guest_state_exit_irqoff();
+
+ return ret;
+}
+
#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK)
static int __vcpu_run(struct kvm_vcpu *vcpu)
{
@@ -5083,20 +5107,27 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
kvm_vcpu_srcu_read_unlock(vcpu);
/*
* As PF_VCPU will be used in fault handler, between
- * guest_enter and guest_exit should be no uaccess.
+ * guest_timing_enter_irqoff and guest_timing_exit_irqoff
+ * should be no uaccess.
*/
- local_irq_disable();
- guest_enter_irqoff();
- __disable_cpu_timer_accounting(vcpu);
- local_irq_enable();
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
memcpy(sie_page->pv_grregs,
vcpu->run->s.regs.gprs,
sizeof(sie_page->pv_grregs));
}
- exit_reason = sie64a(vcpu->arch.sie_block,
- vcpu->run->s.regs.gprs,
- vcpu->arch.gmap->asce);
+
+ local_irq_disable();
+ guest_timing_enter_irqoff();
+ __disable_cpu_timer_accounting(vcpu);
+
+ exit_reason = kvm_s390_enter_exit_sie(vcpu->arch.sie_block,
+ vcpu->run->s.regs.gprs,
+ vcpu->arch.gmap->asce);
+
+ __enable_cpu_timer_accounting(vcpu);
+ guest_timing_exit_irqoff();
+ local_irq_enable();
+
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
memcpy(vcpu->run->s.regs.gprs,
sie_page->pv_grregs,
@@ -5112,10 +5143,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK;
}
}
- local_irq_disable();
- __enable_cpu_timer_accounting(vcpu);
- guest_exit_irqoff();
- local_irq_enable();
kvm_vcpu_srcu_read_lock(vcpu);
rc = vcpu_post_run(vcpu, exit_reason);
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 13a9661d2b28..347268f89f2f 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1170,10 +1170,6 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
vcpu->arch.sie_block->fpf & FPF_BPBC)
set_thread_flag(TIF_ISOLATE_BP_GUEST);
- local_irq_disable();
- guest_enter_irqoff();
- local_irq_enable();
-
/*
* Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking
* and VCPU requests also hinder the vSIE from running and lead
@@ -1183,15 +1179,16 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
current->thread.gmap_int_code = 0;
barrier();
- if (!kvm_s390_vcpu_sie_inhibited(vcpu))
- rc = sie64a(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce);
+ if (!kvm_s390_vcpu_sie_inhibited(vcpu)) {
+ local_irq_disable();
+ guest_timing_enter_irqoff();
+ rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce);
+ guest_timing_exit_irqoff();
+ local_irq_enable();
+ }
barrier();
vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE;
- local_irq_disable();
- guest_exit_irqoff();
- local_irq_enable();
-
/* restore guest state for bp isolation override */
if (!guest_bp_isolation)
clear_thread_flag(TIF_ISOLATE_BP_GUEST);
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index ac604b176660..9af2aae0a515 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -247,11 +247,9 @@ static int ptdump_show(struct seq_file *m, void *v)
.marker = markers,
};
- get_online_mems();
mutex_lock(&cpa_mutex);
ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
mutex_unlock(&cpa_mutex);
- put_online_mems();
return 0;
}
DEFINE_SHOW_ATTRIBUTE(ptdump);
diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h
deleted file mode 100644
index 615e6da71374..000000000000
--- a/arch/s390/net/bpf_jit.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * BPF Jit compiler defines
- *
- * Copyright IBM Corp. 2012,2015
- *
- * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- * Michael Holzheu <holzheu@linux.vnet.ibm.com>
- */
-
-#ifndef __ARCH_S390_NET_BPF_JIT_H
-#define __ARCH_S390_NET_BPF_JIT_H
-
-#ifndef __ASSEMBLER__
-
-#include <linux/filter.h>
-#include <linux/types.h>
-
-#endif /* __ASSEMBLER__ */
-
-/*
- * Stackframe layout (packed stack):
- *
- * ^ high
- * +---------------+ |
- * | old backchain | |
- * +---------------+ |
- * | r15 - r6 | |
- * +---------------+ |
- * | 4 byte align | |
- * | tail_call_cnt | |
- * BFP -> +===============+ |
- * | | |
- * | BPF stack | |
- * | | |
- * R15+160 -> +---------------+ |
- * | new backchain | |
- * R15+152 -> +---------------+ |
- * | + 152 byte SA | |
- * R15 -> +---------------+ + low
- *
- * We get 160 bytes stack space from calling function, but only use
- * 12 * 8 byte for old backchain, r15..r6, and tail_call_cnt.
- *
- * The stack size used by the BPF program ("BPF stack" above) is passed
- * via "aux->stack_depth".
- */
-#define STK_SPACE_ADD (160)
-#define STK_160_UNUSED (160 - 12 * 8)
-#define STK_OFF (STK_SPACE_ADD - STK_160_UNUSED)
-
-#define STK_OFF_R6 (160 - 11 * 8) /* Offset of r6 on stack */
-#define STK_OFF_TCCNT (160 - 12 * 8) /* Offset of tail_call_cnt on stack */
-
-#endif /* __ARCH_S390_NET_BPF_JIT_H */
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 0c9a35782c83..bb17efe29d65 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -32,7 +32,6 @@
#include <asm/set_memory.h>
#include <asm/text-patching.h>
#include <asm/unwind.h>
-#include "bpf_jit.h"
struct bpf_jit {
u32 seen; /* Flags to remember seen eBPF instructions */
@@ -54,6 +53,7 @@ struct bpf_jit {
int prologue_plt; /* Start of prologue hotpatch PLT */
int kern_arena; /* Pool offset of kernel arena address */
u64 user_arena; /* User arena address */
+ u32 frame_off; /* Offset of struct bpf_prog from %r15 */
};
#define SEEN_MEM BIT(0) /* use mem[] for temporary storage */
@@ -426,11 +426,25 @@ static void jit_fill_hole(void *area, unsigned int size)
}
/*
+ * Caller-allocated part of the frame.
+ * Thanks to packed stack, its otherwise unused initial part can be used for
+ * the BPF stack and for the next frame.
+ */
+struct prog_frame {
+ u64 unused[8];
+ /* BPF stack starts here and grows towards 0 */
+ u32 tail_call_cnt;
+ u32 pad;
+ u64 r6[10]; /* r6 - r15 */
+ u64 backchain;
+} __packed;
+
+/*
* Save registers from "rs" (register start) to "re" (register end) on stack
*/
static void save_regs(struct bpf_jit *jit, u32 rs, u32 re)
{
- u32 off = STK_OFF_R6 + (rs - 6) * 8;
+ u32 off = offsetof(struct prog_frame, r6) + (rs - 6) * 8;
if (rs == re)
/* stg %rs,off(%r15) */
@@ -443,12 +457,9 @@ static void save_regs(struct bpf_jit *jit, u32 rs, u32 re)
/*
* Restore registers from "rs" (register start) to "re" (register end) on stack
*/
-static void restore_regs(struct bpf_jit *jit, u32 rs, u32 re, u32 stack_depth)
+static void restore_regs(struct bpf_jit *jit, u32 rs, u32 re)
{
- u32 off = STK_OFF_R6 + (rs - 6) * 8;
-
- if (jit->seen & SEEN_STACK)
- off += STK_OFF + stack_depth;
+ u32 off = jit->frame_off + offsetof(struct prog_frame, r6) + (rs - 6) * 8;
if (rs == re)
/* lg %rs,off(%r15) */
@@ -492,8 +503,7 @@ static int get_end(u16 seen_regs, int start)
* Save and restore clobbered registers (6-15) on stack.
* We save/restore registers in chunks with gap >= 2 registers.
*/
-static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth,
- u16 extra_regs)
+static void save_restore_regs(struct bpf_jit *jit, int op, u16 extra_regs)
{
u16 seen_regs = jit->seen_regs | extra_regs;
const int last = 15, save_restore_size = 6;
@@ -516,7 +526,7 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth,
if (op == REGS_SAVE)
save_regs(jit, rs, re);
else
- restore_regs(jit, rs, re, stack_depth);
+ restore_regs(jit, rs, re);
re++;
} while (re <= last);
}
@@ -581,11 +591,12 @@ static void bpf_jit_plt(struct bpf_plt *plt, void *ret, void *target)
* Emit function prologue
*
* Save registers and create stack frame if necessary.
- * See stack frame layout description in "bpf_jit.h"!
+ * Stack frame layout is described by struct prog_frame.
*/
-static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp,
- u32 stack_depth)
+static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp)
{
+ BUILD_BUG_ON(sizeof(struct prog_frame) != STACK_FRAME_OVERHEAD);
+
/* No-op for hotpatching */
/* brcl 0,prologue_plt */
EMIT6_PCREL_RILC(0xc0040000, 0, jit->prologue_plt);
@@ -593,8 +604,9 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp,
if (!bpf_is_subprog(fp)) {
/* Initialize the tail call counter in the main program. */
- /* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */
- _EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT);
+ /* xc tail_call_cnt(4,%r15),tail_call_cnt(%r15) */
+ _EMIT6(0xd703f000 | offsetof(struct prog_frame, tail_call_cnt),
+ 0xf000 | offsetof(struct prog_frame, tail_call_cnt));
} else {
/*
* Skip the tail call counter initialization in subprograms.
@@ -617,7 +629,7 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp,
jit->seen_regs |= NVREGS;
} else {
/* Save registers */
- save_restore_regs(jit, REGS_SAVE, stack_depth,
+ save_restore_regs(jit, REGS_SAVE,
fp->aux->exception_boundary ? NVREGS : 0);
}
/* Setup literal pool */
@@ -637,13 +649,15 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp,
if (is_first_pass(jit) || (jit->seen & SEEN_STACK)) {
/* lgr %w1,%r15 (backchain) */
EMIT4(0xb9040000, REG_W1, REG_15);
- /* la %bfp,STK_160_UNUSED(%r15) (BPF frame pointer) */
- EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15, STK_160_UNUSED);
- /* aghi %r15,-STK_OFF */
- EMIT4_IMM(0xa70b0000, REG_15, -(STK_OFF + stack_depth));
- /* stg %w1,152(%r15) (backchain) */
+ /* la %bfp,unused_end(%r15) (BPF frame pointer) */
+ EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15,
+ offsetofend(struct prog_frame, unused));
+ /* aghi %r15,-frame_off */
+ EMIT4_IMM(0xa70b0000, REG_15, -jit->frame_off);
+ /* stg %w1,backchain(%r15) */
EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
- REG_15, 152);
+ REG_15,
+ offsetof(struct prog_frame, backchain));
}
}
@@ -677,13 +691,13 @@ static void call_r1(struct bpf_jit *jit)
/*
* Function epilogue
*/
-static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
+static void bpf_jit_epilogue(struct bpf_jit *jit)
{
jit->exit_ip = jit->prg;
/* Load exit code: lgr %r2,%b0 */
EMIT4(0xb9040000, REG_2, BPF_REG_0);
/* Restore registers */
- save_restore_regs(jit, REGS_RESTORE, stack_depth, 0);
+ save_restore_regs(jit, REGS_RESTORE, 0);
EMIT_JUMP_REG(14);
jit->prg = ALIGN(jit->prg, 8);
@@ -865,7 +879,7 @@ static int sign_extend(struct bpf_jit *jit, int r, u8 size, u8 flags)
* stack space for the large switch statement.
*/
static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
- int i, bool extra_pass, u32 stack_depth)
+ int i, bool extra_pass)
{
struct bpf_insn *insn = &fp->insnsi[i];
s32 branch_oc_off = insn->off;
@@ -1786,9 +1800,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
* Note 2: We assume that the verifier does not let us call the
* main program, which clears the tail call counter on entry.
*/
- /* mvc STK_OFF_TCCNT(4,%r15),N(%r15) */
- _EMIT6(0xd203f000 | STK_OFF_TCCNT,
- 0xf000 | (STK_OFF_TCCNT + STK_OFF + stack_depth));
+ /* mvc tail_call_cnt(4,%r15),frame_off+tail_call_cnt(%r15) */
+ _EMIT6(0xd203f000 | offsetof(struct prog_frame, tail_call_cnt),
+ 0xf000 | (jit->frame_off +
+ offsetof(struct prog_frame, tail_call_cnt)));
/* Sign-extend the kfunc arguments. */
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
@@ -1839,10 +1854,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
* goto out;
*/
- if (jit->seen & SEEN_STACK)
- off = STK_OFF_TCCNT + STK_OFF + stack_depth;
- else
- off = STK_OFF_TCCNT;
+ off = jit->frame_off +
+ offsetof(struct prog_frame, tail_call_cnt);
/* lhi %w0,1 */
EMIT4_IMM(0xa7080000, REG_W0, 1);
/* laal %w1,%w0,off(%r15) */
@@ -1872,7 +1885,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
/*
* Restore registers before calling function
*/
- save_restore_regs(jit, REGS_RESTORE, stack_depth, 0);
+ save_restore_regs(jit, REGS_RESTORE, 0);
/*
* goto *(prog->bpf_func + tail_call_start);
@@ -2165,7 +2178,7 @@ static int bpf_set_addr(struct bpf_jit *jit, int i)
* Compile eBPF program into s390x code
*/
static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
- bool extra_pass, u32 stack_depth)
+ bool extra_pass)
{
int i, insn_count, lit32_size, lit64_size;
u64 kern_arena;
@@ -2174,24 +2187,30 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
jit->lit64 = jit->lit64_start;
jit->prg = 0;
jit->excnt = 0;
+ if (is_first_pass(jit) || (jit->seen & SEEN_STACK))
+ jit->frame_off = sizeof(struct prog_frame) -
+ offsetofend(struct prog_frame, unused) +
+ round_up(fp->aux->stack_depth, 8);
+ else
+ jit->frame_off = 0;
kern_arena = bpf_arena_get_kern_vm_start(fp->aux->arena);
if (kern_arena)
jit->kern_arena = _EMIT_CONST_U64(kern_arena);
jit->user_arena = bpf_arena_get_user_vm_start(fp->aux->arena);
- bpf_jit_prologue(jit, fp, stack_depth);
+ bpf_jit_prologue(jit, fp);
if (bpf_set_addr(jit, 0) < 0)
return -1;
for (i = 0; i < fp->len; i += insn_count) {
- insn_count = bpf_jit_insn(jit, fp, i, extra_pass, stack_depth);
+ insn_count = bpf_jit_insn(jit, fp, i, extra_pass);
if (insn_count < 0)
return -1;
/* Next instruction address */
if (bpf_set_addr(jit, i + insn_count) < 0)
return -1;
}
- bpf_jit_epilogue(jit, stack_depth);
+ bpf_jit_epilogue(jit);
lit32_size = jit->lit32 - jit->lit32_start;
lit64_size = jit->lit64 - jit->lit64_start;
@@ -2267,7 +2286,6 @@ static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit,
*/
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
{
- u32 stack_depth = round_up(fp->aux->stack_depth, 8);
struct bpf_prog *tmp, *orig_fp = fp;
struct bpf_binary_header *header;
struct s390_jit_data *jit_data;
@@ -2320,7 +2338,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
* - 3: Calculate program size and addrs array
*/
for (pass = 1; pass <= 3; pass++) {
- if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) {
+ if (bpf_jit_prog(&jit, fp, extra_pass)) {
fp = orig_fp;
goto free_addrs;
}
@@ -2334,7 +2352,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
goto free_addrs;
}
skip_init_ctx:
- if (bpf_jit_prog(&jit, fp, extra_pass, stack_depth)) {
+ if (bpf_jit_prog(&jit, fp, extra_pass)) {
bpf_jit_binary_free(header);
fp = orig_fp;
goto free_addrs;
@@ -2654,9 +2672,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
/* stg %r1,backchain_off(%r15) */
EMIT6_DISP_LH(0xe3000000, 0x0024, REG_1, REG_0, REG_15,
tjit->backchain_off);
- /* mvc tccnt_off(4,%r15),stack_size+STK_OFF_TCCNT(%r15) */
+ /* mvc tccnt_off(4,%r15),stack_size+tail_call_cnt(%r15) */
_EMIT6(0xd203f000 | tjit->tccnt_off,
- 0xf000 | (tjit->stack_size + STK_OFF_TCCNT));
+ 0xf000 | (tjit->stack_size +
+ offsetof(struct prog_frame, tail_call_cnt)));
/* stmg %r2,%rN,fwd_reg_args_off(%r15) */
if (nr_reg_args)
EMIT6_DISP_LH(0xeb000000, 0x0024, REG_2,
@@ -2793,8 +2812,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
(nr_stack_args * sizeof(u64) - 1) << 16 |
tjit->stack_args_off,
0xf000 | tjit->orig_stack_args_off);
- /* mvc STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */
- _EMIT6(0xd203f000 | STK_OFF_TCCNT, 0xf000 | tjit->tccnt_off);
+ /* mvc tail_call_cnt(4,%r15),tccnt_off(%r15) */
+ _EMIT6(0xd203f000 | offsetof(struct prog_frame, tail_call_cnt),
+ 0xf000 | tjit->tccnt_off);
/* lgr %r1,%r8 */
EMIT4(0xb9040000, REG_1, REG_8);
/* %r1() */
@@ -2851,8 +2871,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
if (flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET))
EMIT6_DISP_LH(0xe3000000, 0x0004, REG_2, REG_0, REG_15,
tjit->retval_off);
- /* mvc stack_size+STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */
- _EMIT6(0xd203f000 | (tjit->stack_size + STK_OFF_TCCNT),
+ /* mvc stack_size+tail_call_cnt(4,%r15),tccnt_off(%r15) */
+ _EMIT6(0xd203f000 | (tjit->stack_size +
+ offsetof(struct prog_frame, tail_call_cnt)),
0xf000 | tjit->tccnt_off);
/* aghi %r15,stack_size */
EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size);
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 89185af7bcc9..d5795067befa 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -40,7 +40,6 @@ config SUPERH
select HAVE_GUP_FAST if MMU
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_HW_BREAKPOINT
select HAVE_IOREMAP_PROT if MMU && !X2TLB
select HAVE_KERNEL_BZIP2
diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index cab2f9c011a8..7b420424b6d7 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -103,16 +103,16 @@ UTS_MACHINE := sh
LDFLAGS_vmlinux += -e _stext
ifdef CONFIG_CPU_LITTLE_ENDIAN
-ld-bfd := elf32-sh-linux
-LDFLAGS_vmlinux += --defsym jiffies=jiffies_64 --oformat $(ld-bfd)
+ld_bfd := elf32-sh-linux
+LDFLAGS_vmlinux += --defsym jiffies=jiffies_64 --oformat $(ld_bfd)
KBUILD_LDFLAGS += -EL
else
-ld-bfd := elf32-shbig-linux
-LDFLAGS_vmlinux += --defsym jiffies=jiffies_64+4 --oformat $(ld-bfd)
+ld_bfd := elf32-shbig-linux
+LDFLAGS_vmlinux += --defsym jiffies=jiffies_64+4 --oformat $(ld_bfd)
KBUILD_LDFLAGS += -EB
endif
-export ld-bfd
+export ld_bfd
# Mach groups
machdir-$(CONFIG_SOLUTION_ENGINE) += mach-se
diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile
index 8bc319ff54bf..58df491778b2 100644
--- a/arch/sh/boot/compressed/Makefile
+++ b/arch/sh/boot/compressed/Makefile
@@ -27,7 +27,7 @@ endif
ccflags-remove-$(CONFIG_MCOUNT) += -pg
-LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(IMAGE_OFFSET) -e startup \
+LDFLAGS_vmlinux := --oformat $(ld_bfd) -Ttext $(IMAGE_OFFSET) -e startup \
-T $(obj)/../../kernel/vmlinux.lds
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
@@ -51,7 +51,7 @@ $(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.bin FORCE
OBJCOPYFLAGS += -R .empty_zero_page
-LDFLAGS_piggy.o := -r --format binary --oformat $(ld-bfd) -T
+LDFLAGS_piggy.o := -r --format binary --oformat $(ld_bfd) -T
$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
$(call if_changed,ld)
diff --git a/arch/sh/boot/romimage/Makefile b/arch/sh/boot/romimage/Makefile
index c7c8be58400c..17b03df0a8de 100644
--- a/arch/sh/boot/romimage/Makefile
+++ b/arch/sh/boot/romimage/Makefile
@@ -13,7 +13,7 @@ mmcif-obj-$(CONFIG_CPU_SUBTYPE_SH7724) := $(obj)/mmcif-sh7724.o
load-$(CONFIG_ROMIMAGE_MMCIF) := $(mmcif-load-y)
obj-$(CONFIG_ROMIMAGE_MMCIF) := $(mmcif-obj-y)
-LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(load-y) -e romstart \
+LDFLAGS_vmlinux := --oformat $(ld_bfd) -Ttext $(load-y) -e romstart \
-T $(obj)/../../kernel/vmlinux.lds
$(obj)/vmlinux: $(obj)/head.o $(obj-y) $(obj)/piggy.o FORCE
@@ -24,7 +24,7 @@ OBJCOPYFLAGS += -j .empty_zero_page
$(obj)/zeropage.bin: vmlinux FORCE
$(call if_changed,objcopy)
-LDFLAGS_piggy.o := -r --format binary --oformat $(ld-bfd) -T
+LDFLAGS_piggy.o := -r --format binary --oformat $(ld_bfd) -T
$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/zeropage.bin arch/sh/boot/zImage FORCE
$(call if_changed,ld)
diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
index f022ada363b5..8ef72b8dbcd3 100644
--- a/arch/sh/configs/titan_defconfig
+++ b/arch/sh/configs/titan_defconfig
@@ -61,7 +61,6 @@ CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
CONFIG_NETFILTER_XT_TARGET_MARK=m
CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
CONFIG_NETFILTER_XT_MATCH_ESP=m
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
CONFIG_NETFILTER_XT_MATCH_LIMIT=m
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index dcfdb7f1dae9..7b595092cbfb 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -78,7 +78,6 @@ config SPARC64
select MMU_GATHER_NO_FLUSH_CACHE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_DYNAMIC_FTRACE
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_PAGE_SIZE_8KB
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_CONTEXT_TRACKING_USER
@@ -97,6 +96,7 @@ config SPARC64
select HAVE_ARCH_AUDITSYSCALL
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ select ARCH_SUPPORTS_HUGETLBFS
select HAVE_NMI
select HAVE_REGS_AND_STACK_ACCESS_API
select ARCH_USE_QUEUED_RWLOCKS
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index e7a9cdd498dc..d3bc16fbcbbd 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -50,11 +50,6 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
return changed;
}
-#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
-void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
- unsigned long end, unsigned long floor,
- unsigned long ceiling);
-
#include <asm-generic/hugetlb.h>
#endif /* _ASM_SPARC64_HUGETLB_H */
diff --git a/arch/sparc/include/asm/mman.h b/arch/sparc/include/asm/mman.h
index af9c10c83dc5..3e4bac33be81 100644
--- a/arch/sparc/include/asm/mman.h
+++ b/arch/sparc/include/asm/mman.h
@@ -28,7 +28,7 @@ static inline void ipi_set_tstate_mcde(void *arg)
}
#define arch_calc_vm_prot_bits(prot, pkey) sparc_calc_vm_prot_bits(prot)
-static inline unsigned long sparc_calc_vm_prot_bits(unsigned long prot)
+static inline vm_flags_t sparc_calc_vm_prot_bits(unsigned long prot)
{
if (adi_capable() && (prot & PROT_ADI)) {
struct pt_regs *regs;
@@ -58,7 +58,7 @@ static inline int sparc_validate_prot(unsigned long prot, unsigned long addr)
/* arch_validate_flags() - Ensure combination of flags is valid for a
* VMA.
*/
-static inline bool arch_validate_flags(unsigned long vm_flags)
+static inline bool arch_validate_flags(vm_flags_t vm_flags)
{
/* If ADI is being enabled on this VMA, check for ADI
* capability on the platform and ensure VMA is suitable
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index adcba7329386..71befa109e1c 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -145,6 +145,9 @@
#define SO_PASSRIGHTS 0x005c
+#define SO_INQ 0x005d
+#define SCM_INQ SO_INQ
+
#if !defined(__KERNEL__)
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 80504148d8a5..4b9431311e05 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -295,122 +295,3 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
return entry;
}
-
-static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
- unsigned long addr)
-{
- pgtable_t token = pmd_pgtable(*pmd);
-
- pmd_clear(pmd);
- pte_free_tlb(tlb, token, addr);
- mm_dec_nr_ptes(tlb->mm);
-}
-
-static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pmd_t *pmd;
- unsigned long next;
- unsigned long start;
-
- start = addr;
- pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- if (pmd_none(*pmd))
- continue;
- if (is_hugetlb_pmd(*pmd))
- pmd_clear(pmd);
- else
- hugetlb_free_pte_range(tlb, pmd, addr);
- } while (pmd++, addr = next, addr != end);
-
- start &= PUD_MASK;
- if (start < floor)
- return;
- if (ceiling) {
- ceiling &= PUD_MASK;
- if (!ceiling)
- return;
- }
- if (end - 1 > ceiling - 1)
- return;
-
- pmd = pmd_offset(pud, start);
- pud_clear(pud);
- pmd_free_tlb(tlb, pmd, start);
- mm_dec_nr_pmds(tlb->mm);
-}
-
-static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pud_t *pud;
- unsigned long next;
- unsigned long start;
-
- start = addr;
- pud = pud_offset(p4d, addr);
- do {
- next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud))
- continue;
- if (is_hugetlb_pud(*pud))
- pud_clear(pud);
- else
- hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
- ceiling);
- } while (pud++, addr = next, addr != end);
-
- start &= PGDIR_MASK;
- if (start < floor)
- return;
- if (ceiling) {
- ceiling &= PGDIR_MASK;
- if (!ceiling)
- return;
- }
- if (end - 1 > ceiling - 1)
- return;
-
- pud = pud_offset(p4d, start);
- p4d_clear(p4d);
- pud_free_tlb(tlb, pud, start);
- mm_dec_nr_puds(tlb->mm);
-}
-
-void hugetlb_free_pgd_range(struct mmu_gather *tlb,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
- pgd_t *pgd;
- p4d_t *p4d;
- unsigned long next;
-
- addr &= PMD_MASK;
- if (addr < floor) {
- addr += PMD_SIZE;
- if (!addr)
- return;
- }
- if (ceiling) {
- ceiling &= PMD_MASK;
- if (!ceiling)
- return;
- }
- if (end - 1 > ceiling - 1)
- end -= PMD_SIZE;
- if (addr > end - 1)
- return;
-
- pgd = pgd_offset(tlb->mm, addr);
- p4d = p4d_offset(pgd, addr);
- do {
- next = p4d_addr_end(addr, end);
- if (p4d_none_or_clear_bad(p4d))
- continue;
- hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling);
- } while (p4d++, addr = next, addr != end);
-}
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 25ae4c897aae..7ed58bf3aaca 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -3201,7 +3201,7 @@ void copy_highpage(struct page *to, struct page *from)
}
EXPORT_SYMBOL(copy_highpage);
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
{
unsigned long prot = pgprot_val(protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f23919a7db40..58d890fe2100 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -99,7 +99,6 @@ config X86
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_PREEMPT_LAZY
- select ARCH_HAS_PTE_DEVMAP if X86_64
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_HW_PTE_YOUNG
select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
@@ -124,6 +123,7 @@ config X86
select ARCH_SUPPORTS_ACPI
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ select ARCH_SUPPORTS_HUGETLBFS
select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
@@ -242,7 +242,6 @@ config X86
select HAVE_GUP_FAST
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
select HAVE_FTRACE_GRAPH_FUNC if HAVE_FUNCTION_GRAPH_TRACER
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_FREGS if HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_GRAPH_TRACER if X86_32 || (X86_64 && DYNAMIC_FTRACE)
select HAVE_FUNCTION_TRACER
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index f1b6d40154e3..f1adfba1a76e 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -104,10 +104,12 @@ static void crypto_aegis128_aesni_process_ad(
}
}
-static __always_inline void
+static __always_inline int
crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
struct skcipher_walk *walk, bool enc)
{
+ int err = 0;
+
while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
if (enc)
aegis128_aesni_enc(state, walk->src.virt.addr,
@@ -119,7 +121,10 @@ crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
walk->dst.virt.addr,
round_down(walk->nbytes,
AEGIS128_BLOCK_SIZE));
- skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
+ kernel_fpu_end();
+ err = skcipher_walk_done(walk,
+ walk->nbytes % AEGIS128_BLOCK_SIZE);
+ kernel_fpu_begin();
}
if (walk->nbytes) {
@@ -131,8 +136,11 @@ crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
aegis128_aesni_dec_tail(state, walk->src.virt.addr,
walk->dst.virt.addr,
walk->nbytes);
- skcipher_walk_done(walk, 0);
+ kernel_fpu_end();
+ err = skcipher_walk_done(walk, 0);
+ kernel_fpu_begin();
}
+ return err;
}
static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead)
@@ -165,7 +173,7 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
return 0;
}
-static __always_inline void
+static __always_inline int
crypto_aegis128_aesni_crypt(struct aead_request *req,
struct aegis_block *tag_xor,
unsigned int cryptlen, bool enc)
@@ -174,20 +182,24 @@ crypto_aegis128_aesni_crypt(struct aead_request *req,
struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
struct skcipher_walk walk;
struct aegis_state state;
+ int err;
if (enc)
- skcipher_walk_aead_encrypt(&walk, req, true);
+ err = skcipher_walk_aead_encrypt(&walk, req, false);
else
- skcipher_walk_aead_decrypt(&walk, req, true);
+ err = skcipher_walk_aead_decrypt(&walk, req, false);
+ if (err)
+ return err;
kernel_fpu_begin();
aegis128_aesni_init(&state, &ctx->key, req->iv);
crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
- crypto_aegis128_aesni_process_crypt(&state, &walk, enc);
- aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
-
+ err = crypto_aegis128_aesni_process_crypt(&state, &walk, enc);
+ if (err == 0)
+ aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
kernel_fpu_end();
+ return err;
}
static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
@@ -196,8 +208,11 @@ static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
struct aegis_block tag = {};
unsigned int authsize = crypto_aead_authsize(tfm);
unsigned int cryptlen = req->cryptlen;
+ int err;
- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true);
+ err = crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true);
+ if (err)
+ return err;
scatterwalk_map_and_copy(tag.bytes, req->dst,
req->assoclen + cryptlen, authsize, 1);
@@ -212,11 +227,14 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
struct aegis_block tag;
unsigned int authsize = crypto_aead_authsize(tfm);
unsigned int cryptlen = req->cryptlen - authsize;
+ int err;
scatterwalk_map_and_copy(tag.bytes, req->src,
req->assoclen + cryptlen, authsize, 0);
- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false);
+ err = crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false);
+ if (err)
+ return err;
return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
}
diff --git a/arch/x86/crypto/aria_aesni_avx2_glue.c b/arch/x86/crypto/aria_aesni_avx2_glue.c
index b4bddcd58457..007b250f774c 100644
--- a/arch/x86/crypto/aria_aesni_avx2_glue.c
+++ b/arch/x86/crypto/aria_aesni_avx2_glue.c
@@ -9,6 +9,7 @@
#include <crypto/aria.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <linux/export.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c
index ab9b38d05332..4c88ef4eba82 100644
--- a/arch/x86/crypto/aria_aesni_avx_glue.c
+++ b/arch/x86/crypto/aria_aesni_avx_glue.c
@@ -9,6 +9,7 @@
#include <crypto/aria.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <linux/export.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index a7d162388142..5c321f255eb7 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -8,6 +8,7 @@
#include <crypto/algapi.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <linux/export.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 3bd37d664121..cbede120e5f2 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -10,6 +10,7 @@
#include <linux/unaligned.h>
#include <linux/crypto.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index dcfc0de333de..d587f05c3c8c 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -7,6 +7,7 @@
#include <crypto/curve25519.h>
#include <crypto/internal/kpp.h>
+#include <linux/export.h>
#include <linux/types.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index e640abc1cb8a..9c8b3a335d5c 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <linux/export.h>
#include <crypto/algapi.h>
#include <crypto/serpent.h>
diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c
index 72867fc49ce8..88caf418a06f 100644
--- a/arch/x86/crypto/sm4_aesni_avx_glue.c
+++ b/arch/x86/crypto/sm4_aesni_avx_glue.c
@@ -11,6 +11,7 @@
#include <asm/fpu/api.h>
#include <linux/module.h>
#include <linux/crypto.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <crypto/internal/skcipher.h>
#include <crypto/sm4.h>
diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
index 4c67184dc573..8e9906d36902 100644
--- a/arch/x86/crypto/twofish_glue.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -40,6 +40,7 @@
#include <crypto/algapi.h>
#include <crypto/twofish.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 1a1ecfa7f72a..8ad77725bf60 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -9,6 +9,7 @@
#include <crypto/algapi.h>
#include <crypto/twofish.h>
#include <linux/crypto.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 23d86c9750b9..07ba4935e873 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -488,11 +488,14 @@ static inline void apic_setup_apic_calls(void) { }
extern void apic_ack_irq(struct irq_data *data);
+#define APIC_VECTOR_TO_BIT_NUMBER(v) ((unsigned int)(v) % 32)
+#define APIC_VECTOR_TO_REG_OFFSET(v) ((unsigned int)(v) / 32 * 0x10)
+
static inline bool lapic_vector_set_in_irr(unsigned int vector)
{
- u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ u32 irr = apic_read(APIC_IRR + APIC_VECTOR_TO_REG_OFFSET(vector));
- return !!(irr & (1U << (vector % 32)));
+ return !!(irr & (1U << APIC_VECTOR_TO_BIT_NUMBER(vector)));
}
static inline bool is_vector_pending(unsigned int vector)
@@ -500,6 +503,65 @@ static inline bool is_vector_pending(unsigned int vector)
return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector);
}
+#define MAX_APIC_VECTOR 256
+#define APIC_VECTORS_PER_REG 32
+
+/*
+ * Vector states are maintained by APIC in 32-bit registers that are
+ * 16 bytes aligned. The status of each vector is kept in a single
+ * bit.
+ */
+static inline int apic_find_highest_vector(void *bitmap)
+{
+ int vec;
+ u32 *reg;
+
+ for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; vec >= 0; vec -= APIC_VECTORS_PER_REG) {
+ reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec);
+ if (*reg)
+ return __fls(*reg) + vec;
+ }
+
+ return -1;
+}
+
+static inline u32 apic_get_reg(void *regs, int reg)
+{
+ return *((u32 *) (regs + reg));
+}
+
+static inline void apic_set_reg(void *regs, int reg, u32 val)
+{
+ *((u32 *) (regs + reg)) = val;
+}
+
+static __always_inline u64 apic_get_reg64(void *regs, int reg)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ return *((u64 *) (regs + reg));
+}
+
+static __always_inline void apic_set_reg64(void *regs, int reg, u64 val)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ *((u64 *) (regs + reg)) = val;
+}
+
+static inline void apic_clear_vector(int vec, void *bitmap)
+{
+ clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec));
+}
+
+static inline void apic_set_vector(int vec, void *bitmap)
+{
+ set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec));
+}
+
+static inline int apic_test_vector(int vec, void *bitmap)
+{
+ return test_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec));
+}
+
/*
* Warm reset vector position:
*/
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 5036f13ab69f..5a0d42464d44 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -26,7 +26,22 @@ enum {
IRQ_REMAP_X2APIC_MODE,
};
-struct vcpu_data {
+/*
+ * This is mainly used to communicate information back-and-forth
+ * between SVM and IOMMU for setting up and tearing down posted
+ * interrupt
+ */
+struct amd_iommu_pi_data {
+ u64 vapic_addr; /* Physical address of the vCPU's vAPIC. */
+ u32 ga_tag;
+ u32 vector; /* Guest vector of the interrupt */
+ int cpu;
+ bool ga_log_intr;
+ bool is_guest_mode;
+ void *ir_data;
+};
+
+struct intel_iommu_pi_data {
u64 pi_desc_addr; /* Physical address of PI Descriptor */
u32 vector; /* Guest vector of the interrupt */
};
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 8d50e3e0a19b..18a5c3119e1a 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -49,7 +49,6 @@ KVM_X86_OP(set_idt)
KVM_X86_OP(get_gdt)
KVM_X86_OP(set_gdt)
KVM_X86_OP(sync_dirty_debug_regs)
-KVM_X86_OP(set_dr6)
KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
@@ -112,7 +111,7 @@ KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
KVM_X86_OP_OPTIONAL(vcpu_blocking)
KVM_X86_OP_OPTIONAL(vcpu_unblocking)
KVM_X86_OP_OPTIONAL(pi_update_irte)
-KVM_X86_OP_OPTIONAL(pi_start_assignment)
+KVM_X86_OP_OPTIONAL(pi_start_bypass)
KVM_X86_OP_OPTIONAL(apicv_pre_state_restore)
KVM_X86_OP_OPTIONAL(apicv_post_state_restore)
KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
@@ -139,7 +138,7 @@ KVM_X86_OP(check_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
KVM_X86_OP_OPTIONAL(migrate_timers)
-KVM_X86_OP(msr_filter_changed)
+KVM_X86_OP(recalc_msr_intercepts)
KVM_X86_OP(complete_emulated_msr)
KVM_X86_OP(vcpu_deliver_sipi_vector)
KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f7af967aa16f..f19a76d3ca0e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -297,6 +297,7 @@ enum x86_intercept_stage;
*/
#define KVM_APIC_PV_EOI_PENDING 1
+struct kvm_kernel_irqfd;
struct kvm_kernel_irq_routing_entry;
/*
@@ -1320,6 +1321,12 @@ enum kvm_apicv_inhibit {
*/
APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
+ /*
+ * AVIC is disabled because the vCPU's APIC ID is beyond the max
+ * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable.
+ */
+ APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG,
+
NR_APICV_INHIBIT_REASONS,
};
@@ -1338,7 +1345,8 @@ enum kvm_apicv_inhibit {
__APICV_INHIBIT_REASON(IRQWIN), \
__APICV_INHIBIT_REASON(PIT_REINJ), \
__APICV_INHIBIT_REASON(SEV), \
- __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED)
+ __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \
+ __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG)
struct kvm_arch {
unsigned long n_used_mmu_pages;
@@ -1350,7 +1358,7 @@ struct kvm_arch {
bool has_private_mem;
bool has_protected_state;
bool pre_fault_allowed;
- struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+ struct hlist_head *mmu_page_hash;
struct list_head active_mmu_pages;
/*
* A list of kvm_mmu_page structs that, if zapped, could possibly be
@@ -1379,11 +1387,13 @@ struct kvm_arch {
#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
atomic_t noncoherent_dma_count;
-#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
- atomic_t assigned_device_count;
+ unsigned long nr_possible_bypass_irqs;
+
+#ifdef CONFIG_KVM_IOAPIC
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
+#endif
atomic_t vapics_in_nmi_mode;
struct mutex apic_map_lock;
struct kvm_apic_map __rcu *apic_map;
@@ -1398,12 +1408,8 @@ struct kvm_arch {
gpa_t wall_clock;
- bool mwait_in_guest;
- bool hlt_in_guest;
- bool pause_in_guest;
- bool cstate_in_guest;
+ u64 disabled_exits;
- unsigned long irq_sources_bitmap;
s64 kvmclock_offset;
/*
@@ -1432,9 +1438,6 @@ struct kvm_arch {
struct delayed_work kvmclock_update_work;
struct delayed_work kvmclock_sync_work;
- /* reads protected by irq_srcu, writes by irq_lock */
- struct hlist_head mask_notifier_list;
-
#ifdef CONFIG_KVM_HYPERV
struct kvm_hv hyperv;
#endif
@@ -1457,6 +1460,7 @@ struct kvm_arch {
bool x2apic_format;
bool x2apic_broadcast_quirk_disabled;
+ bool has_mapped_host_mmio;
bool guest_can_read_msr_platform_info;
bool exception_payload_enabled;
@@ -1680,6 +1684,12 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
}
+enum kvm_x86_run_flags {
+ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
+ KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
+ KVM_RUN_LOAD_DEBUGCTL = BIT(2),
+};
+
struct kvm_x86_ops {
const char *name;
@@ -1708,6 +1718,12 @@ struct kvm_x86_ops {
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
+ /*
+ * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
+ * match the host's value even while the guest is active.
+ */
+ const u64 HOST_OWNED_DEBUGCTL;
+
void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -1730,7 +1746,6 @@ struct kvm_x86_ops {
void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
- void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
@@ -1761,7 +1776,7 @@ struct kvm_x86_ops {
int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
- bool force_immediate_exit);
+ u64 run_flags);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
@@ -1853,9 +1868,10 @@ struct kvm_x86_ops {
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
- int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
- void (*pi_start_assignment)(struct kvm *kvm);
+ int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
+ void (*pi_start_bypass)(struct kvm *kvm);
void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
@@ -1892,7 +1908,7 @@ struct kvm_x86_ops {
int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu);
void (*migrate_timers)(struct kvm_vcpu *vcpu);
- void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
+ void (*recalc_msr_intercepts)(struct kvm_vcpu *vcpu);
int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
@@ -1950,6 +1966,7 @@ struct kvm_arch_async_pf {
extern u32 __read_mostly kvm_nr_uret_msrs;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
+extern bool __read_mostly enable_ipiv;
extern bool __read_mostly enable_device_posted_irqs;
extern struct kvm_x86_ops kvm_x86_ops;
@@ -1968,7 +1985,7 @@ void kvm_x86_vendor_exit(void);
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
- return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ return kvzalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT);
}
#define __KVM_HAVE_ARCH_VM_FREE
@@ -2013,7 +2030,7 @@ void kvm_mmu_vendor_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
-void kvm_mmu_init_vm(struct kvm *kvm);
+int kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
@@ -2044,19 +2061,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
-struct kvm_irq_mask_notifier {
- void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
- int irq;
- struct hlist_node link;
-};
-
-void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn);
-void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn);
-void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
- bool mask);
-
extern bool tdp_enabled;
u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
@@ -2215,9 +2219,6 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
return !!(*irq_state);
}
-int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
-void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
-
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
@@ -2394,9 +2395,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
-void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct kvm_lapic_irq *irq);
-
static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
{
/* We can only post Fixed and LowPrio IRQs */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 7490bb5c0776..b65c3ba5fa14 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -419,6 +419,7 @@
#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12)
#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
+#define DEBUGCTLMSR_RTM_DEBUG BIT(15)
#define MSR_PEBS_FRONTEND 0x000003f7
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 97954c936c54..e33df3da6980 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -301,16 +301,15 @@ static inline bool pmd_leaf(pmd_t pte)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_leaf */
static inline int pmd_trans_huge(pmd_t pmd)
{
- return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+ return (pmd_val(pmd) & _PAGE_PSE) == _PAGE_PSE;
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline int pud_trans_huge(pud_t pud)
{
- return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+ return (pud_val(pud) & _PAGE_PSE) == _PAGE_PSE;
}
#endif
@@ -320,24 +319,6 @@ static inline int has_transparent_hugepage(void)
return boot_cpu_has(X86_FEATURE_PSE);
}
-#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static inline int pmd_devmap(pmd_t pmd)
-{
- return !!(pmd_val(pmd) & _PAGE_DEVMAP);
-}
-
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static inline int pud_devmap(pud_t pud)
-{
- return !!(pud_val(pud) & _PAGE_DEVMAP);
-}
-#else
-static inline int pud_devmap(pud_t pud)
-{
- return 0;
-}
-#endif
-
#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
static inline bool pmd_special(pmd_t pmd)
{
@@ -361,12 +342,6 @@ static inline pud_t pud_mkspecial(pud_t pud)
return pud_set_flags(pud, _PAGE_SPECIAL);
}
#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
-
-static inline int pgd_devmap(pgd_t pgd)
-{
- return 0;
-}
-#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
@@ -527,11 +502,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
return pte_set_flags(pte, _PAGE_SPECIAL);
}
-static inline pte_t pte_mkdevmap(pte_t pte)
-{
- return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
-}
-
/* See comments above mksaveddirty_shift() */
static inline pmd_t pmd_mksaveddirty(pmd_t pmd)
{
@@ -603,11 +573,6 @@ static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_DIRTY);
}
-static inline pmd_t pmd_mkdevmap(pmd_t pmd)
-{
- return pmd_set_flags(pmd, _PAGE_DEVMAP);
-}
-
static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_PSE);
@@ -673,11 +638,6 @@ static inline pud_t pud_mkdirty(pud_t pud)
return pud_mksaveddirty(pud);
}
-static inline pud_t pud_mkdevmap(pud_t pud)
-{
- return pud_set_flags(pud, _PAGE_DEVMAP);
-}
-
static inline pud_t pud_mkhuge(pud_t pud)
{
return pud_set_flags(pud, _PAGE_PSE);
@@ -1008,13 +968,6 @@ static inline int pte_present(pte_t a)
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}
-#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static inline int pte_devmap(pte_t a)
-{
- return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
-}
-#endif
-
#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index a5731fb1e9dd..2ec250ba467e 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -34,7 +34,6 @@
#define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */
-#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
#ifdef CONFIG_X86_64
#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit (leaf) */
@@ -121,11 +120,9 @@
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
-#define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
#define _PAGE_SOFTW4 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4)
#else
#define _PAGE_NX (_AT(pteval_t, 0))
-#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#define _PAGE_SOFTW4 (_AT(pteval_t, 0))
#endif
@@ -154,7 +151,7 @@
#define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | \
_PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \
- _PAGE_DEVMAP | _PAGE_CC | _PAGE_UFFD_WP)
+ _PAGE_CC | _PAGE_UFFD_WP)
#define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT)
#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index ad954a1a6656..ffc27f676243 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -252,16 +252,21 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+/*
+ * GA_LOG_INTR is a synthetic flag that's never propagated to hardware-visible
+ * tables. GA_LOG_INTR is set if the vCPU needs device posted IRQs to generate
+ * GA log interrupts to wake the vCPU (because it's blocking or about to block).
+ */
+#define AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR BIT_ULL(61)
+
#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
-#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK GENMASK_ULL(51, 12)
#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFFULL)
#define AVIC_DOORBELL_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
-#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-
#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
@@ -290,8 +295,6 @@ enum avic_ipi_failure_cause {
static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID);
static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID);
-#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-
#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0)
#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3)
#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index e9b81876ebe4..00daedfefc1b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -356,11 +356,6 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b
mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
-static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
-{
- flush_tlb_mm(mm);
-}
-
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
static inline bool pte_flags_need_flush(unsigned long oldflags,
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 279148e72459..308dbbae6c6e 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -279,7 +279,7 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
unsigned long addr,
- unsigned long vm_flags)
+ vm_flags_t vm_flags)
{
unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *entry;
@@ -520,9 +520,9 @@ static void sgx_vma_open(struct vm_area_struct *vma)
* Return: 0 on success, -EACCES otherwise
*/
int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
- unsigned long end, unsigned long vm_flags)
+ unsigned long end, vm_flags_t vm_flags)
{
- unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
+ vm_flags_t vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *page;
unsigned long count = 0;
int ret = 0;
@@ -605,7 +605,7 @@ static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *pag
*/
static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
unsigned long addr,
- unsigned long vm_flags)
+ vm_flags_t vm_flags)
{
struct sgx_encl_page *entry;
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index f94ff14c9486..8ff47f6652b9 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -101,7 +101,7 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
}
int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
- unsigned long end, unsigned long vm_flags);
+ unsigned long end, vm_flags_t vm_flags);
bool current_is_ksgxd(void);
void sgx_encl_release(struct kref *ref);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fb27be697128..0792f31961ac 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -213,8 +213,10 @@ arch_initcall(init_x86_sysctl);
*/
struct screen_info screen_info;
EXPORT_SYMBOL(screen_info);
+#if defined(CONFIG_FIRMWARE_EDID)
struct edid_info edid_info;
EXPORT_SYMBOL_GPL(edid_info);
+#endif
extern int root_mountflags;
@@ -525,7 +527,9 @@ static void __init parse_boot_params(void)
{
ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
screen_info = boot_params.screen_info;
+#if defined(CONFIG_FIRMWARE_EDID)
edid_info = boot_params.edid_info;
+#endif
#ifdef CONFIG_X86_32
apm_info.bios = boot_params.apm_bios_info;
ist_info = boot_params.ist_info;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2eeffcec5382..2c86673155c9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -166,6 +166,16 @@ config KVM_AMD_SEV
Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
Secure Nested Paging (SEV-SNP) technologies on AMD processors.
+config KVM_IOAPIC
+ bool "I/O APIC, PIC, and PIT emulation"
+ default y
+ depends on KVM
+ help
+ Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
+ for full in-kernel APIC emulation.
+
+ If unsure, say Y.
+
config KVM_SMM
bool "System Management Mode emulation"
default y
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a5d362c7b504..c4b8950c7abe 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,12 +5,11 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror
include $(srctree)/virt/kvm/Makefile.kvm
-kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
- i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- debugfs.o mmu/mmu.o mmu/page_track.o \
- mmu/spte.o
+kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \
+ debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
+kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o
kvm-$(CONFIG_KVM_HYPERV) += hyperv.o
kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-$(CONFIG_KVM_SMM) += smm.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index f84bc0569c9c..e2836a255b16 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -979,6 +979,7 @@ void kvm_set_cpu_caps(void)
F(FSRS),
F(FSRC),
F(WRMSRNS),
+ X86_64_F(LKGS),
F(AMX_FP16),
F(AVX_IFMA),
F(LAM),
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index ee27064dd72f..72b19a88a776 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -497,15 +497,19 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
return ret;
}
-int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
struct kvm_vcpu_hv_synic *synic;
- synic = synic_get(kvm, vpidx);
+ if (!level)
+ return -1;
+
+ synic = synic_get(kvm, e->hv_sint.vcpu);
if (!synic)
return -EINVAL;
- return synic_set_irq(synic, sint);
+ return synic_set_irq(synic, e->hv_sint.sint);
}
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 913bfc96959c..6ce160ffa678 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -103,7 +103,8 @@ static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
-int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 739aa6c0d0c3..d1b79b418c05 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -248,8 +248,8 @@ static void pit_do_work(struct kthread_work *work)
if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
return;
- kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
- kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false);
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false);
/*
* Provides NMI watchdog support via Virtual Wire mode.
@@ -288,7 +288,7 @@ static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
atomic_set(&pit->pit_state.irq_ack, 1);
}
-void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+static void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
{
struct kvm_kpit_state *ps = &pit->pit_state;
struct kvm *kvm = pit->kvm;
@@ -400,8 +400,8 @@ static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
}
}
-void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
- int hpet_legacy_start)
+static void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start)
{
u8 saved_mode;
@@ -649,6 +649,79 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
kvm_pit_reset_reinject(pit);
}
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+ BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+ mutex_lock(&kps->lock);
+ memcpy(ps, &kps->channels, sizeof(*ps));
+ mutex_unlock(&kps->lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int i;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+ sizeof(ps->channels));
+ ps->flags = kvm->arch.vpit->pit_state.flags;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ memset(&ps->reserved, 0, sizeof(ps->reserved));
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int start = 0;
+ int i;
+ u32 prev_legacy, cur_legacy;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ if (!prev_legacy && cur_legacy)
+ start = 1;
+ memcpy(&pit->pit_state.channels, &ps->channels,
+ sizeof(pit->pit_state.channels));
+ pit->pit_state.flags = ps->flags;
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
+ start && i == 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control)
+{
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ /* pit->pit_state.lock was overloaded to prevent userspace from getting
+ * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+ * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
+ */
+ mutex_lock(&pit->pit_state.lock);
+ kvm_pit_set_reinject(pit, control->pit_reinject);
+ mutex_unlock(&pit->pit_state.lock);
+
+ return 0;
+}
+
static const struct kvm_io_device_ops pit_dev_ops = {
.read = pit_ioport_read,
.write = pit_ioport_write,
@@ -671,10 +744,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
if (!pit)
return NULL;
- pit->irq_source_id = kvm_request_irq_source_id(kvm);
- if (pit->irq_source_id < 0)
- goto fail_request;
-
mutex_init(&pit->pit_state.lock);
pid = get_pid(task_tgid(current));
@@ -726,8 +795,6 @@ fail_register_pit:
kvm_pit_set_reinject(pit, false);
kthread_destroy_worker(pit->worker);
fail_kthread:
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
-fail_request:
kfree(pit);
return NULL;
}
@@ -744,7 +811,6 @@ void kvm_free_pit(struct kvm *kvm)
kvm_pit_set_reinject(pit, false);
hrtimer_cancel(&pit->pit_state.timer);
kthread_destroy_worker(pit->worker);
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
kfree(pit);
}
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index a768212ba821..60fa499d2f8a 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -6,6 +6,11 @@
#include <kvm/iodev.h>
+#include <uapi/asm/kvm.h>
+
+#include "ioapic.h"
+
+#ifdef CONFIG_KVM_IOAPIC
struct kvm_kpit_channel_state {
u32 count; /* can be 65536 */
u16 latched_count;
@@ -42,7 +47,6 @@ struct kvm_pit {
struct kvm_io_device speaker_dev;
struct kvm *kvm;
struct kvm_kpit_state pit_state;
- int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
struct kthread_worker *worker;
struct kthread_work expired;
@@ -55,11 +59,14 @@ struct kvm_pit {
#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
#define KVM_PIT_CHANNEL_MASK 0x3
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control);
+
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
-
-void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
- int hpet_legacy_start);
-void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
+#endif /* CONFIG_KVM_IOAPIC */
#endif
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a8fb19940975..2ac7f1678c46 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -31,6 +31,8 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/bitops.h>
+
+#include "ioapic.h"
#include "irq.h"
#include <linux/kvm_host.h>
@@ -185,8 +187,11 @@ void kvm_pic_update_irq(struct kvm_pic *s)
pic_unlock(s);
}
-int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
+ struct kvm_pic *s = kvm->arch.vpic;
+ int irq = e->irqchip.pin;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
@@ -203,16 +208,6 @@ int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
return ret;
}
-void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
-{
- int i;
-
- pic_lock(s);
- for (i = 0; i < PIC_NUM_PINS; i++)
- __clear_bit(irq_source_id, &s->irq_states[i]);
- pic_unlock(s);
-}
-
/*
* acknowledge interrupt 'irq'
*/
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 45dae2d5d2f1..2b5d389bca5f 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -41,11 +41,11 @@
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/current.h>
-#include <trace/events/kvm.h>
#include "ioapic.h"
#include "lapic.h"
#include "irq.h"
+#include "trace.h"
static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
bool line_status);
@@ -310,6 +310,42 @@ void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
kvm_make_scan_ioapic_request(kvm);
}
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &ioapic->mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_srcu(&kvm->irq_srcu);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ struct kvm_irq_mask_notifier *kimn;
+ int idx, gsi;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kimn, &ioapic->mask_notifier_list, link)
+ if (kimn->irq == gsi)
+ kimn->func(kimn, mask);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
{
unsigned index;
@@ -479,9 +515,11 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
return ret;
}
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
- int level, bool line_status)
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ int irq = e->irqchip.pin;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
@@ -496,16 +534,6 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
return ret;
}
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
-{
- int i;
-
- spin_lock(&ioapic->lock);
- for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
- __clear_bit(irq_source_id, &ioapic->irq_states[i]);
- spin_unlock(&ioapic->lock);
-}
-
static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
{
int i;
@@ -718,6 +746,7 @@ int kvm_ioapic_init(struct kvm *kvm)
return -ENOMEM;
spin_lock_init(&ioapic->lock);
INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
+ INIT_HLIST_HEAD(&ioapic->mask_notifier_list);
kvm->arch.vioapic = ioapic;
kvm_ioapic_reset(ioapic);
kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index aa8cb4ac0479..bf28dbc11ff6 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -86,8 +86,24 @@ struct kvm_ioapic {
struct delayed_work eoi_inject;
u32 irq_eoi[IOAPIC_NUM_PINS];
u32 irr_delivered;
+
+ /* reads protected by irq_srcu, writes by irq_lock */
+ struct hlist_head mask_notifier_list;
+};
+
+struct kvm_irq_mask_notifier {
+ void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+ int irq;
+ struct hlist_node link;
};
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask);
+
#ifdef DEBUG
#define ASSERT(x) \
do { \
@@ -103,7 +119,7 @@ do { \
static inline int ioapic_in_kernel(struct kvm *kvm)
{
- return irqchip_kernel(kvm);
+ return irqchip_full(kvm);
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -111,9 +127,9 @@ void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
int trigger_mode);
int kvm_ioapic_init(struct kvm *kvm);
void kvm_ioapic_destroy(struct kvm *kvm);
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
- int level, bool line_status);
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
+
void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 97d68d837929..16da89259011 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -11,9 +11,12 @@
#include <linux/export.h>
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
+#include "hyperv.h"
+#include "ioapic.h"
#include "irq.h"
-#include "i8254.h"
+#include "trace.h"
#include "x86.h"
#include "xen.h"
@@ -41,6 +44,14 @@ static int pending_userspace_extint(struct kvm_vcpu *v)
return v->arch.pending_external_vector != -1;
}
+static int get_userspace_extint(struct kvm_vcpu *vcpu)
+{
+ int vector = vcpu->arch.pending_external_vector;
+
+ vcpu->arch.pending_external_vector = -1;
+ return vector;
+}
+
/*
* check if there is pending interrupt from
* non-APIC source without intack.
@@ -67,10 +78,13 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v)
if (!kvm_apic_accept_pic_intr(v))
return 0;
- if (irqchip_split(v->kvm))
- return pending_userspace_extint(v);
- else
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
return v->kvm->arch.vpic->output;
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return pending_userspace_extint(v);
}
/*
@@ -126,13 +140,13 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v)
return v->kvm->arch.xen.upcall_vector;
#endif
- if (irqchip_split(v->kvm)) {
- int vector = v->arch.pending_external_vector;
-
- v->arch.pending_external_vector = -1;
- return vector;
- } else
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
return kvm_pic_read_irq(v->kvm); /* PIC */
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return get_userspace_extint(v);
}
EXPORT_SYMBOL_GPL(kvm_cpu_get_extint);
@@ -163,7 +177,9 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
+#ifdef CONFIG_KVM_IOAPIC
__kvm_migrate_pit_timer(vcpu);
+#endif
kvm_x86_call(migrate_timers)(vcpu);
}
@@ -171,10 +187,532 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
{
bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE;
- return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
+ return resample ? irqchip_full(kvm) : irqchip_in_kernel(kvm);
}
bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
{
return irqchip_in_kernel(kvm);
}
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, struct dest_map *dest_map)
+{
+ int r = -1;
+ struct kvm_vcpu *vcpu, *lowest = NULL;
+ unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned int dest_vcpus = 0;
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL &&
+ irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
+ pr_info("apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_lowest_prio_delivery(irq)) {
+ if (r < 0)
+ r = 0;
+ r += kvm_apic_set_irq(vcpu, irq, dest_map);
+ } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
+ if (!kvm_vector_hashing_enabled()) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ } else {
+ __set_bit(i, dest_vcpu_bitmap);
+ dest_vcpus++;
+ }
+ }
+ }
+
+ if (dest_vcpus != 0) {
+ int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+ lowest = kvm_get_vcpu(kvm, idx);
+ }
+
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq, dest_map);
+
+ return r;
+}
+
+static void kvm_msi_to_lapic_irq(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
+{
+ struct msi_msg msg = { .address_lo = e->msi.address_lo,
+ .address_hi = e->msi.address_hi,
+ .data = e->msi.data };
+
+ trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
+ (u64)msg.address_hi << 32 : 0), msg.data);
+
+ irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
+ irq->vector = msg.arch_data.vector;
+ irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
+ irq->trig_mode = msg.arch_data.is_level;
+ irq->delivery_mode = msg.arch_data.delivery_mode << 8;
+ irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
+ irq->level = 1;
+ irq->shorthand = APIC_DEST_NOSHORT;
+}
+
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e)
+{
+ return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+ struct kvm_lapic_irq irq;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ if (!level)
+ return -1;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_lapic_irq irq;
+ int r;
+
+ switch (e->type) {
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level,
+ line_status);
+#endif
+
+ case KVM_IRQ_ROUTING_MSI:
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
+ return r;
+ break;
+
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ if (!level)
+ return -1;
+
+ return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
+#endif
+ default:
+ break;
+ }
+
+ return -EWOULDBLOCK;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+ bool line_status)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level,
+ line_status);
+ return 0;
+}
+
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ /* We can't check irqchip_in_kernel() here as some callers are
+ * currently initializing the irqchip. Other callers should therefore
+ * check kvm_arch_can_set_irq_routing() before calling this function.
+ */
+ switch (ue->type) {
+#ifdef CONFIG_KVM_IOAPIC
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ if (irqchip_split(kvm))
+ return -EINVAL;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ switch (ue->u.irqchip.irqchip) {
+ case KVM_IRQCHIP_PIC_SLAVE:
+ e->irqchip.pin += PIC_NUM_PINS / 2;
+ fallthrough;
+ case KVM_IRQCHIP_PIC_MASTER:
+ if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
+ return -EINVAL;
+ e->set = kvm_pic_set_irq;
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+ return -EINVAL;
+ e->set = kvm_ioapic_set_irq;
+ break;
+ default:
+ return -EINVAL;
+ }
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ break;
+#endif
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+ break;
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ e->set = kvm_hv_synic_set_irq;
+ e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+ e->hv_sint.sint = ue->u.hv_sint.sint;
+ break;
+#endif
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int r = 0;
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+ return true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (++r == 2)
+ return false;
+
+ *dest_vcpu = vcpu;
+ }
+
+ return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
+void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
+ u8 vector, unsigned long *ioapic_handled_vectors)
+{
+ /*
+ * Intercept EOI if the vCPU is the target of the new IRQ routing, or
+ * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
+ * may receive a level-triggered IRQ in the future, or already received
+ * level-triggered IRQ. The EOI needs to be intercepted and forwarded
+ * to I/O APIC emulation so that the IRQ can be de-asserted.
+ */
+ if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
+ __set_bit(vector, ioapic_handled_vectors);
+ } else if (kvm_apic_pending_eoi(vcpu, vector)) {
+ __set_bit(vector, ioapic_handled_vectors);
+
+ /*
+ * Track the highest pending EOI for which the vCPU is NOT the
+ * target in the new routing. Only the EOI for the IRQ that is
+ * in-flight (for the old routing) needs to be intercepted, any
+ * future IRQs that arrive on this vCPU will be coincidental to
+ * the level-triggered routing and don't need to be intercepted.
+ */
+ if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
+ vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
+ }
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kernel_irq_routing_entry *entry;
+ struct kvm_irq_routing_table *table;
+ u32 i, nr_ioapic_pins;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+ kvm->arch.nr_reserved_ioapic_pins);
+ for (i = 0; i < nr_ioapic_pins; ++i) {
+ hlist_for_each_entry(entry, &table->map[i], link) {
+ struct kvm_lapic_irq irq;
+
+ if (entry->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ kvm_msi_to_lapic_irq(vcpu->kvm, entry, &irq);
+
+ if (!irq.trig_mode)
+ continue;
+
+ kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
+ irq.vector, ioapic_handled_vectors);
+ }
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_HYPERV
+ kvm_hv_irq_routing_update(kvm);
+#endif
+
+ if (irqchip_split(kvm))
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *entry)
+{
+ unsigned int host_irq = irqfd->producer->irq;
+ struct kvm *kvm = irqfd->kvm;
+ struct kvm_vcpu *vcpu = NULL;
+ struct kvm_lapic_irq irq;
+ int r;
+
+ if (WARN_ON_ONCE(!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass()))
+ return -EINVAL;
+
+ if (entry && entry->type == KVM_IRQ_ROUTING_MSI) {
+ kvm_msi_to_lapic_irq(kvm, entry, &irq);
+
+ /*
+ * Force remapped mode if hardware doesn't support posting the
+ * virtual interrupt to a vCPU. Only IRQs are postable (NMIs,
+ * SMIs, etc. are not), and neither AMD nor Intel IOMMUs support
+ * posting multicast/broadcast IRQs. If the interrupt can't be
+ * posted, the device MSI needs to be routed to the host so that
+ * the guest's desired interrupt can be synthesized by KVM.
+ *
+ * This means that KVM can only post lowest-priority interrupts
+ * if they have a single CPU as the destination, e.g. only if
+ * the guest has affined the interrupt to a single vCPU.
+ */
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq))
+ vcpu = NULL;
+ }
+
+ if (!irqfd->irq_bypass_vcpu && !vcpu)
+ return 0;
+
+ r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi,
+ vcpu, irq.vector);
+ if (r) {
+ WARN_ON_ONCE(irqfd->irq_bypass_vcpu && !vcpu);
+ irqfd->irq_bypass_vcpu = NULL;
+ return r;
+ }
+
+ irqfd->irq_bypass_vcpu = vcpu;
+
+ trace_kvm_pi_irte_update(host_irq, vcpu, irqfd->gsi, irq.vector, !!vcpu);
+ return 0;
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret = 0;
+
+ spin_lock_irq(&kvm->irqfds.lock);
+ irqfd->producer = prod;
+
+ if (!kvm->arch.nr_possible_bypass_irqs++)
+ kvm_x86_call(pi_start_bypass)(kvm);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, &irqfd->irq_entry);
+ if (ret)
+ kvm->arch.nr_possible_bypass_irqs--;
+ }
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+ return ret;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret;
+
+ WARN_ON(irqfd->producer != prod);
+
+ /*
+ * If the producer of an IRQ that is currently being posted to a vCPU
+ * is unregistered, change the associated IRTE back to remapped mode as
+ * the IRQ has been released (or repurposed) by the device driver, i.e.
+ * KVM must relinquish control of the IRTE.
+ */
+ spin_lock_irq(&kvm->irqfds.lock);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, NULL);
+ if (ret)
+ pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n",
+ irqfd->consumer.eventfd, ret);
+ }
+ irqfd->producer = NULL;
+
+ kvm->arch.nr_possible_bypass_irqs--;
+
+ spin_unlock_irq(&kvm->irqfds.lock);
+}
+
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (new->type != KVM_IRQ_ROUTING_MSI &&
+ old->type != KVM_IRQ_ROUTING_MSI)
+ return;
+
+ if (old->type == KVM_IRQ_ROUTING_MSI &&
+ new->type == KVM_IRQ_ROUTING_MSI &&
+ !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
+ return;
+
+ kvm_pi_update_irte(irqfd, new);
+}
+
+#ifdef CONFIG_KVM_IOAPIC
+#define IOAPIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#define PIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
+#define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+ ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+ ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+ ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+ ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+ ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+ ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+ ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+ ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+ ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+ ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+ ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+ ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+};
+
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, default_routing,
+ ARRAY_SIZE(default_routing), 0);
+}
+
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ memcpy(&chip->chip.pic, &pic->pics[0],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ memcpy(&chip->chip.pic, &pic->pics[1],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_get_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[0], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[1], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_set_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ kvm_pic_update_irq(pic);
+ return r;
+}
+#endif
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 76d46b2f41dd..5e62c1f79ce6 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -18,6 +18,8 @@
#include <kvm/iodev.h>
#include "lapic.h"
+#ifdef CONFIG_KVM_IOAPIC
+
#define PIC_NUM_PINS 16
#define SELECT_PIC(irq) \
((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
@@ -63,17 +65,15 @@ int kvm_pic_init(struct kvm *kvm);
void kvm_pic_destroy(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
-static inline int irqchip_split(struct kvm *kvm)
-{
- int mode = kvm->arch.irqchip_mode;
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm);
- /* Matches smp_wmb() when setting irqchip_mode */
- smp_rmb();
- return mode == KVM_IRQCHIP_SPLIT;
-}
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
-static inline int irqchip_kernel(struct kvm *kvm)
+static inline int irqchip_full(struct kvm *kvm)
{
int mode = kvm->arch.irqchip_mode;
@@ -81,10 +81,26 @@ static inline int irqchip_kernel(struct kvm *kvm)
smp_rmb();
return mode == KVM_IRQCHIP_KERNEL;
}
+#else /* CONFIG_KVM_IOAPIC */
+static __always_inline int irqchip_full(struct kvm *kvm)
+{
+ return false;
+}
+#endif
static inline int pic_in_kernel(struct kvm *kvm)
{
- return irqchip_kernel(kvm);
+ return irqchip_full(kvm);
+}
+
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_SPLIT;
}
static inline int irqchip_in_kernel(struct kvm *kvm)
@@ -105,7 +121,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
-int kvm_setup_default_irq_routing(struct kvm *kvm);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
deleted file mode 100644
index d6d792b5d1bd..000000000000
--- a/arch/x86/kvm/irq_comm.c
+++ /dev/null
@@ -1,469 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * irq_comm.c: Common API for in kernel interrupt controller
- * Copyright (c) 2007, Intel Corporation.
- *
- * Authors:
- * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kvm_host.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/rculist.h>
-
-#include <trace/events/kvm.h>
-
-#include "irq.h"
-
-#include "ioapic.h"
-
-#include "lapic.h"
-
-#include "hyperv.h"
-#include "x86.h"
-#include "xen.h"
-
-static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
-}
-
-static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
- line_status);
-}
-
-int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, struct dest_map *dest_map)
-{
- int r = -1;
- struct kvm_vcpu *vcpu, *lowest = NULL;
- unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
- unsigned int dest_vcpus = 0;
-
- if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
- return r;
-
- if (irq->dest_mode == APIC_DEST_PHYSICAL &&
- irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
- pr_info("apic: phys broadcast and lowest prio\n");
- irq->delivery_mode = APIC_DM_FIXED;
- }
-
- memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!kvm_apic_present(vcpu))
- continue;
-
- if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
- irq->dest_id, irq->dest_mode))
- continue;
-
- if (!kvm_lowest_prio_delivery(irq)) {
- if (r < 0)
- r = 0;
- r += kvm_apic_set_irq(vcpu, irq, dest_map);
- } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
- if (!kvm_vector_hashing_enabled()) {
- if (!lowest)
- lowest = vcpu;
- else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
- lowest = vcpu;
- } else {
- __set_bit(i, dest_vcpu_bitmap);
- dest_vcpus++;
- }
- }
- }
-
- if (dest_vcpus != 0) {
- int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
- dest_vcpu_bitmap, KVM_MAX_VCPUS);
-
- lowest = kvm_get_vcpu(kvm, idx);
- }
-
- if (lowest)
- r = kvm_apic_set_irq(lowest, irq, dest_map);
-
- return r;
-}
-
-void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct kvm_lapic_irq *irq)
-{
- struct msi_msg msg = { .address_lo = e->msi.address_lo,
- .address_hi = e->msi.address_hi,
- .data = e->msi.data };
-
- trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
- (u64)msg.address_hi << 32 : 0), msg.data);
-
- irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
- irq->vector = msg.arch_data.vector;
- irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
- irq->trig_mode = msg.arch_data.is_level;
- irq->delivery_mode = msg.arch_data.delivery_mode << 8;
- irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
- irq->level = 1;
- irq->shorthand = APIC_DEST_NOSHORT;
-}
-EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
-
-static inline bool kvm_msi_route_invalid(struct kvm *kvm,
- struct kvm_kernel_irq_routing_entry *e)
-{
- return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
-}
-
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level, bool line_status)
-{
- struct kvm_lapic_irq irq;
-
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
-
- if (!level)
- return -1;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
-}
-
-#ifdef CONFIG_KVM_HYPERV
-static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- if (!level)
- return -1;
-
- return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
-}
-#endif
-
-int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_lapic_irq irq;
- int r;
-
- switch (e->type) {
-#ifdef CONFIG_KVM_HYPERV
- case KVM_IRQ_ROUTING_HV_SINT:
- return kvm_hv_set_sint(e, kvm, irq_source_id, level,
- line_status);
-#endif
-
- case KVM_IRQ_ROUTING_MSI:
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
- return r;
- break;
-
-#ifdef CONFIG_KVM_XEN
- case KVM_IRQ_ROUTING_XEN_EVTCHN:
- if (!level)
- return -1;
-
- return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
-#endif
- default:
- break;
- }
-
- return -EWOULDBLOCK;
-}
-
-int kvm_request_irq_source_id(struct kvm *kvm)
-{
- unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
- int irq_source_id;
-
- mutex_lock(&kvm->irq_lock);
- irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
-
- if (irq_source_id >= BITS_PER_LONG) {
- pr_warn("exhausted allocatable IRQ sources!\n");
- irq_source_id = -EFAULT;
- goto unlock;
- }
-
- ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
- ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
- set_bit(irq_source_id, bitmap);
-unlock:
- mutex_unlock(&kvm->irq_lock);
-
- return irq_source_id;
-}
-
-void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
-{
- ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
- ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
-
- mutex_lock(&kvm->irq_lock);
- if (irq_source_id < 0 ||
- irq_source_id >= BITS_PER_LONG) {
- pr_err("IRQ source ID out of range!\n");
- goto unlock;
- }
- clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
- if (!irqchip_kernel(kvm))
- goto unlock;
-
- kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
- kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
-unlock:
- mutex_unlock(&kvm->irq_lock);
-}
-
-void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn)
-{
- mutex_lock(&kvm->irq_lock);
- kimn->irq = irq;
- hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
- mutex_unlock(&kvm->irq_lock);
-}
-
-void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn)
-{
- mutex_lock(&kvm->irq_lock);
- hlist_del_rcu(&kimn->link);
- mutex_unlock(&kvm->irq_lock);
- synchronize_srcu(&kvm->irq_srcu);
-}
-
-void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
- bool mask)
-{
- struct kvm_irq_mask_notifier *kimn;
- int idx, gsi;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
- if (gsi != -1)
- hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link)
- if (kimn->irq == gsi)
- kimn->func(kimn, mask);
- srcu_read_unlock(&kvm->irq_srcu, idx);
-}
-
-bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
-{
- return irqchip_in_kernel(kvm);
-}
-
-int kvm_set_routing_entry(struct kvm *kvm,
- struct kvm_kernel_irq_routing_entry *e,
- const struct kvm_irq_routing_entry *ue)
-{
- /* We can't check irqchip_in_kernel() here as some callers are
- * currently initializing the irqchip. Other callers should therefore
- * check kvm_arch_can_set_irq_routing() before calling this function.
- */
- switch (ue->type) {
- case KVM_IRQ_ROUTING_IRQCHIP:
- if (irqchip_split(kvm))
- return -EINVAL;
- e->irqchip.pin = ue->u.irqchip.pin;
- switch (ue->u.irqchip.irqchip) {
- case KVM_IRQCHIP_PIC_SLAVE:
- e->irqchip.pin += PIC_NUM_PINS / 2;
- fallthrough;
- case KVM_IRQCHIP_PIC_MASTER:
- if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
- return -EINVAL;
- e->set = kvm_set_pic_irq;
- break;
- case KVM_IRQCHIP_IOAPIC:
- if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
- return -EINVAL;
- e->set = kvm_set_ioapic_irq;
- break;
- default:
- return -EINVAL;
- }
- e->irqchip.irqchip = ue->u.irqchip.irqchip;
- break;
- case KVM_IRQ_ROUTING_MSI:
- e->set = kvm_set_msi;
- e->msi.address_lo = ue->u.msi.address_lo;
- e->msi.address_hi = ue->u.msi.address_hi;
- e->msi.data = ue->u.msi.data;
-
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
- break;
-#ifdef CONFIG_KVM_HYPERV
- case KVM_IRQ_ROUTING_HV_SINT:
- e->set = kvm_hv_set_sint;
- e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
- e->hv_sint.sint = ue->u.hv_sint.sint;
- break;
-#endif
-#ifdef CONFIG_KVM_XEN
- case KVM_IRQ_ROUTING_XEN_EVTCHN:
- return kvm_xen_setup_evtchn(kvm, e, ue);
-#endif
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
- struct kvm_vcpu **dest_vcpu)
-{
- int r = 0;
- unsigned long i;
- struct kvm_vcpu *vcpu;
-
- if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
- return true;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!kvm_apic_present(vcpu))
- continue;
-
- if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
- irq->dest_id, irq->dest_mode))
- continue;
-
- if (++r == 2)
- return false;
-
- *dest_vcpu = vcpu;
- }
-
- return r == 1;
-}
-EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
-
-#define IOAPIC_ROUTING_ENTRY(irq) \
- { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
- .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
-#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
-
-#define PIC_ROUTING_ENTRY(irq) \
- { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
- .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
-#define ROUTING_ENTRY2(irq) \
- IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
-
-static const struct kvm_irq_routing_entry default_routing[] = {
- ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
- ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
- ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
- ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
- ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
- ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
- ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
- ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
- ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
- ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
- ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
- ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
-};
-
-int kvm_setup_default_irq_routing(struct kvm *kvm)
-{
- return kvm_set_irq_routing(kvm, default_routing,
- ARRAY_SIZE(default_routing), 0);
-}
-
-void kvm_arch_post_irq_routing_update(struct kvm *kvm)
-{
- if (!irqchip_split(kvm))
- return;
- kvm_make_scan_ioapic_request(kvm);
-}
-
-void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
- u8 vector, unsigned long *ioapic_handled_vectors)
-{
- /*
- * Intercept EOI if the vCPU is the target of the new IRQ routing, or
- * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
- * may receive a level-triggered IRQ in the future, or already received
- * level-triggered IRQ. The EOI needs to be intercepted and forwarded
- * to I/O APIC emulation so that the IRQ can be de-asserted.
- */
- if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
- __set_bit(vector, ioapic_handled_vectors);
- } else if (kvm_apic_pending_eoi(vcpu, vector)) {
- __set_bit(vector, ioapic_handled_vectors);
-
- /*
- * Track the highest pending EOI for which the vCPU is NOT the
- * target in the new routing. Only the EOI for the IRQ that is
- * in-flight (for the old routing) needs to be intercepted, any
- * future IRQs that arrive on this vCPU will be coincidental to
- * the level-triggered routing and don't need to be intercepted.
- */
- if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
- vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
- }
-}
-
-void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
- ulong *ioapic_handled_vectors)
-{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_kernel_irq_routing_entry *entry;
- struct kvm_irq_routing_table *table;
- u32 i, nr_ioapic_pins;
- int idx;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
- kvm->arch.nr_reserved_ioapic_pins);
- for (i = 0; i < nr_ioapic_pins; ++i) {
- hlist_for_each_entry(entry, &table->map[i], link) {
- struct kvm_lapic_irq irq;
-
- if (entry->type != KVM_IRQ_ROUTING_MSI)
- continue;
-
- kvm_set_msi_irq(vcpu->kvm, entry, &irq);
-
- if (!irq.trig_mode)
- continue;
-
- kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
- irq.vector, ioapic_handled_vectors);
- }
- }
- srcu_read_unlock(&kvm->irq_srcu, idx);
-}
-
-void kvm_arch_irq_routing_update(struct kvm *kvm)
-{
-#ifdef CONFIG_KVM_HYPERV
- kvm_hv_irq_routing_update(kvm);
-#endif
-}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73418dc0ebb2..8172c2042dd6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -27,6 +27,7 @@
#include <linux/export.h>
#include <linux/math64.h>
#include <linux/slab.h>
+#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/mce.h>
#include <asm/msr.h>
@@ -55,9 +56,6 @@
/* 14 is the version for Xeon and Pentium 8.4.8*/
#define APIC_VERSION 0x14UL
#define LAPIC_MMIO_LENGTH (1 << 12)
-/* followed define is not in apicdef.h */
-#define MAX_APIC_VECTOR 256
-#define APIC_VECTORS_PER_REG 32
/*
* Enable local APIC timer advancement (tscdeadline mode only) with adaptive
@@ -79,42 +77,20 @@ module_param(lapic_timer_advance, bool, 0444);
static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data);
-static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
-{
- *((u32 *) (regs + reg_off)) = val;
-}
-
static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
{
- __kvm_lapic_set_reg(apic->regs, reg_off, val);
-}
-
-static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg)
-{
- BUILD_BUG_ON(reg != APIC_ICR);
- return *((u64 *) (regs + reg));
+ apic_set_reg(apic->regs, reg_off, val);
}
static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
{
- return __kvm_lapic_get_reg64(apic->regs, reg);
-}
-
-static __always_inline void __kvm_lapic_set_reg64(char *regs, int reg, u64 val)
-{
- BUILD_BUG_ON(reg != APIC_ICR);
- *((u64 *) (regs + reg)) = val;
+ return apic_get_reg64(apic->regs, reg);
}
static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
int reg, u64 val)
{
- __kvm_lapic_set_reg64(apic->regs, reg, val);
-}
-
-static inline int apic_test_vector(int vec, void *bitmap)
-{
- return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+ apic_set_reg64(apic->regs, reg, val);
}
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
@@ -125,16 +101,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
apic_test_vector(vector, apic->regs + APIC_IRR);
}
-static inline int __apic_test_and_set_vector(int vec, void *bitmap)
-{
- return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
-{
- return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
@@ -626,21 +592,6 @@ static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
[LVT_CMCI] = LVT_MASK | APIC_MODE_MASK
};
-static int find_highest_vector(void *bitmap)
-{
- int vec;
- u32 *reg;
-
- for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG;
- vec >= 0; vec -= APIC_VECTORS_PER_REG) {
- reg = bitmap + REG_POS(vec);
- if (*reg)
- return __fls(*reg) + vec;
- }
-
- return -1;
-}
-
static u8 count_vectors(void *bitmap)
{
int vec;
@@ -648,7 +599,7 @@ static u8 count_vectors(void *bitmap)
u8 count = 0;
for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
- reg = bitmap + REG_POS(vec);
+ reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec);
count += hweight32(*reg);
}
@@ -706,7 +657,7 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
static inline int apic_search_irr(struct kvm_lapic *apic)
{
- return find_highest_vector(apic->regs + APIC_IRR);
+ return apic_find_highest_vector(apic->regs + APIC_IRR);
}
static inline int apic_find_highest_irr(struct kvm_lapic *apic)
@@ -729,10 +680,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
{
if (unlikely(apic->apicv_active)) {
- kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
} else {
apic->irr_pending = false;
- kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
if (apic_search_irr(apic) != -1)
apic->irr_pending = true;
}
@@ -744,9 +695,15 @@ void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec)
}
EXPORT_SYMBOL_GPL(kvm_apic_clear_irr);
+static void *apic_vector_to_isr(int vec, struct kvm_lapic *apic)
+{
+ return apic->regs + APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(vec);
+}
+
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
{
- if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ if (__test_and_set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
+ apic_vector_to_isr(vec, apic)))
return;
/*
@@ -781,7 +738,7 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
if (likely(apic->highest_isr_cache != -1))
return apic->highest_isr_cache;
- result = find_highest_vector(apic->regs + APIC_ISR);
+ result = apic_find_highest_vector(apic->regs + APIC_ISR);
ASSERT(result == -1 || result >= 16);
return result;
@@ -789,7 +746,8 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
{
- if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ if (!__test_and_clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
+ apic_vector_to_isr(vec, apic)))
return;
/*
@@ -1332,11 +1290,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
if (trig_mode)
- kvm_lapic_set_vector(vector,
- apic->regs + APIC_TMR);
+ apic_set_vector(vector, apic->regs + APIC_TMR);
else
- kvm_lapic_clear_vector(vector,
- apic->regs + APIC_TMR);
+ apic_clear_vector(vector, apic->regs + APIC_TMR);
}
kvm_x86_call(deliver_interrupt)(apic, delivery_mode,
@@ -1455,7 +1411,7 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
{
- int trigger_mode;
+ int __maybe_unused trigger_mode;
/* Eoi the ioapic only if the ioapic doesn't own the vector. */
if (!kvm_ioapic_handles_vector(apic, vector))
@@ -1476,12 +1432,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
return;
}
+#ifdef CONFIG_KVM_IOAPIC
if (apic_test_vector(vector, apic->regs + APIC_TMR))
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+#endif
}
static int apic_set_eoi(struct kvm_lapic *apic)
@@ -3084,12 +3042,12 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
if (!kvm_x86_ops.x2apic_icr_is_split) {
if (set) {
- icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
- (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
- __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+ icr = apic_get_reg(s->regs, APIC_ICR) |
+ (u64)apic_get_reg(s->regs, APIC_ICR2) << 32;
+ apic_set_reg64(s->regs, APIC_ICR, icr);
} else {
- icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
- __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ icr = apic_get_reg64(s->regs, APIC_ICR);
+ apic_set_reg(s->regs, APIC_ICR2, icr >> 32);
}
}
}
@@ -3105,8 +3063,7 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
* Get calculated timer current count for remaining timer period (if
* any) and store it in the returned register set.
*/
- __kvm_lapic_set_reg(s->regs, APIC_TMCCT,
- __apic_read(vcpu->arch.apic, APIC_TMCCT));
+ apic_set_reg(s->regs, APIC_TMCCT, __apic_read(vcpu->arch.apic, APIC_TMCCT));
return kvm_apic_state_fixup(vcpu, s, false);
}
@@ -3146,8 +3103,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+#ifdef CONFIG_KVM_IOAPIC
if (ioapic_in_kernel(vcpu->kvm))
kvm_rtc_eoi_tracking_restore_one(vcpu);
+#endif
vcpu->arch.apic_arb_prio = 0;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4ce30db65828..72de14527698 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -4,6 +4,8 @@
#include <kvm/iodev.h>
+#include <asm/apic.h>
+
#include <linux/kvm_host.h>
#include "hyperv.h"
@@ -21,6 +23,8 @@
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
enum lapic_mode {
LAPIC_MODE_DISABLED = 0,
LAPIC_MODE_INVALID = X2APIC_ENABLE,
@@ -145,22 +149,9 @@ void kvm_lapic_exit(void);
u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic);
-#define VEC_POS(v) ((v) & (32 - 1))
-#define REG_POS(v) (((v) >> 5) << 4)
-
-static inline void kvm_lapic_clear_vector(int vec, void *bitmap)
-{
- clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline void kvm_lapic_set_vector(int vec, void *bitmap)
-{
- set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
{
- kvm_lapic_set_vector(vec, apic->regs + APIC_IRR);
+ apic_set_vector(vec, apic->regs + APIC_IRR);
/*
* irr_pending must be true if any interrupt is pending; set it after
* APIC_IRR to avoid race with apic_clear_irr
@@ -168,14 +159,9 @@ static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
apic->irr_pending = true;
}
-static inline u32 __kvm_lapic_get_reg(char *regs, int reg_off)
-{
- return *((u32 *) (regs + reg_off));
-}
-
static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
{
- return __kvm_lapic_get_reg(apic->regs, reg_off);
+ return apic_get_reg(apic->regs, reg_off);
}
DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4e06e2e89a8f..6e838cb6c9e1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1983,14 +1983,35 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
return true;
}
+static __ro_after_init HLIST_HEAD(empty_page_hash);
+
+static struct hlist_head *kvm_get_mmu_page_hash(struct kvm *kvm, gfn_t gfn)
+{
+ /*
+ * Ensure the load of the hash table pointer itself is ordered before
+ * loads to walk the table. The pointer is set at runtime outside of
+ * mmu_lock when the TDP MMU is enabled, i.e. when the hash table of
+ * shadow pages becomes necessary only when KVM needs to shadow L1's
+ * TDP for an L2 guest. Pairs with the smp_store_release() in
+ * kvm_mmu_alloc_page_hash().
+ */
+ struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash);
+
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ if (!page_hash)
+ return &empty_page_hash;
+
+ return &page_hash[kvm_page_table_hashfn(gfn)];
+}
+
#define for_each_valid_sp(_kvm, _sp, _list) \
hlist_for_each_entry(_sp, _list, hash_link) \
if (is_obsolete_sp((_kvm), (_sp))) { \
} else
#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
- for_each_valid_sp(_kvm, _sp, \
- &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
+ for_each_valid_sp(_kvm, _sp, kvm_get_mmu_page_hash(_kvm, _gfn)) \
if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
@@ -2358,6 +2379,12 @@ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
struct kvm_mmu_page *sp;
bool created = false;
+ /*
+ * No need for memory barriers, unlike in kvm_get_mmu_page_hash(), as
+ * mmu_page_hash must be set prior to creating the first shadow root,
+ * i.e. reaching this point is fully serialized by slots_arch_lock.
+ */
+ BUG_ON(!kvm->arch.mmu_page_hash);
sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
@@ -3882,6 +3909,28 @@ out_unlock:
return r;
}
+static int kvm_mmu_alloc_page_hash(struct kvm *kvm)
+{
+ struct hlist_head *h;
+
+ if (kvm->arch.mmu_page_hash)
+ return 0;
+
+ h = kvcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT);
+ if (!h)
+ return -ENOMEM;
+
+ /*
+ * Ensure the hash table pointer is set only after all stores to zero
+ * the memory are retired. Pairs with the smp_load_acquire() in
+ * kvm_get_mmu_page_hash(). Note, mmu_lock must be held for write to
+ * add (or remove) shadow pages, and so readers are guaranteed to see
+ * an empty list for their current mmu_lock critical section.
+ */
+ smp_store_release(&kvm->arch.mmu_page_hash, h);
+ return 0;
+}
+
static int mmu_first_shadow_root_alloc(struct kvm *kvm)
{
struct kvm_memslots *slots;
@@ -3901,9 +3950,13 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
if (kvm_shadow_root_allocated(kvm))
goto out_unlock;
+ r = kvm_mmu_alloc_page_hash(kvm);
+ if (r)
+ goto out_unlock;
+
/*
- * Check if anything actually needs to be allocated, e.g. all metadata
- * will be allocated upfront if TDP is disabled.
+ * Check if memslot metadata actually needs to be allocated, e.g. all
+ * metadata will be allocated upfront if TDP is disabled.
*/
if (kvm_memslots_have_rmaps(kvm) &&
kvm_page_track_write_tracking_enabled(kvm))
@@ -6682,15 +6735,22 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
}
-void kvm_mmu_init_vm(struct kvm *kvm)
+int kvm_mmu_init_vm(struct kvm *kvm)
{
+ int r;
+
kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
- if (tdp_mmu_enabled)
+ if (tdp_mmu_enabled) {
kvm_mmu_init_tdp_mmu(kvm);
+ } else {
+ r = kvm_mmu_alloc_page_hash(kvm);
+ if (r)
+ return r;
+ }
kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
@@ -6699,6 +6759,7 @@ void kvm_mmu_init_vm(struct kvm *kvm)
kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
+ return 0;
}
static void mmu_free_vm_memory_caches(struct kvm *kvm)
@@ -6710,6 +6771,8 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm)
void kvm_mmu_uninit_vm(struct kvm *kvm)
{
+ kvfree(kvm->arch.mmu_page_hash);
+
if (tdp_mmu_enabled)
kvm_mmu_uninit_tdp_mmu(kvm);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index db8f33e4de62..65f3c89d7c5d 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -103,6 +103,9 @@ struct kvm_mmu_page {
int root_count;
refcount_t tdp_mmu_root_count;
};
+
+ bool has_mapped_host_mmio;
+
union {
/* These two members aren't used for TDP MMU */
struct {
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 68e323568e95..ed762bb4b007 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -804,9 +804,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r != RET_PF_CONTINUE)
return r;
+#if PTTYPE != PTTYPE_EPT
/*
- * Do not change pte_access if the pfn is a mmio page, otherwise
- * we will cache the incorrect access into mmio spte.
+ * Treat the guest PTE protections as writable, supervisor-only if this
+ * is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore
+ * PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO,
+ * otherwise KVM will cache incorrect access information in the SPTE.
*/
if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
!is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
@@ -822,6 +825,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (is_cr4_smep(vcpu->arch.mmu))
walker.pte_access &= ~ACC_EXEC_MASK;
}
+#endif
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index cfce03d8f123..df31039b5d63 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -104,7 +104,7 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
return spte;
}
-static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
+static bool __kvm_is_mmio_pfn(kvm_pfn_t pfn)
{
if (pfn_valid(pfn))
return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
@@ -125,6 +125,35 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
E820_TYPE_RAM);
}
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn, int *is_host_mmio)
+{
+ /*
+ * Determining if a PFN is host MMIO is relative expensive. Cache the
+ * result locally (in the sole caller) to avoid doing the full query
+ * multiple times when creating a single SPTE.
+ */
+ if (*is_host_mmio < 0)
+ *is_host_mmio = __kvm_is_mmio_pfn(pfn);
+
+ return *is_host_mmio;
+}
+
+static void kvm_track_host_mmio_mapping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (root)
+ WRITE_ONCE(root->has_mapped_host_mmio, true);
+ else
+ WRITE_ONCE(vcpu->kvm->arch.has_mapped_host_mmio, true);
+
+ /*
+ * Force vCPUs to exit and flush CPU buffers if the vCPU is using the
+ * affected root(s).
+ */
+ kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
/*
* Returns true if the SPTE needs to be updated atomically due to having bits
* that may be changed without holding mmu_lock, and for which KVM must not
@@ -162,6 +191,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
{
int level = sp->role.level;
u64 spte = SPTE_MMU_PRESENT_MASK;
+ int is_host_mmio = -1;
bool wrprot = false;
/*
@@ -209,13 +239,15 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
- spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn));
+ if (kvm_x86_ops.get_mt_mask)
+ spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn, &is_host_mmio));
if (host_writable)
spte |= shadow_host_writable_mask;
else
pte_access &= ~ACC_WRITE_MASK;
- if (shadow_me_value && !kvm_is_mmio_pfn(pfn))
+ if (shadow_me_value && !kvm_is_mmio_pfn(pfn, &is_host_mmio))
spte |= shadow_me_value;
spte |= (u64)pfn << PAGE_SHIFT;
@@ -260,6 +292,11 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
}
+ if (static_branch_unlikely(&cpu_buf_vm_clear) &&
+ !kvm_vcpu_can_access_host_mmio(vcpu) &&
+ kvm_is_mmio_pfn(pfn, &is_host_mmio))
+ kvm_track_host_mmio_mapping(vcpu);
+
*new_spte = spte;
return wrprot;
}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1e94f081bdaf..3133f066927e 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -280,6 +280,16 @@ static inline bool is_mirror_sptep(tdp_ptep_t sptep)
return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep)));
}
+static inline bool kvm_vcpu_can_access_host_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (root)
+ return READ_ONCE(root->has_mapped_host_mmio);
+
+ return READ_ONCE(vcpu->kvm->arch.has_mapped_host_mmio);
+}
+
static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
{
return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 067f8e3f5a0d..a34c5c3b164e 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -18,6 +18,7 @@
#include <linux/hashtable.h>
#include <linux/amd-iommu.h>
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
#include <asm/irq_remapping.h>
#include <asm/msr.h>
@@ -29,36 +30,39 @@
#include "svm.h"
/*
- * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID,
- * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry
- * if an interrupt can't be delivered, e.g. because the vCPU isn't running.
+ * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
+ * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
+ * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index
+ * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast
+ * lookup on the index, where as vCPUs whose index doesn't match their ID need
+ * to walk the entire xarray of vCPUs in the worst case scenario.
*
- * For the vCPU ID, use however many bits are currently allowed for the max
+ * For the vCPU index, use however many bits are currently allowed for the max
* guest physical APIC ID (limited by the size of the physical ID table), and
* use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
* size of the GATag is defined by hardware (32 bits), but is an opaque value
* as far as hardware is concerned.
*/
-#define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
+#define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
#define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
#define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
-#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+#define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK)
-#define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
- ((vcpu_id) & AVIC_VCPU_ID_MASK))
-#define AVIC_GATAG(vm_id, vcpu_id) \
+#define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
+ ((vcpu_idx) & AVIC_VCPU_IDX_MASK))
+#define AVIC_GATAG(vm_id, vcpu_idx) \
({ \
- u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \
+ u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \
\
- WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \
+ WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \
WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
ga_tag; \
})
-static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
+static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u);
static bool force_avic;
module_param_unsafe(force_avic, bool, 0444);
@@ -75,14 +79,6 @@ static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
bool x2avic_enabled;
-/*
- * This is a wrapper of struct amd_iommu_ir_data.
- */
-struct amd_svm_iommu_ir {
- struct list_head node; /* Used by SVM for per-vcpu ir_list */
- void *data; /* Storing pointer to struct amd_ir_data */
-};
-
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
@@ -147,16 +143,16 @@ int avic_ga_log_notifier(u32 ga_tag)
struct kvm_svm *kvm_svm;
struct kvm_vcpu *vcpu = NULL;
u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
- u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+ u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag);
- pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
- trace_kvm_avic_ga_log(vm_id, vcpu_id);
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx);
+ trace_kvm_avic_ga_log(vm_id, vcpu_idx);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
if (kvm_svm->avic_vm_id != vm_id)
continue;
- vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
+ vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx);
break;
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
@@ -180,10 +176,8 @@ void avic_vm_destroy(struct kvm *kvm)
if (!enable_apicv)
return;
- if (kvm_svm->avic_logical_id_table_page)
- __free_page(kvm_svm->avic_logical_id_table_page);
- if (kvm_svm->avic_physical_id_table_page)
- __free_page(kvm_svm->avic_physical_id_table_page);
+ free_page((unsigned long)kvm_svm->avic_logical_id_table);
+ free_page((unsigned long)kvm_svm->avic_physical_id_table);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_del(&kvm_svm->hnode);
@@ -196,27 +190,19 @@ int avic_vm_init(struct kvm *kvm)
int err = -ENOMEM;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
struct kvm_svm *k2;
- struct page *p_page;
- struct page *l_page;
u32 vm_id;
if (!enable_apicv)
return 0;
- /* Allocating physical APIC ID table (4KB) */
- p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!p_page)
+ kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!kvm_svm->avic_physical_id_table)
goto free_avic;
- kvm_svm->avic_physical_id_table_page = p_page;
-
- /* Allocating logical APIC ID table (4KB) */
- l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!l_page)
+ kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!kvm_svm->avic_logical_id_table)
goto free_avic;
- kvm_svm->avic_logical_id_table_page = l_page;
-
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
again:
vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
@@ -242,17 +228,19 @@ free_avic:
return err;
}
+static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
+{
+ return __sme_set(__pa(svm->vcpu.arch.apic->regs));
+}
+
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
{
struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
- phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
- phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
- phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
- vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
- vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
- vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
- vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
+ vmcb->control.avic_backing_page = avic_get_backing_page_address(svm);
+ vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table));
+ vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
+ vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
if (kvm_apicv_activated(svm->vcpu.kvm))
avic_activate_vmcb(svm);
@@ -260,32 +248,31 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
avic_deactivate_vmcb(svm);
}
-static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
- unsigned int index)
-{
- u64 *avic_physical_id_table;
- struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
-
- if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
- (index > X2AVIC_MAX_PHYSICAL_ID))
- return NULL;
-
- avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
-
- return &avic_physical_id_table[index];
-}
-
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
- u64 *entry, new_entry;
- int id = vcpu->vcpu_id;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
struct vcpu_svm *svm = to_svm(vcpu);
+ u32 id = vcpu->vcpu_id;
+ u64 new_entry;
+ /*
+ * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC
+ * hardware. Immediately clear apicv_active, i.e. don't wait until the
+ * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as
+ * avic_vcpu_load() expects to be called if and only if the vCPU has
+ * fully initialized AVIC.
+ */
if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
- (id > X2AVIC_MAX_PHYSICAL_ID))
- return -EINVAL;
+ (id > X2AVIC_MAX_PHYSICAL_ID)) {
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
+ vcpu->arch.apic->apicv_active = false;
+ return 0;
+ }
+
+ BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE ||
+ (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE);
- if (!vcpu->arch.apic->regs)
+ if (WARN_ON_ONCE(!vcpu->arch.apic->regs))
return -EINVAL;
if (kvm_apicv_activated(vcpu->kvm)) {
@@ -302,19 +289,21 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return ret;
}
- svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
+ /* Note, fls64() returns the bit position, +1. */
+ BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT >
+ fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK));
/* Setting AVIC backing page address in the phy APIC ID table */
- entry = avic_get_physical_id_entry(vcpu, id);
- if (!entry)
- return -EINVAL;
+ new_entry = avic_get_backing_page_address(svm) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+ svm->avic_physical_id_entry = new_entry;
- new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
- AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
- AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
- WRITE_ONCE(*entry, new_entry);
-
- svm->avic_physical_id_cache = entry;
+ /*
+ * Initialize the real table, as vCPUs must have a valid entry in order
+ * for broadcast IPIs to function correctly (broadcast IPIs ignore
+ * invalid entries, i.e. aren't guaranteed to generate a VM-Exit).
+ */
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry);
return 0;
}
@@ -448,7 +437,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
if (apic_x2apic_mode(source))
avic_logical_id_table = NULL;
else
- avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
+ avic_logical_id_table = kvm_svm->avic_logical_id_table;
/*
* AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
@@ -550,7 +539,6 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- u32 *logical_apic_id_table;
u32 cluster, index;
ldr = GET_APIC_LOGICAL_ID(ldr);
@@ -571,9 +559,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
return NULL;
index += (cluster << 2);
- logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
-
- return &logical_apic_id_table[index];
+ return &kvm_svm->avic_logical_id_table[index];
}
static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
@@ -722,6 +708,9 @@ int avic_init_vcpu(struct vcpu_svm *svm)
int ret;
struct kvm_vcpu *vcpu = &svm->vcpu;
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
+
if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
return 0;
@@ -729,8 +718,6 @@ int avic_init_vcpu(struct vcpu_svm *svm)
if (ret)
return ret;
- INIT_LIST_HEAD(&svm->ir_list);
- spin_lock_init(&svm->ir_list_lock);
svm->dfr_reg = APIC_DFR_FLAT;
return ret;
@@ -742,316 +729,161 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
{
- int ret = 0;
+ struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
unsigned long flags;
- struct amd_svm_iommu_ir *ir;
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm))
- return 0;
- /*
- * Here, we go through the per-vcpu ir_list to update all existing
- * interrupt remapping table entry targeting this vcpu.
- */
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
- if (list_empty(&svm->ir_list))
- goto out;
+ if (!vcpu)
+ return;
- list_for_each_entry(ir, &svm->ir_list, node) {
- if (activate)
- ret = amd_iommu_activate_guest_mode(ir->data);
- else
- ret = amd_iommu_deactivate_guest_mode(ir->data);
- if (ret)
- break;
- }
-out:
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
- return ret;
+ spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
+ list_del(&irqfd->vcpu_list);
+ spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
}
-static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector)
{
- unsigned long flags;
- struct amd_svm_iommu_ir *cur;
-
- spin_lock_irqsave(&svm->ir_list_lock, flags);
- list_for_each_entry(cur, &svm->ir_list, node) {
- if (cur->data != pi->ir_data)
- continue;
- list_del(&cur->node);
- kfree(cur);
- break;
- }
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-}
-
-static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
-{
- int ret = 0;
- unsigned long flags;
- struct amd_svm_iommu_ir *ir;
- u64 entry;
-
- if (WARN_ON_ONCE(!pi->ir_data))
- return -EINVAL;
-
- /**
- * In some cases, the existing irte is updated and re-set,
- * so we need to check here if it's already been * added
- * to the ir_list.
- */
- if (pi->prev_ga_tag) {
- struct kvm *kvm = svm->vcpu.kvm;
- u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
- struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
- struct vcpu_svm *prev_svm;
-
- if (!prev_vcpu) {
- ret = -EINVAL;
- goto out;
- }
-
- prev_svm = to_svm(prev_vcpu);
- svm_ir_list_del(prev_svm, pi);
- }
-
- /**
- * Allocating new amd_iommu_pi_data, which will get
- * add to the per-vcpu ir_list.
- */
- ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
- if (!ir) {
- ret = -ENOMEM;
- goto out;
- }
- ir->data = pi->ir_data;
-
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
/*
- * Update the target pCPU for IOMMU doorbells if the vCPU is running.
- * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
- * will update the pCPU info when the vCPU awkened and/or scheduled in.
- * See also avic_vcpu_load().
+ * If the IRQ was affined to a different vCPU, remove the IRTE metadata
+ * from the *previous* vCPU's list.
*/
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
- amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
- true, pi->ir_data);
-
- list_add(&ir->node, &svm->ir_list);
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-out:
- return ret;
-}
+ svm_ir_list_del(irqfd);
-/*
- * Note:
- * The HW cannot support posting multicast/broadcast
- * interrupts to a vCPU. So, we still use legacy interrupt
- * remapping for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- */
-static int
-get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
-{
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu = NULL;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq)) {
- pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
- __func__, irq.vector);
- return -1;
- }
-
- pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
- irq.vector);
- *svm = to_svm(vcpu);
- vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
- vcpu_info->vector = irq.vector;
-
- return 0;
-}
-
-/*
- * avic_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- bool enable_remapped_mode = true;
- int idx, ret = 0;
-
- if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
- return 0;
-
- pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
- __func__, host_irq, guest_irq, set);
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
- }
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- struct vcpu_data vcpu_info;
- struct vcpu_svm *svm = NULL;
+ if (vcpu) {
+ /*
+ * Try to enable guest_mode in IRTE, unless AVIC is inhibited,
+ * in which case configure the IRTE for legacy mode, but track
+ * the IRTE metadata so that it can be converted to guest mode
+ * if AVIC is enabled/uninhibited in the future.
+ */
+ struct amd_iommu_pi_data pi_data = {
+ .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+ vcpu->vcpu_idx),
+ .is_guest_mode = kvm_vcpu_apicv_active(vcpu),
+ .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
+ .vector = vector,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 entry;
+ int ret;
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
+ /*
+ * Prevent the vCPU from being scheduled out or migrated until
+ * the IRTE is updated and its metadata has been added to the
+ * list of IRQs being posted to the vCPU, to ensure the IRTE
+ * isn't programmed with stale pCPU/IsRunning information.
+ */
+ guard(spinlock_irqsave)(&svm->ir_list_lock);
- /**
- * Here, we setup with legacy mode in the following cases:
- * 1. When cannot target interrupt to a specific vcpu.
- * 2. Unsetting posted interrupt.
- * 3. APIC virtualization is disabled for the vcpu.
- * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
+ /*
+ * Update the target pCPU for IOMMU doorbells if the vCPU is
+ * running. If the vCPU is NOT running, i.e. is blocking or
+ * scheduled out, KVM will update the pCPU info when the vCPU
+ * is awakened and/or scheduled in. See also avic_vcpu_load().
*/
- if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
- kvm_vcpu_apicv_active(&svm->vcpu)) {
- struct amd_iommu_pi_data pi;
-
- enable_remapped_mode = false;
-
- /* Try to enable guest_mode in IRTE */
- pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
- AVIC_HPA_MASK);
- pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
- svm->vcpu.vcpu_id);
- pi.is_guest_mode = true;
- pi.vcpu_data = &vcpu_info;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
-
- /**
- * Here, we successfully setting up vcpu affinity in
- * IOMMU guest mode. Now, we need to store the posted
- * interrupt information in a per-vcpu ir_list so that
- * we can reference to them directly when we update vcpu
- * scheduling information in IOMMU irte.
- */
- if (!ret && pi.is_guest_mode)
- svm_ir_list_add(svm, &pi);
+ entry = svm->avic_physical_id_entry;
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
+ pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ } else {
+ pi_data.cpu = -1;
+ pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
}
- if (!ret && svm) {
- trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
- e->gsi, vcpu_info.vector,
- vcpu_info.pi_desc_addr, set);
- }
+ ret = irq_set_vcpu_affinity(host_irq, &pi_data);
+ if (ret)
+ return ret;
- if (ret < 0) {
- pr_err("%s: failed to update PI IRTE\n", __func__);
- goto out;
+ /*
+ * Revert to legacy mode if the IOMMU didn't provide metadata
+ * for the IRTE, which KVM needs to keep the IRTE up-to-date,
+ * e.g. if the vCPU is migrated or AVIC is disabled.
+ */
+ if (WARN_ON_ONCE(!pi_data.ir_data)) {
+ irq_set_vcpu_affinity(host_irq, NULL);
+ return -EIO;
}
- }
- ret = 0;
- if (enable_remapped_mode) {
- /* Use legacy mode in IRTE */
- struct amd_iommu_pi_data pi;
+ irqfd->irq_bypass_data = pi_data.ir_data;
+ list_add(&irqfd->vcpu_list, &svm->ir_list);
+ return 0;
+ }
+ return irq_set_vcpu_affinity(host_irq, NULL);
+}
- /**
- * Here, pi is used to:
- * - Tell IOMMU to use legacy mode for this interrupt.
- * - Retrieve ga_tag of prior interrupt remapping data.
- */
- pi.prev_ga_tag = 0;
- pi.is_guest_mode = false;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
+enum avic_vcpu_action {
+ /*
+ * There is no need to differentiate between activate and deactivate,
+ * as KVM only refreshes AVIC state when the vCPU is scheduled in and
+ * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is
+ * being (de)activated.
+ */
+ AVIC_TOGGLE_ON_OFF = BIT(0),
+ AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF,
+ AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF,
- /**
- * Check if the posted interrupt was previously
- * setup with the guest_mode by checking if the ga_tag
- * was cached. If so, we need to clean up the per-vcpu
- * ir_list.
- */
- if (!ret && pi.prev_ga_tag) {
- int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
- struct kvm_vcpu *vcpu;
+ /*
+ * No unique action is required to deal with a vCPU that stops/starts
+ * running. A vCPU that starts running by definition stops blocking as
+ * well, and a vCPU that stops running can't have been blocking, i.e.
+ * doesn't need to toggle GALogIntr.
+ */
+ AVIC_START_RUNNING = 0,
+ AVIC_STOP_RUNNING = 0,
- vcpu = kvm_get_vcpu_by_id(kvm, id);
- if (vcpu)
- svm_ir_list_del(to_svm(vcpu), &pi);
- }
- }
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
-}
+ /*
+ * When a vCPU starts blocking, KVM needs to set the GALogIntr flag
+ * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is
+ * sent to the vCPU.
+ */
+ AVIC_START_BLOCKING = BIT(1),
+};
-static inline int
-avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
+static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
{
- int ret = 0;
- struct amd_svm_iommu_ir *ir;
+ bool ga_log_intr = (action & AVIC_START_BLOCKING);
struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_kernel_irqfd *irqfd;
lockdep_assert_held(&svm->ir_list_lock);
- if (!kvm_arch_has_assigned_device(vcpu->kvm))
- return 0;
-
/*
* Here, we go through the per-vcpu ir_list to update all existing
* interrupt remapping table entry targeting this vcpu.
*/
if (list_empty(&svm->ir_list))
- return 0;
+ return;
- list_for_each_entry(ir, &svm->ir_list, node) {
- ret = amd_iommu_update_ga(cpu, r, ir->data);
- if (ret)
- return ret;
+ list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
+ void *data = irqfd->irq_bypass_data;
+
+ if (!(action & AVIC_TOGGLE_ON_OFF))
+ WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr));
+ else if (cpu >= 0)
+ WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr));
+ else
+ WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
}
- return 0;
}
-void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
{
- u64 entry;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
+ u64 entry;
lockdep_assert_preemption_disabled();
if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
- /*
- * No need to update anything if the vCPU is blocking, i.e. if the vCPU
- * is being scheduled in after being preempted. The CPU entries in the
- * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
- * If the vCPU was migrated, its new CPU value will be stuffed when the
- * vCPU unblocks.
- */
- if (kvm_vcpu_is_blocking(vcpu))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
return;
/*
@@ -1063,38 +895,57 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ entry = svm->avic_physical_id_entry;
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK |
+ AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
- avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
+ svm->avic_physical_id_entry = entry;
+
+ /*
+ * If IPI virtualization is disabled, clear IsRunning when updating the
+ * actual Physical ID table, so that the CPU never sees IsRunning=1.
+ * Keep the APIC ID up-to-date in the entry to minimize the chances of
+ * things going sideways if hardware peeks at the ID.
+ */
+ if (!enable_ipiv)
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
-void avic_vcpu_put(struct kvm_vcpu *vcpu)
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- u64 entry;
+ /*
+ * No need to update anything if the vCPU is blocking, i.e. if the vCPU
+ * is being scheduled in after being preempted. The CPU entries in the
+ * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
+ * If the vCPU was migrated, its new CPU value will be stuffed when the
+ * vCPU unblocks.
+ */
+ if (kvm_vcpu_is_blocking(vcpu))
+ return;
+
+ __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING);
+}
+
+static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
+ u64 entry = svm->avic_physical_id_entry;
lockdep_assert_preemption_disabled();
- /*
- * Note, reading the Physical ID entry outside of ir_list_lock is safe
- * as only the pCPU that has loaded (or is loading) the vCPU is allowed
- * to modify the entry, and preemption is disabled. I.e. the vCPU
- * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
- * recursively.
- */
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
-
- /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
- if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
return;
/*
@@ -1107,13 +958,62 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
- avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+ avic_update_iommu_vcpu_affinity(vcpu, -1, action);
+
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
+ /*
+ * Keep the previous APIC ID in the entry so that a rogue doorbell from
+ * hardware is at least restricted to a CPU associated with the vCPU.
+ */
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+
+ if (enable_ipiv)
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ /*
+ * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as
+ * it's a synthetic flag that usurps an unused should-be-zero bit.
+ */
+ if (action & AVIC_START_BLOCKING)
+ entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
+
+ svm->avic_physical_id_entry = entry;
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, reading the Physical ID entry outside of ir_list_lock is safe
+ * as only the pCPU that has loaded (or is loading) the vCPU is allowed
+ * to modify the entry, and preemption is disabled. I.e. the vCPU
+ * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
+ * recursively.
+ */
+ u64 entry = to_svm(vcpu)->avic_physical_id_entry;
+
+ /*
+ * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the
+ * vCPU is preempted while its in the process of blocking. WARN if the
+ * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put
+ * the AVIC if it wasn't previously loaded.
+ */
+ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) {
+ if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu)))
+ return;
+ /*
+ * The vCPU was preempted while blocking, ensure its IRTEs are
+ * configured to generate GA Log Interrupts.
+ */
+ if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR))))
+ return;
+ }
+
+ __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING :
+ AVIC_STOP_RUNNING);
}
void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
@@ -1142,19 +1042,18 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
- bool activated = kvm_vcpu_apicv_active(vcpu);
-
if (!enable_apicv)
return;
+ /* APICv should only be toggled on/off while the vCPU is running. */
+ WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu));
+
avic_refresh_virtual_apic_mode(vcpu);
- if (activated)
- avic_vcpu_load(vcpu, vcpu->cpu);
+ if (kvm_vcpu_apicv_active(vcpu))
+ __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE);
else
- avic_vcpu_put(vcpu);
-
- avic_set_pi_irte_mode(vcpu, activated);
+ __avic_vcpu_put(vcpu, AVIC_DEACTIVATE);
}
void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
@@ -1162,20 +1061,25 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
if (!kvm_vcpu_apicv_active(vcpu))
return;
- /*
- * Unload the AVIC when the vCPU is about to block, _before_
- * the vCPU actually blocks.
- *
- * Any IRQs that arrive before IsRunning=0 will not cause an
- * incomplete IPI vmexit on the source, therefore vIRR will also
- * be checked by kvm_vcpu_check_block() before blocking. The
- * memory barrier implicit in set_current_state orders writing
- * IsRunning=0 before reading the vIRR. The processor needs a
- * matching memory barrier on interrupt delivery between writing
- * IRR and reading IsRunning; the lack of this barrier might be
- * the cause of errata #1235).
- */
- avic_vcpu_put(vcpu);
+ /*
+ * Unload the AVIC when the vCPU is about to block, _before_ the vCPU
+ * actually blocks.
+ *
+ * Note, any IRQs that arrive before IsRunning=0 will not cause an
+ * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles
+ * this by checking vIRR one last time before blocking. The memory
+ * barrier implicit in set_current_state orders writing IsRunning=0
+ * before reading the vIRR. The processor needs a matching memory
+ * barrier on interrupt delivery between writing IRR and reading
+ * IsRunning; the lack of this barrier might be the cause of errata #1235).
+ *
+ * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM
+ * doesn't need to detect events for scheduling purposes. The doorbell
+ * used to signal running vCPUs cannot be blocked, i.e. will perturb the
+ * CPU and cause noisy neighbor problems if the VM is sending interrupts
+ * to the vCPU while it's scheduled out.
+ */
+ __avic_vcpu_put(vcpu, AVIC_START_BLOCKING);
}
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
@@ -1228,6 +1132,14 @@ bool avic_hardware_setup(void)
if (x2avic_enabled)
pr_info("x2AVIC enabled\n");
+ /*
+ * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
+ * due to erratum 1235, which results in missed VM-Exits on the sender
+ * and thus missed wake events for blocking vCPUs due to the CPU
+ * failing to see a software update to clear IsRunning.
+ */
+ enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
+
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
return true;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 8427a48b8b7a..b7fd2e869998 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -185,12 +185,87 @@ void recalc_intercepts(struct vcpu_svm *svm)
}
/*
+ * This array (and its actual size) holds the set of offsets (indexing by chunk
+ * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the
+ * set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g.
+ * based on CPUID features. This array only tracks MSRs that *might* be passed
+ * through to the guest.
+ *
+ * Hardcode the capacity of the array based on the maximum number of _offsets_.
+ * MSRs are batched together, so there are fewer offsets than MSRs.
+ */
+static int nested_svm_msrpm_merge_offsets[7] __ro_after_init;
+static int nested_svm_nr_msrpm_merge_offsets __ro_after_init;
+typedef unsigned long nsvm_msrpm_merge_t;
+
+int __init nested_svm_init_msrpm_merge_offsets(void)
+{
+ static const u32 merge_msrs[] __initconst = {
+ MSR_STAR,
+ MSR_IA32_SYSENTER_CS,
+ MSR_IA32_SYSENTER_EIP,
+ MSR_IA32_SYSENTER_ESP,
+ #ifdef CONFIG_X86_64
+ MSR_GS_BASE,
+ MSR_FS_BASE,
+ MSR_KERNEL_GS_BASE,
+ MSR_LSTAR,
+ MSR_CSTAR,
+ MSR_SYSCALL_MASK,
+ #endif
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_PRED_CMD,
+ MSR_IA32_FLUSH_CMD,
+ MSR_IA32_APERF,
+ MSR_IA32_MPERF,
+ MSR_IA32_LASTBRANCHFROMIP,
+ MSR_IA32_LASTBRANCHTOIP,
+ MSR_IA32_LASTINTFROMIP,
+ MSR_IA32_LASTINTTOIP,
+ };
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) {
+ int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]);
+ u32 offset;
+
+ if (WARN_ON(bit_nr < 0))
+ return -EIO;
+
+ /*
+ * Merging is done in chunks to reduce the number of accesses
+ * to L1's bitmap.
+ */
+ offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t);
+
+ for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) {
+ if (nested_svm_msrpm_merge_offsets[j] == offset)
+ break;
+ }
+
+ if (j < nested_svm_nr_msrpm_merge_offsets)
+ continue;
+
+ if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets)))
+ return -EIO;
+
+ nested_svm_msrpm_merge_offsets[j] = offset;
+ nested_svm_nr_msrpm_merge_offsets++;
+ }
+
+ return 0;
+}
+
+/*
* Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
* is optimized in that it only merges the parts where KVM MSR permission bitmap
* may contain zero bits.
*/
-static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
+static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm;
+ nsvm_msrpm_merge_t *msrpm01 = svm->msrpm;
int i;
/*
@@ -205,7 +280,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
if (!svm->nested.force_msr_bitmap_recalc) {
struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
- if (kvm_hv_hypercall_enabled(&svm->vcpu) &&
+ if (kvm_hv_hypercall_enabled(vcpu) &&
hve->hv_enlightenments_control.msr_bitmap &&
(svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
goto set_msrpm_base_pa;
@@ -215,25 +290,17 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return true;
- for (i = 0; i < MSRPM_OFFSETS; i++) {
- u32 value, p;
- u64 offset;
+ for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) {
+ const int p = nested_svm_msrpm_merge_offsets[i];
+ nsvm_msrpm_merge_t l1_val;
+ gpa_t gpa;
- if (msrpm_offsets[i] == 0xffffffff)
- break;
+ gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val));
- p = msrpm_offsets[i];
-
- /* x2apic msrs are intercepted always for the nested guest */
- if (is_x2apic_msrpm_offset(p))
- continue;
-
- offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
-
- if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
+ if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val)))
return false;
- svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ msrpm02[p] = msrpm01[p] | l1_val;
}
svm->nested.force_msr_bitmap_recalc = false;
@@ -937,7 +1004,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
goto out_exit_err;
- if (nested_svm_vmrun_msrpm(svm))
+ if (nested_svm_merge_msrpm(vcpu))
goto out;
out_exit_err:
@@ -1230,7 +1297,6 @@ int svm_allocate_nested(struct vcpu_svm *svm)
svm->nested.msrpm = svm_vcpu_alloc_msrpm();
if (!svm->nested.msrpm)
goto err_free_vmcb02;
- svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
svm->nested.initialized = true;
return 0;
@@ -1290,26 +1356,26 @@ void svm_leave_nested(struct kvm_vcpu *vcpu)
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{
- u32 offset, msr, value;
- int write, mask;
+ gpa_t base = svm->nested.ctl.msrpm_base_pa;
+ int write, bit_nr;
+ u8 value, mask;
+ u32 msr;
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return NESTED_EXIT_HOST;
msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- offset = svm_msrpm_offset(msr);
+ bit_nr = svm_msrpm_bit_nr(msr);
write = svm->vmcb->control.exit_info_1 & 1;
- mask = 1 << ((2 * (msr & 0xf)) + write);
- if (offset == MSR_INVALID)
+ if (bit_nr < 0)
return NESTED_EXIT_DONE;
- /* Offset is in 32 bit units but need in 8 bit units */
- offset *= 4;
-
- if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
+ if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE,
+ &value, sizeof(value)))
return NESTED_EXIT_DONE;
+ mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1));
return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
@@ -1819,13 +1885,11 @@ out_free:
static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
if (WARN_ON(!is_guest_mode(vcpu)))
return true;
if (!vcpu->arch.pdptrs_from_userspace &&
- !nested_npt_enabled(svm) && is_pae_paging(vcpu))
+ !nested_npt_enabled(to_svm(vcpu)) && is_pae_paging(vcpu))
/*
* Reload the guest's PDPTRs since after a migration
* the guest CR3 might be restored prior to setting the nested
@@ -1834,7 +1898,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
return false;
- if (!nested_svm_vmrun_msrpm(svm)) {
+ if (!nested_svm_merge_msrpm(vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_EMULATION;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index b201f77fcd49..2fbdebf79fbb 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -117,6 +117,7 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
*/
down_write(&sev_deactivate_lock);
+ /* SNP firmware requires use of WBINVD for ASID recycling. */
wbinvd_on_all_cpus();
if (sev_snp_enabled)
@@ -446,7 +447,12 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
init_args.probe = false;
ret = sev_platform_init(&init_args);
if (ret)
- goto e_free;
+ goto e_free_asid;
+
+ if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto e_free_asid;
+ }
/* This needs to happen after SEV/SNP firmware initialization. */
if (vm_type == KVM_X86_SNP_VM) {
@@ -464,6 +470,8 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
return 0;
e_free:
+ free_cpumask_var(sev->have_run_cpus);
+e_free_asid:
argp->error = init_args.error;
sev_asid_free(sev);
sev->asid = 0;
@@ -708,6 +716,33 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
}
}
+static void sev_writeback_caches(struct kvm *kvm)
+{
+ /*
+ * Note, the caller is responsible for ensuring correctness if the mask
+ * can be modified, e.g. if a CPU could be doing VMRUN.
+ */
+ if (cpumask_empty(to_kvm_sev_info(kvm)->have_run_cpus))
+ return;
+
+ /*
+ * Ensure that all dirty guest tagged cache entries are written back
+ * before releasing the pages back to the system for use. CLFLUSH will
+ * not do this without SME_COHERENT, and flushing many cache lines
+ * individually is slower than blasting WBINVD for large VMs, so issue
+ * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported)
+ * on CPUs that have done VMRUN, i.e. may have dirtied data using the
+ * VM's ASID.
+ *
+ * For simplicity, never remove CPUs from the bitmap. Ideally, KVM
+ * would clear the mask when flushing caches, but doing so requires
+ * serializing multiple calls and having responding CPUs (to the IPI)
+ * mark themselves as still running if they are running (or about to
+ * run) a vCPU for the VM.
+ */
+ wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
+}
+
static unsigned long get_num_contig_pages(unsigned long idx,
struct page **inpages, unsigned long npages)
{
@@ -2037,6 +2072,17 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
if (ret)
goto out_source_vcpu;
+ /*
+ * Allocate a new have_run_cpus for the destination, i.e. don't copy
+ * the set of CPUs from the source. If a CPU was used to run a vCPU in
+ * the source VM but is never used for the destination VM, then the CPU
+ * can only have cached memory that was accessible to the source VM.
+ */
+ if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto out_source_vcpu;
+ }
+
sev_migrate_from(kvm, source_kvm);
kvm_vm_dead(source_kvm);
cg_cleanup_sev = src_sev;
@@ -2135,11 +2181,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
return -EINVAL;
/* Check for policy bits that must be set */
- if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) ||
- !(params.policy & SNP_POLICY_MASK_SMT))
- return -EINVAL;
-
- if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET)
+ if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO))
return -EINVAL;
sev->policy = params.policy;
@@ -2698,12 +2740,7 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
goto failed;
}
- /*
- * Ensure that all guest tagged cache entries are flushed before
- * releasing the pages back to the system for use. CLFLUSH will
- * not do this, so issue a WBINVD.
- */
- wbinvd_on_all_cpus();
+ sev_writeback_caches(kvm);
__unregister_enc_region_locked(kvm, region);
@@ -2745,13 +2782,18 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
goto e_unlock;
}
+ mirror_sev = to_kvm_sev_info(kvm);
+ if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto e_unlock;
+ }
+
/*
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
source_sev = to_kvm_sev_info(source_kvm);
kvm_get_kvm(source_kvm);
- mirror_sev = to_kvm_sev_info(kvm);
list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
/* Set enc_context_owner and copy its encryption context over */
@@ -2813,7 +2855,13 @@ void sev_vm_destroy(struct kvm *kvm)
WARN_ON(!list_empty(&sev->mirror_vms));
- /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+ free_cpumask_var(sev->have_run_cpus);
+
+ /*
+ * If this is a mirror VM, remove it from the owner's list of a mirrors
+ * and skip ASID cleanup (the ASID is tied to the lifetime of the owner).
+ * Note, mirror VMs don't support registering encrypted regions.
+ */
if (is_mirroring_enc_context(kvm)) {
struct kvm *owner_kvm = sev->enc_context_owner;
@@ -2824,12 +2872,6 @@ void sev_vm_destroy(struct kvm *kvm)
return;
}
- /*
- * Ensure that all guest tagged cache entries are flushed before
- * releasing the pages back to the system for use. CLFLUSH will
- * not do this, so issue a WBINVD.
- */
- wbinvd_on_all_cpus();
/*
* if userspace was terminated before unregistering the memory regions
@@ -3099,30 +3141,29 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
/*
* VM Page Flush takes a host virtual address and a guest ASID. Fall
- * back to WBINVD if this faults so as not to make any problems worse
- * by leaving stale encrypted data in the cache.
+ * back to full writeback of caches if this faults so as not to make
+ * any problems worse by leaving stale encrypted data in the cache.
*/
if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
- goto do_wbinvd;
+ goto do_sev_writeback_caches;
return;
-do_wbinvd:
- wbinvd_on_all_cpus();
+do_sev_writeback_caches:
+ sev_writeback_caches(vcpu->kvm);
}
void sev_guest_memory_reclaimed(struct kvm *kvm)
{
/*
* With SNP+gmem, private/encrypted memory is unreachable via the
- * hva-based mmu notifiers, so these events are only actually
- * pertaining to shared pages where there is no need to perform
- * the WBINVD to flush associated caches.
+ * hva-based mmu notifiers, i.e. these events are explicitly scoped to
+ * shared pages, where there's no need to flush caches.
*/
if (!sev_guest(kvm) || sev_snp_guest(kvm))
return;
- wbinvd_on_all_cpus();
+ sev_writeback_caches(kvm);
}
void sev_free_vcpu(struct kvm_vcpu *vcpu)
@@ -3454,6 +3495,15 @@ int pre_sev_run(struct vcpu_svm *svm, int cpu)
if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa))
return -EINVAL;
+ /*
+ * To optimize cache flushes when memory is reclaimed from an SEV VM,
+ * track physical CPUs that enter the guest for SEV VMs and thus can
+ * have encrypted, dirty data in the cache, and flush caches only for
+ * CPUs that have entered the guest.
+ */
+ if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus))
+ cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus);
+
/* Assign the asid allocated with this SEV guest */
svm->asid = asid;
@@ -3886,9 +3936,9 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
* From this point forward, the VMSA will always be a guest-mapped page
* rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
* theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
- * that involves cleanups like wbinvd_on_all_cpus() which would ideally
- * be handled during teardown rather than guest boot. Deferring that
- * also allows the existing logic for SEV-ES VMSAs to be re-used with
+ * that involves cleanups like flushing caches, which would ideally be
+ * handled during teardown rather than guest boot. Deferring that also
+ * allows the existing logic for SEV-ES VMSAs to be re-used with
* minimal SNP-specific changes.
*/
svm->sev_es.snp_has_guest_vmsa = true;
@@ -4390,16 +4440,17 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
count, in);
}
-static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
+void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
-
- if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
- bool v_tsc_aux = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
+ /* Clear intercepts on MSRs that are context switched by hardware. */
+ svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW);
- set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux);
- }
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX))
+ svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID));
/*
* For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
@@ -4413,11 +4464,9 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
* XSAVES being exposed to the guest so that KVM can at least honor
* guest CPUID for RDMSR and WRMSR.
*/
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1);
- else
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) ||
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES));
}
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
@@ -4429,16 +4478,12 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
if (best)
vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
-
- if (sev_es_guest(svm->vcpu.kvm))
- sev_es_vcpu_after_set_cpuid(svm);
}
static void sev_es_init_vmcb(struct vcpu_svm *svm)
{
struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
struct vmcb *vmcb = svm->vmcb01.ptr;
- struct kvm_vcpu *vcpu = &svm->vcpu;
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
@@ -4496,10 +4541,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
/* Can't intercept XSETBV, HV can't modify XCR0 directly */
svm_clr_intercept(svm, INTERCEPT_XSETBV);
-
- /* Clear intercepts on selected MSRs */
- set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1);
}
void sev_init_vmcb(struct vcpu_svm *svm)
@@ -4888,7 +4929,7 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
/*
* SEV-ES avoids host/guest cache coherency issues through
- * WBINVD hooks issued via MMU notifiers during run-time, and
+ * WBNOINVD hooks issued via MMU notifiers during run-time, and
* KVM's VM destroy path at shutdown. Those MMU notifier events
* don't cover gmem since there is no requirement to map pages
* to a HVA in order to use them for a running guest. While the
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ab9b947dbf4f..d9931c6c4bc6 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -72,8 +72,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
static bool erratum_383_found __read_mostly;
-u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
-
/*
* Set osvw_len to higher value when updated Revision Guides
* are published and we know what the new status bits are
@@ -82,72 +80,6 @@ static uint64_t osvw_len = 4, osvw_status;
static DEFINE_PER_CPU(u64, current_tsc_ratio);
-#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4))
-
-static const struct svm_direct_access_msrs {
- u32 index; /* Index of the MSR */
- bool always; /* True if intercept is initially cleared */
-} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
- { .index = MSR_STAR, .always = true },
- { .index = MSR_IA32_SYSENTER_CS, .always = true },
- { .index = MSR_IA32_SYSENTER_EIP, .always = false },
- { .index = MSR_IA32_SYSENTER_ESP, .always = false },
-#ifdef CONFIG_X86_64
- { .index = MSR_GS_BASE, .always = true },
- { .index = MSR_FS_BASE, .always = true },
- { .index = MSR_KERNEL_GS_BASE, .always = true },
- { .index = MSR_LSTAR, .always = true },
- { .index = MSR_CSTAR, .always = true },
- { .index = MSR_SYSCALL_MASK, .always = true },
-#endif
- { .index = MSR_IA32_SPEC_CTRL, .always = false },
- { .index = MSR_IA32_PRED_CMD, .always = false },
- { .index = MSR_IA32_FLUSH_CMD, .always = false },
- { .index = MSR_IA32_DEBUGCTLMSR, .always = false },
- { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
- { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
- { .index = MSR_IA32_LASTINTFROMIP, .always = false },
- { .index = MSR_IA32_LASTINTTOIP, .always = false },
- { .index = MSR_IA32_XSS, .always = false },
- { .index = MSR_EFER, .always = false },
- { .index = MSR_IA32_CR_PAT, .always = false },
- { .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
- { .index = MSR_TSC_AUX, .always = false },
- { .index = X2APIC_MSR(APIC_ID), .always = false },
- { .index = X2APIC_MSR(APIC_LVR), .always = false },
- { .index = X2APIC_MSR(APIC_TASKPRI), .always = false },
- { .index = X2APIC_MSR(APIC_ARBPRI), .always = false },
- { .index = X2APIC_MSR(APIC_PROCPRI), .always = false },
- { .index = X2APIC_MSR(APIC_EOI), .always = false },
- { .index = X2APIC_MSR(APIC_RRR), .always = false },
- { .index = X2APIC_MSR(APIC_LDR), .always = false },
- { .index = X2APIC_MSR(APIC_DFR), .always = false },
- { .index = X2APIC_MSR(APIC_SPIV), .always = false },
- { .index = X2APIC_MSR(APIC_ISR), .always = false },
- { .index = X2APIC_MSR(APIC_TMR), .always = false },
- { .index = X2APIC_MSR(APIC_IRR), .always = false },
- { .index = X2APIC_MSR(APIC_ESR), .always = false },
- { .index = X2APIC_MSR(APIC_ICR), .always = false },
- { .index = X2APIC_MSR(APIC_ICR2), .always = false },
-
- /*
- * Note:
- * AMD does not virtualize APIC TSC-deadline timer mode, but it is
- * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
- * the AVIC hardware would generate GP fault. Therefore, always
- * intercept the MSR 0x832, and do not setup direct_access_msr.
- */
- { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false },
- { .index = X2APIC_MSR(APIC_LVTPC), .always = false },
- { .index = X2APIC_MSR(APIC_LVT0), .always = false },
- { .index = X2APIC_MSR(APIC_LVT1), .always = false },
- { .index = X2APIC_MSR(APIC_LVTERR), .always = false },
- { .index = X2APIC_MSR(APIC_TMICT), .always = false },
- { .index = X2APIC_MSR(APIC_TMCCT), .always = false },
- { .index = X2APIC_MSR(APIC_TDCR), .always = false },
- { .index = MSR_INVALID, .always = false },
-};
-
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* pause_filter_count: On processors that support Pause filtering(indicated
@@ -232,6 +164,7 @@ module_param(tsc_scaling, int, 0444);
*/
static bool avic;
module_param(avic, bool, 0444);
+module_param(enable_ipiv, bool, 0444);
module_param(enable_device_posted_irqs, bool, 0444);
@@ -264,33 +197,6 @@ static DEFINE_MUTEX(vmcb_dump_mutex);
*/
static int tsc_aux_uret_slot __read_mostly = -1;
-static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
-
-#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
-#define MSRS_RANGE_SIZE 2048
-#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
-
-u32 svm_msrpm_offset(u32 msr)
-{
- u32 offset;
- int i;
-
- for (i = 0; i < NUM_MSR_MAPS; i++) {
- if (msr < msrpm_ranges[i] ||
- msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
- continue;
-
- offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
- offset += (i * MSRS_RANGE_SIZE); /* add range offset */
-
- /* Now we have the u8 offset - but need the u32 offset */
- return offset / 4;
- }
-
- /* MSR not in any range */
- return MSR_INVALID;
-}
-
static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
@@ -757,50 +663,8 @@ static void clr_dr_intercepts(struct vcpu_svm *svm)
recalc_intercepts(svm);
}
-static int direct_access_msr_slot(u32 msr)
-{
- u32 i;
-
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
- if (direct_access_msrs[i].index == msr)
- return i;
-
- return -ENOENT;
-}
-
-static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
- int write)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int slot = direct_access_msr_slot(msr);
-
- if (slot == -ENOENT)
- return;
-
- /* Set the shadow bitmaps to the desired intercept states */
- if (read)
- set_bit(slot, svm->shadow_msr_intercept.read);
- else
- clear_bit(slot, svm->shadow_msr_intercept.read);
-
- if (write)
- set_bit(slot, svm->shadow_msr_intercept.write);
- else
- clear_bit(slot, svm->shadow_msr_intercept.write);
-}
-
-static bool valid_msr_intercept(u32 index)
-{
- return direct_access_msr_slot(index) != -ENOENT;
-}
-
static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
{
- u8 bit_write;
- unsigned long tmp;
- u32 offset;
- u32 *msrpm;
-
/*
* For non-nested case:
* If the L01 MSR bitmap does not intercept the MSR, then we need to
@@ -810,90 +674,102 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
- to_svm(vcpu)->msrpm;
+ void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm :
+ to_svm(vcpu)->msrpm;
- offset = svm_msrpm_offset(msr);
- bit_write = 2 * (msr & 0x0f) + 1;
- tmp = msrpm[offset];
-
- BUG_ON(offset == MSR_INVALID);
-
- return test_bit(bit_write, &tmp);
+ return svm_test_msr_bitmap_write(msrpm, msr);
}
-static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
- u32 msr, int read, int write)
+void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u8 bit_read, bit_write;
- unsigned long tmp;
- u32 offset;
+ void *msrpm = svm->msrpm;
- /*
- * If this warning triggers extend the direct_access_msrs list at the
- * beginning of the file
- */
- WARN_ON(!valid_msr_intercept(msr));
-
- /* Enforce non allowed MSRs to trap */
- if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
- read = 0;
-
- if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
- write = 0;
-
- offset = svm_msrpm_offset(msr);
- bit_read = 2 * (msr & 0x0f);
- bit_write = 2 * (msr & 0x0f) + 1;
- tmp = msrpm[offset];
-
- BUG_ON(offset == MSR_INVALID);
-
- read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
- write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+ /* Don't disable interception for MSRs userspace wants to handle. */
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ svm_clear_msr_bitmap_read(msrpm, msr);
+ else
+ svm_set_msr_bitmap_read(msrpm, msr);
+ }
- msrpm[offset] = tmp;
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ svm_clear_msr_bitmap_write(msrpm, msr);
+ else
+ svm_set_msr_bitmap_write(msrpm, msr);
+ }
svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
svm->nested.force_msr_bitmap_recalc = true;
}
-void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
- int read, int write)
-{
- set_shadow_msr_intercept(vcpu, msr, read, write);
- set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
-}
-
-u32 *svm_vcpu_alloc_msrpm(void)
+void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
{
- unsigned int order = get_order(MSRPM_SIZE);
- struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
- u32 *msrpm;
+ unsigned int order = get_order(size);
+ struct page *pages = alloc_pages(gfp_mask, order);
+ void *pm;
if (!pages)
return NULL;
- msrpm = page_address(pages);
- memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
+ /*
+ * Set all bits in the permissions map so that all MSR and I/O accesses
+ * are intercepted by default.
+ */
+ pm = page_address(pages);
+ memset(pm, 0xff, PAGE_SIZE * (1 << order));
- return msrpm;
+ return pm;
}
-void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
+static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
{
- int i;
+ bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- if (!direct_access_msrs[i].always)
- continue;
- set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
- }
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept);
+
+ if (sev_es_guest(vcpu->kvm))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept);
}
void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
{
+ static const u32 x2avic_passthrough_msrs[] = {
+ X2APIC_MSR(APIC_ID),
+ X2APIC_MSR(APIC_LVR),
+ X2APIC_MSR(APIC_TASKPRI),
+ X2APIC_MSR(APIC_ARBPRI),
+ X2APIC_MSR(APIC_PROCPRI),
+ X2APIC_MSR(APIC_EOI),
+ X2APIC_MSR(APIC_RRR),
+ X2APIC_MSR(APIC_LDR),
+ X2APIC_MSR(APIC_DFR),
+ X2APIC_MSR(APIC_SPIV),
+ X2APIC_MSR(APIC_ISR),
+ X2APIC_MSR(APIC_TMR),
+ X2APIC_MSR(APIC_IRR),
+ X2APIC_MSR(APIC_ESR),
+ X2APIC_MSR(APIC_ICR),
+ X2APIC_MSR(APIC_ICR2),
+
+ /*
+ * Note! Always intercept LVTT, as TSC-deadline timer mode
+ * isn't virtualized by hardware, and the CPU will generate a
+ * #GP instead of a #VMEXIT.
+ */
+ X2APIC_MSR(APIC_LVTTHMR),
+ X2APIC_MSR(APIC_LVTPC),
+ X2APIC_MSR(APIC_LVT0),
+ X2APIC_MSR(APIC_LVT1),
+ X2APIC_MSR(APIC_LVTERR),
+ X2APIC_MSR(APIC_TMICT),
+ X2APIC_MSR(APIC_TMCCT),
+ X2APIC_MSR(APIC_TDCR),
+ };
int i;
if (intercept == svm->x2avic_msrs_intercepted)
@@ -902,84 +778,79 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
if (!x2avic_enabled)
return;
- for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
- int index = direct_access_msrs[i].index;
-
- if ((index < APIC_BASE_MSR) ||
- (index > APIC_BASE_MSR + 0xff))
- continue;
- set_msr_interception(&svm->vcpu, svm->msrpm, index,
- !intercept, !intercept);
- }
+ for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++)
+ svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i],
+ MSR_TYPE_RW, intercept);
svm->x2avic_msrs_intercepted = intercept;
}
-void svm_vcpu_free_msrpm(u32 *msrpm)
+void svm_vcpu_free_msrpm(void *msrpm)
{
__free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
}
-static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
+static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u32 i;
- /*
- * Set intercept permissions for all direct access MSRs again. They
- * will automatically get filtered through the MSR filter, so we are
- * back in sync after this.
- */
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- u32 msr = direct_access_msrs[i].index;
- u32 read = test_bit(i, svm->shadow_msr_intercept.read);
- u32 write = test_bit(i, svm->shadow_msr_intercept.write);
-
- set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
- }
-}
-
-static void add_msr_offset(u32 offset)
-{
- int i;
-
- for (i = 0; i < MSRPM_OFFSETS; ++i) {
+ svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- /* Offset already in list? */
- if (msrpm_offsets[i] == offset)
- return;
+#ifdef CONFIG_X86_64
+ svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW);
+#endif
- /* Slot used by another offset? */
- if (msrpm_offsets[i] != MSR_INVALID)
- continue;
+ if (lbrv)
+ svm_recalc_lbr_msr_intercepts(vcpu);
- /* Add offset to list */
- msrpm_offsets[i] = offset;
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
- return;
- }
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
/*
- * If this BUG triggers the msrpm_offsets table has an overflow. Just
- * increase MSRPM_OFFSETS in this case.
+ * Disable interception of SPEC_CTRL if KVM doesn't need to manually
+ * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if
+ * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively
+ * using SPEC_CTRL.
*/
- BUG();
-}
-
-static void init_msrpm_offsets(void)
-{
- int i;
-
- memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+ if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !guest_has_spec_ctrl_msr(vcpu));
+ else
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !svm->spec_ctrl);
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- u32 offset;
+ /*
+ * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU,
+ * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits.
+ */
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW,
+ guest_cpuid_is_intel_compatible(vcpu));
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW,
+ guest_cpuid_is_intel_compatible(vcpu));
+
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
+ }
- offset = svm_msrpm_offset(direct_access_msrs[i].index);
- BUG_ON(offset == MSR_INVALID);
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_recalc_msr_intercepts(vcpu);
- add_msr_offset(offset);
- }
+ /*
+ * x2APIC intercepts are modified on-demand and cannot be filtered by
+ * userspace.
+ */
}
void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
@@ -998,13 +869,7 @@ void svm_enable_lbrv(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
-
- if (sev_es_guest(vcpu->kvm))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1);
+ svm_recalc_lbr_msr_intercepts(vcpu);
/* Move the LBR msrs to the vmcb02 so that the guest can see them. */
if (is_guest_mode(vcpu))
@@ -1016,12 +881,8 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
-
svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+ svm_recalc_lbr_msr_intercepts(vcpu);
/*
* Move the LBR msrs back to the vmcb01 to avoid copying them
@@ -1176,9 +1037,10 @@ void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
}
/* Evaluate instruction intercepts that depend on guest CPUID features. */
-static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
- struct vcpu_svm *svm)
+static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
/*
* Intercept INVPCID if shadow paging is enabled to sync/free shadow
* roots, or if INVPCID is disabled in the guest to inject #UD.
@@ -1197,24 +1059,11 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
else
svm_set_intercept(svm, INTERCEPT_RDTSCP);
}
-}
-
-static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
if (guest_cpuid_is_intel_compatible(vcpu)) {
- /*
- * We must intercept SYSENTER_EIP and SYSENTER_ESP
- * accesses because the processor only stores 32 bits.
- * For the same reason we cannot use virtual VMLOAD/VMSAVE.
- */
svm_set_intercept(svm, INTERCEPT_VMLOAD);
svm_set_intercept(svm, INTERCEPT_VMSAVE);
svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
-
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
} else {
/*
* If hardware supports Virtual VMLOAD VMSAVE then enable it
@@ -1225,12 +1074,15 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
svm_clr_intercept(svm, INTERCEPT_VMSAVE);
svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
}
- /* No need to intercept these MSRs */
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
}
}
+static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ svm_recalc_instruction_intercepts(vcpu);
+ svm_recalc_msr_intercepts(vcpu);
+}
+
static void init_vmcb(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1353,15 +1205,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm_clr_intercept(svm, INTERCEPT_PAUSE);
}
- svm_recalc_instruction_intercepts(vcpu, svm);
-
- /*
- * If the host supports V_SPEC_CTRL then disable the interception
- * of MSR_IA32_SPEC_CTRL.
- */
- if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
-
if (kvm_vcpu_apicv_active(vcpu))
avic_init_vmcb(svm, vmcb);
@@ -1381,7 +1224,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
sev_init_vmcb(svm);
svm_hv_init_vmcb(vmcb);
- init_vmcb_after_set_cpuid(vcpu);
+
+ svm_recalc_intercepts_after_set_cpuid(vcpu);
vmcb_mark_all_dirty(vmcb);
@@ -1392,8 +1236,6 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm_vcpu_init_msrpm(vcpu, svm->msrpm);
-
svm_init_osvw(vcpu);
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
@@ -1490,13 +1332,15 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ WARN_ON_ONCE(!list_empty(&svm->ir_list));
+
svm_leave_nested(vcpu);
svm_free_nested(svm);
sev_free_vcpu(vcpu);
__free_page(__sme_pa_to_page(svm->vmcb01.pa));
- __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
+ svm_vcpu_free_msrpm(svm->msrpm);
}
#ifdef CONFIG_CPU_MITIGATIONS
@@ -2880,12 +2724,11 @@ static int svm_get_feature_msr(u32 msr, u64 *data)
return 0;
}
-static bool
-sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info)
{
return sev_es_guest(vcpu->kvm) &&
vcpu->arch.guest_state_protected &&
- svm_msrpm_offset(msr_info->index) != MSR_INVALID &&
!msr_write_intercepted(vcpu, msr_info->index);
}
@@ -3116,11 +2959,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
- * nested_svm_vmrun_msrpm.
+ * nested_svm_merge_msrpm().
* We update the L1 MSR bit as well since it will end up
* touching the MSR anyway now.
*/
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr->host_initiated &&
@@ -3186,8 +3029,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
/*
* TSC_AUX is usually changed only during boot and never read
- * directly. Intercept TSC_AUX instead of exposing it to the
- * guest via direct_access_msrs, and switch it via user return.
+ * directly. Intercept TSC_AUX and switch it via user return.
*/
preempt_disable();
ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
@@ -4389,9 +4231,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
guest_state_exit_irqoff();
}
-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
- bool force_immediate_exit)
+static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_svm *svm = to_svm(vcpu);
bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
@@ -4438,10 +4280,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
svm_hv_update_vp_id(svm->vmcb, vcpu);
/*
- * Run with all-zero DR6 unless needed, so that we can get the exact cause
- * of a #DB.
+ * Run with all-zero DR6 unless the guest can write DR6 freely, so that
+ * KVM can get the exact cause of a #DB. Note, loading guest DR6 from
+ * KVM's snapshot is only necessary when DR accesses won't exit.
*/
- if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
+ svm_set_dr6(vcpu, vcpu->arch.dr6);
+ else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
@@ -4621,20 +4466,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (guest_cpuid_is_intel_compatible(vcpu))
guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
- svm_recalc_instruction_intercepts(vcpu, svm);
-
- if (boot_cpu_has(X86_FEATURE_IBPB))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
- !!guest_has_pred_cmd_msr(vcpu));
-
- if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
- !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
-
if (sev_guest(vcpu->kvm))
sev_vcpu_after_set_cpuid(svm);
- init_vmcb_after_set_cpuid(vcpu);
+ svm_recalc_intercepts_after_set_cpuid(vcpu);
}
static bool svm_has_wbinvd_exit(void)
@@ -5185,7 +5020,7 @@ static int svm_vm_init(struct kvm *kvm)
}
if (!pause_filter_count || !pause_filter_thresh)
- kvm->arch.pause_in_guest = true;
+ kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
if (enable_apicv) {
int ret = avic_vm_init(kvm);
@@ -5252,7 +5087,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
- .set_dr6 = svm_set_dr6,
.set_dr7 = svm_set_dr7,
.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
@@ -5337,7 +5171,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
- .msr_filter_changed = svm_msr_filter_changed,
+ .recalc_msr_intercepts = svm_recalc_msr_intercepts,
.complete_emulated_msr = svm_complete_emulated_msr,
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
@@ -5473,11 +5307,8 @@ static __init void svm_set_cpu_caps(void)
static __init int svm_hardware_setup(void)
{
- int cpu;
- struct page *iopm_pages;
void *iopm_va;
- int r;
- unsigned int order = get_order(IOPM_SIZE);
+ int cpu, r;
/*
* NX is required for shadow paging and for NPT if the NX huge pages
@@ -5489,17 +5320,6 @@ static __init int svm_hardware_setup(void)
}
kvm_enable_efer_bits(EFER_NX);
- iopm_pages = alloc_pages(GFP_KERNEL, order);
-
- if (!iopm_pages)
- return -ENOMEM;
-
- iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
- iopm_base = __sme_page_pa(iopm_pages);
-
- init_msrpm_offsets();
-
kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
XFEATURE_MASK_BNDCSR);
@@ -5533,6 +5353,10 @@ static __init int svm_hardware_setup(void)
if (nested) {
pr_info("Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+
+ r = nested_svm_init_msrpm_merge_offsets();
+ if (r)
+ return r;
}
/*
@@ -5564,6 +5388,13 @@ static __init int svm_hardware_setup(void)
else
pr_info("LBR virtualization supported\n");
}
+
+ iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL);
+ if (!iopm_va)
+ return -ENOMEM;
+
+ iopm_base = __sme_set(__pa(iopm_va));
+
/*
* Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
* may be modified by svm_adjust_mmio_mask()), as well as nrips.
@@ -5581,6 +5412,7 @@ static __init int svm_hardware_setup(void)
enable_apicv = avic = avic && avic_hardware_setup();
if (!enable_apicv) {
+ enable_ipiv = false;
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
@@ -5662,6 +5494,8 @@ static int __init svm_init(void)
{
int r;
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm);
+
__unused_size_checks();
if (!kvm_is_svm_supported())
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e6f3c6a153a0..58b9d168e0c8 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -44,9 +44,6 @@ static inline struct page *__sme_pa_to_page(unsigned long pa)
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
-#define MAX_DIRECT_ACCESS_MSRS 48
-#define MSRPM_OFFSETS 32
-extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
extern int nrips;
extern int vgif;
@@ -113,6 +110,7 @@ struct kvm_sev_info {
void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */
void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */
struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
+ cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
};
#define SEV_POLICY_NODBG BIT_ULL(0)
@@ -123,8 +121,8 @@ struct kvm_svm {
/* Struct members for AVIC */
u32 avic_vm_id;
- struct page *avic_logical_id_table_page;
- struct page *avic_physical_id_table_page;
+ u32 *avic_logical_id_table;
+ u64 *avic_physical_id_table;
struct hlist_node hnode;
struct kvm_sev_info sev_info;
@@ -189,8 +187,11 @@ struct svm_nested_state {
u64 vmcb12_gpa;
u64 last_vmcb12_gpa;
- /* These are the merged vectors */
- u32 *msrpm;
+ /*
+ * The MSR permissions map used for vmcb02, which is the merge result
+ * of vmcb01 and vmcb12
+ */
+ void *msrpm;
/* A VMRUN has started but has not yet been performed, so
* we cannot inject a nested vmexit yet. */
@@ -271,7 +272,7 @@ struct vcpu_svm {
*/
u64 virt_spec_ctrl;
- u32 *msrpm;
+ void *msrpm;
ulong nmi_iret_rip;
@@ -306,24 +307,26 @@ struct vcpu_svm {
u32 ldr_reg;
u32 dfr_reg;
- struct page *avic_backing_page;
- u64 *avic_physical_id_cache;
+
+ /* This is essentially a shadow of the vCPU's actual entry in the
+ * Physical ID table that is programmed into the VMCB, i.e. that is
+ * seen by the CPU. If IPI virtualization is disabled, IsRunning is
+ * only ever set in the shadow, i.e. is never propagated to the "real"
+ * table, so that hardware never sees IsRunning=1.
+ */
+ u64 avic_physical_id_entry;
/*
- * Per-vcpu list of struct amd_svm_iommu_ir:
- * This is used mainly to store interrupt remapping information used
- * when update the vcpu affinity. This avoids the need to scan for
- * IRTE and try to match ga_tag in the IOMMU driver.
+ * Per-vCPU list of irqfds that are eligible to post IRQs directly to
+ * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list
+ * is used to reconfigure IRTEs when the vCPU is loaded/put (to set the
+ * target pCPU), when AVIC is toggled on/off (to (de)activate bypass),
+ * and if the irqfd becomes ineligible for posting (to put the IRTE
+ * back into remapped mode).
*/
struct list_head ir_list;
spinlock_t ir_list_lock;
- /* Save desired MSR intercept (read: pass-through) state */
- struct {
- DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS);
- DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
- } shadow_msr_intercept;
-
struct vcpu_sev_es_state sev_es;
bool guest_state_loaded;
@@ -613,17 +616,74 @@ static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data)
svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data);
}
-/* svm.c */
-#define MSR_INVALID 0xffffffffU
+/*
+ * The MSRPM is 8KiB in size, divided into four 2KiB ranges (the fourth range
+ * is reserved). Each MSR within a range is covered by two bits, one each for
+ * read (bit 0) and write (bit 1), where a bit value of '1' means intercepted.
+ */
+#define SVM_MSRPM_BYTES_PER_RANGE 2048
+#define SVM_BITS_PER_MSR 2
+#define SVM_MSRS_PER_BYTE (BITS_PER_BYTE / SVM_BITS_PER_MSR)
+#define SVM_MSRS_PER_RANGE (SVM_MSRPM_BYTES_PER_RANGE * SVM_MSRS_PER_BYTE)
+static_assert(SVM_MSRS_PER_RANGE == 8192);
+#define SVM_MSRPM_OFFSET_MASK (SVM_MSRS_PER_RANGE - 1)
+
+static __always_inline int svm_msrpm_bit_nr(u32 msr)
+{
+ int range_nr;
+
+ switch (msr & ~SVM_MSRPM_OFFSET_MASK) {
+ case 0:
+ range_nr = 0;
+ break;
+ case 0xc0000000:
+ range_nr = 1;
+ break;
+ case 0xc0010000:
+ range_nr = 2;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return range_nr * SVM_MSRPM_BYTES_PER_RANGE * BITS_PER_BYTE +
+ (msr & SVM_MSRPM_OFFSET_MASK) * SVM_BITS_PER_MSR;
+}
+
+#define __BUILD_SVM_MSR_BITMAP_HELPER(rtype, action, bitop, access, bit_rw) \
+static inline rtype svm_##action##_msr_bitmap_##access(unsigned long *bitmap, \
+ u32 msr) \
+{ \
+ int bit_nr; \
+ \
+ bit_nr = svm_msrpm_bit_nr(msr); \
+ if (bit_nr < 0) \
+ return (rtype)true; \
+ \
+ return bitop##_bit(bit_nr + bit_rw, bitmap); \
+}
+
+#define BUILD_SVM_MSR_BITMAP_HELPERS(ret_type, action, bitop) \
+ __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0) \
+ __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 1)
+
+BUILD_SVM_MSR_BITMAP_HELPERS(bool, test, test)
+BUILD_SVM_MSR_BITMAP_HELPERS(void, clear, __clear)
+BUILD_SVM_MSR_BITMAP_HELPERS(void, set, __set)
#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR)
+/* svm.c */
extern bool dump_invalid_vmcb;
-u32 svm_msrpm_offset(u32 msr);
-u32 *svm_vcpu_alloc_msrpm(void);
-void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
-void svm_vcpu_free_msrpm(u32 *msrpm);
+void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask);
+
+static inline void *svm_vcpu_alloc_msrpm(void)
+{
+ return svm_alloc_permissions_map(MSRPM_SIZE, GFP_KERNEL_ACCOUNT);
+}
+
+void svm_vcpu_free_msrpm(void *msrpm);
void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
void svm_enable_lbrv(struct kvm_vcpu *vcpu);
void svm_update_lbrv(struct kvm_vcpu *vcpu);
@@ -643,6 +703,20 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable);
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
int trig_mode, int vec);
+void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set);
+
+static inline void svm_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ svm_set_intercept_for_msr(vcpu, msr, type, false);
+}
+
+static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ svm_set_intercept_for_msr(vcpu, msr, type, true);
+}
+
/* nested.c */
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -671,6 +745,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
+int __init nested_svm_init_msrpm_merge_offsets(void);
+
int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
void svm_leave_nested(struct kvm_vcpu *vcpu);
@@ -721,7 +797,8 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \
- BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \
+ BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \
)
bool avic_hardware_setup(void);
@@ -736,8 +813,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
@@ -752,6 +830,7 @@ void sev_init_vmcb(struct vcpu_svm *svm);
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_vcpu_reset(struct vcpu_svm *svm);
+void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ba736cbb0587..57d79fd31df0 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -260,6 +260,86 @@ TRACE_EVENT(kvm_cpuid,
__entry->used_max_basic ? ", used max basic" : "")
);
+#define kvm_deliver_mode \
+ {0x0, "Fixed"}, \
+ {0x1, "LowPrio"}, \
+ {0x2, "SMI"}, \
+ {0x3, "Res3"}, \
+ {0x4, "NMI"}, \
+ {0x5, "INIT"}, \
+ {0x6, "SIPI"}, \
+ {0x7, "ExtINT"}
+
+#ifdef CONFIG_KVM_IOAPIC
+TRACE_EVENT(kvm_ioapic_set_irq,
+ TP_PROTO(__u64 e, int pin, bool coalesced),
+ TP_ARGS(e, pin, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u64, e )
+ __field( int, pin )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->e = e;
+ __entry->pin = pin;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
+ __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+ TP_PROTO(__u64 e),
+ TP_ARGS(e),
+
+ TP_STRUCT__entry(
+ __field( __u64, e )
+ ),
+
+ TP_fast_assign(
+ __entry->e = e;
+ ),
+
+ TP_printk("dst %x vec %u (%s|%s|%s%s)",
+ (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "")
+);
+#endif
+
+TRACE_EVENT(kvm_msi_set_irq,
+ TP_PROTO(__u64 address, __u64 data),
+ TP_ARGS(address, data),
+
+ TP_STRUCT__entry(
+ __field( __u64, address )
+ __field( __u64, data )
+ ),
+
+ TP_fast_assign(
+ __entry->address = address;
+ __entry->data = data;
+ ),
+
+ TP_printk("dst %llx vec %u (%s|%s|%s%s)",
+ (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
+ (u8)__entry->data,
+ __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->address & (1<<2)) ? "logical" : "physical",
+ (__entry->data & (1<<15)) ? "level" : "edge",
+ (__entry->address & (1<<3)) ? "|rh" : "")
+);
+
#define AREG(x) { APIC_##x, "APIC_" #x }
#define kvm_trace_symbol_apic \
@@ -1096,37 +1176,32 @@ TRACE_EVENT(kvm_smm_transition,
* Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC.
*/
TRACE_EVENT(kvm_pi_irte_update,
- TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
- unsigned int gsi, unsigned int gvec,
- u64 pi_desc_addr, bool set),
- TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
+ TP_PROTO(unsigned int host_irq, struct kvm_vcpu *vcpu,
+ unsigned int gsi, unsigned int gvec, bool set),
+ TP_ARGS(host_irq, vcpu, gsi, gvec, set),
TP_STRUCT__entry(
__field( unsigned int, host_irq )
- __field( unsigned int, vcpu_id )
+ __field( int, vcpu_id )
__field( unsigned int, gsi )
__field( unsigned int, gvec )
- __field( u64, pi_desc_addr )
__field( bool, set )
),
TP_fast_assign(
__entry->host_irq = host_irq;
- __entry->vcpu_id = vcpu_id;
+ __entry->vcpu_id = vcpu ? vcpu->vcpu_id : -1;
__entry->gsi = gsi;
__entry->gvec = gvec;
- __entry->pi_desc_addr = pi_desc_addr;
__entry->set = set;
),
- TP_printk("PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
- "gvec: 0x%x, pi_desc_addr: 0x%llx",
+ TP_printk("PI is %s for irq %u, vcpu %d, gsi: 0x%x, gvec: 0x%x",
__entry->set ? "enabled and being updated" : "disabled",
__entry->host_irq,
__entry->vcpu_id,
__entry->gsi,
- __entry->gvec,
- __entry->pi_desc_addr)
+ __entry->gvec)
);
/*
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index cb6588238f46..5316c27f6099 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -15,7 +15,6 @@ extern bool __read_mostly enable_ept;
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
-extern bool __read_mostly enable_ipiv;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index a0c5e8781c33..bc5ece76533a 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -53,8 +53,6 @@ struct vcpu_vt {
#ifdef CONFIG_X86_64
u64 msr_host_kernel_gs_base;
#endif
-
- unsigned long host_debugctlmsr;
};
#ifdef CONFIG_KVM_INTEL_TDX
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index d1e02e567b57..dbab1c15b0cd 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -29,40 +29,8 @@ static __init int vt_hardware_setup(void)
if (ret)
return ret;
- /*
- * Update vt_x86_ops::vm_size here so it is ready before
- * kvm_ops_update() is called in kvm_x86_vendor_init().
- *
- * Note, the actual bringing up of TDX must be done after
- * kvm_ops_update() because enabling TDX requires enabling
- * hardware virtualization first, i.e., all online CPUs must
- * be in post-VMXON state. This means the @vm_size here
- * may be updated to TDX's size but TDX may fail to enable
- * at later time.
- *
- * The VMX/VT code could update kvm_x86_ops::vm_size again
- * after bringing up TDX, but this would require exporting
- * either kvm_x86_ops or kvm_ops_update() from the base KVM
- * module, which looks overkill. Anyway, the worst case here
- * is KVM may allocate couple of more bytes than needed for
- * each VM.
- */
- if (enable_tdx) {
- vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size,
- sizeof(struct kvm_tdx));
- /*
- * Note, TDX may fail to initialize in a later time in
- * vt_init(), in which case it is not necessary to setup
- * those callbacks. But making them valid here even
- * when TDX fails to init later is fine because those
- * callbacks won't be called if the VM isn't TDX guest.
- */
- vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
- vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
- vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
- vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
- vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
- }
+ if (enable_tdx)
+ tdx_hardware_setup();
return 0;
}
@@ -175,12 +143,12 @@ static int vt_vcpu_pre_run(struct kvm_vcpu *vcpu)
return vmx_vcpu_pre_run(vcpu);
}
-static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
if (is_td_vcpu(vcpu))
- return tdx_vcpu_run(vcpu, force_immediate_exit);
+ return tdx_vcpu_run(vcpu, run_flags);
- return vmx_vcpu_run(vcpu, force_immediate_exit);
+ return vmx_vcpu_run(vcpu, run_flags);
}
static int vt_handle_exit(struct kvm_vcpu *vcpu,
@@ -220,7 +188,7 @@ static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return vmx_get_msr(vcpu, msr_info);
}
-static void vt_msr_filter_changed(struct kvm_vcpu *vcpu)
+static void vt_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
/*
* TDX doesn't allow VMM to configure interception of MSR accesses.
@@ -231,7 +199,7 @@ static void vt_msr_filter_changed(struct kvm_vcpu *vcpu)
if (is_td_vcpu(vcpu))
return;
- vmx_msr_filter_changed(vcpu);
+ vmx_recalc_msr_intercepts(vcpu);
}
static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
@@ -489,14 +457,6 @@ static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
vmx_set_gdt(vcpu, dt);
}
-static void vt_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
-{
- if (is_td_vcpu(vcpu))
- return;
-
- vmx_set_dr6(vcpu, val);
-}
-
static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
if (is_td_vcpu(vcpu))
@@ -923,6 +883,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.vcpu_load = vt_op(vcpu_load),
.vcpu_put = vt_op(vcpu_put),
+ .HOST_OWNED_DEBUGCTL = VMX_HOST_OWNED_DEBUGCTL_BITS,
+
.update_exception_bitmap = vt_op(update_exception_bitmap),
.get_feature_msr = vmx_get_feature_msr,
.get_msr = vt_op(get_msr),
@@ -943,7 +905,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.set_idt = vt_op(set_idt),
.get_gdt = vt_op(get_gdt),
.set_gdt = vt_op(set_gdt),
- .set_dr6 = vt_op(set_dr6),
.set_dr7 = vt_op(set_dr7),
.sync_dirty_debug_regs = vt_op(sync_dirty_debug_regs),
.cache_reg = vt_op(cache_reg),
@@ -1014,7 +975,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.nested_ops = &vmx_nested_ops,
.pi_update_irte = vmx_pi_update_irte,
- .pi_start_assignment = vmx_pi_start_assignment,
+ .pi_start_bypass = vmx_pi_start_bypass,
#ifdef CONFIG_X86_64
.set_hv_timer = vt_op(set_hv_timer),
@@ -1034,7 +995,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.apic_init_signal_blocked = vt_op(apic_init_signal_blocked),
.migrate_timers = vmx_migrate_timers,
- .msr_filter_changed = vt_op(msr_filter_changed),
+ .recalc_msr_intercepts = vt_op(recalc_msr_intercepts),
.complete_emulated_msr = vt_op(complete_emulated_msr),
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 7211c71d4241..b8ea1969113d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -715,6 +715,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_APERF, MSR_TYPE_R);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_MPERF, MSR_TYPE_R);
+
kvm_vcpu_unmap(vcpu, &map);
vmx->nested.force_msr_bitmap_recalc = false;
@@ -2663,10 +2669,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (vmx->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
+ vmx_get_supported_debugctl(vcpu, false));
} else {
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
+ vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
}
if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
@@ -3156,7 +3163,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
- CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
+ (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
+ CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
return -EINVAL;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
@@ -3530,7 +3538,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
- vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
if (kvm_mpx_supported() &&
(!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
@@ -4608,6 +4616,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+ /*
+ * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
+ * Writes to DEBUGCTL that aren't intercepted by L1 are immediately
+ * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
+ * vmcs02 doesn't strictly track vmcs12.
+ */
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
vmcs12->guest_dr7 = vcpu->arch.dr7;
@@ -4798,7 +4812,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
__vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
kvm_set_dr(vcpu, 7, 0x400);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+ vmx_guest_debugctl_write(vcpu, 0);
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
vmcs12->vm_exit_msr_load_count))
@@ -4853,6 +4867,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
}
+ /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
+ vmx_reload_guest_debugctl(vcpu);
+
/*
* Note that calling vmx_set_{efer,cr0,cr4} is important as they
* handle a variety of side effects to KVM's software model.
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index bbf4509f32d0..0b173602821b 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -653,11 +653,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
*/
static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
{
- u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ u64 data = vmx_guest_debugctl_read();
if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
data &= ~DEBUGCTLMSR_LBR;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ vmx_guest_debugctl_write(vcpu, data);
}
}
@@ -730,7 +730,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
if (!lbr_desc->event) {
vmx_disable_lbr_msrs_passthrough(vcpu);
- if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
+ if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)
goto warn;
if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
goto warn;
@@ -752,7 +752,7 @@ warn:
static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
{
- if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
+ if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR))
intel_pmu_release_guest_lbr_event(vcpu);
}
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 5c615e5845bf..4a6d9a17da23 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -2,6 +2,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
#include <asm/irq_remapping.h>
#include <asm/cpu.h>
@@ -72,13 +73,10 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
/*
* If the vCPU wasn't on the wakeup list and wasn't migrated, then the
* full update can be skipped as neither the vector nor the destination
- * needs to be changed.
+ * needs to be changed. Clear SN even if there is no assigned device,
+ * again for simplicity.
*/
if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
- /*
- * Clear SN if it was set due to being preempted. Again, do
- * this even if there is no assigned device for simplicity.
- */
if (pi_test_and_clear_sn(pi_desc))
goto after_clear_sn;
return;
@@ -148,8 +146,13 @@ after_clear_sn:
static bool vmx_can_use_vtd_pi(struct kvm *kvm)
{
+ /*
+ * Note, reading the number of possible bypass IRQs can race with a
+ * bypass IRQ being attached to the VM. vmx_pi_start_bypass() ensures
+ * blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK.
+ */
return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() &&
- kvm_arch_has_assigned_device(kvm);
+ READ_ONCE(kvm->arch.nr_possible_bypass_irqs);
}
/*
@@ -224,17 +227,23 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
if (!vmx_needs_pi_wakeup(vcpu))
return;
- if (kvm_vcpu_is_blocking(vcpu) &&
+ /*
+ * If the vCPU is blocking with IRQs enabled and ISN'T being preempted,
+ * enable the wakeup handler so that notification IRQ wakes the vCPU as
+ * expected. There is no need to enable the wakeup handler if the vCPU
+ * is preempted between setting its wait state and manually scheduling
+ * out, as the task is still runnable, i.e. doesn't need a wake event
+ * from KVM to be scheduled in.
+ *
+ * If the wakeup handler isn't being enabled, Suppress Notifications as
+ * the cost of propagating PIR.IRR to PID.ON is negligible compared to
+ * the cost of a spurious IRQ, and vCPU put/load is a slow path.
+ */
+ if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) &&
((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) ||
(!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu))))
pi_enable_wakeup_handler(vcpu);
-
- /*
- * Set SN when the vCPU is preempted. Note, the vCPU can both be seen
- * as blocking and preempted, e.g. if it's preempted between setting
- * its wait state and manually scheduling out.
- */
- if (vcpu->preempted)
+ else
pi_set_sn(pi_desc);
}
@@ -281,99 +290,30 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
/*
- * Bail out of the block loop if the VM has an assigned
- * device, but the blocking vCPU didn't reconfigure the
- * PI.NV to the wakeup vector, i.e. the assigned device
- * came along after the initial check in vmx_vcpu_pi_put().
+ * Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as
+ * blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup
+ * vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put().
*/
-void vmx_pi_start_assignment(struct kvm *kvm)
+void vmx_pi_start_bypass(struct kvm *kvm)
{
- if (!kvm_arch_has_irq_bypass())
+ if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm)))
return;
kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
}
-/*
- * vmx_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector)
{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- bool enable_remapped_mode = true;
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu;
- struct vcpu_data vcpu_info;
- int idx, ret = 0;
-
- if (!vmx_can_use_vtd_pi(kvm))
- return 0;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
+ if (vcpu) {
+ struct intel_iommu_pi_data pi_data = {
+ .pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)),
+ .vector = vector,
+ };
+
+ return irq_set_vcpu_affinity(host_irq, &pi_data);
+ } else {
+ return irq_set_vcpu_affinity(host_irq, NULL);
}
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
- /*
- * VT-d PI cannot support posting multicast/broadcast
- * interrupts to a vCPU, we still use interrupt remapping
- * for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- *
- * We will support full lowest-priority interrupt later.
- *
- * In addition, we can only inject generic interrupts using
- * the PI mechanism, refuse to route others through it.
- */
-
- kvm_set_msi_irq(kvm, e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq))
- continue;
-
- vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
- vcpu_info.vector = irq.vector;
-
- trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
- vcpu_info.vector, vcpu_info.pi_desc_addr, set);
-
- if (!set)
- continue;
-
- enable_remapped_mode = false;
-
- ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
- if (ret < 0) {
- printk(KERN_INFO "%s: failed to update PI IRTE\n",
- __func__);
- goto out;
- }
- }
-
- if (enable_remapped_mode)
- ret = irq_set_vcpu_affinity(host_irq, NULL);
-
- ret = 0;
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
}
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 80499ea0e674..a4af39948cf0 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -3,6 +3,9 @@
#define __KVM_X86_VMX_POSTED_INTR_H
#include <linux/bitmap.h>
+#include <linux/find.h>
+#include <linux/kvm_host.h>
+
#include <asm/posted_intr.h>
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
@@ -11,9 +14,10 @@ void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
-void vmx_pi_start_assignment(struct kvm *kvm);
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
+void vmx_pi_start_bypass(struct kvm *kvm);
static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
{
diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
index 6a9bfdfbb6e5..2f20fb170def 100644
--- a/arch/x86/kvm/vmx/run_flags.h
+++ b/arch/x86/kvm/vmx/run_flags.h
@@ -2,10 +2,12 @@
#ifndef __KVM_X86_VMX_RUN_FLAGS_H
#define __KVM_X86_VMX_RUN_FLAGS_H
-#define VMX_RUN_VMRESUME_SHIFT 0
-#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
+#define VMX_RUN_VMRESUME_SHIFT 0
+#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
+#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT 2
-#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT)
-#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
+#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT)
+#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
+#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT)
#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index ec79aacc446f..66744f5768c8 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -743,7 +743,7 @@ bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
!to_tdx(vcpu)->vp_enter_args.r12;
}
-bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
+static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
u64 vcpu_state_details;
@@ -783,8 +783,6 @@ void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
else
vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
- vt->host_debugctlmsr = get_debugctlmsr();
-
vt->guest_state_loaded = true;
}
@@ -1025,20 +1023,20 @@ static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
DEBUGCTLMSR_FREEZE_IN_SMM)
-fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
struct vcpu_tdx *tdx = to_tdx(vcpu);
struct vcpu_vt *vt = to_vt(vcpu);
/*
- * force_immediate_exit requires vCPU entering for events injection with
- * an immediately exit followed. But The TDX module doesn't guarantee
- * entry, it's already possible for KVM to _think_ it completely entry
- * to the guest without actually having done so.
- * Since KVM never needs to force an immediate exit for TDX, and can't
- * do direct injection, just warn on force_immediate_exit.
+ * WARN if KVM wants to force an immediate exit, as the TDX module does
+ * not guarantee entry into the guest, i.e. it's possible for KVM to
+ * _think_ it completed entry to the guest and forced an immediate exit
+ * without actually having done so. Luckily, KVM never needs to force
+ * an immediate exit for TDX (KVM can't do direct event injection, so
+ * just WARN and continue on.
*/
- WARN_ON_ONCE(force_immediate_exit);
+ WARN_ON_ONCE(run_flags);
/*
* Wait until retry of SEPT-zap-related SEAMCALL completes before
@@ -1048,7 +1046,7 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
return EXIT_FASTPATH_EXIT_HANDLED;
- trace_kvm_entry(vcpu, force_immediate_exit);
+ trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);
if (pi_test_on(&vt->pi_desc)) {
apic->send_IPI_self(POSTED_INTR_VECTOR);
@@ -1060,8 +1058,8 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
tdx_vcpu_enter_exit(vcpu);
- if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED)
- update_debugctlmsr(vt->host_debugctlmsr);
+ if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
tdx_load_host_xsave_state(vcpu);
tdx->guest_entered = true;
@@ -1640,8 +1638,8 @@ static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
return 0;
}
-int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct page *page = pfn_to_page(pfn);
@@ -1721,8 +1719,8 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
return 0;
}
-int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt)
+static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt)
{
int tdx_level = pg_level_to_tdx_sept_level(level);
gpa_t gpa = gfn_to_gpa(gfn);
@@ -1857,8 +1855,8 @@ static void tdx_track(struct kvm *kvm)
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
}
-int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt)
+static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
@@ -1880,8 +1878,8 @@ int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
return tdx_reclaim_page(virt_to_page(private_spt));
}
-int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
{
struct page *page = pfn_to_page(pfn);
int ret;
@@ -3605,10 +3603,14 @@ int __init tdx_bringup(void)
r = __tdx_bringup();
if (r) {
/*
- * Disable TDX only but don't fail to load module if
- * the TDX module could not be loaded. No need to print
- * message saying "module is not loaded" because it was
- * printed when the first SEAMCALL failed.
+ * Disable TDX only but don't fail to load module if the TDX
+ * module could not be loaded. No need to print message saying
+ * "module is not loaded" because it was printed when the first
+ * SEAMCALL failed. Don't bother unwinding the S-EPT hooks or
+ * vm_size, as kvm_x86_ops have already been finalized (and are
+ * intentionally not exported). The S-EPT code is unreachable,
+ * and allocating a few more bytes per VM in a should-be-rare
+ * failure scenario is a non-issue.
*/
if (r == -ENODEV)
goto success_disable_tdx;
@@ -3622,3 +3624,20 @@ success_disable_tdx:
enable_tdx = 0;
return 0;
}
+
+void __init tdx_hardware_setup(void)
+{
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
+
+ /*
+ * Note, if the TDX module can't be loaded, KVM TDX support will be
+ * disabled but KVM will continue loading (see tdx_bringup()).
+ */
+ vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
+
+ vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
+ vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
+ vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
+ vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
+ vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
+}
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 51f98443e8a2..ca39a9391db1 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -8,6 +8,7 @@
#ifdef CONFIG_KVM_INTEL_TDX
#include "common.h"
+void tdx_hardware_setup(void);
int tdx_bringup(void);
void tdx_cleanup(void);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 191a9ed0da22..aa157fe5b7b3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -75,6 +75,8 @@
#include "vmx_onhyperv.h"
#include "posted_intr.h"
+#include "mmu/spte.h"
+
MODULE_AUTHOR("Qumranet");
MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");
@@ -113,8 +115,6 @@ static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, 0444);
module_param(enable_apicv, bool, 0444);
-
-bool __read_mostly enable_ipiv = true;
module_param(enable_ipiv, bool, 0444);
module_param(enable_device_posted_irqs, bool, 0444);
@@ -168,31 +168,6 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
RTIT_STATUS_BYTECNT))
/*
- * List of MSRs that can be directly passed to the guest.
- * In addition to these x2apic, PT and LBR MSRs are handled specially.
- */
-static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
- MSR_IA32_SPEC_CTRL,
- MSR_IA32_PRED_CMD,
- MSR_IA32_FLUSH_CMD,
- MSR_IA32_TSC,
-#ifdef CONFIG_X86_64
- MSR_FS_BASE,
- MSR_GS_BASE,
- MSR_KERNEL_GS_BASE,
- MSR_IA32_XFD,
- MSR_IA32_XFD_ERR,
-#endif
- MSR_IA32_SYSENTER_CS,
- MSR_IA32_SYSENTER_ESP,
- MSR_IA32_SYSENTER_EIP,
- MSR_CORE_C1_RES,
- MSR_CORE_C3_RESIDENCY,
- MSR_CORE_C6_RESIDENCY,
- MSR_CORE_C7_RESIDENCY,
-};
-
-/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
* executions of PAUSE in a loop. Also indicate if ple enabled.
@@ -674,40 +649,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static int vmx_get_passthrough_msr_slot(u32 msr)
-{
- int i;
-
- switch (msr) {
- case 0x800 ... 0x8ff:
- /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
- return -ENOENT;
- case MSR_IA32_RTIT_STATUS:
- case MSR_IA32_RTIT_OUTPUT_BASE:
- case MSR_IA32_RTIT_OUTPUT_MASK:
- case MSR_IA32_RTIT_CR3_MATCH:
- case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
- /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
- case MSR_LBR_SELECT:
- case MSR_LBR_TOS:
- case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
- case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
- case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
- case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
- case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
- /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
- return -ENOENT;
- }
-
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
- if (vmx_possible_passthrough_msrs[i] == msr)
- return i;
- }
-
- WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
- return -ENOENT;
-}
-
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -963,6 +904,10 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
flags |= VMX_RUN_SAVE_SPEC_CTRL;
+ if (static_branch_unlikely(&cpu_buf_vm_clear) &&
+ kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
+ flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
+
return flags;
}
@@ -2149,7 +2094,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ msr_info->data = vmx_guest_debugctl_read();
break;
default:
find_uret_msr:
@@ -2174,7 +2119,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
return (unsigned long)data;
}
-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
{
u64 debugctl = 0;
@@ -2186,9 +2131,25 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
(host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ if (boot_cpu_has(X86_FEATURE_RTM) &&
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
+ debugctl |= DEBUGCTLMSR_RTM_DEBUG;
+
return debugctl;
}
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+{
+ u64 invalid;
+
+ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
+ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
+ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
+ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
+ }
+ return !invalid;
+}
+
/*
* Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -2257,29 +2218,22 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_DEBUGCTLMSR: {
- u64 invalid;
-
- invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
- if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
- kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
- data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- }
-
- if (invalid)
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
return 1;
+ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ vmx_guest_debugctl_write(vcpu, data);
+
if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
(data & DEBUGCTLMSR_LBR))
intel_pmu_create_guest_lbr_event(vcpu);
return 0;
- }
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
@@ -4013,76 +3967,29 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
vmx->nested.force_msr_bitmap_recalc = true;
}
-void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
- int idx;
if (!cpu_has_vmx_msr_bitmap())
return;
vmx_msr_bitmap_l01_changed(vmx);
- /*
- * Mark the desired intercept state in shadow bitmap, this is needed
- * for resync when the MSR filters change.
- */
- idx = vmx_get_passthrough_msr_slot(msr);
- if (idx >= 0) {
- if (type & MSR_TYPE_R)
- clear_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- clear_bit(idx, vmx->shadow_msr_intercept.write);
- }
-
- if ((type & MSR_TYPE_R) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
- vmx_set_msr_bitmap_read(msr_bitmap, msr);
- type &= ~MSR_TYPE_R;
- }
-
- if ((type & MSR_TYPE_W) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
- vmx_set_msr_bitmap_write(msr_bitmap, msr);
- type &= ~MSR_TYPE_W;
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ vmx_clear_msr_bitmap_read(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
}
- if (type & MSR_TYPE_R)
- vmx_clear_msr_bitmap_read(msr_bitmap, msr);
-
- if (type & MSR_TYPE_W)
- vmx_clear_msr_bitmap_write(msr_bitmap, msr);
-}
-
-void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
- int idx;
-
- if (!cpu_has_vmx_msr_bitmap())
- return;
-
- vmx_msr_bitmap_l01_changed(vmx);
-
- /*
- * Mark the desired intercept state in shadow bitmap, this is needed
- * for resync when the MSR filter changes.
- */
- idx = vmx_get_passthrough_msr_slot(msr);
- if (idx >= 0) {
- if (type & MSR_TYPE_R)
- set_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- set_bit(idx, vmx->shadow_msr_intercept.write);
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ vmx_clear_msr_bitmap_write(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-
- if (type & MSR_TYPE_R)
- vmx_set_msr_bitmap_read(msr_bitmap, msr);
-
- if (type & MSR_TYPE_W)
- vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
@@ -4161,35 +4068,57 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
}
}
-void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 i;
-
if (!cpu_has_vmx_msr_bitmap())
return;
- /*
- * Redo intercept permissions for MSRs that KVM is passing through to
- * the guest. Disabling interception will check the new MSR filter and
- * ensure that KVM enables interception if usersepace wants to filter
- * the MSR. MSRs that KVM is already intercepting don't need to be
- * refreshed since KVM is going to intercept them regardless of what
- * userspace wants.
- */
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
- u32 msr = vmx_possible_passthrough_msrs[i];
-
- if (!test_bit(i, vmx->shadow_msr_intercept.read))
- vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
-
- if (!test_bit(i, vmx->shadow_msr_intercept.write))
- vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
+ vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ if (kvm_cstate_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ }
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
}
/* PT MSRs can be passed through iff PT is exposed to the guest. */
if (vmx_pt_mode_is_host_guest())
pt_update_intercept_for_msr(vcpu);
+
+ if (vcpu->arch.xfd_no_write_intercept)
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);
+
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !to_vmx(vcpu)->spec_ctrl);
+
+ if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
+
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
+
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
+ /*
+ * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
+ * filtered by userspace.
+ */
}
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -4790,7 +4719,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write32(GUEST_SYSENTER_CS, 0);
vmcs_writel(GUEST_SYSENTER_ESP, 0);
vmcs_writel(GUEST_SYSENTER_EIP, 0);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ vmx_guest_debugctl_write(&vmx->vcpu, 0);
if (cpu_has_vmx_tpr_shadow()) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
@@ -5606,12 +5536,6 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
-void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
-{
- lockdep_assert_irqs_disabled();
- set_debugreg(vcpu->arch.dr6, 6);
-}
-
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
@@ -7290,7 +7214,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
else if (static_branch_unlikely(&cpu_buf_vm_clear) &&
- kvm_arch_has_assigned_device(vcpu->kvm))
+ (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO))
x86_clear_cpu_buffers();
vmx_disable_fb_clear(vmx);
@@ -7323,8 +7247,9 @@ out:
guest_state_exit_irqoff();
}
-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7369,6 +7294,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
vcpu->arch.regs_dirty = 0;
+ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+ set_debugreg(vcpu->arch.dr6, 6);
+
+ if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
+ vmx_reload_guest_debugctl(vcpu);
+
/*
* Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
* prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
@@ -7543,26 +7474,6 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
evmcs->hv_enlightenments_control.msr_bitmap = 1;
}
- /* The MSR bitmap starts with all ones */
- bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
-
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
-#ifdef CONFIG_X86_64
- vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
-#endif
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
- if (kvm_cstate_in_guest(vcpu->kvm)) {
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
- }
-
vmx->loaded_vmcs = &vmx->vmcs01;
if (cpu_need_virtualize_apic_accesses(vcpu)) {
@@ -7612,7 +7523,7 @@ free_vpid:
int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
- kvm->arch.pause_in_guest = true;
+ kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
switch (l1tf_mitigation) {
@@ -7849,18 +7760,6 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
}
}
- if (kvm_cpu_cap_has(X86_FEATURE_XFD))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
- !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
-
- if (boot_cpu_has(X86_FEATURE_IBPB))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
- !guest_has_pred_cmd_msr(vcpu));
-
- if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
- !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
-
set_cr4_guest_host_mask(vmx);
vmx_write_encls_bitmap(vcpu, NULL);
@@ -7876,6 +7775,9 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx->msr_ia32_feature_control_valid_bits &=
~FEAT_CTL_SGX_LC_ENABLED;
+ /* Recalc MSR interception to account for feature changes. */
+ vmx_recalc_msr_intercepts(vcpu);
+
/* Refresh #PF interception to account for MAXPHYADDR changes. */
vmx_update_exception_bitmap(vcpu);
}
@@ -8650,6 +8552,8 @@ int __init vmx_init(void)
{
int r, cpu;
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
+
if (!kvm_is_vmx_supported())
return -EOPNOTSUPP;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index b5758c33c60f..d3389baf3ab3 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -19,8 +19,6 @@
#include "../mmu.h"
#include "common.h"
-#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
-
#ifdef CONFIG_X86_64
#define MAX_NR_USER_RETURN_MSRS 7
#else
@@ -296,13 +294,6 @@ struct vcpu_vmx {
struct pt_desc pt_desc;
struct lbr_desc lbr_desc;
- /* Save desired MSR intercept (read: pass-through) state */
-#define MAX_POSSIBLE_PASSTHROUGH_MSRS 16
- struct {
- DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- } shadow_msr_intercept;
-
/* ve_info must be page aligned. */
struct vmx_ve_information *ve_info;
};
@@ -395,24 +386,54 @@ bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs,
int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
-void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
-void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set);
+
+static inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ vmx_set_intercept_for_msr(vcpu, msr, type, false);
+}
+
+static inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ vmx_set_intercept_for_msr(vcpu, msr, type, true);
+}
u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
-static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
- int type, bool value)
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
+
+#define VMX_HOST_OWNED_DEBUGCTL_BITS (DEBUGCTLMSR_FREEZE_IN_SMM)
+
+static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
{
- if (value)
- vmx_enable_intercept_for_msr(vcpu, msr, type);
- else
- vmx_disable_intercept_for_msr(vcpu, msr, type);
+ WARN_ON_ONCE(val & VMX_HOST_OWNED_DEBUGCTL_BITS);
+
+ val |= vcpu->arch.host_debugctl & VMX_HOST_OWNED_DEBUGCTL_BITS;
+ vmcs_write64(GUEST_IA32_DEBUGCTL, val);
}
-void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+static inline u64 vmx_guest_debugctl_read(void)
+{
+ return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~VMX_HOST_OWNED_DEBUGCTL_BITS;
+}
+
+static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu)
+{
+ u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+ if (!((val ^ vcpu->arch.host_debugctl) & VMX_HOST_OWNED_DEBUGCTL_BITS))
+ return;
+
+ vmx_guest_debugctl_write(vcpu, val & ~VMX_HOST_OWNED_DEBUGCTL_BITS);
+}
/*
* Note, early Intel manuals have the write-low and read-high bitmap offsets
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index b4596f651232..2b3424f638db 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -21,7 +21,7 @@ void vmx_vm_destroy(struct kvm *kvm);
int vmx_vcpu_precreate(struct kvm *kvm);
int vmx_vcpu_create(struct kvm_vcpu *vcpu);
int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu);
-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
void vmx_vcpu_free(struct kvm_vcpu *vcpu);
void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
@@ -52,7 +52,7 @@ void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
int trig_mode, int vector);
void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
bool vmx_has_emulated_msr(struct kvm *kvm, u32 index);
-void vmx_msr_filter_changed(struct kvm_vcpu *vcpu);
+void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu);
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
int vmx_get_feature_msr(u32 msr, u64 *data);
@@ -133,10 +133,9 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void tdx_vcpu_free(struct kvm_vcpu *vcpu);
void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu);
-fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void tdx_vcpu_put(struct kvm_vcpu *vcpu);
-bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu);
int tdx_handle_exit(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion fastpath);
@@ -151,15 +150,6 @@ int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
-int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt);
-int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt);
-int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn);
-int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn);
-
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f2a57a1136d2..a1c49bc681c4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -226,6 +226,9 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
bool __read_mostly enable_apicv = true;
EXPORT_SYMBOL_GPL(enable_apicv);
+bool __read_mostly enable_ipiv = true;
+EXPORT_SYMBOL_GPL(enable_ipiv);
+
bool __read_mostly enable_device_posted_irqs = true;
EXPORT_SYMBOL_GPL(enable_device_posted_irqs);
@@ -4579,6 +4582,9 @@ static u64 kvm_get_allowed_disable_exits(void)
{
u64 r = KVM_X86_DISABLE_EXITS_PAUSE;
+ if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+ r |= KVM_X86_DISABLE_EXITS_APERFMPERF;
+
if (!mitigate_smt_rsb) {
r |= KVM_X86_DISABLE_EXITS_HLT |
KVM_X86_DISABLE_EXITS_CSTATE;
@@ -4634,17 +4640,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_EXT_CPUID:
case KVM_CAP_EXT_EMUL_CPUID:
case KVM_CAP_CLOCKSOURCE:
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CAP_PIT:
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_REINJECT_CONTROL:
+#endif
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
case KVM_CAP_USER_NMI:
- case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_IOEVENTFD:
case KVM_CAP_IOEVENTFD_NO_LENGTH:
- case KVM_CAP_PIT2:
- case KVM_CAP_PIT_STATE2:
+
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
case KVM_CAP_VCPU_EVENTS:
#ifdef CONFIG_KVM_HYPERV
@@ -4985,11 +4994,6 @@ out:
return r;
}
-static void wbinvd_ipi(void *garbage)
-{
- wbinvd();
-}
-
static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
return kvm_arch_has_noncoherent_dma(vcpu->kvm);
@@ -5013,8 +5017,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_x86_call(has_wbinvd_exit)())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
- smp_call_function_single(vcpu->cpu,
- wbinvd_ipi, NULL, 1);
+ wbinvd_on_cpu(vcpu->cpu);
}
kvm_x86_call(vcpu_load)(vcpu, cpu);
@@ -5489,12 +5492,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
(events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
return -EINVAL;
- /* INITs are latched while in SMM */
- if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
- (events->smi.smm || events->smi.pending) &&
- vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
- return -EINVAL;
-
process_nmi(vcpu);
/*
@@ -6401,135 +6398,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- memcpy(&chip->chip.pic, &pic->pics[0],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- memcpy(&chip->chip.pic, &pic->pics[1],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_IOAPIC:
- kvm_get_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- return r;
-}
-
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- spin_lock(&pic->lock);
- memcpy(&pic->pics[0], &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic->lock);
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- spin_lock(&pic->lock);
- memcpy(&pic->pics[1], &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic->lock);
- break;
- case KVM_IRQCHIP_IOAPIC:
- kvm_set_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- kvm_pic_update_irq(pic);
- return r;
-}
-
-static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
-
- BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
-
- mutex_lock(&kps->lock);
- memcpy(ps, &kps->channels, sizeof(*ps));
- mutex_unlock(&kps->lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- int i;
- struct kvm_pit *pit = kvm->arch.vpit;
-
- mutex_lock(&pit->pit_state.lock);
- memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
- for (i = 0; i < 3; i++)
- kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
- mutex_unlock(&pit->pit_state.lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
- sizeof(ps->channels));
- ps->flags = kvm->arch.vpit->pit_state.flags;
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- memset(&ps->reserved, 0, sizeof(ps->reserved));
- return 0;
-}
-
-static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- int start = 0;
- int i;
- u32 prev_legacy, cur_legacy;
- struct kvm_pit *pit = kvm->arch.vpit;
-
- mutex_lock(&pit->pit_state.lock);
- prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
- cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
- if (!prev_legacy && cur_legacy)
- start = 1;
- memcpy(&pit->pit_state.channels, &ps->channels,
- sizeof(pit->pit_state.channels));
- pit->pit_state.flags = ps->flags;
- for (i = 0; i < 3; i++)
- kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
- start && i == 0);
- mutex_unlock(&pit->pit_state.lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_reinject(struct kvm *kvm,
- struct kvm_reinject_control *control)
-{
- struct kvm_pit *pit = kvm->arch.vpit;
-
- /* pit->pit_state.lock was overloaded to prevent userspace from getting
- * an inconsistent state after running multiple KVM_REINJECT_CONTROL
- * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
- */
- mutex_lock(&pit->pit_state.lock);
- kvm_pit_set_reinject(pit, control->pit_reinject);
- mutex_unlock(&pit->pit_state.lock);
-
- return 0;
-}
-
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
@@ -6549,18 +6417,6 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
kvm_vcpu_kick(vcpu);
}
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
- bool line_status)
-{
- if (!irqchip_in_kernel(kvm))
- return -ENXIO;
-
- irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event->irq, irq_event->level,
- line_status);
- return 0;
-}
-
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
@@ -6625,17 +6481,11 @@ split_irqchip_unlock:
if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) &&
cpu_smt_possible() &&
- (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
+ (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE |
+ KVM_X86_DISABLE_EXITS_APERFMPERF)))
pr_warn_once(SMT_RSB_MSG);
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
- kvm->arch.pause_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT)
- kvm->arch.mwait_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
- kvm->arch.hlt_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
- kvm->arch.cstate_in_guest = true;
+ kvm_disable_exits(kvm, cap->args[0]);
r = 0;
disable_exits_unlock:
mutex_unlock(&kvm->lock);
@@ -7072,9 +6922,11 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r = -ENOTTY;
+
+#ifdef CONFIG_KVM_IOAPIC
/*
* This union makes it completely explicit to gcc-3.x
- * that these two variables' stack usage should be
+ * that these three variables' stack usage should be
* combined, not added together.
*/
union {
@@ -7082,6 +6934,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm_pit_state2 ps2;
struct kvm_pit_config pit_config;
} u;
+#endif
switch (ioctl) {
case KVM_SET_TSS_ADDR:
@@ -7105,6 +6958,7 @@ set_identity_unlock:
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
break;
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CREATE_IRQCHIP: {
mutex_lock(&kvm->lock);
@@ -7126,7 +6980,7 @@ set_identity_unlock:
goto create_irqchip_unlock;
}
- r = kvm_setup_default_irq_routing(kvm);
+ r = kvm_setup_default_ioapic_and_pic_routing(kvm);
if (r) {
kvm_ioapic_destroy(kvm);
kvm_pic_destroy(kvm);
@@ -7174,7 +7028,7 @@ set_identity_unlock:
}
r = -ENXIO;
- if (!irqchip_kernel(kvm))
+ if (!irqchip_full(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@@ -7198,7 +7052,7 @@ set_identity_unlock:
}
r = -ENXIO;
- if (!irqchip_kernel(kvm))
+ if (!irqchip_full(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
set_irqchip_out:
@@ -7271,6 +7125,7 @@ set_pit2_out:
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
}
+#endif
case KVM_SET_BOOT_CPU_ID:
r = 0;
mutex_lock(&kvm->lock);
@@ -7341,9 +7196,12 @@ set_pit2_out:
if (user_tsc_khz == 0)
user_tsc_khz = tsc_khz;
- WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
- r = 0;
-
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
goto out;
}
case KVM_GET_TSC_KHZ: {
@@ -10729,8 +10587,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+#ifdef CONFIG_KVM_IOAPIC
else if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+#endif
if (is_guest_mode(vcpu))
vcpu->arch.load_eoi_exitmap_pending = true;
@@ -10784,6 +10644,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
fastpath_t exit_fastpath;
+ u64 run_flags, debug_ctl;
bool req_immediate_exit = false;
@@ -10931,8 +10792,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_vcpu_update_apicv(vcpu);
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
kvm_check_async_pf_completion(vcpu);
+
+ /*
+ * Recalc MSR intercepts as userspace may want to intercept
+ * accesses to MSRs that KVM would otherwise pass through to
+ * the guest.
+ */
if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
- kvm_x86_call(msr_filter_changed)(vcpu);
+ kvm_x86_call(recalc_msr_intercepts)(vcpu);
if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
kvm_x86_call(update_cpu_dirty_logging)(vcpu);
@@ -11028,8 +10895,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}
- if (req_immediate_exit)
+ run_flags = 0;
+ if (req_immediate_exit) {
+ run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -11047,12 +10917,22 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[3], 3);
/* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
+ run_flags |= KVM_RUN_LOAD_GUEST_DR6;
} else if (unlikely(hw_breakpoint_active())) {
set_debugreg(DR7_FIXED_1, 7);
}
- vcpu->arch.host_debugctl = get_debugctlmsr();
+ /*
+ * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
+ * can be modified in IRQ context, e.g. via SMP function calls. Inform
+ * vendor code if any host-owned bits were changed, e.g. so that the
+ * value loaded into hardware while running the guest can be updated.
+ */
+ debug_ctl = get_debugctlmsr();
+ if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
+ !vcpu->arch.guest_state_protected)
+ run_flags |= KVM_RUN_LOAD_DEBUGCTL;
+ vcpu->arch.host_debugctl = debug_ctl;
guest_timing_enter_irqoff();
@@ -11066,8 +10946,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
- exit_fastpath = kvm_x86_call(vcpu_run)(vcpu,
- req_immediate_exit);
+ exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, run_flags);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -11079,6 +10958,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
break;
}
+ run_flags = 0;
+
/* Note, VM-Exits that go down the "slow" path are accounted below. */
++vcpu->stat.exits;
}
@@ -11552,6 +11433,28 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
trace_kvm_fpu(0);
}
+static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ /*
+ * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and
+ * tracks the pending SIPI separately. SIPI_RECEIVED is still accepted
+ * by KVM_SET_VCPU_EVENTS for backwards compatibility, but should be
+ * converted to INIT_RECEIVED.
+ */
+ if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED))
+ return -EINVAL;
+
+ /*
+ * Disallow running the vCPU if userspace forced it into an impossible
+ * MP_STATE, e.g. if the vCPU is in WFS but SIPI is blocked.
+ */
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED &&
+ !kvm_apic_init_sipi_allowed(vcpu))
+ return -EINVAL;
+
+ return kvm_x86_call(vcpu_pre_run)(vcpu);
+}
+
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
@@ -11654,7 +11557,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
- r = kvm_x86_call(vcpu_pre_run)(vcpu);
+ r = kvm_x86_vcpu_pre_run(vcpu);
if (r <= 0)
goto out;
@@ -11898,21 +11801,16 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
}
/*
- * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
- * forcing the guest into INIT/SIPI if those events are supposed to be
- * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state
- * if an SMI is pending as well.
+ * SIPI_RECEIVED is obsolete and no longer used internally; KVM instead
+ * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI.
+ * Translate SIPI_RECEIVED as appropriate for backwards compatibility.
*/
- if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
- (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
- mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
- goto out;
-
if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
- kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
+ mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED;
set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
- } else
- kvm_set_mp_state(vcpu, mp_state->mp_state);
+ }
+
+ kvm_set_mp_state(vcpu, mp_state->mp_state);
kvm_make_request(KVM_REQ_EVENT, vcpu);
ret = 0;
@@ -12794,21 +12692,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto out;
- kvm_mmu_init_vm(kvm);
+ ret = kvm_mmu_init_vm(kvm);
+ if (ret)
+ goto out_cleanup_page_track;
ret = kvm_x86_call(vm_init)(kvm);
if (ret)
goto out_uninit_mmu;
- INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
- /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
- set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
- /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
- set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
- &kvm->arch.irq_sources_bitmap);
-
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
@@ -12847,6 +12740,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
out_uninit_mmu:
kvm_mmu_uninit_vm(kvm);
+out_cleanup_page_track:
kvm_page_track_cleanup(kvm);
out:
return ret;
@@ -12939,7 +12833,9 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
+#ifdef CONFIG_KVM_IOAPIC
kvm_free_pit(kvm);
+#endif
kvm_mmu_pre_destroy_vm(kvm);
static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
@@ -12963,8 +12859,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
}
kvm_destroy_vcpus(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
+#ifdef CONFIG_KVM_IOAPIC
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
+#endif
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
@@ -13574,25 +13472,6 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
}
-void kvm_arch_start_assignment(struct kvm *kvm)
-{
- if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
- kvm_x86_call(pi_start_assignment)(kvm);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
-
-void kvm_arch_end_assignment(struct kvm *kvm)
-{
- atomic_dec(&kvm->arch.assigned_device_count);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
-
-bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
-{
- return raw_atomic_read(&kvm->arch.assigned_device_count);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
-
static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
{
/*
@@ -13628,77 +13507,6 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
-int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
- struct irq_bypass_producer *prod)
-{
- struct kvm_kernel_irqfd *irqfd =
- container_of(cons, struct kvm_kernel_irqfd, consumer);
- struct kvm *kvm = irqfd->kvm;
- int ret;
-
- kvm_arch_start_assignment(irqfd->kvm);
-
- spin_lock_irq(&kvm->irqfds.lock);
- irqfd->producer = prod;
-
- ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 1);
- if (ret)
- kvm_arch_end_assignment(irqfd->kvm);
-
- spin_unlock_irq(&kvm->irqfds.lock);
-
-
- return ret;
-}
-
-void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
- struct irq_bypass_producer *prod)
-{
- int ret;
- struct kvm_kernel_irqfd *irqfd =
- container_of(cons, struct kvm_kernel_irqfd, consumer);
- struct kvm *kvm = irqfd->kvm;
-
- WARN_ON(irqfd->producer != prod);
-
- /*
- * When producer of consumer is unregistered, we change back to
- * remapped mode, so we can re-use the current implementation
- * when the irq is masked/disabled or the consumer side (KVM
- * int this case doesn't want to receive the interrupts.
- */
- spin_lock_irq(&kvm->irqfds.lock);
- irqfd->producer = NULL;
-
- ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 0);
- if (ret)
- printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
- " fails: %d\n", irqfd->consumer.token, ret);
-
- spin_unlock_irq(&kvm->irqfds.lock);
-
-
- kvm_arch_end_assignment(irqfd->kvm);
-}
-
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
-}
-
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
- struct kvm_kernel_irq_routing_entry *new)
-{
- if (old->type != KVM_IRQ_ROUTING_MSI ||
- new->type != KVM_IRQ_ROUTING_MSI)
- return true;
-
- return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
-}
-
bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
@@ -14098,7 +13906,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 832f0faf4779..bcfd9b719ada 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -55,6 +55,28 @@ struct kvm_host_values {
void kvm_spurious_fault(void);
+#define SIZE_OF_MEMSLOTS_HASHTABLE \
+ (sizeof(((struct kvm_memslots *)0)->id_hash) * 2 * KVM_MAX_NR_ADDRESS_SPACES)
+
+/* Sanity check the size of the memslot hash tables. */
+static_assert(SIZE_OF_MEMSLOTS_HASHTABLE ==
+ (1024 * (1 + IS_ENABLED(CONFIG_X86_64)) * (1 + IS_ENABLED(CONFIG_KVM_SMM))));
+
+/*
+ * Assert that "struct kvm_{svm,vmx,tdx}" is an order-0 or order-1 allocation.
+ * Spilling over to an order-2 allocation isn't fundamentally problematic, but
+ * isn't expected to happen in the foreseeable future (O(years)). Assert that
+ * the size is an order-0 allocation when ignoring the memslot hash tables, to
+ * help detect and debug unexpected size increases.
+ */
+#define KVM_SANITY_CHECK_VM_STRUCT_SIZE(x) \
+do { \
+ BUILD_BUG_ON(get_order(sizeof(struct x) - SIZE_OF_MEMSLOTS_HASHTABLE) && \
+ !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \
+ BUILD_BUG_ON(get_order(sizeof(struct x)) > 1 && \
+ !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \
+} while (0)
+
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
({ \
bool failed = (consistency_check); \
@@ -499,24 +521,34 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
__rem; \
})
+static inline void kvm_disable_exits(struct kvm *kvm, u64 mask)
+{
+ kvm->arch.disabled_exits |= mask;
+}
+
static inline bool kvm_mwait_in_guest(struct kvm *kvm)
{
- return kvm->arch.mwait_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_MWAIT;
}
static inline bool kvm_hlt_in_guest(struct kvm *kvm)
{
- return kvm->arch.hlt_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_HLT;
}
static inline bool kvm_pause_in_guest(struct kvm *kvm)
{
- return kvm->arch.pause_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_PAUSE;
}
static inline bool kvm_cstate_in_guest(struct kvm *kvm)
{
- return kvm->arch.cstate_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_CSTATE;
+}
+
+static inline bool kvm_aperfmperf_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_APERFMPERF;
}
static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 2e7923844afe..c09284302dd3 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -36,7 +36,6 @@
#include <linux/debugfs.h>
#include <linux/ioport.h>
#include <linux/kernel.h>
-#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/io.h>
#include <linux/mm.h>
diff --git a/arch/x86/mm/pgprot.c b/arch/x86/mm/pgprot.c
index c84bd9540b16..dc1afd5c839d 100644
--- a/arch/x86/mm/pgprot.c
+++ b/arch/x86/mm/pgprot.c
@@ -32,7 +32,7 @@ void add_encrypt_protection_map(void)
protection_map[i] = pgprot_encrypted(protection_map[i]);
}
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
{
unsigned long val = pgprot_val(protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 15672cb926fc..7e3fca164620 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3501,13 +3501,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf);
}
-static const char *bpf_get_prog_name(struct bpf_prog *prog)
-{
- if (prog->aux->ksym.prog)
- return prog->aux->ksym.name;
- return prog->aux->name;
-}
-
static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size)
{
int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
@@ -3531,7 +3524,7 @@ static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size
if (stack_ptr[0] != PRIV_STACK_GUARD_VAL ||
stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL) {
pr_err("BPF private stack overflow/underflow detected for prog %sx\n",
- bpf_get_prog_name(prog));
+ bpf_jit_get_prog_name(prog));
break;
}
}
@@ -3845,7 +3838,6 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
}
return;
#endif
- WARN(1, "verification of programs using bpf_throw should have failed\n");
}
void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,