summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild3
-rw-r--r--arch/x86/Kconfig78
-rw-r--r--arch/x86/Kconfig.cpu17
-rw-r--r--arch/x86/Makefile20
-rw-r--r--arch/x86/boot/compressed/Makefile7
-rw-r--r--arch/x86/boot/compressed/efi_thunk_64.S16
-rw-r--r--arch/x86/boot/compressed/head_64.S8
-rw-r--r--arch/x86/boot/compressed/kaslr.c4
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S6
-rw-r--r--arch/x86/boot/compressed/misc.c3
-rw-r--r--arch/x86/boot/compressed/misc.h4
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c2
-rw-r--r--arch/x86/boot/compressed/sev.c6
-rw-r--r--arch/x86/boot/genimage.sh15
-rw-r--r--arch/x86/boot/mtools.conf.in5
-rw-r--r--arch/x86/boot/string.h3
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/aegis128-aesni-asm.S48
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S2
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S56
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S40
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c6
-rw-r--r--arch/x86/crypto/blake2s-core.S4
-rw-r--r--arch/x86/crypto/blake2s-glue.c68
-rw-r--r--arch/x86/crypto/blake2s-shash.c77
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S12
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S14
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S14
-rw-r--r--arch/x86/crypto/camellia-x86_64-asm_64.S12
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S12
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S10
-rw-r--r--arch/x86/crypto/chacha-avx2-x86_64.S6
-rw-r--r--arch/x86/crypto/chacha-avx512vl-x86_64.S6
-rw-r--r--arch/x86/crypto/chacha-ssse3-x86_64.S8
-rw-r--r--arch/x86/crypto/crc32-pclmul_asm.S2
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S2
-rw-r--r--arch/x86/crypto/crct10dif-pcl-asm_64.S2
-rw-r--r--arch/x86/crypto/curve25519-x86_64.c767
-rw-r--r--arch/x86/crypto/des3_ede-asm_64.S4
-rw-r--r--arch/x86/crypto/des3_ede_glue.c4
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S6
-rw-r--r--arch/x86/crypto/nh-avx2-x86_64.S2
-rw-r--r--arch/x86/crypto/nh-sse2-x86_64.S2
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S10
-rw-r--r--arch/x86/crypto/serpent-avx2-asm_64.S10
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S6
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S6
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S2
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S2
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S2
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S2
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S2
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S2
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S2
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S2
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S2
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S2
-rw-r--r--arch/x86/crypto/sm4-aesni-avx-asm_64.S18
-rw-r--r--arch/x86/crypto/sm4-aesni-avx2-asm_64.S14
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S10
-rw-r--r--arch/x86/crypto/twofish-i586-asm_32.S4
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64-3way.S6
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S4
-rw-r--r--arch/x86/entry/entry_32.S37
-rw-r--r--arch/x86/entry/entry_64.S56
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/entry/thunk_32.S2
-rw-r--r--arch/x86/entry/thunk_64.S2
-rw-r--r--arch/x86/entry/vdso/Makefile2
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S1
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S2
-rw-r--r--arch/x86/entry/vdso/vsgx.S2
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c3
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_emu_64.S6
-rw-r--r--arch/x86/events/amd/iommu.c2
-rw-r--r--arch/x86/events/core.c29
-rw-r--r--arch/x86/events/intel/bts.c6
-rw-r--r--arch/x86/events/intel/core.c95
-rw-r--r--arch/x86/events/intel/ds.c7
-rw-r--r--arch/x86/events/intel/lbr.c22
-rw-r--r--arch/x86/events/intel/uncore.c2
-rw-r--r--arch/x86/events/intel/uncore_discovery.h2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c28
-rw-r--r--arch/x86/events/msr.c1
-rw-r--r--arch/x86/events/perf_event.h23
-rw-r--r--arch/x86/hyperv/Makefile2
-rw-r--r--arch/x86/hyperv/hv_apic.c20
-rw-r--r--arch/x86/hyperv/hv_init.c106
-rw-r--r--arch/x86/hyperv/irqdomain.c55
-rw-r--r--arch/x86/hyperv/ivm.c317
-rw-r--r--arch/x86/hyperv/mmu.c19
-rw-r--r--arch/x86/ia32/audit.c13
-rw-r--r--arch/x86/ia32/ia32_signal.c15
-rw-r--r--arch/x86/include/asm/GEN-for-each-reg.h14
-rw-r--r--arch/x86/include/asm/alternative.h1
-rw-r--r--arch/x86/include/asm/amd_nb.h1
-rw-r--r--arch/x86/include/asm/asm-prototypes.h18
-rw-r--r--arch/x86/include/asm/asm.h90
-rw-r--r--arch/x86/include/asm/barrier.h10
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h8
-rw-r--r--arch/x86/include/asm/cpufeature.h13
-rw-r--r--arch/x86/include/asm/cpufeatures.h5
-rw-r--r--arch/x86/include/asm/efi.h16
-rw-r--r--arch/x86/include/asm/entry-common.h2
-rw-r--r--arch/x86/include/asm/extable.h44
-rw-r--r--arch/x86/include/asm/extable_fixup_types.h67
-rw-r--r--arch/x86/include/asm/fpu/api.h73
-rw-r--r--arch/x86/include/asm/fpu/internal.h540
-rw-r--r--arch/x86/include/asm/fpu/sched.h68
-rw-r--r--arch/x86/include/asm/fpu/signal.h10
-rw-r--r--arch/x86/include/asm/fpu/types.h246
-rw-r--r--arch/x86/include/asm/fpu/xcr.h23
-rw-r--r--arch/x86/include/asm/fpu/xstate.h91
-rw-r--r--arch/x86/include/asm/ftrace.h9
-rw-r--r--arch/x86/include/asm/futex.h28
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h50
-rw-r--r--arch/x86/include/asm/ia32.h2
-rw-r--r--arch/x86/include/asm/insn-eval.h16
-rw-r--r--arch/x86/include/asm/intel-family.h2
-rw-r--r--arch/x86/include/asm/io.h28
-rw-r--r--arch/x86/include/asm/irq_stack.h42
-rw-r--r--arch/x86/include/asm/irqflags.h7
-rw-r--r--arch/x86/include/asm/kexec.h2
-rw-r--r--arch/x86/include/asm/kprobes.h1
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h141
-rw-r--r--arch/x86/include/asm/kvm_page_track.h11
-rw-r--r--arch/x86/include/asm/kvm_para.h12
-rw-r--r--arch/x86/include/asm/linkage.h14
-rw-r--r--arch/x86/include/asm/mce.h40
-rw-r--r--arch/x86/include/asm/mem_encrypt.h16
-rw-r--r--arch/x86/include/asm/microcode.h3
-rw-r--r--arch/x86/include/asm/mmx.h15
-rw-r--r--arch/x86/include/asm/mshyperv.h77
-rw-r--r--arch/x86/include/asm/msr-index.h19
-rw-r--r--arch/x86/include/asm/msr.h30
-rw-r--r--arch/x86/include/asm/mtrr.h8
-rw-r--r--arch/x86/include/asm/nospec-branch.h72
-rw-r--r--arch/x86/include/asm/page_32.h14
-rw-r--r--arch/x86/include/asm/page_64.h1
-rw-r--r--arch/x86/include/asm/page_64_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h44
-rw-r--r--arch/x86/include/asm/paravirt_types.h3
-rw-r--r--arch/x86/include/asm/pgtable.h32
-rw-r--r--arch/x86/include/asm/pkru.h6
-rw-r--r--arch/x86/include/asm/processor.h28
-rw-r--r--arch/x86/include/asm/proto.h2
-rw-r--r--arch/x86/include/asm/ptrace.h2
-rw-r--r--arch/x86/include/asm/qspinlock.h1
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h4
-rw-r--r--arch/x86/include/asm/realmode.h1
-rw-r--r--arch/x86/include/asm/required-features.h4
-rw-r--r--arch/x86/include/asm/segment.h11
-rw-r--r--arch/x86/include/asm/set_memory.h5
-rw-r--r--arch/x86/include/asm/sev-common.h66
-rw-r--r--arch/x86/include/asm/sev.h6
-rw-r--r--arch/x86/include/asm/sgx.h18
-rw-r--r--arch/x86/include/asm/smp.h8
-rw-r--r--arch/x86/include/asm/stacktrace.h10
-rw-r--r--arch/x86/include/asm/static_call.h3
-rw-r--r--arch/x86/include/asm/string_32.h33
-rw-r--r--arch/x86/include/asm/syscall.h33
-rw-r--r--arch/x86/include/asm/thread_info.h3
-rw-r--r--arch/x86/include/asm/tlbflush.h5
-rw-r--r--arch/x86/include/asm/topology.h5
-rw-r--r--arch/x86/include/asm/trace/fpu.h4
-rw-r--r--arch/x86/include/asm/traps.h6
-rw-r--r--arch/x86/include/asm/uaccess.h46
-rw-r--r--arch/x86/include/asm/unwind.h29
-rw-r--r--arch/x86/include/asm/unwind_hints.h5
-rw-r--r--arch/x86/include/asm/word-at-a-time.h66
-rw-r--r--arch/x86/include/asm/x86_init.h6
-rw-r--r--arch/x86/include/asm/xen/hypercall.h235
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h13
-rw-r--r--arch/x86/include/asm/xen/page.h14
-rw-r--r--arch/x86/include/asm/xen/pci.h18
-rw-r--r--arch/x86/include/uapi/asm/kvm.h20
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h1
-rw-r--r--arch/x86/include/uapi/asm/prctl.h24
-rw-r--r--arch/x86/include/uapi/asm/sgx.h2
-rw-r--r--arch/x86/kernel/Makefile8
-rw-r--r--arch/x86/kernel/acpi/boot.c9
-rw-r--r--arch/x86/kernel/acpi/cstate.c15
-rw-r--r--arch/x86/kernel/acpi/sleep.c4
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S6
-rw-r--r--arch/x86/kernel/alternative.c242
-rw-r--r--arch/x86/kernel/amd_nb.c54
-rw-r--r--arch/x86/kernel/aperture_64.c13
-rw-r--r--arch/x86/kernel/apic/msi.c11
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c27
-rw-r--r--arch/x86/kernel/asm-offsets.c3
-rw-r--r--arch/x86/kernel/audit_64.c10
-rw-r--r--arch/x86/kernel/cc_platform.c85
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/bugs.c13
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c1
-rw-r--r--arch/x86/kernel/cpu/common.c69
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c3
-rw-r--r--arch/x86/kernel/cpu/hygon.c2
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c45
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c291
-rw-r--r--arch/x86/kernel/cpu/mce/core.c439
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c46
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c5
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h71
-rw-r--r--arch/x86/kernel/cpu/mce/p5.c6
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c64
-rw-r--r--arch/x86/kernel/cpu/mce/winchip.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c14
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c17
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c9
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c40
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c6
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h36
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c174
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h8
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c65
-rw-r--r--arch/x86/kernel/cpu/vortex.c39
-rw-r--r--arch/x86/kernel/crash_dump_64.c4
-rw-r--r--arch/x86/kernel/devicetree.c10
-rw-r--r--arch/x86/kernel/doublefault_32.c3
-rw-r--r--arch/x86/kernel/dumpstack_64.c6
-rw-r--r--arch/x86/kernel/early-quirks.c7
-rw-r--r--arch/x86/kernel/fpu/bugs.c2
-rw-r--r--arch/x86/kernel/fpu/context.h83
-rw-r--r--arch/x86/kernel/fpu/core.c498
-rw-r--r--arch/x86/kernel/fpu/init.c76
-rw-r--r--arch/x86/kernel/fpu/internal.h28
-rw-r--r--arch/x86/kernel/fpu/legacy.h111
-rw-r--r--arch/x86/kernel/fpu/regset.c36
-rw-r--r--arch/x86/kernel/fpu/signal.c292
-rw-r--r--arch/x86/kernel/fpu/xstate.c939
-rw-r--r--arch/x86/kernel/fpu/xstate.h322
-rw-r--r--arch/x86/kernel/ftrace.c78
-rw-r--r--arch/x86/kernel/ftrace_32.S6
-rw-r--r--arch/x86/kernel/ftrace_64.S38
-rw-r--r--arch/x86/kernel/head64.c71
-rw-r--r--arch/x86/kernel/head_32.S2
-rw-r--r--arch/x86/kernel/head_64.S19
-rw-r--r--arch/x86/kernel/hpet.c81
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irqflags.S4
-rw-r--r--arch/x86/kernel/itmt.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c73
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c2
-rw-r--r--arch/x86/kernel/kprobes/opt.c6
-rw-r--r--arch/x86/kernel/kvm.c118
-rw-r--r--arch/x86/kernel/kvmclock.c6
-rw-r--r--arch/x86/kernel/machine_kexec_64.c19
-rw-r--r--arch/x86/kernel/module.c16
-rw-r--r--arch/x86/kernel/paravirt.c61
-rw-r--r--arch/x86/kernel/pci-swiotlb.c9
-rw-r--r--arch/x86/kernel/probe_roms.c2
-rw-r--r--arch/x86/kernel/process.c100
-rw-r--r--arch/x86/kernel/process.h4
-rw-r--r--arch/x86/kernel/process_32.c5
-rw-r--r--arch/x86/kernel/process_64.c5
-rw-r--r--arch/x86/kernel/ptrace.c2
-rw-r--r--arch/x86/kernel/reboot.c12
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S10
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S12
-rw-r--r--arch/x86/kernel/setup.c17
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/sev-shared.c70
-rw-r--r--arch/x86/kernel/sev.c314
-rw-r--r--arch/x86/kernel/sev_verify_cbit.S2
-rw-r--r--arch/x86/kernel/signal.c83
-rw-r--r--arch/x86/kernel/smpboot.c80
-rw-r--r--arch/x86/kernel/static_call.c19
-rw-r--r--arch/x86/kernel/trace.c2
-rw-r--r--arch/x86/kernel/traps.c100
-rw-r--r--arch/x86/kernel/tsc.c28
-rw-r--r--arch/x86/kernel/tsc_sync.c41
-rw-r--r--arch/x86/kernel/umip.c8
-rw-r--r--arch/x86/kernel/unwind_frame.c3
-rw-r--r--arch/x86/kernel/unwind_guess.c3
-rw-r--r--arch/x86/kernel/unwind_orc.c23
-rw-r--r--arch/x86/kernel/verify_cpu.S4
-rw-r--r--arch/x86/kernel/vm86_32.c10
-rw-r--r--arch/x86/kernel/vmlinux.lds.S15
-rw-r--r--arch/x86/kernel/x86_init.c12
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/Makefile7
-rw-r--r--arch/x86/kvm/cpuid.c169
-rw-r--r--arch/x86/kvm/cpuid.h2
-rw-r--r--arch/x86/kvm/debugfs.c9
-rw-r--r--arch/x86/kvm/emulate.c76
-rw-r--r--arch/x86/kvm/hyperv.c46
-rw-r--r--arch/x86/kvm/i8254.c2
-rw-r--r--arch/x86/kvm/i8259.c5
-rw-r--r--arch/x86/kvm/ioapic.c6
-rw-r--r--arch/x86/kvm/ioapic.h5
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/irq_comm.c19
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h20
-rw-r--r--arch/x86/kvm/kvm_emulate.h1
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c3
-rw-r--r--arch/x86/kvm/lapic.c108
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.h130
-rw-r--r--arch/x86/kvm/mmu/mmu.c952
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h28
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h20
-rw-r--r--arch/x86/kvm/mmu/page_track.c49
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h217
-rw-r--r--arch/x86/kvm/mmu/spte.c37
-rw-r--r--arch/x86/kvm/mmu/spte.h21
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c6
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h6
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c188
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h11
-rw-r--r--arch/x86/kvm/pmu.c130
-rw-r--r--arch/x86/kvm/pmu.h9
-rw-r--r--arch/x86/kvm/svm/avic.c36
-rw-r--r--arch/x86/kvm/svm/nested.c314
-rw-r--r--arch/x86/kvm/svm/pmu.c30
-rw-r--r--arch/x86/kvm/svm/sev.c544
-rw-r--r--arch/x86/kvm/svm/svm.c282
-rw-r--r--arch/x86/kvm/svm/svm.h119
-rw-r--r--arch/x86/kvm/svm/svm_ops.h4
-rw-r--r--arch/x86/kvm/svm/vmenter.S4
-rw-r--r--arch/x86/kvm/trace.h33
-rw-r--r--arch/x86/kvm/vmx/capabilities.h9
-rw-r--r--arch/x86/kvm/vmx/evmcs.h4
-rw-r--r--arch/x86/kvm/vmx/nested.c439
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c62
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c177
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h6
-rw-r--r--arch/x86/kvm/vmx/sgx.c16
-rw-r--r--arch/x86/kvm/vmx/vmcs.h5
-rw-r--r--arch/x86/kvm/vmx/vmenter.S14
-rw-r--r--arch/x86/kvm/vmx/vmx.c598
-rw-r--r--arch/x86/kvm/vmx/vmx.h89
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h43
-rw-r--r--arch/x86/kvm/x86.c1919
-rw-r--r--arch/x86/kvm/x86.h39
-rw-r--r--arch/x86/kvm/xen.c380
-rw-r--r--arch/x86/kvm/xen.h9
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_386_32.S86
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S16
-rw-r--r--arch/x86/lib/checksum_32.S27
-rw-r--r--arch/x86/lib/clear_page_64.S6
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S4
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S4
-rw-r--r--arch/x86/lib/copy_mc_64.S24
-rw-r--r--arch/x86/lib/copy_page_64.S4
-rw-r--r--arch/x86/lib/copy_user_64.S66
-rw-r--r--arch/x86/lib/csum-copy_64.S2
-rw-r--r--arch/x86/lib/csum-partial_64.c183
-rw-r--r--arch/x86/lib/error-inject.c3
-rw-r--r--arch/x86/lib/getuser.S22
-rw-r--r--arch/x86/lib/hweight.S6
-rw-r--r--arch/x86/lib/insn-eval.c182
-rw-r--r--arch/x86/lib/insn.c5
-rw-r--r--arch/x86/lib/iomap_copy_64.S2
-rw-r--r--arch/x86/lib/kaslr.c18
-rw-r--r--arch/x86/lib/memcpy_32.c4
-rw-r--r--arch/x86/lib/memcpy_64.S12
-rw-r--r--arch/x86/lib/memmove_64.S4
-rw-r--r--arch/x86/lib/memset_64.S6
-rw-r--r--arch/x86/lib/mmx_32.c388
-rw-r--r--arch/x86/lib/msr-reg.S4
-rw-r--r--arch/x86/lib/putuser.S6
-rw-r--r--arch/x86/lib/retpoline.S60
-rw-r--r--arch/x86/lib/string_32.c1
-rw-r--r--arch/x86/lib/usercopy_32.c67
-rw-r--r--arch/x86/lib/usercopy_64.c8
-rw-r--r--arch/x86/math-emu/div_Xsig.S2
-rw-r--r--arch/x86/math-emu/div_small.S2
-rw-r--r--arch/x86/math-emu/fpu_aux.c2
-rw-r--r--arch/x86/math-emu/fpu_entry.c6
-rw-r--r--arch/x86/math-emu/fpu_system.h2
-rw-r--r--arch/x86/math-emu/mul_Xsig.S6
-rw-r--r--arch/x86/math-emu/polynom_Xsig.S2
-rw-r--r--arch/x86/math-emu/reg_norm.S6
-rw-r--r--arch/x86/math-emu/reg_round.S2
-rw-r--r--arch/x86/math-emu/reg_u_add.S2
-rw-r--r--arch/x86/math-emu/reg_u_div.S2
-rw-r--r--arch/x86/math-emu/reg_u_mul.S2
-rw-r--r--arch/x86/math-emu/reg_u_sub.S2
-rw-r--r--arch/x86/math-emu/round_Xsig.S4
-rw-r--r--arch/x86/math-emu/shr_Xsig.S8
-rw-r--r--arch/x86/math-emu/wm_shrx.S16
-rw-r--r--arch/x86/mm/Makefile7
-rw-r--r--arch/x86/mm/cpu_entry_area.c7
-rw-r--r--arch/x86/mm/extable.c216
-rw-r--r--arch/x86/mm/fault.c23
-rw-r--r--arch/x86/mm/init.c7
-rw-r--r--arch/x86/mm/init_32.c45
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/ioremap.c18
-rw-r--r--arch/x86/mm/kasan_init_64.c4
-rw-r--r--arch/x86/mm/mem_encrypt.c426
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c438
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S4
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c18
-rw-r--r--arch/x86/mm/numa.c2
-rw-r--r--arch/x86/mm/numa_emulation.c2
-rw-r--r--arch/x86/mm/pat/set_memory.c30
-rw-r--r--arch/x86/mm/tlb.c10
-rw-r--r--arch/x86/net/bpf_jit_comp.c446
-rw-r--r--arch/x86/net/bpf_jit_comp32.c26
-rw-r--r--arch/x86/pci/acpi.c2
-rw-r--r--arch/x86/pci/common.c2
-rw-r--r--arch/x86/pci/xen.c123
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts4
-rw-r--r--arch/x86/platform/efi/efi_64.c9
-rw-r--r--arch/x86/platform/efi/efi_stub_32.S2
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S2
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S16
-rw-r--r--arch/x86/platform/efi/quirks.c3
-rw-r--r--arch/x86/platform/olpc/olpc.c2
-rw-r--r--arch/x86/platform/olpc/xo1-wakeup.S6
-rw-r--r--arch/x86/platform/pvh/enlighten.c12
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/power/hibernate_asm_32.S4
-rw-r--r--arch/x86/power/hibernate_asm_64.S4
-rw-r--r--arch/x86/purgatory/Makefile2
-rw-r--r--arch/x86/realmode/init.c46
-rw-r--r--arch/x86/tools/relocs.c105
-rw-r--r--arch/x86/um/Makefile2
-rw-r--r--arch/x86/um/asm/barrier.h1
-rw-r--r--arch/x86/um/asm/segment.h8
-rw-r--r--arch/x86/um/checksum_32.S4
-rw-r--r--arch/x86/um/os-Linux/registers.c1
-rw-r--r--arch/x86/um/ptrace_32.c1
-rw-r--r--arch/x86/um/ptrace_64.c1
-rw-r--r--arch/x86/um/setjmp_32.S2
-rw-r--r--arch/x86/um/setjmp_64.S2
-rw-r--r--arch/x86/um/shared/sysdep/syscalls_64.h3
-rw-r--r--arch/x86/um/signal.c1
-rw-r--r--arch/x86/um/sys_call_table_32.c4
-rw-r--r--arch/x86/um/sys_call_table_64.c17
-rw-r--r--arch/x86/um/syscalls_64.c14
-rw-r--r--arch/x86/xen/Kconfig20
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c168
-rw-r--r--arch/x86/xen/enlighten_hvm.c6
-rw-r--r--arch/x86/xen/enlighten_pv.c140
-rw-r--r--arch/x86/xen/enlighten_pvh.c10
-rw-r--r--arch/x86/xen/irq.c73
-rw-r--r--arch/x86/xen/mmu_hvm.c37
-rw-r--r--arch/x86/xen/mmu_pv.c153
-rw-r--r--arch/x86/xen/p2m.c2
-rw-r--r--arch/x86/xen/pmu.c32
-rw-r--r--arch/x86/xen/setup.c16
-rw-r--r--arch/x86/xen/smp.c28
-rw-r--r--arch/x86/xen/smp_pv.c14
-rw-r--r--arch/x86/xen/vga.c12
-rw-r--r--arch/x86/xen/xen-asm.S107
-rw-r--r--arch/x86/xen/xen-head.S46
-rw-r--r--arch/x86/xen/xen-ops.h9
460 files changed, 13814 insertions, 9143 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 30dec019756b..f384cb1a4f7a 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -25,3 +25,6 @@ obj-y += platform/
obj-y += net/
obj-$(CONFIG_KEXEC_FILE) += purgatory/
+
+# for cleaning
+subdir- += boot tools
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ab83c22d274e..407533c835fe 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -61,8 +61,9 @@ config X86
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
select ARCH_32BIT_OFF_T if X86_32
select ARCH_CLOCKSOURCE_INIT
+ select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
- select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM)
+ select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64
select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE)
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
@@ -103,6 +104,7 @@ config X86
select ARCH_SUPPORTS_ACPI
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
select ARCH_SUPPORTS_LTO_CLANG
@@ -125,6 +127,7 @@ config X86
select CLOCKSOURCE_VALIDATE_LAST_CYCLE
select CLOCKSOURCE_WATCHDOG
select DCACHE_WORD_ACCESS
+ select DYNAMIC_SIGFRAME
select EDAC_ATOMIC_SCRUB
select EDAC_SUPPORT
select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
@@ -190,6 +193,8 @@ config X86
select HAVE_DYNAMIC_FTRACE_WITH_REGS
select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64
select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ select HAVE_SAMPLE_FTRACE_DIRECT if X86_64
+ select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64
select HAVE_EBPF_JIT
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_EISA
@@ -197,7 +202,7 @@ config X86
select HAVE_FAST_GUP
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD
- select HAVE_FUNCTION_GRAPH_TRACER
+ select HAVE_FUNCTION_GRAPH_TRACER if X86_32 || (X86_64 && DYNAMIC_FTRACE)
select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS
select HAVE_HW_BREAKPOINT
@@ -265,6 +270,7 @@ config X86
select HAVE_ARCH_KCSAN if X86_64
select X86_FEATURE_NAMES if PROC_FS
select PROC_PID_ARCH_STATUS if PROC_FS
+ select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX
imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI
config INSTRUCTION_DECODER
@@ -468,6 +474,18 @@ config RETPOLINE
branches. Requires a compiler with -mindirect-branch=thunk-extern
support for full protection. The kernel may run slower.
+config CC_HAS_SLS
+ def_bool $(cc-option,-mharden-sls=all)
+
+config SLS
+ bool "Mitigate Straight-Line-Speculation"
+ depends on CC_HAS_SLS && X86_64
+ default n
+ help
+ Compile the kernel with straight-line-speculation options to guard
+ against straight line speculation. The kernel image might be slightly
+ larger.
+
config X86_CPU_RESCTRL
bool "x86 CPU resource control support"
depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
@@ -610,9 +628,7 @@ config X86_INTEL_MID
depends on X86_IO_APIC
select I2C
select DW_APB_TIMER
- select APB_TIMER
select INTEL_SCU_PCI
- select MFD_INTEL_MSIC
help
Select to build a kernel capable of supporting Intel MID (Mobile
Internet Device) platform systems which do not have the PCI legacy
@@ -1001,6 +1017,17 @@ config NR_CPUS
This is purely to save memory: each supported CPU adds about 8KB
to the kernel image.
+config SCHED_CLUSTER
+ bool "Cluster scheduler support"
+ depends on SMP
+ default y
+ help
+ Cluster scheduler support improves the CPU scheduler's decision
+ making when dealing with machines that have clusters of CPUs.
+ Cluster usually means a couple of CPUs which are placed closely
+ by sharing mid-level caches, last-level cache tags or internal
+ busses.
+
config SCHED_SMT
def_bool y if SMP
@@ -1256,7 +1283,8 @@ config TOSHIBA
config I8K
tristate "Dell i8k legacy laptop support"
- select HWMON
+ depends on HWMON
+ depends on PROC_FS
select SENSORS_DELL_SMM
help
This option enables legacy /proc/i8k userspace interface in hwmon
@@ -1405,7 +1433,7 @@ config HIGHMEM4G
config HIGHMEM64G
bool "64GB"
- depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6
+ depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6
select X86_PAE
help
Select this if you have a 32-bit processor and more than 4
@@ -1509,15 +1537,20 @@ config X86_CPA_STATISTICS
helps to determine the effectiveness of preserving large and huge
page mappings when mapping protections are changed.
+config X86_MEM_ENCRYPT
+ select ARCH_HAS_FORCE_DMA_UNENCRYPTED
+ select DYNAMIC_PHYSICAL_MASK
+ select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
+ def_bool n
+
config AMD_MEM_ENCRYPT
bool "AMD Secure Memory Encryption (SME) support"
depends on X86_64 && CPU_SUP_AMD
select DMA_COHERENT_POOL
- select DYNAMIC_PHYSICAL_MASK
select ARCH_USE_MEMREMAP_PROT
- select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select INSTRUCTION_DECODER
- select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
+ select ARCH_HAS_CC_PLATFORM
+ select X86_MEM_ENCRYPT
help
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
@@ -1525,7 +1558,6 @@ config AMD_MEM_ENCRYPT
config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
bool "Activate AMD Secure Memory Encryption (SME) by default"
- default y
depends on AMD_MEM_ENCRYPT
help
Say yes to have system memory encrypted by default if running on
@@ -1615,7 +1647,7 @@ config ARCH_SELECT_MEMORY_MODEL
config ARCH_MEMORY_PROBE
bool "Enable sysfs memory/probe interface"
- depends on X86_64 && MEMORY_HOTPLUG
+ depends on MEMORY_HOTPLUG
help
This option enables a sysfs memory/probe interface for testing.
See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
@@ -1903,6 +1935,7 @@ config X86_SGX
select SRCU
select MMU_NOTIFIER
select NUMA_KEEP_MEMINFO if NUMA
+ select XARRAY_MULTI
help
Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
that can be used by applications to set aside private regions of code
@@ -1918,6 +1951,7 @@ config EFI
depends on ACPI
select UCS2_STRING
select EFI_RUNTIME_WRAPPERS
+ select ARCH_USE_MEMREMAP_PROT
help
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
@@ -1931,7 +1965,7 @@ config EFI
config EFI_STUB
bool "EFI stub support"
- depends on EFI && !X86_USE_3DNOW
+ depends on EFI
depends on $(cc-option,-mabi=ms) || X86_32
select RELOCATABLE
help
@@ -2389,13 +2423,29 @@ config MODIFY_LDT_SYSCALL
Saying 'N' here may make sense for embedded or server kernels.
+config STRICT_SIGALTSTACK_SIZE
+ bool "Enforce strict size checking for sigaltstack"
+ depends on DYNAMIC_SIGFRAME
+ help
+ For historical reasons MINSIGSTKSZ is a constant which became
+ already too small with AVX512 support. Add a mechanism to
+ enforce strict checking of the sigaltstack size against the
+ real size of the FPU frame. This option enables the check
+ by default. It can also be controlled via the kernel command
+ line option 'strict_sas_size' independent of this config
+ switch. Enabling it might break existing applications which
+ allocate a too small sigaltstack but 'work' because they
+ never get a signal delivered.
+
+ Say 'N' unless you want to really enforce this check.
+
source "kernel/livepatch/Kconfig"
endmenu
config ARCH_HAS_ADD_PAGES
def_bool y
- depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on ARCH_ENABLE_MEMORY_HOTPLUG
config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
def_bool y
@@ -2832,8 +2882,6 @@ config HAVE_ATOMIC_IOMAP
def_bool y
depends on X86_32
-source "drivers/firmware/Kconfig"
-
source "arch/x86/kvm/Kconfig"
source "arch/x86/Kconfig.assembler"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 814fe0d349b0..542377cd419d 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -342,10 +342,6 @@ config X86_USE_PPRO_CHECKSUM
def_bool y
depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
-config X86_USE_3DNOW
- def_bool y
- depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML
-
#
# P6_NOPs are a relatively minor optimization that require a family >=
# 6 processor, except that it is broken on certain VIA chips.
@@ -508,3 +504,16 @@ config CPU_SUP_ZHAOXIN
CPU might render the kernel unbootable.
If unsure, say N.
+
+config CPU_SUP_VORTEX_32
+ default y
+ bool "Support Vortex processors" if PROCESSOR_SELECT
+ depends on X86_32
+ help
+ This enables detection, tunings and quirks for Vortex processors
+
+ You need this enabled if you want your kernel to run on a
+ Vortex CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller.
+
+ If unsure, say N.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 7488cfbbd2f6..e84cdd409b64 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -12,6 +12,18 @@ else
KBUILD_DEFCONFIG := $(ARCH)_defconfig
endif
+ifdef CONFIG_CC_IS_GCC
+RETPOLINE_CFLAGS := $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
+RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix)
+RETPOLINE_VDSO_CFLAGS := $(call cc-option,-mindirect-branch=thunk-inline -mindirect-branch-register)
+endif
+ifdef CONFIG_CC_IS_CLANG
+RETPOLINE_CFLAGS := -mretpoline-external-thunk
+RETPOLINE_VDSO_CFLAGS := -mretpoline
+endif
+export RETPOLINE_CFLAGS
+export RETPOLINE_VDSO_CFLAGS
+
# For gcc stack alignment is specified with -mpreferred-stack-boundary,
# clang has the option -mstack-alignment for that purpose.
ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
@@ -179,6 +191,10 @@ ifdef CONFIG_RETPOLINE
endif
endif
+ifdef CONFIG_SLS
+ KBUILD_CFLAGS += -mharden-sls=all
+endif
+
KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
ifdef CONFIG_LTO_CLANG
@@ -283,8 +299,6 @@ endif
archclean:
$(Q)rm -rf $(objtree)/arch/i386
$(Q)rm -rf $(objtree)/arch/x86_64
- $(Q)$(MAKE) $(clean)=$(boot)
- $(Q)$(MAKE) $(clean)=arch/x86/tools
define archhelp
echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
@@ -299,7 +313,7 @@ define archhelp
echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
echo ' bzdisk/fdimage*/hdimage/isoimage also accept:'
echo ' FDARGS="..." arguments for the booted kernel'
- echo ' FDINITRD=file initrd for the booted kernel'
+ echo ' FDINITRD=file initrd for the booted kernel'
echo ''
echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest'
echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest'
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 431bf7f846c3..e11813646051 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -28,7 +28,11 @@ KCOV_INSTRUMENT := n
targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 vmlinux.bin.zst
-KBUILD_CFLAGS := -m$(BITS) -O2
+# CLANG_FLAGS must come before any cc-disable-warning or cc-option calls in
+# case of cross compiling, as it has the '--target=' flag, which is needed to
+# avoid errors with '-march=i386', and future flags may depend on the target to
+# be valid.
+KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS)
KBUILD_CFLAGS += -fno-strict-aliasing -fPIE
KBUILD_CFLAGS += -Wundef
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
@@ -47,7 +51,6 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS
# Disable relocation relaxation in case the link is not PIE.
KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
-KBUILD_CFLAGS += $(CLANG_FLAGS)
# sev.c indirectly inludes inat-table.h which is generated during
# compilation and stored in $(objtree). Add the directory to the includes so
diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S
index 8bb92e9f4e97..67e7edcdfea8 100644
--- a/arch/x86/boot/compressed/efi_thunk_64.S
+++ b/arch/x86/boot/compressed/efi_thunk_64.S
@@ -26,8 +26,6 @@ SYM_FUNC_START(__efi64_thunk)
push %rbp
push %rbx
- leaq 1f(%rip), %rbp
-
movl %ds, %eax
push %rax
movl %es, %eax
@@ -35,6 +33,11 @@ SYM_FUNC_START(__efi64_thunk)
movl %ss, %eax
push %rax
+ /* Copy args passed on stack */
+ movq 0x30(%rsp), %rbp
+ movq 0x38(%rsp), %rbx
+ movq 0x40(%rsp), %rax
+
/*
* Convert x86-64 ABI params to i386 ABI
*/
@@ -44,13 +47,18 @@ SYM_FUNC_START(__efi64_thunk)
movl %ecx, 0x8(%rsp)
movl %r8d, 0xc(%rsp)
movl %r9d, 0x10(%rsp)
+ movl %ebp, 0x14(%rsp)
+ movl %ebx, 0x18(%rsp)
+ movl %eax, 0x1c(%rsp)
- leaq 0x14(%rsp), %rbx
+ leaq 0x20(%rsp), %rbx
sgdt (%rbx)
addq $16, %rbx
sidt (%rbx)
+ leaq 1f(%rip), %rbp
+
/*
* Switch to IDT and GDT with 32-bit segments. This is the firmware GDT
* and IDT that was installed when the kernel started executing. The
@@ -93,7 +101,7 @@ SYM_FUNC_START(__efi64_thunk)
pop %rbx
pop %rbp
- ret
+ RET
SYM_FUNC_END(__efi64_thunk)
.code32
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 572c535cf45b..fd9441f40457 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -813,7 +813,7 @@ SYM_FUNC_START(efi32_pe_entry)
2: popl %edi // restore callee-save registers
popl %ebx
leave
- ret
+ RET
SYM_FUNC_END(efi32_pe_entry)
.section ".rodata"
@@ -868,7 +868,7 @@ SYM_FUNC_START(startup32_set_idt_entry)
pop %ecx
pop %ebx
- ret
+ RET
SYM_FUNC_END(startup32_set_idt_entry)
#endif
@@ -884,7 +884,7 @@ SYM_FUNC_START(startup32_load_idt)
movl %eax, rva(boot32_idt_desc+2)(%ebp)
lidt rva(boot32_idt_desc)(%ebp)
#endif
- ret
+ RET
SYM_FUNC_END(startup32_load_idt)
/*
@@ -954,7 +954,7 @@ SYM_FUNC_START(startup32_check_sev_cbit)
popl %ebx
popl %eax
#endif
- ret
+ RET
SYM_FUNC_END(startup32_check_sev_cbit)
/*
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 67c3208b668a..411b268bc0a2 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -32,10 +32,6 @@
#include <generated/utsrelease.h>
#include <asm/efi.h>
-/* Macros used by the included decompressor code below. */
-#define STATIC
-#include <linux/decompress/mm.h>
-
#define _SETUP
#include <asm/setup.h> /* For COMMAND_LINE_SIZE */
#undef _SETUP
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
index c1e81a848b2a..a63424d13627 100644
--- a/arch/x86/boot/compressed/mem_encrypt.S
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -58,7 +58,7 @@ SYM_FUNC_START(get_sev_encryption_bit)
#endif /* CONFIG_AMD_MEM_ENCRYPT */
- ret
+ RET
SYM_FUNC_END(get_sev_encryption_bit)
/**
@@ -92,7 +92,7 @@ SYM_CODE_START_LOCAL(sev_es_req_cpuid)
/* All good - return success */
xorl %eax, %eax
1:
- ret
+ RET
2:
movl $-1, %eax
jmp 1b
@@ -221,7 +221,7 @@ SYM_FUNC_START(set_sev_encryption_mask)
#endif
xor %rax, %rax
- ret
+ RET
SYM_FUNC_END(set_sev_encryption_mask)
.data
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 743f13ea25c1..a4339cb2d247 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -28,6 +28,9 @@
/* Macros used by the included decompressor code below. */
#define STATIC static
+/* Define an externally visible malloc()/free(). */
+#define MALLOC_VISIBLE
+#include <linux/decompress/mm.h>
/*
* Provide definitions of memzero and memmove as some of the decompressors will
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 31139256859f..16ed360b6692 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -14,6 +14,8 @@
#undef CONFIG_KASAN
#undef CONFIG_KASAN_GENERIC
+#define __NO_FORTIFY
+
/* cpu_feature_enabled() cannot be used this early */
#define USE_EARLY_PGTABLE_L5
@@ -44,6 +46,8 @@ extern char _head[], _end[];
/* misc.c */
extern memptr free_mem_ptr;
extern memptr free_mem_end_ptr;
+void *malloc(int size);
+void free(void *where);
extern struct boot_params *boot_params;
void __putstr(const char *s);
void __puthex(unsigned long value);
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 2a78746f5a4c..a1733319a22a 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "misc.h"
#include <linux/efi.h>
#include <asm/e820/types.h>
#include <asm/processor.h>
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 670e998fe930..28bcf04c022e 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -122,7 +122,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
static bool early_setup_sev_es(void)
{
if (!sev_es_negotiate_protocol())
- sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED);
+ sev_es_terminate(GHCB_SEV_ES_PROT_UNSUPPORTED);
if (set_page_decrypted((unsigned long)&boot_ghcb_page))
return false;
@@ -175,7 +175,7 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
enum es_result result;
if (!boot_ghcb && !early_setup_sev_es())
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
vc_ghcb_invalidate(boot_ghcb);
result = vc_init_em_ctxt(&ctxt, regs, exit_code);
@@ -202,5 +202,5 @@ finish:
if (result == ES_OK)
vc_finish_insn(&ctxt);
else if (result != ES_RETRY)
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
}
diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh
index 0673fdfc1a11..c9299aeb7333 100644
--- a/arch/x86/boot/genimage.sh
+++ b/arch/x86/boot/genimage.sh
@@ -120,12 +120,13 @@ efiarch() {
}
# Get the combined sizes in bytes of the files given, counting sparse
-# files as full length, and padding each file to a 4K block size
+# files as full length, and padding each file to cluster size
+cluster=16384
filesizes() {
local t=0
local s
for s in $(ls -lnL "$@" 2>/dev/null | awk '/^-/{ print $5; }'); do
- t=$((t + ((s+4095)/4096)*4096))
+ t=$((t + ((s+cluster-1)/cluster)*cluster))
done
echo $t
}
@@ -230,14 +231,14 @@ genhdimage() {
ptype='-T 0xef' # EFI system partition, no GPT
fi
sizes=$(filesizes "$FBZIMAGE" "${FDINITRDS[@]}" "$efishell")
- # Allow 1% + 1 MiB for filesystem and partition table overhead,
- # syslinux, and config files
+ # Allow 1% + 2 MiB for filesystem and partition table overhead,
+ # syslinux, and config files; this is probably excessive...
megs=$(((sizes + sizes/100 + 2*1024*1024 - 1)/(1024*1024)))
$dd if=/dev/zero of="$FIMAGE" bs=$((1024*1024)) count=$megs 2>/dev/null
- mpartition -I -c -s 32 -h 64 -t $megs $ptype -b 512 -a h:
+ mpartition -I -c -s 32 -h 64 $ptype -b 64 -a p:
$dd if="$mbr" of="$FIMAGE" bs=440 count=1 conv=notrunc 2>/dev/null
- mformat -v 'LINUX_BOOT' -s 32 -h 64 -t $megs h:
- syslinux --offset $((512*512)) "$FIMAGE"
+ mformat -v 'LINUX_BOOT' -s 32 -h 64 -c $((cluster/512)) -t $megs h:
+ syslinux --offset $((64*512)) "$FIMAGE"
do_mcopy h:
}
diff --git a/arch/x86/boot/mtools.conf.in b/arch/x86/boot/mtools.conf.in
index 9e2662d01364..174c60508766 100644
--- a/arch/x86/boot/mtools.conf.in
+++ b/arch/x86/boot/mtools.conf.in
@@ -14,7 +14,8 @@ drive v:
drive w:
file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter
-# Hard disk
+# Hard disk (h: for the filesystem, p: for format - old mtools bug?)
drive h:
+ file="@OBJ@/hdimage" offset=32768 mformat_only
+drive p:
file="@OBJ@/hdimage" partition=1 mformat_only
-
diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h
index a232da487cd2..e5d2c6b8c2f1 100644
--- a/arch/x86/boot/string.h
+++ b/arch/x86/boot/string.h
@@ -8,8 +8,10 @@
#undef memcmp
void *memcpy(void *dst, const void *src, size_t len);
+void *memmove(void *dst, const void *src, size_t len);
void *memset(void *dst, int c, size_t len);
int memcmp(const void *s1, const void *s2, size_t len);
+int bcmp(const void *s1, const void *s2, size_t len);
/* Access builtin version by default. */
#define memcpy(d,s,l) __builtin_memcpy(d,s,l)
@@ -25,6 +27,7 @@ extern size_t strnlen(const char *s, size_t maxlen);
extern unsigned int atou(const char *s);
extern unsigned long long simple_strtoull(const char *cp, char **endp,
unsigned int base);
+long simple_strtol(const char *cp, char **endp, unsigned int base);
int kstrtoull(const char *s, unsigned int base, unsigned long long *res);
int boot_kstrtoul(const char *s, unsigned int base, unsigned long *res);
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index e81885384f60..71124cf8630c 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,4 +1,3 @@
-# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_AUDIT=y
@@ -262,3 +261,4 @@ CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_BOOT_PARAMS=y
+CONFIG_KALLSYMS_ALL=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index e8a7a0af2bda..92b1169ec90b 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,4 +1,3 @@
-# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_AUDIT=y
@@ -258,3 +257,4 @@ CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_BOOT_PARAMS=y
+CONFIG_KALLSYMS_ALL=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index f307c93fc90a..c3af959648e6 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -62,7 +62,9 @@ obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
-blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+blake2s-x86_64-y := blake2s-shash.o
+obj-$(if $(CONFIG_CRYPTO_BLAKE2S_X86),y) += libblake2s-x86_64.o
+libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 51d46d93efbc..b48ddebb4748 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -122,7 +122,7 @@ SYM_FUNC_START_LOCAL(__load_partial)
pxor T0, MSG
.Lld_partial_8:
- ret
+ RET
SYM_FUNC_END(__load_partial)
/*
@@ -180,7 +180,7 @@ SYM_FUNC_START_LOCAL(__store_partial)
mov %r10b, (%r9)
.Lst_partial_1:
- ret
+ RET
SYM_FUNC_END(__store_partial)
/*
@@ -225,7 +225,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_init)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_init)
/*
@@ -337,7 +337,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_1:
movdqu STATE4, 0x00(STATEP)
@@ -346,7 +346,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_2:
movdqu STATE3, 0x00(STATEP)
@@ -355,7 +355,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_3:
movdqu STATE2, 0x00(STATEP)
@@ -364,7 +364,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_4:
movdqu STATE1, 0x00(STATEP)
@@ -373,11 +373,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out:
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_ad)
.macro encrypt_block a s0 s1 s2 s3 s4 i
@@ -452,7 +452,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_1:
movdqu STATE3, 0x00(STATEP)
@@ -461,7 +461,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_2:
movdqu STATE2, 0x00(STATEP)
@@ -470,7 +470,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_3:
movdqu STATE1, 0x00(STATEP)
@@ -479,7 +479,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_4:
movdqu STATE0, 0x00(STATEP)
@@ -488,11 +488,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out:
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_enc)
/*
@@ -532,7 +532,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
.macro decrypt_block a s0 s1 s2 s3 s4 i
@@ -606,7 +606,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_1:
movdqu STATE3, 0x00(STATEP)
@@ -615,7 +615,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_2:
movdqu STATE2, 0x00(STATEP)
@@ -624,7 +624,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_3:
movdqu STATE1, 0x00(STATEP)
@@ -633,7 +633,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_4:
movdqu STATE0, 0x00(STATEP)
@@ -642,11 +642,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out:
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_dec)
/*
@@ -696,7 +696,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_dec_tail)
/*
@@ -743,5 +743,5 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
movdqu MSG, (%rsi)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_final)
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index 3f0fc7dd87d7..c799838242a6 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -525,7 +525,7 @@ ddq_add_8:
/* return updated IV */
vpshufb xbyteswap, xcounter, xcounter
vmovdqu xcounter, (p_iv)
- ret
+ RET
.endm
/*
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 4e3972570916..363699dd7220 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -1594,7 +1594,7 @@ SYM_FUNC_START(aesni_gcm_dec)
GCM_ENC_DEC dec
GCM_COMPLETE arg10, arg11
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec)
@@ -1683,7 +1683,7 @@ SYM_FUNC_START(aesni_gcm_enc)
GCM_COMPLETE arg10, arg11
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc)
/*****************************************************************************
@@ -1701,7 +1701,7 @@ SYM_FUNC_START(aesni_gcm_init)
FUNC_SAVE
GCM_INIT %arg3, %arg4,%arg5, %arg6
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_init)
/*****************************************************************************
@@ -1716,7 +1716,7 @@ SYM_FUNC_START(aesni_gcm_enc_update)
FUNC_SAVE
GCM_ENC_DEC enc
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc_update)
/*****************************************************************************
@@ -1731,7 +1731,7 @@ SYM_FUNC_START(aesni_gcm_dec_update)
FUNC_SAVE
GCM_ENC_DEC dec
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec_update)
/*****************************************************************************
@@ -1746,7 +1746,7 @@ SYM_FUNC_START(aesni_gcm_finalize)
FUNC_SAVE
GCM_COMPLETE %arg3 %arg4
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_finalize)
#endif
@@ -1762,7 +1762,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_256a)
pxor %xmm1, %xmm0
movaps %xmm0, (TKEYP)
add $0x10, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_256a)
SYM_FUNC_END_ALIAS(_key_expansion_128)
@@ -1787,7 +1787,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_192a)
shufps $0b01001110, %xmm2, %xmm1
movaps %xmm1, 0x10(TKEYP)
add $0x20, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_192a)
SYM_FUNC_START_LOCAL(_key_expansion_192b)
@@ -1806,7 +1806,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_192b)
movaps %xmm0, (TKEYP)
add $0x10, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_192b)
SYM_FUNC_START_LOCAL(_key_expansion_256b)
@@ -1818,7 +1818,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b)
pxor %xmm1, %xmm2
movaps %xmm2, (TKEYP)
add $0x10, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_256b)
/*
@@ -1933,7 +1933,7 @@ SYM_FUNC_START(aesni_set_key)
popl KEYP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_set_key)
/*
@@ -1957,7 +1957,7 @@ SYM_FUNC_START(aesni_enc)
popl KEYP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_enc)
/*
@@ -2014,7 +2014,7 @@ SYM_FUNC_START_LOCAL(_aesni_enc1)
aesenc KEY, STATE
movaps 0x70(TKEYP), KEY
aesenclast KEY, STATE
- ret
+ RET
SYM_FUNC_END(_aesni_enc1)
/*
@@ -2122,7 +2122,7 @@ SYM_FUNC_START_LOCAL(_aesni_enc4)
aesenclast KEY, STATE2
aesenclast KEY, STATE3
aesenclast KEY, STATE4
- ret
+ RET
SYM_FUNC_END(_aesni_enc4)
/*
@@ -2147,7 +2147,7 @@ SYM_FUNC_START(aesni_dec)
popl KEYP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_dec)
/*
@@ -2204,7 +2204,7 @@ SYM_FUNC_START_LOCAL(_aesni_dec1)
aesdec KEY, STATE
movaps 0x70(TKEYP), KEY
aesdeclast KEY, STATE
- ret
+ RET
SYM_FUNC_END(_aesni_dec1)
/*
@@ -2312,7 +2312,7 @@ SYM_FUNC_START_LOCAL(_aesni_dec4)
aesdeclast KEY, STATE2
aesdeclast KEY, STATE3
aesdeclast KEY, STATE4
- ret
+ RET
SYM_FUNC_END(_aesni_dec4)
/*
@@ -2372,7 +2372,7 @@ SYM_FUNC_START(aesni_ecb_enc)
popl LEN
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_ecb_enc)
/*
@@ -2433,7 +2433,7 @@ SYM_FUNC_START(aesni_ecb_dec)
popl LEN
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_ecb_dec)
/*
@@ -2477,7 +2477,7 @@ SYM_FUNC_START(aesni_cbc_enc)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cbc_enc)
/*
@@ -2570,7 +2570,7 @@ SYM_FUNC_START(aesni_cbc_dec)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cbc_dec)
/*
@@ -2627,7 +2627,7 @@ SYM_FUNC_START(aesni_cts_cbc_enc)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cts_cbc_enc)
/*
@@ -2688,7 +2688,7 @@ SYM_FUNC_START(aesni_cts_cbc_dec)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cts_cbc_dec)
.pushsection .rodata
@@ -2725,7 +2725,7 @@ SYM_FUNC_START_LOCAL(_aesni_inc_init)
mov $1, TCTR_LOW
movq TCTR_LOW, INC
movq CTR, TCTR_LOW
- ret
+ RET
SYM_FUNC_END(_aesni_inc_init)
/*
@@ -2753,7 +2753,7 @@ SYM_FUNC_START_LOCAL(_aesni_inc)
.Linc_low:
movaps CTR, IV
pshufb BSWAP_MASK, IV
- ret
+ RET
SYM_FUNC_END(_aesni_inc)
/*
@@ -2816,7 +2816,7 @@ SYM_FUNC_START(aesni_ctr_enc)
movups IV, (IVP)
.Lctr_enc_just_ret:
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_ctr_enc)
#endif
@@ -2932,7 +2932,7 @@ SYM_FUNC_START(aesni_xts_encrypt)
popl IVP
#endif
FRAME_END
- ret
+ RET
.Lxts_enc_1x:
add $64, LEN
@@ -3092,7 +3092,7 @@ SYM_FUNC_START(aesni_xts_decrypt)
popl IVP
#endif
FRAME_END
- ret
+ RET
.Lxts_dec_1x:
add $64, LEN
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 98e3552b6e03..0852ab573fd3 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -1767,7 +1767,7 @@ SYM_FUNC_START(aesni_gcm_init_avx_gen2)
FUNC_SAVE
INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_init_avx_gen2)
###############################################################################
@@ -1788,15 +1788,15 @@ SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
FUNC_RESTORE
- ret
+ RET
key_128_enc_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
FUNC_RESTORE
- ret
+ RET
key_256_enc_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
###############################################################################
@@ -1817,15 +1817,15 @@ SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
FUNC_RESTORE
- ret
+ RET
key_128_dec_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
FUNC_RESTORE
- ret
+ RET
key_256_dec_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
###############################################################################
@@ -1846,15 +1846,15 @@ SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
# must be 192
GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_128_finalize:
GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_256_finalize:
GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
###############################################################################
@@ -2735,7 +2735,7 @@ SYM_FUNC_START(aesni_gcm_init_avx_gen4)
FUNC_SAVE
INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_init_avx_gen4)
###############################################################################
@@ -2756,15 +2756,15 @@ SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
FUNC_RESTORE
- ret
+ RET
key_128_enc_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
FUNC_RESTORE
- ret
+ RET
key_256_enc_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
###############################################################################
@@ -2785,15 +2785,15 @@ SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
FUNC_RESTORE
- ret
+ RET
key_128_dec_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
FUNC_RESTORE
- ret
+ RET
key_256_dec_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
###############################################################################
@@ -2814,13 +2814,13 @@ SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
# must be 192
GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_128_finalize4:
GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_256_finalize4:
GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 0fc961bef299..41901ba9d3a2 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -866,7 +866,7 @@ static int xts_crypt(struct skcipher_request *req, bool encrypt)
req = &subreq;
err = skcipher_walk_virt(&walk, req, false);
- if (err)
+ if (!walk.nbytes)
return err;
} else {
tail = 0;
@@ -1107,7 +1107,7 @@ static struct aead_alg aesni_aeads[] = { {
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx),
- .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_alignmask = 0,
.cra_module = THIS_MODULE,
},
}, {
@@ -1124,7 +1124,7 @@ static struct aead_alg aesni_aeads[] = { {
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct generic_gcmaes_ctx),
- .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_alignmask = 0,
.cra_module = THIS_MODULE,
},
} };
diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S
index 2ca79974f819..b50b35ff1fdb 100644
--- a/arch/x86/crypto/blake2s-core.S
+++ b/arch/x86/crypto/blake2s-core.S
@@ -171,7 +171,7 @@ SYM_FUNC_START(blake2s_compress_ssse3)
movdqu %xmm1,0x10(%rdi)
movdqu %xmm14,0x20(%rdi)
.Lendofloop:
- ret
+ RET
SYM_FUNC_END(blake2s_compress_ssse3)
#ifdef CONFIG_AS_AVX512
@@ -251,6 +251,6 @@ SYM_FUNC_START(blake2s_compress_avx512)
vmovdqu %xmm1,0x10(%rdi)
vmovdqu %xmm4,0x20(%rdi)
vzeroupper
- retq
+ RET
SYM_FUNC_END(blake2s_compress_avx512)
#endif /* CONFIG_AS_AVX512 */
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index a40365ab301e..69853c13e8fb 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -5,7 +5,6 @@
#include <crypto/internal/blake2s.h>
#include <crypto/internal/simd.h>
-#include <crypto/internal/hash.h>
#include <linux/types.h>
#include <linux/jump_label.h>
@@ -28,9 +27,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
-void blake2s_compress_arch(struct blake2s_state *state,
- const u8 *block, size_t nblocks,
- const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+ size_t nblocks, const u32 inc)
{
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
@@ -56,49 +54,12 @@ void blake2s_compress_arch(struct blake2s_state *state,
block += blocks * BLAKE2S_BLOCK_SIZE;
} while (nblocks);
}
-EXPORT_SYMBOL(blake2s_compress_arch);
-
-static int crypto_blake2s_update_x86(struct shash_desc *desc,
- const u8 *in, unsigned int inlen)
-{
- return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
-}
-
-static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
-{
- return crypto_blake2s_final(desc, out, blake2s_compress_arch);
-}
-
-#define BLAKE2S_ALG(name, driver_name, digest_size) \
- { \
- .base.cra_name = name, \
- .base.cra_driver_name = driver_name, \
- .base.cra_priority = 200, \
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
- .base.cra_module = THIS_MODULE, \
- .digestsize = digest_size, \
- .setkey = crypto_blake2s_setkey, \
- .init = crypto_blake2s_init, \
- .update = crypto_blake2s_update_x86, \
- .final = crypto_blake2s_final_x86, \
- .descsize = sizeof(struct blake2s_state), \
- }
-
-static struct shash_alg blake2s_algs[] = {
- BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
- BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
- BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
- BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
-};
+EXPORT_SYMBOL(blake2s_compress);
static int __init blake2s_mod_init(void)
{
- if (!boot_cpu_has(X86_FEATURE_SSSE3))
- return 0;
-
- static_branch_enable(&blake2s_use_ssse3);
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ static_branch_enable(&blake2s_use_ssse3);
if (IS_ENABLED(CONFIG_AS_AVX512) &&
boot_cpu_has(X86_FEATURE_AVX) &&
@@ -109,26 +70,9 @@ static int __init blake2s_mod_init(void)
XFEATURE_MASK_AVX512, NULL))
static_branch_enable(&blake2s_use_avx512);
- return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
- crypto_register_shashes(blake2s_algs,
- ARRAY_SIZE(blake2s_algs)) : 0;
-}
-
-static void __exit blake2s_mod_exit(void)
-{
- if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+ return 0;
}
module_init(blake2s_mod_init);
-module_exit(blake2s_mod_exit);
-MODULE_ALIAS_CRYPTO("blake2s-128");
-MODULE_ALIAS_CRYPTO("blake2s-128-x86");
-MODULE_ALIAS_CRYPTO("blake2s-160");
-MODULE_ALIAS_CRYPTO("blake2s-160-x86");
-MODULE_ALIAS_CRYPTO("blake2s-224");
-MODULE_ALIAS_CRYPTO("blake2s-224-x86");
-MODULE_ALIAS_CRYPTO("blake2s-256");
-MODULE_ALIAS_CRYPTO("blake2s-256-x86");
MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c
new file mode 100644
index 000000000000..f9e2fecdb761
--- /dev/null
+++ b/arch/x86/crypto/blake2s-shash.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <crypto/internal/blake2s.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/hash.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+static int crypto_blake2s_update_x86(struct shash_desc *desc,
+ const u8 *in, unsigned int inlen)
+{
+ return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
+}
+
+static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
+{
+ return crypto_blake2s_final(desc, out, blake2s_compress);
+}
+
+#define BLAKE2S_ALG(name, driver_name, digest_size) \
+ { \
+ .base.cra_name = name, \
+ .base.cra_driver_name = driver_name, \
+ .base.cra_priority = 200, \
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
+ .base.cra_module = THIS_MODULE, \
+ .digestsize = digest_size, \
+ .setkey = crypto_blake2s_setkey, \
+ .init = crypto_blake2s_init, \
+ .update = crypto_blake2s_update_x86, \
+ .final = crypto_blake2s_final_x86, \
+ .descsize = sizeof(struct blake2s_state), \
+ }
+
+static struct shash_alg blake2s_algs[] = {
+ BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
+ BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
+ BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
+ BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
+};
+
+static int __init blake2s_mod_init(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+ return 0;
+}
+
+static void __exit blake2s_mod_exit(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+}
+
+module_init(blake2s_mod_init);
+module_exit(blake2s_mod_exit);
+
+MODULE_ALIAS_CRYPTO("blake2s-128");
+MODULE_ALIAS_CRYPTO("blake2s-128-x86");
+MODULE_ALIAS_CRYPTO("blake2s-160");
+MODULE_ALIAS_CRYPTO("blake2s-160-x86");
+MODULE_ALIAS_CRYPTO("blake2s-224");
+MODULE_ALIAS_CRYPTO("blake2s-224-x86");
+MODULE_ALIAS_CRYPTO("blake2s-256");
+MODULE_ALIAS_CRYPTO("blake2s-256-x86");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 4222ac6d6584..802d71582689 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -135,10 +135,10 @@ SYM_FUNC_START(__blowfish_enc_blk)
jnz .L__enc_xor;
write_block();
- ret;
+ RET;
.L__enc_xor:
xor_block();
- ret;
+ RET;
SYM_FUNC_END(__blowfish_enc_blk)
SYM_FUNC_START(blowfish_dec_blk)
@@ -170,7 +170,7 @@ SYM_FUNC_START(blowfish_dec_blk)
movq %r11, %r12;
- ret;
+ RET;
SYM_FUNC_END(blowfish_dec_blk)
/**********************************************************************
@@ -322,14 +322,14 @@ SYM_FUNC_START(__blowfish_enc_blk_4way)
popq %rbx;
popq %r12;
- ret;
+ RET;
.L__enc_xor4:
xor_block4();
popq %rbx;
popq %r12;
- ret;
+ RET;
SYM_FUNC_END(__blowfish_enc_blk_4way)
SYM_FUNC_START(blowfish_dec_blk_4way)
@@ -364,5 +364,5 @@ SYM_FUNC_START(blowfish_dec_blk_4way)
popq %rbx;
popq %r12;
- ret;
+ RET;
SYM_FUNC_END(blowfish_dec_blk_4way)
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index e2a0e0f4bf9d..2e1658ddbe1a 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -192,7 +192,7 @@ SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c
roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
%rcx, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
.align 8
@@ -200,7 +200,7 @@ SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_a
roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
%xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
%rax, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
/*
@@ -778,7 +778,7 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
%xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Lenc_max32:
@@ -865,7 +865,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
%xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Ldec_max32:
@@ -906,7 +906,7 @@ SYM_FUNC_START(camellia_ecb_enc_16way)
%xmm8, %rsi);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_enc_16way)
SYM_FUNC_START(camellia_ecb_dec_16way)
@@ -936,7 +936,7 @@ SYM_FUNC_START(camellia_ecb_dec_16way)
%xmm8, %rsi);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_dec_16way)
SYM_FUNC_START(camellia_cbc_dec_16way)
@@ -987,5 +987,5 @@ SYM_FUNC_START(camellia_cbc_dec_16way)
%xmm8, %rsi);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_cbc_dec_16way)
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 706f70829a07..0e4e9abbf4de 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -226,7 +226,7 @@ SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c
roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
%rcx, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
.align 8
@@ -234,7 +234,7 @@ SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_a
roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
%ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
%rax, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
/*
@@ -814,7 +814,7 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
%ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Lenc_max32:
@@ -901,7 +901,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
%ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Ldec_max32:
@@ -946,7 +946,7 @@ SYM_FUNC_START(camellia_ecb_enc_32way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_enc_32way)
SYM_FUNC_START(camellia_ecb_dec_32way)
@@ -980,7 +980,7 @@ SYM_FUNC_START(camellia_ecb_dec_32way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_dec_32way)
SYM_FUNC_START(camellia_cbc_dec_32way)
@@ -1047,5 +1047,5 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
addq $(16 * 32), %rsp;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_cbc_dec_32way)
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 1372e6408850..347c059f5940 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -213,13 +213,13 @@ SYM_FUNC_START(__camellia_enc_blk)
enc_outunpack(mov, RT1);
movq RR12, %r12;
- ret;
+ RET;
.L__enc_xor:
enc_outunpack(xor, RT1);
movq RR12, %r12;
- ret;
+ RET;
SYM_FUNC_END(__camellia_enc_blk)
SYM_FUNC_START(camellia_dec_blk)
@@ -257,7 +257,7 @@ SYM_FUNC_START(camellia_dec_blk)
dec_outunpack();
movq RR12, %r12;
- ret;
+ RET;
SYM_FUNC_END(camellia_dec_blk)
/**********************************************************************
@@ -448,14 +448,14 @@ SYM_FUNC_START(__camellia_enc_blk_2way)
movq RR12, %r12;
popq %rbx;
- ret;
+ RET;
.L__enc2_xor:
enc_outunpack2(xor, RT2);
movq RR12, %r12;
popq %rbx;
- ret;
+ RET;
SYM_FUNC_END(__camellia_enc_blk_2way)
SYM_FUNC_START(camellia_dec_blk_2way)
@@ -495,5 +495,5 @@ SYM_FUNC_START(camellia_dec_blk_2way)
movq RR12, %r12;
movq RXOR, %rbx;
- ret;
+ RET;
SYM_FUNC_END(camellia_dec_blk_2way)
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index 8a6181b08b59..b258af420c92 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -279,7 +279,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
- ret;
+ RET;
SYM_FUNC_END(__cast5_enc_blk16)
.align 16
@@ -352,7 +352,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
- ret;
+ RET;
.L__skip_dec:
vpsrldq $4, RKR, RKR;
@@ -393,7 +393,7 @@ SYM_FUNC_START(cast5_ecb_enc_16way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_ecb_enc_16way)
SYM_FUNC_START(cast5_ecb_dec_16way)
@@ -431,7 +431,7 @@ SYM_FUNC_START(cast5_ecb_dec_16way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_ecb_dec_16way)
SYM_FUNC_START(cast5_cbc_dec_16way)
@@ -483,7 +483,7 @@ SYM_FUNC_START(cast5_cbc_dec_16way)
popq %r15;
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_cbc_dec_16way)
SYM_FUNC_START(cast5_ctr_16way)
@@ -559,5 +559,5 @@ SYM_FUNC_START(cast5_ctr_16way)
popq %r15;
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_ctr_16way)
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index fbddcecc3e3f..82b716fd5dba 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -289,7 +289,7 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
- ret;
+ RET;
SYM_FUNC_END(__cast6_enc_blk8)
.align 8
@@ -336,7 +336,7 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
- ret;
+ RET;
SYM_FUNC_END(__cast6_dec_blk8)
SYM_FUNC_START(cast6_ecb_enc_8way)
@@ -359,7 +359,7 @@ SYM_FUNC_START(cast6_ecb_enc_8way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast6_ecb_enc_8way)
SYM_FUNC_START(cast6_ecb_dec_8way)
@@ -382,7 +382,7 @@ SYM_FUNC_START(cast6_ecb_dec_8way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast6_ecb_dec_8way)
SYM_FUNC_START(cast6_cbc_dec_8way)
@@ -408,5 +408,5 @@ SYM_FUNC_START(cast6_cbc_dec_8way)
popq %r15;
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast6_cbc_dec_8way)
diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S
index ee9a40ab4109..f3d8fc018249 100644
--- a/arch/x86/crypto/chacha-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha-avx2-x86_64.S
@@ -193,7 +193,7 @@ SYM_FUNC_START(chacha_2block_xor_avx2)
.Ldone2:
vzeroupper
- ret
+ RET
.Lxorpart2:
# xor remaining bytes from partial register into output
@@ -498,7 +498,7 @@ SYM_FUNC_START(chacha_4block_xor_avx2)
.Ldone4:
vzeroupper
- ret
+ RET
.Lxorpart4:
# xor remaining bytes from partial register into output
@@ -992,7 +992,7 @@ SYM_FUNC_START(chacha_8block_xor_avx2)
.Ldone8:
vzeroupper
lea -8(%r10),%rsp
- ret
+ RET
.Lxorpart8:
# xor remaining bytes from partial register into output
diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S
index bb193fde123a..946f74dd6fba 100644
--- a/arch/x86/crypto/chacha-avx512vl-x86_64.S
+++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S
@@ -166,7 +166,7 @@ SYM_FUNC_START(chacha_2block_xor_avx512vl)
.Ldone2:
vzeroupper
- ret
+ RET
.Lxorpart2:
# xor remaining bytes from partial register into output
@@ -432,7 +432,7 @@ SYM_FUNC_START(chacha_4block_xor_avx512vl)
.Ldone4:
vzeroupper
- ret
+ RET
.Lxorpart4:
# xor remaining bytes from partial register into output
@@ -812,7 +812,7 @@ SYM_FUNC_START(chacha_8block_xor_avx512vl)
.Ldone8:
vzeroupper
- ret
+ RET
.Lxorpart8:
# xor remaining bytes from partial register into output
diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S
index ca1788bfee16..7111949cd5b9 100644
--- a/arch/x86/crypto/chacha-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
@@ -108,7 +108,7 @@ SYM_FUNC_START_LOCAL(chacha_permute)
sub $2,%r8d
jnz .Ldoubleround
- ret
+ RET
SYM_FUNC_END(chacha_permute)
SYM_FUNC_START(chacha_block_xor_ssse3)
@@ -166,7 +166,7 @@ SYM_FUNC_START(chacha_block_xor_ssse3)
.Ldone:
FRAME_END
- ret
+ RET
.Lxorpart:
# xor remaining bytes from partial register into output
@@ -217,7 +217,7 @@ SYM_FUNC_START(hchacha_block_ssse3)
movdqu %xmm3,0x10(%rsi)
FRAME_END
- ret
+ RET
SYM_FUNC_END(hchacha_block_ssse3)
SYM_FUNC_START(chacha_4block_xor_ssse3)
@@ -762,7 +762,7 @@ SYM_FUNC_START(chacha_4block_xor_ssse3)
.Ldone4:
lea -8(%r10),%rsp
- ret
+ RET
.Lxorpart4:
# xor remaining bytes from partial register into output
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
index 6e7d4c4d3208..c392a6edbfff 100644
--- a/arch/x86/crypto/crc32-pclmul_asm.S
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -236,5 +236,5 @@ fold_64:
pxor %xmm2, %xmm1
pextrd $0x01, %xmm1, %eax
- ret
+ RET
SYM_FUNC_END(crc32_pclmul_le_16)
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index ac1f303eed0f..80c0d22fc42c 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -306,7 +306,7 @@ do_return:
popq %rsi
popq %rdi
popq %rbx
- ret
+ RET
SYM_FUNC_END(crc_pcl)
.section .rodata, "a", @progbits
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
index b2533d63030e..721474abfb71 100644
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -257,7 +257,7 @@ SYM_FUNC_START(crc_t10dif_pcl)
# Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
pextrw $0, %xmm0, %eax
- ret
+ RET
.align 16
.Lless_than_256_bytes:
diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index 38caf61cd5b7..d55fa9e9b9e6 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -64,10 +64,9 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
/* Return the carry bit in a register */
" adcx %%r11, %1;"
- : "+&r" (f2), "=&r" (carry_r)
- : "r" (out), "r" (f1)
- : "%r8", "%r9", "%r10", "%r11", "memory", "cc"
- );
+ : "+&r"(f2), "=&r"(carry_r)
+ : "r"(out), "r"(f1)
+ : "%r8", "%r9", "%r10", "%r11", "memory", "cc");
return carry_r;
}
@@ -108,10 +107,9 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
" cmovc %0, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%1);"
- : "+&r" (f2)
- : "r" (out), "r" (f1)
- : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
- );
+ : "+&r"(f2)
+ : "r"(out), "r"(f1)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
}
/* Computes the field subtraction of two field elements */
@@ -151,10 +149,9 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
" movq %%r9, 8(%0);"
" movq %%r10, 16(%0);"
" movq %%r11, 24(%0);"
- :
- : "r" (out), "r" (f1), "r" (f2)
- : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
- );
+ :
+ : "r"(out), "r"(f1), "r"(f2)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
}
/* Computes a field multiplication: out <- f1 * f2
@@ -162,239 +159,400 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
{
asm volatile(
+
/* Compute the raw multiplication: tmp <- src1 * src2 */
/* Compute src1[0] * src2 */
- " movq 0(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;"
+ " movq 0(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 0(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
/* Compute src1[1] * src2 */
- " movq 8(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 8(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 8(%2), %%r8;"
+ " movq %%r8, 8(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 16(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[2] * src2 */
- " movq 16(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 16(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 16(%2), %%r8;"
+ " movq %%r8, 16(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 24(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[3] * src2 */
- " movq 24(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
+ " movq 24(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 24(%2), %%r8;"
+ " movq %%r8, 24(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 32(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 40(%2);"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%2);"
+
/* Line up pointers */
- " mov %0, %1;"
" mov %2, %0;"
+ " mov %3, %2;"
/* Wrap the result back into the field */
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 32(%1), %%r8, %%r13;"
- " xor %k3, %k3;"
- " adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%rbx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 8(%1), %%r9;"
- " mulxq 48(%1), %%r10, %%r13;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 16(%1), %%r10;"
- " mulxq 56(%1), %%r11, %%rax;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 24(%1), %%r11;"
- " adcx %3, %%rax;"
- " adox %3, %%rax;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
" imul %%rdx, %%rax;"
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
- " adcx %3, %%r9;"
- " movq %%r9, 8(%0);"
- " adcx %3, %%r10;"
- " movq %%r10, 16(%0);"
- " adcx %3, %%r11;"
- " movq %%r11, 24(%0);"
+ " adcx %1, %%r9;"
+ " movq %%r9, 8(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 16(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 24(%2);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 0(%0);"
- : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
- :
- : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
- );
+ " movq %%r8, 0(%2);"
+ : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "%r14", "memory", "cc");
}
/* Computes two field multiplications:
- * out[0] <- f1[0] * f2[0]
- * out[1] <- f1[1] * f2[1]
- * Uses the 16-element buffer tmp for intermediate results. */
+ * out[0] <- f1[0] * f2[0]
+ * out[1] <- f1[1] * f2[1]
+ * Uses the 16-element buffer tmp for intermediate results: */
static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
{
asm volatile(
+
/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
/* Compute src1[0] * src2 */
- " movq 0(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;"
+ " movq 0(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 0(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
/* Compute src1[1] * src2 */
- " movq 8(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 8(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 8(%2), %%r8;"
+ " movq %%r8, 8(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 16(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[2] * src2 */
- " movq 16(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 16(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 16(%2), %%r8;"
+ " movq %%r8, 16(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 24(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[3] * src2 */
- " movq 24(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
+ " movq 24(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 24(%2), %%r8;"
+ " movq %%r8, 24(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 32(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 40(%2);"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%2);"
/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
/* Compute src1[0] * src2 */
- " movq 32(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
- " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
- " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;"
+ " movq 32(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 64(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 72(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
/* Compute src1[1] * src2 */
- " movq 40(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
- " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 40(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 72(%2), %%r8;"
+ " movq %%r8, 72(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 80(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[2] * src2 */
- " movq 48(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
- " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 48(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 80(%2), %%r8;"
+ " movq %%r8, 80(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 88(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[3] * src2 */
- " movq 56(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
- " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;"
- " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
+ " movq 56(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 88(%2), %%r8;"
+ " movq %%r8, 88(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 96(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 104(%2);"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 112(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 120(%2);"
+
/* Line up pointers */
- " mov %0, %1;"
" mov %2, %0;"
+ " mov %3, %2;"
/* Wrap the results back into the field */
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 32(%1), %%r8, %%r13;"
- " xor %k3, %k3;"
- " adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%rbx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 8(%1), %%r9;"
- " mulxq 48(%1), %%r10, %%r13;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 16(%1), %%r10;"
- " mulxq 56(%1), %%r11, %%rax;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 24(%1), %%r11;"
- " adcx %3, %%rax;"
- " adox %3, %%rax;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
" imul %%rdx, %%rax;"
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
- " adcx %3, %%r9;"
- " movq %%r9, 8(%0);"
- " adcx %3, %%r10;"
- " movq %%r10, 16(%0);"
- " adcx %3, %%r11;"
- " movq %%r11, 24(%0);"
+ " adcx %1, %%r9;"
+ " movq %%r9, 8(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 16(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 24(%2);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 0(%0);"
+ " movq %%r8, 0(%2);"
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 96(%1), %%r8, %%r13;"
- " xor %k3, %k3;"
- " adoxq 64(%1), %%r8;"
- " mulxq 104(%1), %%r9, %%rbx;"
+ " mulxq 96(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 64(%0), %%r8;"
+ " mulxq 104(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 72(%1), %%r9;"
- " mulxq 112(%1), %%r10, %%r13;"
+ " adoxq 72(%0), %%r9;"
+ " mulxq 112(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 80(%1), %%r10;"
- " mulxq 120(%1), %%r11, %%rax;"
+ " adoxq 80(%0), %%r10;"
+ " mulxq 120(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 88(%1), %%r11;"
- " adcx %3, %%rax;"
- " adox %3, %%rax;"
+ " adoxq 88(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
" imul %%rdx, %%rax;"
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
- " adcx %3, %%r9;"
- " movq %%r9, 40(%0);"
- " adcx %3, %%r10;"
- " movq %%r10, 48(%0);"
- " adcx %3, %%r11;"
- " movq %%r11, 56(%0);"
+ " adcx %1, %%r9;"
+ " movq %%r9, 40(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 48(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 56(%2);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 32(%0);"
- : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
- :
- : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
- );
+ " movq %%r8, 32(%2);"
+ : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "%r14", "memory", "cc");
}
-/* Computes the field multiplication of four-element f1 with value in f2 */
+/* Computes the field multiplication of four-element f1 with value in f2
+ * Requires f2 to be smaller than 2^17 */
static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
{
register u64 f2_r asm("rdx") = f2;
asm volatile(
/* Compute the raw multiplication of f1*f2 */
- " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
- " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
+ " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
+ " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
" add %%rcx, %%r9;"
" mov $0, %%rcx;"
- " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
+ " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
" adcx %%rbx, %%r10;"
- " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
+ " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
" adcx %%r13, %%r11;"
" adcx %%rcx, %%rax;"
@@ -418,17 +576,17 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%1);"
- : "+&r" (f2_r)
- : "r" (out), "r" (f1)
- : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc"
- );
+ : "+&r"(f2_r)
+ : "r"(out), "r"(f1)
+ : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "memory", "cc");
}
/* Computes p1 <- bit ? p2 : p1 in constant time */
static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
{
asm volatile(
- /* Invert the polarity of bit to match cmov expectations */
+ /* Transfer bit into CF flag */
" add $18446744073709551615, %0;"
/* cswap p1[0], p2[0] */
@@ -502,10 +660,9 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
" cmovc %%r10, %%r9;"
" movq %%r8, 56(%1);"
" movq %%r9, 56(%2);"
- : "+&r" (bit)
- : "r" (p1), "r" (p2)
- : "%r8", "%r9", "%r10", "memory", "cc"
- );
+ : "+&r"(bit)
+ : "r"(p1), "r"(p2)
+ : "%r8", "%r9", "%r10", "memory", "cc");
}
/* Computes the square of a field element: out <- f * f
@@ -516,15 +673,22 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
/* Compute the raw multiplication: tmp <- f * f */
/* Step 1: Compute all partial products */
- " movq 0(%1), %%rdx;" /* f[0] */
- " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
- " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
- " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
- " movq 24(%1), %%rdx;" /* f[3] */
- " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
- " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
- " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
- " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+ " movq 0(%0), %%rdx;" /* f[0] */
+ " mulxq 8(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 16(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%0), %%rdx;" /* f[3] */
+ " mulxq 8(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 8(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
/* Step 2: Compute two parallel carry chains */
" xor %%r15d, %%r15d;"
@@ -542,39 +706,50 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
" adcx %%r14, %%r14;"
/* Step 3: Compute intermediate squares */
- " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
- " movq %%rax, 0(%0);"
- " add %%rcx, %%r8;" " movq %%r8, 8(%0);"
- " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
- " adcx %%rax, %%r9;" " movq %%r9, 16(%0);"
- " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
- " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
- " adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
- " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);"
- " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
- " adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
- " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
+ " movq 0(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%1);"
+ " movq 8(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%1);"
+ " movq 16(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 40(%1);"
+ " movq 24(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%1);"
/* Line up pointers */
- " mov %0, %1;"
- " mov %2, %0;"
+ " mov %1, %0;"
+ " mov %2, %1;"
/* Wrap the result back into the field */
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 32(%1), %%r8, %%r13;"
+ " mulxq 32(%0), %%r8, %%r13;"
" xor %%ecx, %%ecx;"
- " adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%rbx;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 8(%1), %%r9;"
- " mulxq 48(%1), %%r10, %%r13;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 16(%1), %%r10;"
- " mulxq 56(%1), %%r11, %%rax;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 24(%1), %%r11;"
+ " adoxq 24(%0), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
@@ -582,40 +757,47 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
- " movq %%r9, 8(%0);"
+ " movq %%r9, 8(%1);"
" adcx %%rcx, %%r10;"
- " movq %%r10, 16(%0);"
+ " movq %%r10, 16(%1);"
" adcx %%rcx, %%r11;"
- " movq %%r11, 24(%0);"
+ " movq %%r11, 24(%1);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 0(%0);"
- : "+&r" (tmp), "+&r" (f), "+&r" (out)
- :
- : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
- );
+ " movq %%r8, 0(%1);"
+ : "+&r"(f), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+ "%r13", "%r14", "%r15", "memory", "cc");
}
/* Computes two field squarings:
- * out[0] <- f[0] * f[0]
- * out[1] <- f[1] * f[1]
+ * out[0] <- f[0] * f[0]
+ * out[1] <- f[1] * f[1]
* Uses the 16-element buffer tmp for intermediate results */
static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
{
asm volatile(
/* Step 1: Compute all partial products */
- " movq 0(%1), %%rdx;" /* f[0] */
- " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
- " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
- " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
- " movq 24(%1), %%rdx;" /* f[3] */
- " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
- " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
- " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
- " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+ " movq 0(%0), %%rdx;" /* f[0] */
+ " mulxq 8(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 16(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%0), %%rdx;" /* f[3] */
+ " mulxq 8(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 8(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
/* Step 2: Compute two parallel carry chains */
" xor %%r15d, %%r15d;"
@@ -633,29 +815,47 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" adcx %%r14, %%r14;"
/* Step 3: Compute intermediate squares */
- " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
- " movq %%rax, 0(%0);"
- " add %%rcx, %%r8;" " movq %%r8, 8(%0);"
- " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
- " adcx %%rax, %%r9;" " movq %%r9, 16(%0);"
- " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
- " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
- " adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
- " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);"
- " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
- " adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
- " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
+ " movq 0(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%1);"
+ " movq 8(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%1);"
+ " movq 16(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 40(%1);"
+ " movq 24(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%1);"
/* Step 1: Compute all partial products */
- " movq 32(%1), %%rdx;" /* f[0] */
- " mulxq 40(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
- " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
- " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
- " movq 56(%1), %%rdx;" /* f[3] */
- " mulxq 40(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
- " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
- " movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
- " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+ " movq 32(%0), %%rdx;" /* f[0] */
+ " mulxq 40(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 48(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 56(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 56(%0), %%rdx;" /* f[3] */
+ " mulxq 40(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 48(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 40(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 48(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
/* Step 2: Compute two parallel carry chains */
" xor %%r15d, %%r15d;"
@@ -673,37 +873,48 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" adcx %%r14, %%r14;"
/* Step 3: Compute intermediate squares */
- " movq 32(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
- " movq %%rax, 64(%0);"
- " add %%rcx, %%r8;" " movq %%r8, 72(%0);"
- " movq 40(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
- " adcx %%rax, %%r9;" " movq %%r9, 80(%0);"
- " adcx %%rcx, %%r10;" " movq %%r10, 88(%0);"
- " movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
- " adcx %%rax, %%r11;" " movq %%r11, 96(%0);"
- " adcx %%rcx, %%rbx;" " movq %%rbx, 104(%0);"
- " movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
- " adcx %%rax, %%r13;" " movq %%r13, 112(%0);"
- " adcx %%rcx, %%r14;" " movq %%r14, 120(%0);"
+ " movq 32(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 64(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 72(%1);"
+ " movq 40(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 80(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 88(%1);"
+ " movq 48(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 96(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 104(%1);"
+ " movq 56(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 112(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 120(%1);"
/* Line up pointers */
- " mov %0, %1;"
- " mov %2, %0;"
+ " mov %1, %0;"
+ " mov %2, %1;"
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 32(%1), %%r8, %%r13;"
+ " mulxq 32(%0), %%r8, %%r13;"
" xor %%ecx, %%ecx;"
- " adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%rbx;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 8(%1), %%r9;"
- " mulxq 48(%1), %%r10, %%r13;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 16(%1), %%r10;"
- " mulxq 56(%1), %%r11, %%rax;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 24(%1), %%r11;"
+ " adoxq 24(%0), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
@@ -711,32 +922,32 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
- " movq %%r9, 8(%0);"
+ " movq %%r9, 8(%1);"
" adcx %%rcx, %%r10;"
- " movq %%r10, 16(%0);"
+ " movq %%r10, 16(%1);"
" adcx %%rcx, %%r11;"
- " movq %%r11, 24(%0);"
+ " movq %%r11, 24(%1);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 0(%0);"
+ " movq %%r8, 0(%1);"
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 96(%1), %%r8, %%r13;"
+ " mulxq 96(%0), %%r8, %%r13;"
" xor %%ecx, %%ecx;"
- " adoxq 64(%1), %%r8;"
- " mulxq 104(%1), %%r9, %%rbx;"
+ " adoxq 64(%0), %%r8;"
+ " mulxq 104(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 72(%1), %%r9;"
- " mulxq 112(%1), %%r10, %%r13;"
+ " adoxq 72(%0), %%r9;"
+ " mulxq 112(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 80(%1), %%r10;"
- " mulxq 120(%1), %%r11, %%rax;"
+ " adoxq 80(%0), %%r10;"
+ " mulxq 120(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 88(%1), %%r11;"
+ " adoxq 88(%0), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
@@ -744,21 +955,21 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
- " movq %%r9, 40(%0);"
+ " movq %%r9, 40(%1);"
" adcx %%rcx, %%r10;"
- " movq %%r10, 48(%0);"
+ " movq %%r10, 48(%1);"
" adcx %%rcx, %%r11;"
- " movq %%r11, 56(%0);"
+ " movq %%r11, 56(%1);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 32(%0);"
- : "+&r" (tmp), "+&r" (f), "+&r" (out)
- :
- : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
- );
+ " movq %%r8, 32(%1);"
+ : "+&r"(f), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+ "%r13", "%r14", "%r15", "memory", "cc");
}
static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index fac0fdc3f25d..f4c760f4cade 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -243,7 +243,7 @@ SYM_FUNC_START(des3_ede_x86_64_crypt_blk)
popq %r12;
popq %rbx;
- ret;
+ RET;
SYM_FUNC_END(des3_ede_x86_64_crypt_blk)
/***********************************************************************
@@ -528,7 +528,7 @@ SYM_FUNC_START(des3_ede_x86_64_crypt_blk_3way)
popq %r12;
popq %rbx;
- ret;
+ RET;
SYM_FUNC_END(des3_ede_x86_64_crypt_blk_3way)
.section .rodata, "a", @progbits
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index e7cb68a3db3b..787c234d2469 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -164,7 +164,7 @@ static int cbc_encrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
- while ((nbytes = walk.nbytes)) {
+ while (walk.nbytes) {
nbytes = __cbc_encrypt(ctx, &walk);
err = skcipher_walk_done(&walk, nbytes);
}
@@ -243,7 +243,7 @@ static int cbc_decrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
- while ((nbytes = walk.nbytes)) {
+ while (walk.nbytes) {
nbytes = __cbc_decrypt(ctx, &walk);
err = skcipher_walk_done(&walk, nbytes);
}
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 99ac25e18e09..2bf871899920 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -85,7 +85,7 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
psrlq $1, T2
pxor T2, T1
pxor T1, DATA
- ret
+ RET
SYM_FUNC_END(__clmul_gf128mul_ble)
/* void clmul_ghash_mul(char *dst, const u128 *shash) */
@@ -99,7 +99,7 @@ SYM_FUNC_START(clmul_ghash_mul)
pshufb BSWAP, DATA
movups DATA, (%rdi)
FRAME_END
- ret
+ RET
SYM_FUNC_END(clmul_ghash_mul)
/*
@@ -128,5 +128,5 @@ SYM_FUNC_START(clmul_ghash_update)
movups DATA, (%rdi)
.Lupdate_just_ret:
FRAME_END
- ret
+ RET
SYM_FUNC_END(clmul_ghash_update)
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
index b22c7b936272..6a0b15e7196a 100644
--- a/arch/x86/crypto/nh-avx2-x86_64.S
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -153,5 +153,5 @@ SYM_FUNC_START(nh_avx2)
vpaddq T1, T0, T0
vpaddq T4, T0, T0
vmovdqu T0, (HASH)
- ret
+ RET
SYM_FUNC_END(nh_avx2)
diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S
index d7ae22dd6683..34c567bbcb4f 100644
--- a/arch/x86/crypto/nh-sse2-x86_64.S
+++ b/arch/x86/crypto/nh-sse2-x86_64.S
@@ -119,5 +119,5 @@ SYM_FUNC_START(nh_sse2)
paddq PASS2_SUMS, T1
movdqu T0, 0x00(HASH)
movdqu T1, 0x10(HASH)
- ret
+ RET
SYM_FUNC_END(nh_sse2)
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index b7ee24df7fba..82f2313f512b 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -601,7 +601,7 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx)
write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk8_avx)
.align 8
@@ -655,7 +655,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx)
write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_dec_blk8_avx)
SYM_FUNC_START(serpent_ecb_enc_8way_avx)
@@ -673,7 +673,7 @@ SYM_FUNC_START(serpent_ecb_enc_8way_avx)
store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_enc_8way_avx)
SYM_FUNC_START(serpent_ecb_dec_8way_avx)
@@ -691,7 +691,7 @@ SYM_FUNC_START(serpent_ecb_dec_8way_avx)
store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_dec_8way_avx)
SYM_FUNC_START(serpent_cbc_dec_8way_avx)
@@ -709,5 +709,5 @@ SYM_FUNC_START(serpent_cbc_dec_8way_avx)
store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_cbc_dec_8way_avx)
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
index 9161b6e441f3..8ea34c9b9316 100644
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -601,7 +601,7 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk16)
write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk16)
.align 8
@@ -655,7 +655,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk16)
write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_dec_blk16)
SYM_FUNC_START(serpent_ecb_enc_16way)
@@ -677,7 +677,7 @@ SYM_FUNC_START(serpent_ecb_enc_16way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_enc_16way)
SYM_FUNC_START(serpent_ecb_dec_16way)
@@ -699,7 +699,7 @@ SYM_FUNC_START(serpent_ecb_dec_16way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_dec_16way)
SYM_FUNC_START(serpent_cbc_dec_16way)
@@ -722,5 +722,5 @@ SYM_FUNC_START(serpent_cbc_dec_16way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_cbc_dec_16way)
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
index 6379b99cb722..8ccb03ad7cef 100644
--- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -553,12 +553,12 @@ SYM_FUNC_START(__serpent_enc_blk_4way)
write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
- ret;
+ RET;
.L__enc_xor4:
xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk_4way)
SYM_FUNC_START(serpent_dec_blk_4way)
@@ -612,5 +612,5 @@ SYM_FUNC_START(serpent_dec_blk_4way)
movl arg_dst(%esp), %eax;
write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
- ret;
+ RET;
SYM_FUNC_END(serpent_dec_blk_4way)
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
index efb6dc17dc90..e0998a011d1d 100644
--- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -675,13 +675,13 @@ SYM_FUNC_START(__serpent_enc_blk_8way)
write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
.L__enc_xor8:
xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk_8way)
SYM_FUNC_START(serpent_dec_blk_8way)
@@ -735,5 +735,5 @@ SYM_FUNC_START(serpent_dec_blk_8way)
write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(serpent_dec_blk_8way)
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 5eed620f4676..a96b2fd26dab 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -674,7 +674,7 @@ _loop3:
pop %r12
pop %rbx
- ret
+ RET
SYM_FUNC_END(\name)
.endm
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
index 5d8415f482bd..2f94ec0e763b 100644
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -290,7 +290,7 @@ SYM_FUNC_START(sha1_ni_transform)
mov %rbp, %rsp
pop %rbp
- ret
+ RET
SYM_FUNC_END(sha1_ni_transform)
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index d25668d2a1e9..263f916362e0 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -99,7 +99,7 @@
pop %rbp
pop %r12
pop %rbx
- ret
+ RET
SYM_FUNC_END(\name)
.endm
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 4739cd31b9db..3baa1ec39097 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -458,7 +458,7 @@ done_hash:
popq %r13
popq %r12
popq %rbx
- ret
+ RET
SYM_FUNC_END(sha256_transform_avx)
.section .rodata.cst256.K256, "aM", @progbits, 256
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 4087f7432a7e..9bcdbc47b8b4 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -710,7 +710,7 @@ done_hash:
popq %r13
popq %r12
popq %rbx
- ret
+ RET
SYM_FUNC_END(sha256_transform_rorx)
.section .rodata.cst512.K256, "aM", @progbits, 512
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index ddfa863b4ee3..c4a5db612c32 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -472,7 +472,7 @@ done_hash:
popq %r12
popq %rbx
- ret
+ RET
SYM_FUNC_END(sha256_transform_ssse3)
.section .rodata.cst256.K256, "aM", @progbits, 256
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 7abade04a3a3..94d50dd27cb5 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -326,7 +326,7 @@ SYM_FUNC_START(sha256_ni_transform)
.Ldone_hash:
- ret
+ RET
SYM_FUNC_END(sha256_ni_transform)
.section .rodata.cst256.K256, "aM", @progbits, 256
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 3d8f0fd4eea8..1fefe6dd3a9e 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -361,7 +361,7 @@ updateblock:
pop %rbx
nowork:
- ret
+ RET
SYM_FUNC_END(sha512_transform_avx)
########################################################################
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 072cb0f0deae..5cdaab7d6901 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -679,7 +679,7 @@ done_hash:
pop %r12
pop %rbx
- ret
+ RET
SYM_FUNC_END(sha512_transform_rorx)
########################################################################
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index bd51c9070bed..b84c22e06c5f 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -363,7 +363,7 @@ updateblock:
pop %rbx
nowork:
- ret
+ RET
SYM_FUNC_END(sha512_transform_ssse3)
########################################################################
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
index 18d2f5199194..4767ab61ff48 100644
--- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
@@ -78,7 +78,7 @@
vpxor tmp0, x, x;
-.section .rodata.cst164, "aM", @progbits, 164
+.section .rodata.cst16, "aM", @progbits, 16
.align 16
/*
@@ -133,6 +133,10 @@
.L0f0f0f0f:
.long 0x0f0f0f0f
+/* 12 bytes, only for padding */
+.Lpadding_deadbeef:
+ .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
+
.text
.align 16
@@ -242,7 +246,7 @@ SYM_FUNC_START(sm4_aesni_avx_crypt4)
.Lblk4_store_output_done:
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx_crypt4)
.align 8
@@ -352,7 +356,7 @@ SYM_FUNC_START_LOCAL(__sm4_crypt_blk8)
vpshufb RTMP2, RB3, RB3;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(__sm4_crypt_blk8)
/*
@@ -408,7 +412,7 @@ SYM_FUNC_START(sm4_aesni_avx_crypt8)
.Lblk8_store_output_done:
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx_crypt8)
/*
@@ -483,7 +487,7 @@ SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8)
/*
@@ -533,7 +537,7 @@ SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
/*
@@ -586,5 +590,5 @@ SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8)
diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
index d2ffd7f76ee2..4732fe8bb65b 100644
--- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
@@ -93,7 +93,7 @@
vpxor tmp0, x, x;
-.section .rodata.cst164, "aM", @progbits, 164
+.section .rodata.cst16, "aM", @progbits, 16
.align 16
/*
@@ -148,6 +148,10 @@
.L0f0f0f0f:
.long 0x0f0f0f0f
+/* 12 bytes, only for padding */
+.Lpadding_deadbeef:
+ .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
+
.text
.align 16
@@ -264,7 +268,7 @@ SYM_FUNC_START_LOCAL(__sm4_crypt_blk16)
vpshufb RTMP2, RB3, RB3;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(__sm4_crypt_blk16)
#define inc_le128(x, minus_one, tmp) \
@@ -383,7 +387,7 @@ SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16)
/*
@@ -437,7 +441,7 @@ SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
/*
@@ -493,5 +497,5 @@ SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16)
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx2_cfb_dec_blk16)
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 37e63b3c664e..31f9b2ec3857 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -267,7 +267,7 @@ SYM_FUNC_START_LOCAL(__twofish_enc_blk8)
outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
- ret;
+ RET;
SYM_FUNC_END(__twofish_enc_blk8)
.align 8
@@ -307,7 +307,7 @@ SYM_FUNC_START_LOCAL(__twofish_dec_blk8)
outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
- ret;
+ RET;
SYM_FUNC_END(__twofish_dec_blk8)
SYM_FUNC_START(twofish_ecb_enc_8way)
@@ -327,7 +327,7 @@ SYM_FUNC_START(twofish_ecb_enc_8way)
store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(twofish_ecb_enc_8way)
SYM_FUNC_START(twofish_ecb_dec_8way)
@@ -347,7 +347,7 @@ SYM_FUNC_START(twofish_ecb_dec_8way)
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(twofish_ecb_dec_8way)
SYM_FUNC_START(twofish_cbc_dec_8way)
@@ -372,5 +372,5 @@ SYM_FUNC_START(twofish_cbc_dec_8way)
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(twofish_cbc_dec_8way)
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
index a6f09e4f2e46..3abcad661884 100644
--- a/arch/x86/crypto/twofish-i586-asm_32.S
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -260,7 +260,7 @@ SYM_FUNC_START(twofish_enc_blk)
pop %ebx
pop %ebp
mov $1, %eax
- ret
+ RET
SYM_FUNC_END(twofish_enc_blk)
SYM_FUNC_START(twofish_dec_blk)
@@ -317,5 +317,5 @@ SYM_FUNC_START(twofish_dec_blk)
pop %ebx
pop %ebp
mov $1, %eax
- ret
+ RET
SYM_FUNC_END(twofish_dec_blk)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
index bca4cea757ce..d2288bf38a8a 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -258,7 +258,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way)
popq %rbx;
popq %r12;
popq %r13;
- ret;
+ RET;
.L__enc_xor3:
outunpack_enc3(xor);
@@ -266,7 +266,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way)
popq %rbx;
popq %r12;
popq %r13;
- ret;
+ RET;
SYM_FUNC_END(__twofish_enc_blk_3way)
SYM_FUNC_START(twofish_dec_blk_3way)
@@ -301,5 +301,5 @@ SYM_FUNC_START(twofish_dec_blk_3way)
popq %rbx;
popq %r12;
popq %r13;
- ret;
+ RET;
SYM_FUNC_END(twofish_dec_blk_3way)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index d2e56232494a..775af290cd19 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -252,7 +252,7 @@ SYM_FUNC_START(twofish_enc_blk)
popq R1
movl $1,%eax
- ret
+ RET
SYM_FUNC_END(twofish_enc_blk)
SYM_FUNC_START(twofish_dec_blk)
@@ -304,5 +304,5 @@ SYM_FUNC_START(twofish_dec_blk)
popq R1
movl $1,%eax
- ret
+ RET
SYM_FUNC_END(twofish_dec_blk)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index ccb9d32768f3..a7ec22b1d06c 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -268,19 +268,16 @@
1: popl %ds
2: popl %es
3: popl %fs
- addl $(4 + \pop), %esp /* pop the unused "gs" slot */
+4: addl $(4 + \pop), %esp /* pop the unused "gs" slot */
IRET_FRAME
-.pushsection .fixup, "ax"
-4: movl $0, (%esp)
- jmp 1b
-5: movl $0, (%esp)
- jmp 2b
-6: movl $0, (%esp)
- jmp 3b
-.popsection
- _ASM_EXTABLE(1b, 4b)
- _ASM_EXTABLE(2b, 5b)
- _ASM_EXTABLE(3b, 6b)
+
+ /*
+ * There is no _ASM_EXTABLE_TYPE_REG() for ASM, however since this is
+ * ASM the registers are known and we can trivially hard-code them.
+ */
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_POP_ZERO|EX_REG_DS)
+ _ASM_EXTABLE_TYPE(2b, 3b, EX_TYPE_POP_ZERO|EX_REG_ES)
+ _ASM_EXTABLE_TYPE(3b, 4b, EX_TYPE_POP_ZERO|EX_REG_FS)
.endm
.macro RESTORE_ALL_NMI cr3_reg:req pop=0
@@ -740,7 +737,7 @@ SYM_FUNC_START(schedule_tail_wrapper)
popl %eax
FRAME_END
- ret
+ RET
SYM_FUNC_END(schedule_tail_wrapper)
.popsection
@@ -925,10 +922,8 @@ SYM_FUNC_START(entry_SYSENTER_32)
sti
sysexit
-.pushsection .fixup, "ax"
-2: movl $0, PT_FS(%esp)
- jmp 1b
-.popsection
+2: movl $0, PT_FS(%esp)
+ jmp 1b
_ASM_EXTABLE(1b, 2b)
.Lsysenter_fix_flags:
@@ -996,8 +991,7 @@ restore_all_switch_stack:
*/
iret
-.section .fixup, "ax"
-SYM_CODE_START(asm_iret_error)
+.Lasm_iret_error:
pushl $0 # no error code
pushl $iret_error
@@ -1014,9 +1008,8 @@ SYM_CODE_START(asm_iret_error)
#endif
jmp handle_exception
-SYM_CODE_END(asm_iret_error)
-.previous
- _ASM_EXTABLE(.Lirq_return, asm_iret_error)
+
+ _ASM_EXTABLE(.Lirq_return, .Lasm_iret_error)
SYM_FUNC_END(entry_INT80_32)
.macro FIXUP_ESPFIX_STACK
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e38a4cf795d9..1ffdbfaad2e2 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -574,6 +574,10 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
ud2
1:
#endif
+#ifdef CONFIG_XEN_PV
+ ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+#endif
+
POP_REGS pop_rdi=0
/*
@@ -734,14 +738,10 @@ SYM_FUNC_START(asm_load_gs_index)
2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
swapgs
FRAME_END
- ret
-SYM_FUNC_END(asm_load_gs_index)
-EXPORT_SYMBOL(asm_load_gs_index)
+ RET
- _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
- .section .fixup, "ax"
/* running with kernelgs */
-SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
+.Lbad_gs:
swapgs /* switch back to user gs */
.macro ZAP_GS
/* This can't be a string because the preprocessor needs to see it. */
@@ -752,8 +752,11 @@ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
xorl %eax, %eax
movl %eax, %gs
jmp 2b
-SYM_CODE_END(.Lbad_gs)
- .previous
+
+ _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
+
+SYM_FUNC_END(asm_load_gs_index)
+EXPORT_SYMBOL(asm_load_gs_index)
#ifdef CONFIG_XEN_PV
/*
@@ -885,11 +888,12 @@ SYM_CODE_START_LOCAL(paranoid_entry)
* is needed here.
*/
SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
- ret
+ RET
.Lparanoid_entry_checkgs:
/* EBX = 1 -> kernel GSBASE active, no restore required */
movl $1, %ebx
+
/*
* The kernel-enforced convention is a negative GSBASE indicates
* a kernel value. No SWAPGS needed on entry and exit.
@@ -897,22 +901,15 @@ SYM_CODE_START_LOCAL(paranoid_entry)
movl $MSR_GS_BASE, %ecx
rdmsr
testl %edx, %edx
- jns .Lparanoid_entry_swapgs
- ret
+ js .Lparanoid_kernel_gsbase
-.Lparanoid_entry_swapgs:
+ /* EBX = 0 -> SWAPGS required on exit */
+ xorl %ebx, %ebx
swapgs
+.Lparanoid_kernel_gsbase:
- /*
- * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
- * unconditional CR3 write, even in the PTI case. So do an lfence
- * to prevent GS speculation, regardless of whether PTI is enabled.
- */
FENCE_SWAPGS_KERNEL_ENTRY
-
- /* EBX = 0 -> SWAPGS required on exit */
- xorl %ebx, %ebx
- ret
+ RET
SYM_CODE_END(paranoid_entry)
/*
@@ -991,12 +988,7 @@ SYM_CODE_START_LOCAL(error_entry)
movq %rax, %rsp /* switch stack */
ENCODE_FRAME_POINTER
pushq %r12
- ret
-
-.Lerror_entry_done_lfence:
- FENCE_SWAPGS_KERNEL_ENTRY
-.Lerror_entry_done:
- ret
+ RET
/*
* There are two places in the kernel that can potentially fault with
@@ -1020,8 +1012,14 @@ SYM_CODE_START_LOCAL(error_entry)
* .Lgs_change's error handler with kernel gsbase.
*/
SWAPGS
- FENCE_SWAPGS_USER_ENTRY
- jmp .Lerror_entry_done
+
+ /*
+ * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
+ * kernel or user gsbase.
+ */
+.Lerror_entry_done_lfence:
+ FENCE_SWAPGS_KERNEL_ENTRY
+ RET
.Lbstep_iret:
/* Fix truncated RIP */
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 960a021d543e..320480a8db4f 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -453,3 +453,5 @@
446 i386 landlock_restrict_self sys_landlock_restrict_self
447 i386 memfd_secret sys_memfd_secret
448 i386 process_mrelease sys_process_mrelease
+449 i386 futex_waitv sys_futex_waitv
+450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 18b5500ea8bf..c84d12608cd2 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -370,6 +370,8 @@
446 common landlock_restrict_self sys_landlock_restrict_self
447 common memfd_secret sys_memfd_secret
448 common process_mrelease sys_process_mrelease
+449 common futex_waitv sys_futex_waitv
+450 common set_mempolicy_home_node sys_set_mempolicy_home_node
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
index f1f96d4d8cd6..7591bab060f7 100644
--- a/arch/x86/entry/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -24,7 +24,7 @@ SYM_CODE_START_NOALIGN(\name)
popl %edx
popl %ecx
popl %eax
- ret
+ RET
_ASM_NOKPROBE(\name)
SYM_CODE_END(\name)
.endm
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index 496b11ec469d..505b488fcc65 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -50,7 +50,7 @@ SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore)
popq %rsi
popq %rdi
popq %rbp
- ret
+ RET
_ASM_NOKPROBE(__thunk_restore)
SYM_CODE_END(__thunk_restore)
#endif
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index a2dddcc189f6..693f8b9031fb 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -172,7 +172,7 @@ $(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE
# The DSO images are built using a special linker script.
#
quiet_cmd_vdso = VDSO $@
- cmd_vdso = $(LD) -nostdlib -o $@ \
+ cmd_vdso = $(LD) -o $@ \
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
-T $(filter %.lds,$^) $(filter %.o,$^) && \
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index dc8da7695859..bafa73f09e92 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -77,7 +77,6 @@ SECTIONS
.text : {
*(.text*)
- *(.fixup)
} :text =0x90909090,
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index 6ddd7a937b3e..d33c6513fd2c 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -78,7 +78,7 @@ SYM_INNER_LABEL(int80_landing_pad, SYM_L_GLOBAL)
popl %ecx
CFI_RESTORE ecx
CFI_ADJUST_CFA_OFFSET -4
- ret
+ RET
CFI_ENDPROC
.size __kernel_vsyscall,.-__kernel_vsyscall
diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
index 99dafac992e2..d77d278ee9dd 100644
--- a/arch/x86/entry/vdso/vsgx.S
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -81,7 +81,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave)
pop %rbx
leave
.cfi_def_cfa %rsp, 8
- ret
+ RET
/* The out-of-line code runs with the pre-leave stack frame. */
.cfi_def_cfa %rbp, 16
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 1b40b9297083..fd2ee9408e91 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -226,7 +226,8 @@ bool emulate_vsyscall(unsigned long error_code,
if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
warn_bad_vsyscall(KERN_DEBUG, regs,
"seccomp tried to change syscall nr or ip");
- do_exit(SIGSYS);
+ force_exit_sig(SIGSYS);
+ return true;
}
regs->orig_ax = -1;
if (tmp)
diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
index 2e203f3a25a7..15e35159ebb6 100644
--- a/arch/x86/entry/vsyscall/vsyscall_emu_64.S
+++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
@@ -19,17 +19,17 @@ __vsyscall_page:
mov $__NR_gettimeofday, %rax
syscall
- ret
+ RET
.balign 1024, 0xcc
mov $__NR_time, %rax
syscall
- ret
+ RET
.balign 1024, 0xcc
mov $__NR_getcpu, %rax
syscall
- ret
+ RET
.balign 4096, 0xcc
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index 913745f1419b..b15f7b950d2e 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -161,7 +161,7 @@ static int get_next_avail_iommu_bnk_cntr(struct perf_event *event)
raw_spin_lock_irqsave(&piommu->lock, flags);
- for (bank = 0, shift = 0; bank < max_banks; bank++) {
+ for (bank = 0; bank < max_banks; bank++) {
for (cntr = 0; cntr < max_cntrs; cntr++) {
shift = bank + (bank*3) + cntr;
if (piommu->cntr_assign_mask & BIT_ULL(shift)) {
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 6dfa8ddaa60f..e686c5e0537b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -66,6 +66,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all);
DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable);
DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable);
+DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign);
+
DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add);
DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del);
DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read);
@@ -1215,6 +1217,8 @@ static inline void x86_assign_hw_event(struct perf_event *event,
hwc->last_cpu = smp_processor_id();
hwc->last_tag = ++cpuc->tags[i];
+ static_call_cond(x86_pmu_assign)(event, idx);
+
switch (hwc->idx) {
case INTEL_PMC_IDX_FIXED_BTS:
case INTEL_PMC_IDX_FIXED_VLBR:
@@ -2005,6 +2009,8 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_enable, x86_pmu.enable);
static_call_update(x86_pmu_disable, x86_pmu.disable);
+ static_call_update(x86_pmu_assign, x86_pmu.assign);
+
static_call_update(x86_pmu_add, x86_pmu.add);
static_call_update(x86_pmu_del, x86_pmu.del);
static_call_update(x86_pmu_read, x86_pmu.read);
@@ -2470,7 +2476,7 @@ static int x86_pmu_event_init(struct perf_event *event)
if (READ_ONCE(x86_pmu.attr_rdpmc) &&
!(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
- event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+ event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
return err;
}
@@ -2504,7 +2510,7 @@ void perf_clear_dirty_counters(void)
static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
{
- if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
return;
/*
@@ -2525,7 +2531,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
{
- if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
return;
if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
@@ -2536,7 +2542,7 @@ static int x86_pmu_event_idx(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
return 0;
if (is_metric_idx(hwc->idx))
@@ -2719,7 +2725,7 @@ void arch_perf_update_userpage(struct perf_event *event,
userpg->cap_user_time = 0;
userpg->cap_user_time_zero = 0;
userpg->cap_user_rdpmc =
- !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
+ !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
userpg->pmc_width = x86_pmu.cntval_bits;
if (!using_native_sched_clock() || !sched_clock_stable())
@@ -2765,7 +2771,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
struct unwind_state state;
unsigned long addr;
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ if (perf_guest_state()) {
/* TODO: We don't support guest os callchain now */
return;
}
@@ -2868,7 +2874,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
struct stack_frame frame;
const struct stack_frame __user *fp;
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ if (perf_guest_state()) {
/* TODO: We don't support guest os callchain now */
return;
}
@@ -2945,18 +2951,19 @@ static unsigned long code_segment_base(struct pt_regs *regs)
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
- return perf_guest_cbs->get_guest_ip();
+ if (perf_guest_state())
+ return perf_guest_get_ip();
return regs->ip + code_segment_base(regs);
}
unsigned long perf_misc_flags(struct pt_regs *regs)
{
+ unsigned int guest_state = perf_guest_state();
int misc = 0;
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
- if (perf_guest_cbs->is_user_mode())
+ if (guest_state) {
+ if (guest_state & PERF_GUEST_USER)
misc |= PERF_RECORD_MISC_GUEST_USER;
else
misc |= PERF_RECORD_MISC_GUEST_KERNEL;
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 6320d2cfd9d3..974e917e65b2 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -209,6 +209,12 @@ static void bts_update(struct bts_ctx *bts)
} else {
local_set(&buf->data_size, head);
}
+
+ /*
+ * Since BTS is coherent, just add compiler barrier to ensure
+ * BTS updating is ordered against bts::handle::event.
+ */
+ barrier();
}
static int
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 9a044438072b..fd9f908debe5 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -243,7 +243,8 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
static struct event_constraint intel_icl_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* old INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
@@ -288,7 +289,7 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
static struct event_constraint intel_spr_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
@@ -2144,19 +2145,19 @@ static __initconst const u64 knl_hw_cache_extra_regs
* However, there are some cases which may change PEBS status, e.g. PMI
* throttle. The PEBS_ENABLE should be updated where the status changes.
*/
-static void __intel_pmu_disable_all(void)
+static __always_inline void __intel_pmu_disable_all(bool bts)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
- if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ if (bts && test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
intel_pmu_disable_bts();
}
-static void intel_pmu_disable_all(void)
+static __always_inline void intel_pmu_disable_all(void)
{
- __intel_pmu_disable_all();
+ __intel_pmu_disable_all(true);
intel_pmu_pebs_disable_all();
intel_pmu_lbr_disable_all();
}
@@ -2187,6 +2188,47 @@ static void intel_pmu_enable_all(int added)
__intel_pmu_enable_all(added, false);
}
+static noinline int
+__intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries,
+ unsigned int cnt, unsigned long flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ intel_pmu_lbr_read();
+ cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
+
+ memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
+ intel_pmu_enable_all(0);
+ local_irq_restore(flags);
+ return cnt;
+}
+
+static int
+intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+ unsigned long flags;
+
+ /* must not have branches... */
+ local_irq_save(flags);
+ __intel_pmu_disable_all(false); /* we don't care about BTS */
+ __intel_pmu_lbr_disable();
+ /* ... until here */
+ return __intel_pmu_snapshot_branch_stack(entries, cnt, flags);
+}
+
+static int
+intel_pmu_snapshot_arch_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+ unsigned long flags;
+
+ /* must not have branches... */
+ local_irq_save(flags);
+ __intel_pmu_disable_all(false); /* we don't care about BTS */
+ __intel_pmu_arch_lbr_disable();
+ /* ... until here */
+ return __intel_pmu_snapshot_branch_stack(entries, cnt, flags);
+}
+
/*
* Workaround for:
* Intel Errata AAK100 (model 26)
@@ -2403,6 +2445,12 @@ static void intel_pmu_disable_event(struct perf_event *event)
intel_pmu_pebs_disable(event);
}
+static void intel_pmu_assign_event(struct perf_event *event, int idx)
+{
+ if (is_pebs_pt(event))
+ perf_report_aux_output_id(event, idx);
+}
+
static void intel_pmu_del_event(struct perf_event *event)
{
if (needs_branch_stack(event))
@@ -2853,10 +2901,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
*/
if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) {
handled++;
- if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() &&
- perf_guest_cbs->handle_intel_pt_intr))
- perf_guest_cbs->handle_intel_pt_intr();
- else
+ if (!perf_guest_handle_intel_pt_intr())
intel_pt_interrupt();
}
@@ -2930,7 +2975,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
apic_write(APIC_LVTPC, APIC_DM_NMI);
intel_bts_disable_local();
cpuc->enabled = 0;
- __intel_pmu_disable_all();
+ __intel_pmu_disable_all(true);
handled = intel_pmu_drain_bts_buffer();
handled += intel_bts_interrupt();
status = intel_pmu_get_status();
@@ -2998,8 +3043,10 @@ intel_vlbr_constraints(struct perf_event *event)
{
struct event_constraint *c = &vlbr_constraint;
- if (unlikely(constraint_match(c, event->hw.config)))
+ if (unlikely(constraint_match(c, event->hw.config))) {
+ event->hw.flags |= c->flags;
return c;
+ }
return NULL;
}
@@ -4495,8 +4542,16 @@ static int intel_pmu_check_period(struct perf_event *event, u64 value)
return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0;
}
+static void intel_aux_output_init(void)
+{
+ /* Refer also intel_pmu_aux_output_match() */
+ if (x86_pmu.intel_cap.pebs_output_pt_available)
+ x86_pmu.assign = intel_pmu_assign_event;
+}
+
static int intel_pmu_aux_output_match(struct perf_event *event)
{
+ /* intel_pmu_assign_event() is needed, refer intel_aux_output_init() */
if (!x86_pmu.intel_cap.pebs_output_pt_available)
return 0;
@@ -6284,9 +6339,21 @@ __init int intel_pmu_init(void)
x86_pmu.lbr_nr = 0;
}
- if (x86_pmu.lbr_nr)
+ if (x86_pmu.lbr_nr) {
pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
+ /* only support branch_stack snapshot for perfmon >= v2 */
+ if (x86_pmu.disable_all == intel_pmu_disable_all) {
+ if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) {
+ static_call_update(perf_snapshot_branch_stack,
+ intel_pmu_snapshot_arch_branch_stack);
+ } else {
+ static_call_update(perf_snapshot_branch_stack,
+ intel_pmu_snapshot_branch_stack);
+ }
+ }
+ }
+
intel_pmu_check_extra_regs(x86_pmu.extra_regs);
/* Support full width counters using alternative MSR range */
@@ -6302,6 +6369,8 @@ __init int intel_pmu_init(void)
if (is_hybrid())
intel_pmu_check_hybrid_pmus((u64)fixed_mask);
+ intel_aux_output_init();
+
return 0;
}
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8647713276a7..2e215369df4a 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -923,7 +923,8 @@ struct event_constraint intel_skl_pebs_event_constraints[] = {
};
struct event_constraint intel_icl_pebs_event_constraints[] = {
- INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x100000000ULL), /* old INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), /* SLOTS */
INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
@@ -943,7 +944,7 @@ struct event_constraint intel_icl_pebs_event_constraints[] = {
};
struct event_constraint intel_spr_pebs_event_constraints[] = {
- INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
@@ -1301,7 +1302,7 @@ void intel_pmu_pebs_disable_all(void)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->pebs_enabled)
- wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+ __intel_pmu_pebs_disable_all();
}
static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 9e6d6eaeb4cb..8043213b75a5 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -228,20 +228,6 @@ static void __intel_pmu_lbr_enable(bool pmi)
wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN);
}
-static void __intel_pmu_lbr_disable(void)
-{
- u64 debugctl;
-
- if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {
- wrmsrl(MSR_ARCH_LBR_CTL, 0);
- return;
- }
-
- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
- debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
void intel_pmu_lbr_reset_32(void)
{
int i;
@@ -279,6 +265,8 @@ void intel_pmu_lbr_reset(void)
cpuc->last_task_ctx = NULL;
cpuc->last_log_id = 0;
+ if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && cpuc->lbr_select)
+ wrmsrl(MSR_LBR_SELECT, 0);
}
/*
@@ -779,8 +767,12 @@ void intel_pmu_lbr_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- if (cpuc->lbr_users && !vlbr_exclude_host())
+ if (cpuc->lbr_users && !vlbr_exclude_host()) {
+ if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+ return __intel_pmu_arch_lbr_disable();
+
__intel_pmu_lbr_disable();
+ }
}
void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index c72e368dd164..f1ba6ab2e97e 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1187,7 +1187,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
* PCI slot and func to indicate the uncore box.
*/
if (id->driver_data & ~0xffff) {
- struct pci_driver *pci_drv = pdev->driver;
+ struct pci_driver *pci_drv = to_pci_driver(pdev->dev.driver);
pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table);
if (pmu == NULL)
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 7280c8a3c831..6d735611c281 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -30,7 +30,7 @@
#define uncore_discovery_invalid_unit(unit) \
- (!unit.table1 || !unit.ctl || !unit.table3 || \
+ (!unit.table1 || !unit.ctl || \
unit.table1 == -1ULL || unit.ctl == -1ULL || \
unit.table3 == -1ULL)
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 5ddc0f30db6f..3660f698fb2a 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -452,7 +452,7 @@
#define ICX_M3UPI_PCI_PMON_BOX_CTL 0xa0
/* ICX IMC */
-#define ICX_NUMBER_IMC_CHN 2
+#define ICX_NUMBER_IMC_CHN 3
#define ICX_IMC_MEM_STRIDE 0x4
/* SPR */
@@ -3608,6 +3608,9 @@ static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *ev
struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
struct extra_reg *er;
int idx = 0;
+ /* Any of the CHA events may be filtered by Thread/Core-ID.*/
+ if (event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN)
+ idx = SKX_CHA_MSR_PMON_BOX_FILTER_TID;
for (er = skx_uncore_cha_extra_regs; er->msr; er++) {
if (er->event != (event->hw.config & er->config_mask))
@@ -3675,6 +3678,7 @@ static struct event_constraint skx_uncore_iio_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
UNCORE_EVENT_CONSTRAINT(0xd4, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
EVENT_CONSTRAINT_END
};
@@ -4525,6 +4529,13 @@ static void snr_iio_cleanup_mapping(struct intel_uncore_type *type)
pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group);
}
+static struct event_constraint snr_uncore_iio_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
+ EVENT_CONSTRAINT_END
+};
+
static struct intel_uncore_type snr_uncore_iio = {
.name = "iio",
.num_counters = 4,
@@ -4536,6 +4547,7 @@ static struct intel_uncore_type snr_uncore_iio = {
.event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
.box_ctl = SNR_IIO_MSR_PMON_BOX_CTL,
.msr_offset = SNR_IIO_MSR_OFFSET,
+ .constraints = snr_uncore_iio_constraints,
.ops = &ivbep_uncore_msr_ops,
.format_group = &snr_uncore_iio_format_group,
.attr_update = snr_iio_attr_update,
@@ -5076,8 +5088,10 @@ static struct event_constraint icx_uncore_iio_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
UNCORE_EVENT_CONSTRAINT(0x03, 0x3),
UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x88, 0xc),
UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
EVENT_CONSTRAINT_END
};
@@ -5463,7 +5477,7 @@ static struct intel_uncore_ops icx_uncore_mmio_ops = {
static struct intel_uncore_type icx_uncore_imc = {
.name = "imc",
.num_counters = 4,
- .num_boxes = 8,
+ .num_boxes = 12,
.perf_ctr_bits = 48,
.fixed_ctr_bits = 48,
.fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
@@ -5647,6 +5661,7 @@ static struct intel_uncore_type spr_uncore_chabox = {
.event_mask = SPR_CHA_PMON_EVENT_MASK,
.event_mask_ext = SPR_RAW_EVENT_MASK_EXT,
.num_shared_regs = 1,
+ .constraints = skx_uncore_chabox_constraints,
.ops = &spr_uncore_chabox_ops,
.format_group = &spr_uncore_chabox_format_group,
.attr_update = uncore_alias_groups,
@@ -5658,6 +5673,7 @@ static struct intel_uncore_type spr_uncore_iio = {
.event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
.format_group = &snr_uncore_iio_format_group,
.attr_update = uncore_alias_groups,
+ .constraints = icx_uncore_iio_constraints,
};
static struct attribute *spr_uncore_raw_formats_attr[] = {
@@ -5686,9 +5702,16 @@ static struct intel_uncore_type spr_uncore_irp = {
};
+static struct event_constraint spr_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
static struct intel_uncore_type spr_uncore_m2pcie = {
SPR_UNCORE_COMMON_FORMAT(),
.name = "m2pcie",
+ .constraints = spr_uncore_m2pcie_constraints,
};
static struct intel_uncore_type spr_uncore_pcu = {
@@ -5765,6 +5788,7 @@ static struct intel_uncore_type spr_uncore_upi = {
static struct intel_uncore_type spr_uncore_m3upi = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "m3upi",
+ .constraints = icx_uncore_m3upi_constraints,
};
static struct intel_uncore_type spr_uncore_mdf = {
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index c853b28efa33..96c775abe31f 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -68,6 +68,7 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_BROADWELL_D:
case INTEL_FAM6_BROADWELL_G:
case INTEL_FAM6_BROADWELL_X:
+ case INTEL_FAM6_SAPPHIRERAPIDS_X:
case INTEL_FAM6_ATOM_SILVERMONT:
case INTEL_FAM6_ATOM_SILVERMONT_D:
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index e3ac05c97b5e..9d376e528dfc 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -14,6 +14,7 @@
#include <linux/perf_event.h>
+#include <asm/fpu/xstate.h>
#include <asm/intel_ds.h>
#include <asm/cpu.h>
@@ -73,7 +74,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */
#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */
#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */
+
#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */
#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */
#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */
@@ -726,6 +727,7 @@ struct x86_pmu {
void (*enable_all)(int added);
void (*enable)(struct perf_event *);
void (*disable)(struct perf_event *);
+ void (*assign)(struct perf_event *event, int idx);
void (*add)(struct perf_event *);
void (*del)(struct perf_event *);
void (*read)(struct perf_event *event);
@@ -1240,6 +1242,25 @@ static inline bool intel_pmu_has_bts(struct perf_event *event)
return intel_pmu_has_bts_period(event, hwc->sample_period);
}
+static __always_inline void __intel_pmu_pebs_disable_all(void)
+{
+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+static __always_inline void __intel_pmu_arch_lbr_disable(void)
+{
+ wrmsrl(MSR_ARCH_LBR_CTL, 0);
+}
+
+static __always_inline void __intel_pmu_lbr_disable(void)
+{
+ u64 debugctl;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
int intel_pmu_save_and_restart(struct perf_event *event);
struct event_constraint *
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index 48e2c51464e8..5d2de10809ae 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y := hv_init.o mmu.o nested.o irqdomain.o
+obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o
obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o
ifdef CONFIG_X86_64
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 32a1ad356c18..db2d92fb44da 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -122,17 +122,27 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
ipi_arg->reserved = 0;
ipi_arg->vp_set.valid_bank_mask = 0;
- if (!cpumask_equal(mask, cpu_present_mask)) {
+ /*
+ * Use HV_GENERIC_SET_ALL and avoid converting cpumask to VP_SET
+ * when the IPI is sent to all currently present CPUs.
+ */
+ if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) {
ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
if (exclude_self)
nr_bank = cpumask_to_vpset_noself(&(ipi_arg->vp_set), mask);
else
nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask);
- }
- if (nr_bank < 0)
- goto ipi_mask_ex_done;
- if (!nr_bank)
+
+ /*
+ * 'nr_bank <= 0' means some CPUs in cpumask can't be
+ * represented in VP_SET. Return an error and fall back to
+ * native (architectural) method of sending IPIs.
+ */
+ if (nr_bank <= 0)
+ goto ipi_mask_ex_done;
+ } else {
ipi_arg->vp_set.format = HV_GENERIC_SET_ALL;
+ }
status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
ipi_arg, NULL);
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 708a2712a516..8b392b6b7b93 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -28,6 +28,7 @@
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
+#include <linux/swiotlb.h>
int hyperv_init_cpuhp;
u64 hv_current_partition_id = ~0ull;
@@ -36,12 +37,42 @@ EXPORT_SYMBOL_GPL(hv_current_partition_id);
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
+union hv_ghcb * __percpu *hv_ghcb_pg;
+
/* Storage to save the hypercall page temporarily for hibernation */
static void *hv_hypercall_pg_saved;
struct hv_vp_assist_page **hv_vp_assist_page;
EXPORT_SYMBOL_GPL(hv_vp_assist_page);
+static int hyperv_init_ghcb(void)
+{
+ u64 ghcb_gpa;
+ void *ghcb_va;
+ void **ghcb_base;
+
+ if (!hv_isolation_type_snp())
+ return 0;
+
+ if (!hv_ghcb_pg)
+ return -EINVAL;
+
+ /*
+ * GHCB page is allocated by paravisor. The address
+ * returned by MSR_AMD64_SEV_ES_GHCB is above shared
+ * memory boundary and map it here.
+ */
+ rdmsrl(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa);
+ ghcb_va = memremap(ghcb_gpa, HV_HYP_PAGE_SIZE, MEMREMAP_WB);
+ if (!ghcb_va)
+ return -ENOMEM;
+
+ ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg);
+ *ghcb_base = ghcb_va;
+
+ return 0;
+}
+
static int hv_cpu_init(unsigned int cpu)
{
union hv_vp_assist_msr_contents msr = { 0 };
@@ -85,7 +116,7 @@ static int hv_cpu_init(unsigned int cpu)
}
}
- return 0;
+ return hyperv_init_ghcb();
}
static void (*hv_reenlightenment_cb)(void);
@@ -139,7 +170,6 @@ void set_hv_tscchange_cb(void (*cb)(void))
struct hv_reenlightenment_control re_ctrl = {
.vector = HYPERV_REENLIGHTENMENT_VECTOR,
.enabled = 1,
- .target_vp = hv_vp_index[smp_processor_id()]
};
struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1};
@@ -148,13 +178,20 @@ void set_hv_tscchange_cb(void (*cb)(void))
return;
}
+ if (!hv_vp_index)
+ return;
+
hv_reenlightenment_cb = cb;
/* Make sure callback is registered before we write to MSRs */
wmb();
+ re_ctrl.target_vp = hv_vp_index[get_cpu()];
+
wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl));
+
+ put_cpu();
}
EXPORT_SYMBOL_GPL(set_hv_tscchange_cb);
@@ -177,6 +214,14 @@ static int hv_cpu_die(unsigned int cpu)
{
struct hv_reenlightenment_control re_ctrl;
unsigned int new_cpu;
+ void **ghcb_va;
+
+ if (hv_ghcb_pg) {
+ ghcb_va = (void **)this_cpu_ptr(hv_ghcb_pg);
+ if (*ghcb_va)
+ memunmap(*ghcb_va);
+ *ghcb_va = NULL;
+ }
hv_common_cpu_die(cpu);
@@ -342,20 +387,13 @@ static void __init hv_get_partition_id(void)
*/
void __init hyperv_init(void)
{
- u64 guest_id, required_msrs;
+ u64 guest_id;
union hv_x64_msr_hypercall_contents hypercall_msr;
int cpuhp;
if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return;
- /* Absolutely required MSRs */
- required_msrs = HV_MSR_HYPERCALL_AVAILABLE |
- HV_MSR_VP_INDEX_AVAILABLE;
-
- if ((ms_hyperv.features & required_msrs) != required_msrs)
- return;
-
if (hv_common_init())
return;
@@ -366,10 +404,16 @@ void __init hyperv_init(void)
goto common_free;
}
+ if (hv_isolation_type_snp()) {
+ hv_ghcb_pg = alloc_percpu(union hv_ghcb *);
+ if (!hv_ghcb_pg)
+ goto free_vp_assist_page;
+ }
+
cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
hv_cpu_init, hv_cpu_die);
if (cpuhp < 0)
- goto free_vp_assist_page;
+ goto free_ghcb_page;
/*
* Setup the hypercall page and enable hypercalls.
@@ -379,14 +423,15 @@ void __init hyperv_init(void)
guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0);
wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
+ /* Hyper-V requires to write guest os id via ghcb in SNP IVM. */
+ hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START,
VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX,
VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
__builtin_return_address(0));
- if (hv_hypercall_pg == NULL) {
- wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
- goto remove_cpuhp_state;
- }
+ if (hv_hypercall_pg == NULL)
+ goto clean_guest_os_id;
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
hypercall_msr.enable = 1;
@@ -454,10 +499,25 @@ void __init hyperv_init(void)
/* Query the VMs extended capability once, so that it can be cached. */
hv_query_ext_cap(0);
+
+#ifdef CONFIG_SWIOTLB
+ /*
+ * Swiotlb bounce buffer needs to be mapped in extra address
+ * space. Map function doesn't work in the early place and so
+ * call swiotlb_update_mem_attributes() here.
+ */
+ if (hv_is_isolation_supported())
+ swiotlb_update_mem_attributes();
+#endif
+
return;
-remove_cpuhp_state:
+clean_guest_os_id:
+ wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
+ hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
cpuhp_remove_state(cpuhp);
+free_ghcb_page:
+ free_percpu(hv_ghcb_pg);
free_vp_assist_page:
kfree(hv_vp_assist_page);
hv_vp_assist_page = NULL;
@@ -476,6 +536,7 @@ void hyperv_cleanup(void)
/* Reset our OS id */
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
+ hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
/*
* Reset hypercall page reference before reset the page,
@@ -546,16 +607,3 @@ bool hv_is_hyperv_initialized(void)
return hypercall_msr.enable;
}
EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
-
-enum hv_isolation_type hv_get_isolation_type(void)
-{
- if (!(ms_hyperv.priv_high & HV_ISOLATION))
- return HV_ISOLATION_TYPE_NONE;
- return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
-}
-EXPORT_SYMBOL_GPL(hv_get_isolation_type);
-
-bool hv_is_isolation_supported(void)
-{
- return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
-}
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 514fc64e23d5..7e0f6bedc248 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -253,64 +253,43 @@ static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry
return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry);
}
-static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
{
- u64 status;
struct hv_interrupt_entry old_entry;
- struct irq_desc *desc;
- struct irq_data *data;
struct msi_msg msg;
+ u64 status;
- desc = irq_to_desc(irq);
- if (!desc) {
- pr_debug("%s: no irq desc\n", __func__);
- return;
- }
-
- data = &desc->irq_data;
- if (!data) {
- pr_debug("%s: no irq data\n", __func__);
- return;
- }
-
- if (!data->chip_data) {
+ if (!irqd->chip_data) {
pr_debug("%s: no chip data\n!", __func__);
return;
}
- old_entry = *(struct hv_interrupt_entry *)data->chip_data;
+ old_entry = *(struct hv_interrupt_entry *)irqd->chip_data;
entry_to_msi_msg(&old_entry, &msg);
- kfree(data->chip_data);
- data->chip_data = NULL;
+ kfree(irqd->chip_data);
+ irqd->chip_data = NULL;
status = hv_unmap_msi_interrupt(dev, &old_entry);
- if (status != HV_STATUS_SUCCESS) {
+ if (status != HV_STATUS_SUCCESS)
pr_err("%s: hypercall failed, status %lld\n", __func__, status);
- return;
- }
}
-static void hv_msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+static void hv_msi_free_irq(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq)
{
- int i;
- struct msi_desc *entry;
- struct pci_dev *pdev;
+ struct irq_data *irqd = irq_get_irq_data(virq);
+ struct msi_desc *desc;
- if (WARN_ON_ONCE(!dev_is_pci(dev)))
+ if (!irqd)
return;
- pdev = to_pci_dev(dev);
+ desc = irq_data_get_msi_desc(irqd);
+ if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev)))
+ return;
- for_each_pci_msi_entry(entry, pdev) {
- if (entry->irq) {
- for (i = 0; i < entry->nvec_used; i++) {
- hv_teardown_msi_irq_common(pdev, entry, entry->irq + i);
- irq_domain_free_irqs(entry->irq + i, 1);
- }
- }
- }
+ hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd);
}
/*
@@ -329,7 +308,7 @@ static struct irq_chip hv_pci_msi_controller = {
};
static struct msi_domain_ops pci_msi_domain_ops = {
- .domain_free_irqs = hv_msi_domain_free_irqs,
+ .msi_free = hv_msi_free_irq,
.msi_prepare = pci_msi_prepare,
};
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
new file mode 100644
index 000000000000..2b994117581e
--- /dev/null
+++ b/arch/x86/hyperv/ivm.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyper-V Isolation VM interface with paravisor and hypervisor
+ *
+ * Author:
+ * Tianyu Lan <Tianyu.Lan@microsoft.com>
+ */
+
+#include <linux/bitfield.h>
+#include <linux/hyperv.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <asm/svm.h>
+#include <asm/sev.h>
+#include <asm/io.h>
+#include <asm/mshyperv.h>
+#include <asm/hypervisor.h>
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+#define GHCB_USAGE_HYPERV_CALL 1
+
+union hv_ghcb {
+ struct ghcb ghcb;
+ struct {
+ u64 hypercalldata[509];
+ u64 outputgpa;
+ union {
+ union {
+ struct {
+ u32 callcode : 16;
+ u32 isfast : 1;
+ u32 reserved1 : 14;
+ u32 isnested : 1;
+ u32 countofelements : 12;
+ u32 reserved2 : 4;
+ u32 repstartindex : 12;
+ u32 reserved3 : 4;
+ };
+ u64 asuint64;
+ } hypercallinput;
+ union {
+ struct {
+ u16 callstatus;
+ u16 reserved1;
+ u32 elementsprocessed : 12;
+ u32 reserved2 : 20;
+ };
+ u64 asunit64;
+ } hypercalloutput;
+ };
+ u64 reserved2;
+ } hypercall;
+} __packed __aligned(HV_HYP_PAGE_SIZE);
+
+u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
+{
+ union hv_ghcb *hv_ghcb;
+ void **ghcb_base;
+ unsigned long flags;
+ u64 status;
+
+ if (!hv_ghcb_pg)
+ return -EFAULT;
+
+ WARN_ON(in_nmi());
+
+ local_irq_save(flags);
+ ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg);
+ hv_ghcb = (union hv_ghcb *)*ghcb_base;
+ if (!hv_ghcb) {
+ local_irq_restore(flags);
+ return -EFAULT;
+ }
+
+ hv_ghcb->ghcb.protocol_version = GHCB_PROTOCOL_MAX;
+ hv_ghcb->ghcb.ghcb_usage = GHCB_USAGE_HYPERV_CALL;
+
+ hv_ghcb->hypercall.outputgpa = (u64)output;
+ hv_ghcb->hypercall.hypercallinput.asuint64 = 0;
+ hv_ghcb->hypercall.hypercallinput.callcode = control;
+
+ if (input_size)
+ memcpy(hv_ghcb->hypercall.hypercalldata, input, input_size);
+
+ VMGEXIT();
+
+ hv_ghcb->ghcb.ghcb_usage = 0xffffffff;
+ memset(hv_ghcb->ghcb.save.valid_bitmap, 0,
+ sizeof(hv_ghcb->ghcb.save.valid_bitmap));
+
+ status = hv_ghcb->hypercall.hypercalloutput.callstatus;
+
+ local_irq_restore(flags);
+
+ return status;
+}
+
+void hv_ghcb_msr_write(u64 msr, u64 value)
+{
+ union hv_ghcb *hv_ghcb;
+ void **ghcb_base;
+ unsigned long flags;
+ struct es_em_ctxt ctxt;
+
+ if (!hv_ghcb_pg)
+ return;
+
+ WARN_ON(in_nmi());
+
+ local_irq_save(flags);
+ ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg);
+ hv_ghcb = (union hv_ghcb *)*ghcb_base;
+ if (!hv_ghcb) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ ghcb_set_rcx(&hv_ghcb->ghcb, msr);
+ ghcb_set_rax(&hv_ghcb->ghcb, lower_32_bits(value));
+ ghcb_set_rdx(&hv_ghcb->ghcb, upper_32_bits(value));
+
+ if (sev_es_ghcb_hv_call(&hv_ghcb->ghcb, false, &ctxt,
+ SVM_EXIT_MSR, 1, 0))
+ pr_warn("Fail to write msr via ghcb %llx.\n", msr);
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(hv_ghcb_msr_write);
+
+void hv_ghcb_msr_read(u64 msr, u64 *value)
+{
+ union hv_ghcb *hv_ghcb;
+ void **ghcb_base;
+ unsigned long flags;
+ struct es_em_ctxt ctxt;
+
+ /* Check size of union hv_ghcb here. */
+ BUILD_BUG_ON(sizeof(union hv_ghcb) != HV_HYP_PAGE_SIZE);
+
+ if (!hv_ghcb_pg)
+ return;
+
+ WARN_ON(in_nmi());
+
+ local_irq_save(flags);
+ ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg);
+ hv_ghcb = (union hv_ghcb *)*ghcb_base;
+ if (!hv_ghcb) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ ghcb_set_rcx(&hv_ghcb->ghcb, msr);
+ if (sev_es_ghcb_hv_call(&hv_ghcb->ghcb, false, &ctxt,
+ SVM_EXIT_MSR, 0, 0))
+ pr_warn("Fail to read msr via ghcb %llx.\n", msr);
+ else
+ *value = (u64)lower_32_bits(hv_ghcb->ghcb.save.rax)
+ | ((u64)lower_32_bits(hv_ghcb->ghcb.save.rdx) << 32);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(hv_ghcb_msr_read);
+#endif
+
+enum hv_isolation_type hv_get_isolation_type(void)
+{
+ if (!(ms_hyperv.priv_high & HV_ISOLATION))
+ return HV_ISOLATION_TYPE_NONE;
+ return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
+}
+EXPORT_SYMBOL_GPL(hv_get_isolation_type);
+
+/*
+ * hv_is_isolation_supported - Check system runs in the Hyper-V
+ * isolation VM.
+ */
+bool hv_is_isolation_supported(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ return false;
+
+ if (!hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ return false;
+
+ return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
+}
+
+DEFINE_STATIC_KEY_FALSE(isolation_type_snp);
+
+/*
+ * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based
+ * isolation VM.
+ */
+bool hv_isolation_type_snp(void)
+{
+ return static_branch_unlikely(&isolation_type_snp);
+}
+
+/*
+ * hv_mark_gpa_visibility - Set pages visible to host via hvcall.
+ *
+ * In Isolation VM, all guest memory is encrypted from host and guest
+ * needs to set memory visible to host via hvcall before sharing memory
+ * with host.
+ */
+static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
+ enum hv_mem_host_visibility visibility)
+{
+ struct hv_gpa_range_for_visibility **input_pcpu, *input;
+ u16 pages_processed;
+ u64 hv_status;
+ unsigned long flags;
+
+ /* no-op if partition isolation is not enabled */
+ if (!hv_is_isolation_supported())
+ return 0;
+
+ if (count > HV_MAX_MODIFY_GPA_REP_COUNT) {
+ pr_err("Hyper-V: GPA count:%d exceeds supported:%lu\n", count,
+ HV_MAX_MODIFY_GPA_REP_COUNT);
+ return -EINVAL;
+ }
+
+ local_irq_save(flags);
+ input_pcpu = (struct hv_gpa_range_for_visibility **)
+ this_cpu_ptr(hyperv_pcpu_input_arg);
+ input = *input_pcpu;
+ if (unlikely(!input)) {
+ local_irq_restore(flags);
+ return -EINVAL;
+ }
+
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->host_visibility = visibility;
+ input->reserved0 = 0;
+ input->reserved1 = 0;
+ memcpy((void *)input->gpa_page_list, pfn, count * sizeof(*pfn));
+ hv_status = hv_do_rep_hypercall(
+ HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, count,
+ 0, input, &pages_processed);
+ local_irq_restore(flags);
+
+ if (hv_result_success(hv_status))
+ return 0;
+ else
+ return -EFAULT;
+}
+
+/*
+ * hv_set_mem_host_visibility - Set specified memory visible to host.
+ *
+ * In Isolation VM, all guest memory is encrypted from host and guest
+ * needs to set memory visible to host via hvcall before sharing memory
+ * with host. This function works as wrap of hv_mark_gpa_visibility()
+ * with memory base and size.
+ */
+int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visible)
+{
+ enum hv_mem_host_visibility visibility = visible ?
+ VMBUS_PAGE_VISIBLE_READ_WRITE : VMBUS_PAGE_NOT_VISIBLE;
+ u64 *pfn_array;
+ int ret = 0;
+ int i, pfn;
+
+ if (!hv_is_isolation_supported() || !hv_hypercall_pg)
+ return 0;
+
+ pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
+ if (!pfn_array)
+ return -ENOMEM;
+
+ for (i = 0, pfn = 0; i < pagecount; i++) {
+ pfn_array[pfn] = virt_to_hvpfn((void *)kbuffer + i * HV_HYP_PAGE_SIZE);
+ pfn++;
+
+ if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) {
+ ret = hv_mark_gpa_visibility(pfn, pfn_array,
+ visibility);
+ if (ret)
+ goto err_free_pfn_array;
+ pfn = 0;
+ }
+ }
+
+ err_free_pfn_array:
+ kfree(pfn_array);
+ return ret;
+}
+
+/*
+ * hv_map_memory - map memory to extra space in the AMD SEV-SNP Isolation VM.
+ */
+void *hv_map_memory(void *addr, unsigned long size)
+{
+ unsigned long *pfns = kcalloc(size / PAGE_SIZE,
+ sizeof(unsigned long), GFP_KERNEL);
+ void *vaddr;
+ int i;
+
+ if (!pfns)
+ return NULL;
+
+ for (i = 0; i < size / PAGE_SIZE; i++)
+ pfns[i] = vmalloc_to_pfn(addr + i * PAGE_SIZE) +
+ (ms_hyperv.shared_gpa_boundary >> PAGE_SHIFT);
+
+ vaddr = vmap_pfn(pfns, size / PAGE_SIZE, PAGE_KERNEL_IO);
+ kfree(pfns);
+
+ return vaddr;
+}
+
+void hv_unmap_memory(void *addr)
+{
+ vunmap(addr);
+}
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index bd13736d0c05..0ad2378fe6ad 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -68,15 +68,6 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
local_irq_save(flags);
- /*
- * Only check the mask _after_ interrupt has been disabled to avoid the
- * mask changing under our feet.
- */
- if (cpumask_empty(cpus)) {
- local_irq_restore(flags);
- return;
- }
-
flush_pcpu = (struct hv_tlb_flush **)
this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -115,7 +106,9 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
* must. We will also check all VP numbers when walking the
* supplied CPU set to remain correct in all cases.
*/
- if (hv_cpu_number_to_vp_number(cpumask_last(cpus)) >= 64)
+ cpu = cpumask_last(cpus);
+
+ if (cpu < nr_cpumask_bits && hv_cpu_number_to_vp_number(cpu) >= 64)
goto do_ex_hypercall;
for_each_cpu(cpu, cpus) {
@@ -131,6 +124,12 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
__set_bit(vcpu, (unsigned long *)
&flush->processor_mask);
}
+
+ /* nothing to flush if 'processor_mask' ends up being empty */
+ if (!flush->processor_mask) {
+ local_irq_restore(flags);
+ return;
+ }
}
/*
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
index 6efe6cb3768a..59e19549e759 100644
--- a/arch/x86/ia32/audit.c
+++ b/arch/x86/ia32/audit.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/audit_arch.h>
#include <asm/unistd_32.h>
#include <asm/audit.h>
@@ -31,15 +32,17 @@ int ia32_classify_syscall(unsigned syscall)
{
switch (syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_socketcall:
- return 4;
+ return AUDITSC_SOCKETCALL;
case __NR_execve:
case __NR_execveat:
- return 5;
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 1;
+ return AUDITSC_COMPAT;
}
}
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 5e3d9b7fd5fb..c9c3859322fa 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -24,7 +24,6 @@
#include <linux/syscalls.h>
#include <asm/ucontext.h>
#include <linux/uaccess.h>
-#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
#include <asm/ptrace.h>
#include <asm/ia32_unistd.h>
@@ -57,8 +56,8 @@ static inline void reload_segments(struct sigcontext_32 *sc)
/*
* Do a signal return; undo the signal stack.
*/
-static int ia32_restore_sigcontext(struct pt_regs *regs,
- struct sigcontext_32 __user *usc)
+static bool ia32_restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext_32 __user *usc)
{
struct sigcontext_32 sc;
@@ -66,7 +65,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
current->restart_block.fn = do_no_restart_syscall;
if (unlikely(copy_from_user(&sc, usc, sizeof(sc))))
- return -EFAULT;
+ return false;
/* Get only the ia32 registers. */
regs->bx = sc.bx;
@@ -111,7 +110,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn)
set_current_blocked(&set);
- if (ia32_restore_sigcontext(regs, &frame->sc))
+ if (!ia32_restore_sigcontext(regs, &frame->sc))
goto badframe;
return regs->ax;
@@ -135,7 +134,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
set_current_blocked(&set);
- if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
+ if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
goto badframe;
if (compat_restore_altstack(&frame->uc.uc_stack))
@@ -220,8 +219,8 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
*fpstate = (struct _fpstate_32 __user *) sp;
- if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
- math_size) < 0)
+ if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
+ math_size))
return (void __user *) -1L;
sp -= frame_size;
diff --git a/arch/x86/include/asm/GEN-for-each-reg.h b/arch/x86/include/asm/GEN-for-each-reg.h
index 1b07fb102c4e..07949102a08d 100644
--- a/arch/x86/include/asm/GEN-for-each-reg.h
+++ b/arch/x86/include/asm/GEN-for-each-reg.h
@@ -1,11 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * These are in machine order; things rely on that.
+ */
#ifdef CONFIG_64BIT
GEN(rax)
-GEN(rbx)
GEN(rcx)
GEN(rdx)
+GEN(rbx)
+GEN(rsp)
+GEN(rbp)
GEN(rsi)
GEN(rdi)
-GEN(rbp)
GEN(r8)
GEN(r9)
GEN(r10)
@@ -16,10 +21,11 @@ GEN(r14)
GEN(r15)
#else
GEN(eax)
-GEN(ebx)
GEN(ecx)
GEN(edx)
+GEN(ebx)
+GEN(esp)
+GEN(ebp)
GEN(esi)
GEN(edi)
-GEN(ebp)
#endif
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index a3c2315aca12..58eee6402832 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -75,6 +75,7 @@ extern int alternatives_patched;
extern void alternative_instructions(void);
extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+extern void apply_retpolines(s32 *start, s32 *end);
struct module;
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 455066a06f60..00d1a400b7a1 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -24,7 +24,6 @@ extern int amd_set_subcaches(int, unsigned long);
extern int amd_smn_read(u16 node, u32 address, u32 *value);
extern int amd_smn_write(u16 node, u32 address, u32 value);
-extern int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo);
struct amd_l3_cache {
unsigned indices;
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 4cb726c71ed8..8f80de627c60 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -17,21 +17,3 @@
extern void cmpxchg8b_emu(void);
#endif
-#ifdef CONFIG_RETPOLINE
-
-#undef GEN
-#define GEN(reg) \
- extern asmlinkage void __x86_indirect_thunk_ ## reg (void);
-#include <asm/GEN-for-each-reg.h>
-
-#undef GEN
-#define GEN(reg) \
- extern asmlinkage void __x86_indirect_alt_call_ ## reg (void);
-#include <asm/GEN-for-each-reg.h>
-
-#undef GEN
-#define GEN(reg) \
- extern asmlinkage void __x86_indirect_alt_jmp_ ## reg (void);
-#include <asm/GEN-for-each-reg.h>
-
-#endif /* CONFIG_RETPOLINE */
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 3ad3da9a7d97..c878fed3056f 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -6,11 +6,13 @@
# define __ASM_FORM(x, ...) x,## __VA_ARGS__
# define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__
# define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__,
+# define __ASM_REGPFX %
#else
#include <linux/stringify.h>
# define __ASM_FORM(x, ...) " " __stringify(x,##__VA_ARGS__) " "
# define __ASM_FORM_RAW(x, ...) __stringify(x,##__VA_ARGS__)
# define __ASM_FORM_COMMA(x, ...) " " __stringify(x,##__VA_ARGS__) ","
+# define __ASM_REGPFX %%
#endif
#define _ASM_BYTES(x, ...) __ASM_FORM(.byte x,##__VA_ARGS__ ;)
@@ -49,6 +51,9 @@
#define _ASM_SI __ASM_REG(si)
#define _ASM_DI __ASM_REG(di)
+/* Adds a (%rip) suffix on 64 bits only; for immediate memory references */
+#define _ASM_RIP(x) __ASM_SEL_RAW(x, x (__ASM_REGPFX rip))
+
#ifndef __x86_64__
/* 32 bit */
@@ -122,28 +127,19 @@
#ifdef __KERNEL__
+# include <asm/extable_fixup_types.h>
+
/* Exception table entry */
#ifdef __ASSEMBLY__
-# define _ASM_EXTABLE_HANDLE(from, to, handler) \
+
+# define _ASM_EXTABLE_TYPE(from, to, type) \
.pushsection "__ex_table","a" ; \
.balign 4 ; \
.long (from) - . ; \
.long (to) - . ; \
- .long (handler) - . ; \
+ .long type ; \
.popsection
-# define _ASM_EXTABLE(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
-
-# define _ASM_EXTABLE_UA(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess)
-
-# define _ASM_EXTABLE_CPY(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy)
-
-# define _ASM_EXTABLE_FAULT(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
-
# ifdef CONFIG_KPROBES
# define _ASM_NOKPROBE(entry) \
.pushsection "_kprobe_blacklist","aw" ; \
@@ -155,26 +151,51 @@
# endif
#else /* ! __ASSEMBLY__ */
-# define _EXPAND_EXTABLE_HANDLE(x) #x
-# define _ASM_EXTABLE_HANDLE(from, to, handler) \
+
+# define DEFINE_EXTABLE_TYPE_REG \
+ ".macro extable_type_reg type:req reg:req\n" \
+ ".set found, 0\n" \
+ ".set regnr, 0\n" \
+ ".irp rs,rax,rcx,rdx,rbx,rsp,rbp,rsi,rdi,r8,r9,r10,r11,r12,r13,r14,r15\n" \
+ ".ifc \\reg, %%\\rs\n" \
+ ".set found, found+1\n" \
+ ".long \\type + (regnr << 8)\n" \
+ ".endif\n" \
+ ".set regnr, regnr+1\n" \
+ ".endr\n" \
+ ".set regnr, 0\n" \
+ ".irp rs,eax,ecx,edx,ebx,esp,ebp,esi,edi,r8d,r9d,r10d,r11d,r12d,r13d,r14d,r15d\n" \
+ ".ifc \\reg, %%\\rs\n" \
+ ".set found, found+1\n" \
+ ".long \\type + (regnr << 8)\n" \
+ ".endif\n" \
+ ".set regnr, regnr+1\n" \
+ ".endr\n" \
+ ".if (found != 1)\n" \
+ ".error \"extable_type_reg: bad register argument\"\n" \
+ ".endif\n" \
+ ".endm\n"
+
+# define UNDEFINE_EXTABLE_TYPE_REG \
+ ".purgem extable_type_reg\n"
+
+# define _ASM_EXTABLE_TYPE(from, to, type) \
" .pushsection \"__ex_table\",\"a\"\n" \
" .balign 4\n" \
" .long (" #from ") - .\n" \
" .long (" #to ") - .\n" \
- " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n" \
+ " .long " __stringify(type) " \n" \
" .popsection\n"
-# define _ASM_EXTABLE(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
-
-# define _ASM_EXTABLE_UA(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess)
-
-# define _ASM_EXTABLE_CPY(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy)
-
-# define _ASM_EXTABLE_FAULT(from, to) \
- _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+# define _ASM_EXTABLE_TYPE_REG(from, to, type, reg) \
+ " .pushsection \"__ex_table\",\"a\"\n" \
+ " .balign 4\n" \
+ " .long (" #from ") - .\n" \
+ " .long (" #to ") - .\n" \
+ DEFINE_EXTABLE_TYPE_REG \
+ "extable_type_reg reg=" __stringify(reg) ", type=" __stringify(type) " \n"\
+ UNDEFINE_EXTABLE_TYPE_REG \
+ " .popsection\n"
/* For C file, we already have NOKPROBE_SYMBOL macro */
@@ -188,6 +209,17 @@ register unsigned long current_stack_pointer asm(_ASM_SP);
#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
#endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
+#define _ASM_EXTABLE(from, to) \
+ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_DEFAULT)
+#define _ASM_EXTABLE_UA(from, to) \
+ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS)
+
+#define _ASM_EXTABLE_CPY(from, to) \
+ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_COPY)
+
+#define _ASM_EXTABLE_FAULT(from, to) \
+ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT)
+
+#endif /* __KERNEL__ */
#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 3ba772a69cc8..35389b2af88e 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -19,9 +19,9 @@
#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \
X86_FEATURE_XMM2) ::: "memory", "cc")
#else
-#define mb() asm volatile("mfence":::"memory")
-#define rmb() asm volatile("lfence":::"memory")
-#define wmb() asm volatile("sfence" ::: "memory")
+#define __mb() asm volatile("mfence":::"memory")
+#define __rmb() asm volatile("lfence":::"memory")
+#define __wmb() asm volatile("sfence" ::: "memory")
#endif
/**
@@ -51,8 +51,8 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
/* Prevent speculative execution past this barrier. */
#define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC)
-#define dma_rmb() barrier()
-#define dma_wmb() barrier()
+#define __dma_rmb() barrier()
+#define __dma_wmb() barrier()
#define __smp_mb() asm volatile("lock; addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc")
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 3d52b094850a..dd5ea1bdf04c 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -10,6 +10,12 @@
#ifdef CONFIG_X86_64
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#define VC_EXCEPTION_STKSZ EXCEPTION_STKSZ
+#else
+#define VC_EXCEPTION_STKSZ 0
+#endif
+
/* Macro to enforce the same ordering and stack sizes */
#define ESTACKS_MEMBERS(guardsize, optional_stack_size) \
char DF_stack_guard[guardsize]; \
@@ -28,7 +34,7 @@
/* The exception stacks' physical storage. No guard pages required */
struct exception_stacks {
- ESTACKS_MEMBERS(0, 0)
+ ESTACKS_MEMBERS(0, VC_EXCEPTION_STKSZ)
};
/* The effective cpu entry area mapping with guard pages. */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 16a51e7288d5..1261842d006c 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -173,20 +173,25 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
* means that the boot_cpu_has() variant is already fast enough for the
* majority of cases and you should stick to using it as it is generally
* only two instructions: a RIP-relative MOV and a TEST.
+ *
+ * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know
+ * that this is only used on a fallback path and will sometimes cause
+ * it to manifest the address of boot_cpu_data in a register, fouling
+ * the mainline (post-initialization) code.
*/
static __always_inline bool _static_cpu_has(u16 bit)
{
asm_volatile_goto(
ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]")
- ".section .altinstr_aux,\"ax\"\n"
+ ".pushsection .altinstr_aux,\"ax\"\n"
"6:\n"
- " testb %[bitnum],%[cap_byte]\n"
+ " testb %[bitnum]," _ASM_RIP(%P[cap_byte]) "\n"
" jnz %l[t_yes]\n"
" jmp %l[t_no]\n"
- ".previous\n"
+ ".popsection\n"
: : [feature] "i" (bit),
[bitnum] "i" (1 << (bit & 7)),
- [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+ [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3])
: : t_yes, t_no);
t_yes:
return true;
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index d0ce5cfd3ac1..6db4e2932b3d 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -277,6 +277,7 @@
#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */
#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */
#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */
+#define X86_FEATURE_XFD (10*32+ 4) /* "" eXtended Feature Disabling */
/*
* Extended auxiliary flags: Linux defined - for features scattered in various
@@ -298,6 +299,9 @@
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
+#define X86_FEATURE_AMX_BF16 (18*32+22) /* AMX bf16 Support */
+#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */
+#define X86_FEATURE_AMX_INT8 (18*32+25) /* AMX int8 Support */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
@@ -313,6 +317,7 @@
#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
+#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 4d0b126835b8..03cb12775043 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -46,13 +46,14 @@ extern unsigned long efi_mixed_mode_stack_pa;
#define __efi_nargs(...) __efi_nargs_(__VA_ARGS__)
#define __efi_nargs_(...) __efi_nargs__(0, ##__VA_ARGS__, \
+ __efi_arg_sentinel(9), __efi_arg_sentinel(8), \
__efi_arg_sentinel(7), __efi_arg_sentinel(6), \
__efi_arg_sentinel(5), __efi_arg_sentinel(4), \
__efi_arg_sentinel(3), __efi_arg_sentinel(2), \
__efi_arg_sentinel(1), __efi_arg_sentinel(0))
-#define __efi_nargs__(_0, _1, _2, _3, _4, _5, _6, _7, n, ...) \
+#define __efi_nargs__(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, ...) \
__take_second_arg(n, \
- ({ BUILD_BUG_ON_MSG(1, "__efi_nargs limit exceeded"); 8; }))
+ ({ BUILD_BUG_ON_MSG(1, "__efi_nargs limit exceeded"); 10; }))
#define __efi_arg_sentinel(n) , n
/*
@@ -176,8 +177,9 @@ extern u64 efi_setup;
extern efi_status_t __efi64_thunk(u32, ...);
#define efi64_thunk(...) ({ \
- __efi_nargs_check(efi64_thunk, 6, __VA_ARGS__); \
- __efi64_thunk(__VA_ARGS__); \
+ u64 __pad[3]; /* must have space for 3 args on the stack */ \
+ __efi_nargs_check(efi64_thunk, 9, __VA_ARGS__); \
+ __efi64_thunk(__VA_ARGS__, __pad); \
})
static inline bool efi_is_mixed(void)
@@ -197,8 +199,6 @@ static inline bool efi_runtime_supported(void)
extern void parse_efi_setup(u64 phys_addr, u32 data_len);
-extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt);
-
extern void efi_thunk_runtime_setup(void);
efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size,
unsigned long descriptor_size,
@@ -308,6 +308,10 @@ static inline u32 efi64_convert_status(efi_status_t status)
#define __efi64_argmap_query_mode(gop, mode, size, info) \
((gop), (mode), efi64_zero_upper(size), efi64_zero_upper(info))
+/* TCG2 protocol */
+#define __efi64_argmap_hash_log_extend_event(prot, fl, addr, size, ev) \
+ ((prot), (fl), 0ULL, (u64)(addr), 0ULL, (u64)(size), 0ULL, ev)
+
/*
* The macros below handle the plumbing for the argument mapping. To add a
* mapping for a specific EFI method, simply define a macro
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 14ebd2196569..43184640b579 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -25,7 +25,7 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs)
* For !SMAP hardware we patch out CLAC on entry.
*/
if (boot_cpu_has(X86_FEATURE_SMAP) ||
- (IS_ENABLED(CONFIG_64_BIT) && boot_cpu_has(X86_FEATURE_XENPV)))
+ (IS_ENABLED(CONFIG_64BIT) && boot_cpu_has(X86_FEATURE_XENPV)))
mask |= X86_EFLAGS_AC;
WARN_ON_ONCE(flags & mask);
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
index 1f0cbc52937c..155c991ba95e 100644
--- a/arch/x86/include/asm/extable.h
+++ b/arch/x86/include/asm/extable.h
@@ -1,12 +1,18 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_EXTABLE_H
#define _ASM_X86_EXTABLE_H
+
+#include <asm/extable_fixup_types.h>
+
/*
- * The exception table consists of triples of addresses relative to the
- * exception table entry itself. The first address is of an instruction
- * that is allowed to fault, the second is the target at which the program
- * should continue. The third is a handler function to deal with the fault
- * caused by the instruction in the first field.
+ * The exception table consists of two addresses relative to the
+ * exception table entry itself and a type selector field.
+ *
+ * The first address is of an instruction that is allowed to fault, the
+ * second is the target at which the program should continue.
+ *
+ * The type entry is used by fixup_exception() to select the handler to
+ * deal with the fault caused by the instruction in the first field.
*
* All the routines below use bits of fixup code that are out of line
* with the main instruction path. This means when everything is well,
@@ -15,7 +21,7 @@
*/
struct exception_table_entry {
- int insn, fixup, handler;
+ int insn, fixup, data;
};
struct pt_regs;
@@ -25,21 +31,27 @@ struct pt_regs;
do { \
(a)->fixup = (b)->fixup + (delta); \
(b)->fixup = (tmp).fixup - (delta); \
- (a)->handler = (b)->handler + (delta); \
- (b)->handler = (tmp).handler - (delta); \
+ (a)->data = (b)->data; \
+ (b)->data = (tmp).data; \
} while (0)
-enum handler_type {
- EX_HANDLER_NONE,
- EX_HANDLER_FAULT,
- EX_HANDLER_UACCESS,
- EX_HANDLER_OTHER
-};
-
extern int fixup_exception(struct pt_regs *regs, int trapnr,
unsigned long error_code, unsigned long fault_addr);
extern int fixup_bug(struct pt_regs *regs, int trapnr);
-extern enum handler_type ex_get_fault_handler_type(unsigned long ip);
+extern int ex_get_fixup_type(unsigned long ip);
extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
+#ifdef CONFIG_X86_MCE
+extern void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr);
+#else
+static inline void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { }
+#endif
+
+#if defined(CONFIG_BPF_JIT) && defined(CONFIG_X86_64)
+bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs);
+#else
+static inline bool ex_handler_bpf(const struct exception_table_entry *x,
+ struct pt_regs *regs) { return false; }
+#endif
+
#endif
diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
new file mode 100644
index 000000000000..503622627400
--- /dev/null
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_EXTABLE_FIXUP_TYPES_H
+#define _ASM_X86_EXTABLE_FIXUP_TYPES_H
+
+/*
+ * Our IMM is signed, as such it must live at the top end of the word. Also,
+ * since C99 hex constants are of ambigious type, force cast the mask to 'int'
+ * so that FIELD_GET() will DTRT and sign extend the value when it extracts it.
+ */
+#define EX_DATA_TYPE_MASK ((int)0x000000FF)
+#define EX_DATA_REG_MASK ((int)0x00000F00)
+#define EX_DATA_FLAG_MASK ((int)0x0000F000)
+#define EX_DATA_IMM_MASK ((int)0xFFFF0000)
+
+#define EX_DATA_REG_SHIFT 8
+#define EX_DATA_FLAG_SHIFT 12
+#define EX_DATA_IMM_SHIFT 16
+
+#define EX_DATA_REG(reg) ((reg) << EX_DATA_REG_SHIFT)
+#define EX_DATA_FLAG(flag) ((flag) << EX_DATA_FLAG_SHIFT)
+#define EX_DATA_IMM(imm) ((imm) << EX_DATA_IMM_SHIFT)
+
+/* segment regs */
+#define EX_REG_DS EX_DATA_REG(8)
+#define EX_REG_ES EX_DATA_REG(9)
+#define EX_REG_FS EX_DATA_REG(10)
+#define EX_REG_GS EX_DATA_REG(11)
+
+/* flags */
+#define EX_FLAG_CLEAR_AX EX_DATA_FLAG(1)
+#define EX_FLAG_CLEAR_DX EX_DATA_FLAG(2)
+#define EX_FLAG_CLEAR_AX_DX EX_DATA_FLAG(3)
+
+/* types */
+#define EX_TYPE_NONE 0
+#define EX_TYPE_DEFAULT 1
+#define EX_TYPE_FAULT 2
+#define EX_TYPE_UACCESS 3
+#define EX_TYPE_COPY 4
+#define EX_TYPE_CLEAR_FS 5
+#define EX_TYPE_FPU_RESTORE 6
+#define EX_TYPE_BPF 7
+#define EX_TYPE_WRMSR 8
+#define EX_TYPE_RDMSR 9
+#define EX_TYPE_WRMSR_SAFE 10 /* reg := -EIO */
+#define EX_TYPE_RDMSR_SAFE 11 /* reg := -EIO */
+#define EX_TYPE_WRMSR_IN_MCE 12
+#define EX_TYPE_RDMSR_IN_MCE 13
+#define EX_TYPE_DEFAULT_MCE_SAFE 14
+#define EX_TYPE_FAULT_MCE_SAFE 15
+
+#define EX_TYPE_POP_REG 16 /* sp += sizeof(long) */
+#define EX_TYPE_POP_ZERO (EX_TYPE_POP_REG | EX_DATA_IMM(0))
+
+#define EX_TYPE_IMM_REG 17 /* reg := (long)imm */
+#define EX_TYPE_EFAULT_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(-EFAULT))
+#define EX_TYPE_ZERO_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(0))
+#define EX_TYPE_ONE_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(1))
+
+#define EX_TYPE_FAULT_SGX 18
+
+#define EX_TYPE_UCOPY_LEN 19 /* cx := reg + imm*cx */
+#define EX_TYPE_UCOPY_LEN1 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(1))
+#define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4))
+#define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8))
+
+#endif
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 23bef08a8388..c83b3020350a 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -12,6 +12,8 @@
#define _ASM_X86_FPU_API_H
#include <linux/bottom_half.h>
+#include <asm/fpu/types.h>
+
/*
* Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
* disables preemption so be careful if you intend to use it for long periods
@@ -48,9 +50,9 @@ static inline void kernel_fpu_begin(void)
}
/*
- * Use fpregs_lock() while editing CPU's FPU registers or fpu->state.
+ * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate.
* A context switch will (and softirq might) save CPU's FPU registers to
- * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
+ * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
* a random state.
*
* local_bh_disable() protects against both preemption and soft interrupts
@@ -100,12 +102,67 @@ extern void switch_fpu_return(void);
*/
extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
-/*
- * Tasks that are not using SVA have mm->pasid set to zero to note that they
- * will not have the valid bit set in MSR_IA32_PASID while they are running.
- */
-#define PASID_DISABLED 0
+/* Trap handling */
+extern int fpu__exception_code(struct fpu *fpu, int trap_nr);
+extern void fpu_sync_fpstate(struct fpu *fpu);
+extern void fpu_reset_from_exception_fixup(void);
+
+/* Boot, hotplug and resume */
+extern void fpu__init_cpu(void);
+extern void fpu__init_system(struct cpuinfo_x86 *c);
+extern void fpu__init_check_bugs(void);
+extern void fpu__resume_cpu(void);
+
+#ifdef CONFIG_MATH_EMULATION
+extern void fpstate_init_soft(struct swregs_state *soft);
+#else
+static inline void fpstate_init_soft(struct swregs_state *soft) {}
+#endif
+
+/* State tracking */
+DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+/* Process cleanup */
+#ifdef CONFIG_X86_64
+extern void fpstate_free(struct fpu *fpu);
+#else
+static inline void fpstate_free(struct fpu *fpu) { }
+#endif
+
+/* fpstate-related functions which are exported to KVM */
+extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature);
+
+extern u64 xstate_get_guest_group_perm(void);
+
+/* KVM specific functions */
+extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
+extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
+extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
+extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures);
+
+#ifdef CONFIG_X86_64
+extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd);
+extern void fpu_sync_guest_vmexit_xfd_state(void);
+#else
+static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { }
+static inline void fpu_sync_guest_vmexit_xfd_state(void) { }
+#endif
+
+extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru);
+extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru);
+
+static inline void fpstate_set_confidential(struct fpu_guest *gfpu)
+{
+ gfpu->fpstate->is_confidential = true;
+}
+
+static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
+{
+ return gfpu->fpstate->is_confidential;
+}
-static inline void update_pasid(void) { }
+/* prctl */
+struct task_struct;
+extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2);
#endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 5a18694a89b2..e69de29bb2d1 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -1,540 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- * Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _ASM_X86_FPU_INTERNAL_H
-#define _ASM_X86_FPU_INTERNAL_H
-
-#include <linux/compat.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-
-#include <asm/user.h>
-#include <asm/fpu/api.h>
-#include <asm/fpu/xstate.h>
-#include <asm/fpu/xcr.h>
-#include <asm/cpufeature.h>
-#include <asm/trace/fpu.h>
-
-/*
- * High level FPU state handling functions:
- */
-extern int fpu__restore_sig(void __user *buf, int ia32_frame);
-extern void fpu__drop(struct fpu *fpu);
-extern void fpu__clear_user_states(struct fpu *fpu);
-extern int fpu__exception_code(struct fpu *fpu, int trap_nr);
-
-extern void fpu_sync_fpstate(struct fpu *fpu);
-
-/* Clone and exit operations */
-extern int fpu_clone(struct task_struct *dst);
-extern void fpu_flush_thread(void);
-
-/*
- * Boot time FPU initialization functions:
- */
-extern void fpu__init_cpu(void);
-extern void fpu__init_system_xstate(void);
-extern void fpu__init_cpu_xstate(void);
-extern void fpu__init_system(struct cpuinfo_x86 *c);
-extern void fpu__init_check_bugs(void);
-extern void fpu__resume_cpu(void);
-
-/*
- * Debugging facility:
- */
-#ifdef CONFIG_X86_DEBUG_FPU
-# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
-#else
-# define WARN_ON_FPU(x) ({ (void)(x); 0; })
-#endif
-
-/*
- * FPU related CPU feature flag helper routines:
- */
-static __always_inline __pure bool use_xsaveopt(void)
-{
- return static_cpu_has(X86_FEATURE_XSAVEOPT);
-}
-
-static __always_inline __pure bool use_xsave(void)
-{
- return static_cpu_has(X86_FEATURE_XSAVE);
-}
-
-static __always_inline __pure bool use_fxsr(void)
-{
- return static_cpu_has(X86_FEATURE_FXSR);
-}
-
-/*
- * fpstate handling functions:
- */
-
-extern union fpregs_state init_fpstate;
-
-extern void fpstate_init(union fpregs_state *state);
-#ifdef CONFIG_MATH_EMULATION
-extern void fpstate_init_soft(struct swregs_state *soft);
-#else
-static inline void fpstate_init_soft(struct swregs_state *soft) {}
-#endif
-extern void save_fpregs_to_fpstate(struct fpu *fpu);
-
-/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */
-#define user_insn(insn, output, input...) \
-({ \
- int err; \
- \
- might_fault(); \
- \
- asm volatile(ASM_STAC "\n" \
- "1: " #insn "\n" \
- "2: " ASM_CLAC "\n" \
- ".section .fixup,\"ax\"\n" \
- "3: negl %%eax\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_FAULT(1b, 3b) \
- : [err] "=a" (err), output \
- : "0"(0), input); \
- err; \
-})
-
-#define kernel_insn_err(insn, output, input...) \
-({ \
- int err; \
- asm volatile("1:" #insn "\n\t" \
- "2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: movl $-1,%[err]\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE(1b, 3b) \
- : [err] "=r" (err), output \
- : "0"(0), input); \
- err; \
-})
-
-#define kernel_insn(insn, output, input...) \
- asm volatile("1:" #insn "\n\t" \
- "2:\n" \
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \
- : output : input)
-
-static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
-{
- return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx));
-}
-
-static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx)
-{
- if (IS_ENABLED(CONFIG_X86_32))
- return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
- else
- return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
-
-}
-
-static inline void fxrstor(struct fxregs_state *fx)
-{
- if (IS_ENABLED(CONFIG_X86_32))
- kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
- else
- kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int fxrstor_safe(struct fxregs_state *fx)
-{
- if (IS_ENABLED(CONFIG_X86_32))
- return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
- else
- return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
-{
- if (IS_ENABLED(CONFIG_X86_32))
- return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
- else
- return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline void frstor(struct fregs_state *fx)
-{
- kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int frstor_safe(struct fregs_state *fx)
-{
- return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int frstor_from_user_sigframe(struct fregs_state __user *fx)
-{
- return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline void fxsave(struct fxregs_state *fx)
-{
- if (IS_ENABLED(CONFIG_X86_32))
- asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
- else
- asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
-}
-
-/* These macros all use (%edi)/(%rdi) as the single memory argument. */
-#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
-#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
-#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
-#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
-#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
-
-/*
- * After this @err contains 0 on success or the negated trap number when
- * the operation raises an exception. For faults this results in -EFAULT.
- */
-#define XSTATE_OP(op, st, lmask, hmask, err) \
- asm volatile("1:" op "\n\t" \
- "xor %[err], %[err]\n" \
- "2:\n\t" \
- ".pushsection .fixup,\"ax\"\n\t" \
- "3: negl %%eax\n\t" \
- "jmp 2b\n\t" \
- ".popsection\n\t" \
- _ASM_EXTABLE_FAULT(1b, 3b) \
- : [err] "=a" (err) \
- : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
- : "memory")
-
-/*
- * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
- * format and supervisor states in addition to modified optimization in
- * XSAVEOPT.
- *
- * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
- * supports modified optimization which is not supported by XSAVE.
- *
- * We use XSAVE as a fallback.
- *
- * The 661 label is defined in the ALTERNATIVE* macros as the address of the
- * original instruction which gets replaced. We need to use it here as the
- * address of the instruction where we might get an exception at.
- */
-#define XSTATE_XSAVE(st, lmask, hmask, err) \
- asm volatile(ALTERNATIVE_2(XSAVE, \
- XSAVEOPT, X86_FEATURE_XSAVEOPT, \
- XSAVES, X86_FEATURE_XSAVES) \
- "\n" \
- "xor %[err], %[err]\n" \
- "3:\n" \
- ".pushsection .fixup,\"ax\"\n" \
- "4: movl $-2, %[err]\n" \
- "jmp 3b\n" \
- ".popsection\n" \
- _ASM_EXTABLE(661b, 4b) \
- : [err] "=r" (err) \
- : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
- : "memory")
-
-/*
- * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
- * XSAVE area format.
- */
-#define XSTATE_XRESTORE(st, lmask, hmask) \
- asm volatile(ALTERNATIVE(XRSTOR, \
- XRSTORS, X86_FEATURE_XSAVES) \
- "\n" \
- "3:\n" \
- _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\
- : \
- : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
- : "memory")
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline void os_xrstor_booting(struct xregs_state *xstate)
-{
- u64 mask = xfeatures_mask_fpstate();
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- WARN_ON(system_state != SYSTEM_BOOTING);
-
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
- else
- XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
-
- /*
- * We should never fault when copying from a kernel buffer, and the FPU
- * state we set at boot time should be valid.
- */
- WARN_ON_FPU(err);
-}
-
-/*
- * Save processor xstate to xsave area.
- *
- * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
- * and command line options. The choice is permanent until the next reboot.
- */
-static inline void os_xsave(struct xregs_state *xstate)
-{
- u64 mask = xfeatures_mask_all;
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- WARN_ON_FPU(!alternatives_patched);
-
- XSTATE_XSAVE(xstate, lmask, hmask, err);
-
- /* We should never fault when copying to a kernel buffer: */
- WARN_ON_FPU(err);
-}
-
-/*
- * Restore processor xstate from xsave area.
- *
- * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
- */
-static inline void os_xrstor(struct xregs_state *xstate, u64 mask)
-{
- u32 lmask = mask;
- u32 hmask = mask >> 32;
-
- XSTATE_XRESTORE(xstate, lmask, hmask);
-}
-
-/*
- * Save xstate to user space xsave area.
- *
- * We don't use modified optimization because xrstor/xrstors might track
- * a different application.
- *
- * We don't use compacted format xsave area for
- * backward compatibility for old applications which don't understand
- * compacted format of xsave area.
- */
-static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
-{
- /*
- * Include the features which are not xsaved/rstored by the kernel
- * internally, e.g. PKRU. That's user space ABI and also required
- * to allow the signal handler to modify PKRU.
- */
- u64 mask = xfeatures_mask_uabi();
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- /*
- * Clear the xsave header first, so that reserved fields are
- * initialized to zero.
- */
- err = __clear_user(&buf->header, sizeof(buf->header));
- if (unlikely(err))
- return -EFAULT;
-
- stac();
- XSTATE_OP(XSAVE, buf, lmask, hmask, err);
- clac();
-
- return err;
-}
-
-/*
- * Restore xstate from user space xsave area.
- */
-static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
-{
- struct xregs_state *xstate = ((__force struct xregs_state *)buf);
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- stac();
- XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
- clac();
-
- return err;
-}
-
-/*
- * Restore xstate from kernel space xsave area, return an error code instead of
- * an exception.
- */
-static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask)
-{
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- if (cpu_feature_enabled(X86_FEATURE_XSAVES))
- XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
- else
- XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
-
- return err;
-}
-
-extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
-
-static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate)
-{
- __restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate());
-}
-
-extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
-
-/*
- * FPU context switch related helper methods:
- */
-
-DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
-
-/*
- * The in-register FPU state for an FPU context on a CPU is assumed to be
- * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
- * matches the FPU.
- *
- * If the FPU register state is valid, the kernel can skip restoring the
- * FPU state from memory.
- *
- * Any code that clobbers the FPU registers or updates the in-memory
- * FPU state for a task MUST let the rest of the kernel know that the
- * FPU registers are no longer valid for this task.
- *
- * Either one of these invalidation functions is enough. Invalidate
- * a resource you control: CPU if using the CPU for something else
- * (with preemption disabled), FPU for the current task, or a task that
- * is prevented from running by the current task.
- */
-static inline void __cpu_invalidate_fpregs_state(void)
-{
- __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
-}
-
-static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
-{
- fpu->last_cpu = -1;
-}
-
-static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
-{
- return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
-}
-
-/*
- * These generally need preemption protection to work,
- * do try to avoid using these on their own:
- */
-static inline void fpregs_deactivate(struct fpu *fpu)
-{
- this_cpu_write(fpu_fpregs_owner_ctx, NULL);
- trace_x86_fpu_regs_deactivated(fpu);
-}
-
-static inline void fpregs_activate(struct fpu *fpu)
-{
- this_cpu_write(fpu_fpregs_owner_ctx, fpu);
- trace_x86_fpu_regs_activated(fpu);
-}
-
-/* Internal helper for switch_fpu_return() and signal frame setup */
-static inline void fpregs_restore_userregs(void)
-{
- struct fpu *fpu = &current->thread.fpu;
- int cpu = smp_processor_id();
-
- if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
- return;
-
- if (!fpregs_state_valid(fpu, cpu)) {
- u64 mask;
-
- /*
- * This restores _all_ xstate which has not been
- * established yet.
- *
- * If PKRU is enabled, then the PKRU value is already
- * correct because it was either set in switch_to() or in
- * flush_thread(). So it is excluded because it might be
- * not up to date in current->thread.fpu.xsave state.
- */
- mask = xfeatures_mask_restore_user() |
- xfeatures_mask_supervisor();
- __restore_fpregs_from_fpstate(&fpu->state, mask);
-
- fpregs_activate(fpu);
- fpu->last_cpu = cpu;
- }
- clear_thread_flag(TIF_NEED_FPU_LOAD);
-}
-
-/*
- * FPU state switching for scheduling.
- *
- * This is a two-stage process:
- *
- * - switch_fpu_prepare() saves the old state.
- * This is done within the context of the old process.
- *
- * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
- * will get loaded on return to userspace, or when the kernel needs it.
- *
- * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
- * are saved in the current thread's FPU register state.
- *
- * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
- * hold current()'s FPU registers. It is required to load the
- * registers before returning to userland or using the content
- * otherwise.
- *
- * The FPU context is only stored/restored for a user task and
- * PF_KTHREAD is used to distinguish between kernel and user threads.
- */
-static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
-{
- if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) {
- save_fpregs_to_fpstate(old_fpu);
- /*
- * The save operation preserved register state, so the
- * fpu_fpregs_owner_ctx is still @old_fpu. Store the
- * current CPU number in @old_fpu, so the next return
- * to user space can avoid the FPU register restore
- * when is returns on the same CPU and still owns the
- * context.
- */
- old_fpu->last_cpu = cpu;
-
- trace_x86_fpu_regs_deactivated(old_fpu);
- }
-}
-
-/*
- * Misc helper functions:
- */
-
-/*
- * Delay loading of the complete FPU state until the return to userland.
- * PKRU is handled separately.
- */
-static inline void switch_fpu_finish(struct fpu *new_fpu)
-{
- if (cpu_feature_enabled(X86_FEATURE_FPU))
- set_thread_flag(TIF_NEED_FPU_LOAD);
-}
-
-#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
new file mode 100644
index 000000000000..99a8820e8cc4
--- /dev/null
+++ b/arch/x86/include/asm/fpu/sched.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FPU_SCHED_H
+#define _ASM_X86_FPU_SCHED_H
+
+#include <linux/sched.h>
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/types.h>
+
+#include <asm/trace/fpu.h>
+
+extern void save_fpregs_to_fpstate(struct fpu *fpu);
+extern void fpu__drop(struct fpu *fpu);
+extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags);
+extern void fpu_flush_thread(void);
+
+/*
+ * FPU state switching for scheduling.
+ *
+ * This is a two-stage process:
+ *
+ * - switch_fpu_prepare() saves the old state.
+ * This is done within the context of the old process.
+ *
+ * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
+ * will get loaded on return to userspace, or when the kernel needs it.
+ *
+ * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
+ * are saved in the current thread's FPU register state.
+ *
+ * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
+ * hold current()'s FPU registers. It is required to load the
+ * registers before returning to userland or using the content
+ * otherwise.
+ *
+ * The FPU context is only stored/restored for a user task and
+ * PF_KTHREAD is used to distinguish between kernel and user threads.
+ */
+static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
+{
+ if (cpu_feature_enabled(X86_FEATURE_FPU) &&
+ !(current->flags & PF_KTHREAD)) {
+ save_fpregs_to_fpstate(old_fpu);
+ /*
+ * The save operation preserved register state, so the
+ * fpu_fpregs_owner_ctx is still @old_fpu. Store the
+ * current CPU number in @old_fpu, so the next return
+ * to user space can avoid the FPU register restore
+ * when is returns on the same CPU and still owns the
+ * context.
+ */
+ old_fpu->last_cpu = cpu;
+
+ trace_x86_fpu_regs_deactivated(old_fpu);
+ }
+}
+
+/*
+ * Delay loading of the complete FPU state until the return to userland.
+ * PKRU is handled separately.
+ */
+static inline void switch_fpu_finish(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_FPU))
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+}
+
+#endif /* _ASM_X86_FPU_SCHED_H */
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 8b6631dffefd..e1c9df9102a5 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -5,6 +5,11 @@
#ifndef _ASM_X86_FPU_SIGNAL_H
#define _ASM_X86_FPU_SIGNAL_H
+#include <linux/compat.h>
+#include <linux/user.h>
+
+#include <asm/fpu/types.h>
+
#ifdef CONFIG_X86_64
# include <uapi/asm/sigcontext.h>
# include <asm/user32.h>
@@ -31,6 +36,9 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
unsigned long fpu__get_fpstate_size(void);
-extern void fpu__init_prepare_fx_sw_frame(void);
+extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
+extern void fpu__clear_user_states(struct fpu *fpu);
+extern bool fpu__restore_sig(void __user *buf, int ia32_frame);
+extern void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask);
#endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index f5a38a5f3ae1..eb7cd1139d97 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -120,6 +120,9 @@ enum xfeature {
XFEATURE_RSRVD_COMP_13,
XFEATURE_RSRVD_COMP_14,
XFEATURE_LBR,
+ XFEATURE_RSRVD_COMP_16,
+ XFEATURE_XTILE_CFG,
+ XFEATURE_XTILE_DATA,
XFEATURE_MAX,
};
@@ -136,12 +139,21 @@ enum xfeature {
#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
#define XFEATURE_MASK_PASID (1 << XFEATURE_PASID)
#define XFEATURE_MASK_LBR (1 << XFEATURE_LBR)
+#define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG)
+#define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA)
#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \
| XFEATURE_MASK_ZMM_Hi256 \
| XFEATURE_MASK_Hi16_ZMM)
+#ifdef CONFIG_X86_64
+# define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILE_DATA \
+ | XFEATURE_MASK_XTILE_CFG)
+#else
+# define XFEATURE_MASK_XTILE (0)
+#endif
+
#define FIRST_EXTENDED_XFEATURE XFEATURE_YMM
struct reg_128_bit {
@@ -153,6 +165,9 @@ struct reg_256_bit {
struct reg_512_bit {
u8 regbytes[512/8];
};
+struct reg_1024_byte {
+ u8 regbytes[1024];
+};
/*
* State component 2:
@@ -255,6 +270,23 @@ struct arch_lbr_state {
u64 ler_to;
u64 ler_info;
struct lbr_entry entries[];
+};
+
+/*
+ * State component 17: 64-byte tile configuration register.
+ */
+struct xtile_cfg {
+ u64 tcfg[8];
+} __packed;
+
+/*
+ * State component 18: 1KB tile data register.
+ * Each register represents 16 64-byte rows of the matrix
+ * data. But the number of registers depends on the actual
+ * implementation.
+ */
+struct xtile_data {
+ struct reg_1024_byte tmm;
} __packed;
/*
@@ -309,6 +341,93 @@ union fpregs_state {
u8 __padding[PAGE_SIZE];
};
+struct fpstate {
+ /* @kernel_size: The size of the kernel register image */
+ unsigned int size;
+
+ /* @user_size: The size in non-compacted UABI format */
+ unsigned int user_size;
+
+ /* @xfeatures: xfeatures for which the storage is sized */
+ u64 xfeatures;
+
+ /* @user_xfeatures: xfeatures valid in UABI buffers */
+ u64 user_xfeatures;
+
+ /* @xfd: xfeatures disabled to trap userspace use. */
+ u64 xfd;
+
+ /* @is_valloc: Indicator for dynamically allocated state */
+ unsigned int is_valloc : 1;
+
+ /* @is_guest: Indicator for guest state (KVM) */
+ unsigned int is_guest : 1;
+
+ /*
+ * @is_confidential: Indicator for KVM confidential mode.
+ * The FPU registers are restored by the
+ * vmentry firmware from encrypted guest
+ * memory. On vmexit the FPU registers are
+ * saved by firmware to encrypted guest memory
+ * and the registers are scrubbed before
+ * returning to the host. So there is no
+ * content which is worth saving and restoring.
+ * The fpstate has to be there so that
+ * preemption and softirq FPU usage works
+ * without special casing.
+ */
+ unsigned int is_confidential : 1;
+
+ /* @in_use: State is in use */
+ unsigned int in_use : 1;
+
+ /* @regs: The register state union for all supported formats */
+ union fpregs_state regs;
+
+ /* @regs is dynamically sized! Don't add anything after @regs! */
+} __aligned(64);
+
+#define FPU_GUEST_PERM_LOCKED BIT_ULL(63)
+
+struct fpu_state_perm {
+ /*
+ * @__state_perm:
+ *
+ * This bitmap indicates the permission for state components, which
+ * are available to a thread group. The permission prctl() sets the
+ * enabled state bits in thread_group_leader()->thread.fpu.
+ *
+ * All run time operations use the per thread information in the
+ * currently active fpu.fpstate which contains the xfeature masks
+ * and sizes for kernel and user space.
+ *
+ * This master permission field is only to be used when
+ * task.fpu.fpstate based checks fail to validate whether the task
+ * is allowed to expand it's xfeatures set which requires to
+ * allocate a larger sized fpstate buffer.
+ *
+ * Do not access this field directly. Use the provided helper
+ * function. Unlocked access is possible for quick checks.
+ */
+ u64 __state_perm;
+
+ /*
+ * @__state_size:
+ *
+ * The size required for @__state_perm. Only valid to access
+ * with sighand locked.
+ */
+ unsigned int __state_size;
+
+ /*
+ * @__user_state_size:
+ *
+ * The size required for @__state_perm user part. Only valid to
+ * access with sighand locked.
+ */
+ unsigned int __user_state_size;
+};
+
/*
* Highest level per task FPU state data structure that
* contains the FPU register state plus various FPU
@@ -337,19 +456,130 @@ struct fpu {
unsigned long avx512_timestamp;
/*
- * @state:
+ * @fpstate:
+ *
+ * Pointer to the active struct fpstate. Initialized to
+ * point at @__fpstate below.
+ */
+ struct fpstate *fpstate;
+
+ /*
+ * @__task_fpstate:
+ *
+ * Pointer to an inactive struct fpstate. Initialized to NULL. Is
+ * used only for KVM support to swap out the regular task fpstate.
+ */
+ struct fpstate *__task_fpstate;
+
+ /*
+ * @perm:
+ *
+ * Permission related information
+ */
+ struct fpu_state_perm perm;
+
+ /*
+ * @guest_perm:
+ *
+ * Permission related information for guest pseudo FPUs
+ */
+ struct fpu_state_perm guest_perm;
+
+ /*
+ * @__fpstate:
*
- * In-memory copy of all FPU registers that we save/restore
- * over context switches. If the task is using the FPU then
- * the registers in the FPU are more recent than this state
- * copy. If the task context-switches away then they get
- * saved here and represent the FPU state.
+ * Initial in-memory storage for FPU registers which are saved in
+ * context switch and when the kernel uses the FPU. The registers
+ * are restored from this storage on return to user space if they
+ * are not longer containing the tasks FPU register state.
*/
- union fpregs_state state;
+ struct fpstate __fpstate;
/*
- * WARNING: 'state' is dynamically-sized. Do not put
+ * WARNING: '__fpstate' is dynamically-sized. Do not put
* anything after it here.
*/
};
+/*
+ * Guest pseudo FPU container
+ */
+struct fpu_guest {
+ /*
+ * @xfeatures: xfeature bitmap of features which are
+ * currently enabled for the guest vCPU.
+ */
+ u64 xfeatures;
+
+ /*
+ * @perm: xfeature bitmap of features which are
+ * permitted to be enabled for the guest
+ * vCPU.
+ */
+ u64 perm;
+
+ /*
+ * @xfd_err: Save the guest value.
+ */
+ u64 xfd_err;
+
+ /*
+ * @uabi_size: Size required for save/restore
+ */
+ unsigned int uabi_size;
+
+ /*
+ * @fpstate: Pointer to the allocated guest fpstate
+ */
+ struct fpstate *fpstate;
+};
+
+/*
+ * FPU state configuration data. Initialized at boot time. Read only after init.
+ */
+struct fpu_state_config {
+ /*
+ * @max_size:
+ *
+ * The maximum size of the register state buffer. Includes all
+ * supported features except independent managed features.
+ */
+ unsigned int max_size;
+
+ /*
+ * @default_size:
+ *
+ * The default size of the register state buffer. Includes all
+ * supported features except independent managed features and
+ * features which have to be requested by user space before usage.
+ */
+ unsigned int default_size;
+
+ /*
+ * @max_features:
+ *
+ * The maximum supported features bitmap. Does not include
+ * independent managed features.
+ */
+ u64 max_features;
+
+ /*
+ * @default_features:
+ *
+ * The default supported features bitmap. Does not include
+ * independent managed features and features which have to
+ * be requested by user space before usage.
+ */
+ u64 default_features;
+ /*
+ * @legacy_features:
+ *
+ * Features which can be reported back to user space
+ * even without XSAVE support, i.e. legacy features FP + SSE
+ */
+ u64 legacy_features;
+};
+
+/* FPU state configuration information */
+extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg;
+
#endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h
index 1c7ab8d95da5..9656a5bc6fea 100644
--- a/arch/x86/include/asm/fpu/xcr.h
+++ b/arch/x86/include/asm/fpu/xcr.h
@@ -2,18 +2,8 @@
#ifndef _ASM_X86_FPU_XCR_H
#define _ASM_X86_FPU_XCR_H
-/*
- * MXCSR and XCR definitions:
- */
-
-static inline void ldmxcsr(u32 mxcsr)
-{
- asm volatile("ldmxcsr %0" :: "m" (mxcsr));
-}
-
-extern unsigned int mxcsr_feature_mask;
-
#define XCR_XFEATURE_ENABLED_MASK 0x00000000
+#define XCR_XFEATURE_IN_USE_MASK 0x00000001
static inline u64 xgetbv(u32 index)
{
@@ -31,4 +21,15 @@ static inline void xsetbv(u32 index, u64 value)
asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
}
+/*
+ * Return a mask of xfeatures which are currently being tracked
+ * by the processor as being in the initial configuration.
+ *
+ * Callers should check X86_FEATURE_XGETBV1.
+ */
+static inline u64 xfeatures_in_use(void)
+{
+ return xgetbv(XCR_XFEATURE_IN_USE_MASK);
+}
+
#endif /* _ASM_X86_FPU_XCR_H */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 109dfcc75299..cd3dd170e23a 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -14,6 +14,8 @@
#define XSTATE_CPUID 0x0000000d
+#define TILE_CPUID 0x0000001d
+
#define FXSAVE_SIZE 512
#define XSAVE_HDR_SIZE 64
@@ -33,7 +35,8 @@
XFEATURE_MASK_Hi16_ZMM | \
XFEATURE_MASK_PKRU | \
XFEATURE_MASK_BNDREGS | \
- XFEATURE_MASK_BNDCSR)
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_XTILE)
/*
* Features which are restored when returning to user space.
@@ -43,6 +46,9 @@
#define XFEATURE_MASK_USER_RESTORE \
(XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU)
+/* Features which are dynamically enabled for a process on request */
+#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
+
/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
@@ -78,78 +84,49 @@
XFEATURE_MASK_INDEPENDENT | \
XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
-#ifdef CONFIG_X86_64
-#define REX_PREFIX "0x48, "
-#else
-#define REX_PREFIX
-#endif
-
-extern u64 xfeatures_mask_all;
-
-static inline u64 xfeatures_mask_supervisor(void)
-{
- return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
-}
-
-/*
- * The xfeatures which are enabled in XCR0 and expected to be in ptrace
- * buffers and signal frames.
- */
-static inline u64 xfeatures_mask_uabi(void)
-{
- return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
-}
-
/*
- * The xfeatures which are restored by the kernel when returning to user
- * mode. This is not necessarily the same as xfeatures_mask_uabi() as the
- * kernel does not manage all XCR0 enabled features via xsave/xrstor as
- * some of them have to be switched eagerly on context switch and exec().
+ * The feature mask required to restore FPU state:
+ * - All user states which are not eagerly switched in switch_to()/exec()
+ * - The suporvisor states
*/
-static inline u64 xfeatures_mask_restore_user(void)
-{
- return xfeatures_mask_all & XFEATURE_MASK_USER_RESTORE;
-}
+#define XFEATURE_MASK_FPSTATE (XFEATURE_MASK_USER_RESTORE | \
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED)
/*
- * Like xfeatures_mask_restore_user() but additionally restors the
- * supported supervisor states.
+ * Features in this mask have space allocated in the signal frame, but may not
+ * have that space initialized when the feature is in its init state.
*/
-static inline u64 xfeatures_mask_fpstate(void)
-{
- return xfeatures_mask_all & \
- (XFEATURE_MASK_USER_RESTORE | XFEATURE_MASK_SUPERVISOR_SUPPORTED);
-}
-
-static inline u64 xfeatures_mask_independent(void)
-{
- if (!boot_cpu_has(X86_FEATURE_ARCH_LBR))
- return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
-
- return XFEATURE_MASK_INDEPENDENT;
-}
+#define XFEATURE_MASK_SIGFRAME_INITOPT (XFEATURE_MASK_XTILE | \
+ XFEATURE_MASK_USER_DYNAMIC)
extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
extern void __init update_regset_xstate_info(unsigned int size,
u64 xstate_mask);
-void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
int xfeature_size(int xfeature_nr);
-int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
-int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
void xsaves(struct xregs_state *xsave, u64 mask);
void xrstors(struct xregs_state *xsave, u64 mask);
-enum xstate_copy_mode {
- XSTATE_COPY_FP,
- XSTATE_COPY_FX,
- XSTATE_COPY_XSAVE,
-};
+int xfd_enable_feature(u64 xfd_err);
+
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+#endif
-struct membuf;
-void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
- enum xstate_copy_mode mode);
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+
+static __always_inline __pure bool fpu_state_size_dynamic(void)
+{
+ return static_branch_unlikely(&__fpu_state_size_dynamic);
+}
+#else
+static __always_inline __pure bool fpu_state_size_dynamic(void)
+{
+ return false;
+}
+#endif
#endif
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 9f3130f40807..024d9797646e 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -57,6 +57,13 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs)
#define ftrace_instruction_pointer_set(fregs, _ip) \
do { (fregs)->regs.ip = (_ip); } while (0)
+
+struct ftrace_ops;
+#define ftrace_graph_func ftrace_graph_func
+void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
+#else
+#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
#endif
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -65,8 +72,6 @@ struct dyn_arch_ftrace {
/* No extra data needed for x86 */
};
-#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
-
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index f9c00110a69a..99d345b686fa 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -17,13 +17,9 @@ do { \
int oldval = 0, ret; \
asm volatile("1:\t" insn "\n" \
"2:\n" \
- "\t.section .fixup,\"ax\"\n" \
- "3:\tmov\t%3, %1\n" \
- "\tjmp\t2b\n" \
- "\t.previous\n" \
- _ASM_EXTABLE_UA(1b, 3b) \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %1) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
- : "i" (-EFAULT), "0" (oparg), "1" (0)); \
+ : "0" (oparg), "1" (0)); \
if (ret) \
goto label; \
*oval = oldval; \
@@ -39,15 +35,11 @@ do { \
"3:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \
"\tjnz\t2b\n" \
"4:\n" \
- "\t.section .fixup,\"ax\"\n" \
- "5:\tmov\t%5, %1\n" \
- "\tjmp\t4b\n" \
- "\t.previous\n" \
- _ASM_EXTABLE_UA(1b, 5b) \
- _ASM_EXTABLE_UA(3b, 5b) \
+ _ASM_EXTABLE_TYPE_REG(1b, 4b, EX_TYPE_EFAULT_REG, %1) \
+ _ASM_EXTABLE_TYPE_REG(3b, 4b, EX_TYPE_EFAULT_REG, %1) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
- : "r" (oparg), "i" (-EFAULT), "1" (0)); \
+ : "r" (oparg), "1" (0)); \
if (ret) \
goto label; \
*oval = oldval; \
@@ -95,15 +87,11 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
if (!user_access_begin(uaddr, sizeof(u32)))
return -EFAULT;
asm volatile("\n"
- "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
+ "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"
"2:\n"
- "\t.section .fixup, \"ax\"\n"
- "3:\tmov %3, %0\n"
- "\tjmp 2b\n"
- "\t.previous\n"
- _ASM_EXTABLE_UA(1b, 3b)
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0) \
: "+r" (ret), "=a" (oldval), "+m" (*uaddr)
- : "i" (-EFAULT), "r" (newval), "1" (oldval)
+ : "r" (newval), "1" (oldval)
: "memory"
);
user_access_end();
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 2322d6bd5883..0a9407dc0859 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -276,6 +276,23 @@ enum hv_isolation_type {
#define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT
#define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC
+/* Hyper-V memory host visibility */
+enum hv_mem_host_visibility {
+ VMBUS_PAGE_NOT_VISIBLE = 0,
+ VMBUS_PAGE_VISIBLE_READ_ONLY = 1,
+ VMBUS_PAGE_VISIBLE_READ_WRITE = 3
+};
+
+/* HvCallModifySparseGpaPageHostVisibility hypercall */
+#define HV_MAX_MODIFY_GPA_REP_COUNT ((PAGE_SIZE / sizeof(u64)) - 2)
+struct hv_gpa_range_for_visibility {
+ u64 partition_id;
+ u32 host_visibility:2;
+ u32 reserved0:30;
+ u32 reserved1;
+ u64 gpa_page_list[HV_MAX_MODIFY_GPA_REP_COUNT];
+} __packed;
+
/*
* Declare the MSR used to setup pages used to communicate with the hypervisor.
*/
@@ -585,6 +602,39 @@ enum hv_interrupt_type {
HV_X64_INTERRUPT_TYPE_MAXIMUM = 0x000A,
};
+union hv_msi_address_register {
+ u32 as_uint32;
+ struct {
+ u32 reserved1:2;
+ u32 destination_mode:1;
+ u32 redirection_hint:1;
+ u32 reserved2:8;
+ u32 destination_id:8;
+ u32 msi_base:12;
+ };
+} __packed;
+
+union hv_msi_data_register {
+ u32 as_uint32;
+ struct {
+ u32 vector:8;
+ u32 delivery_mode:3;
+ u32 reserved1:3;
+ u32 level_assert:1;
+ u32 trigger_mode:1;
+ u32 reserved2:16;
+ };
+} __packed;
+
+/* HvRetargetDeviceInterrupt hypercall */
+union hv_msi_entry {
+ u64 as_uint64;
+ struct {
+ union hv_msi_address_register address;
+ union hv_msi_data_register data;
+ } __packed;
+};
+
#include <asm-generic/hyperv-tlfs.h>
#endif
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 2c5f7861d373..fada857f0a1e 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -68,6 +68,6 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm);
#endif
-#endif /* !CONFIG_IA32_SUPPORT */
+#endif /* CONFIG_IA32_EMULATION */
#endif /* _ASM_X86_IA32_H */
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index 91d7182ad2d6..f07faa61c7f3 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -15,12 +15,16 @@
#define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf)
#define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4))
+int pt_regs_offset(struct pt_regs *regs, int regno);
+
bool insn_has_rep_prefix(struct insn *insn);
void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs);
+unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs);
unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
int insn_get_code_seg_params(struct pt_regs *regs);
+int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip);
int insn_fetch_from_user(struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE]);
int insn_fetch_from_user_inatomic(struct pt_regs *regs,
@@ -28,4 +32,16 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs,
bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE], int buf_size);
+enum mmio_type {
+ MMIO_DECODE_FAILED,
+ MMIO_WRITE,
+ MMIO_WRITE_IMM,
+ MMIO_READ,
+ MMIO_READ_ZERO_EXTEND,
+ MMIO_READ_SIGN_EXTEND,
+ MMIO_MOVS,
+};
+
+enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes);
+
#endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 27158436f322..048b6d5aff50 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -108,6 +108,8 @@
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
+#define INTEL_FAM6_RAPTORLAKE 0xB7
+
/* "Small Core" Processors (Atom) */
#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 841a5d104afa..f6d91ecb8026 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -40,6 +40,7 @@
#include <linux/string.h>
#include <linux/compiler.h>
+#include <linux/cc_platform.h>
#include <asm/page.h>
#include <asm/early_ioremap.h>
#include <asm/pgtable_types.h>
@@ -256,21 +257,6 @@ static inline void slow_down_io(void)
#endif
-#ifdef CONFIG_AMD_MEM_ENCRYPT
-#include <linux/jump_label.h>
-
-extern struct static_key_false sev_enable_key;
-static inline bool sev_key_active(void)
-{
- return static_branch_unlikely(&sev_enable_key);
-}
-
-#else /* !CONFIG_AMD_MEM_ENCRYPT */
-
-static inline bool sev_key_active(void) { return false; }
-
-#endif /* CONFIG_AMD_MEM_ENCRYPT */
-
#define BUILDIO(bwl, bw, type) \
static inline void out##bwl(unsigned type value, int port) \
{ \
@@ -301,7 +287,7 @@ static inline unsigned type in##bwl##_p(int port) \
\
static inline void outs##bwl(int port, const void *addr, unsigned long count) \
{ \
- if (sev_key_active()) { \
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
unsigned type *value = (unsigned type *)addr; \
while (count) { \
out##bwl(*value, port); \
@@ -317,7 +303,7 @@ static inline void outs##bwl(int port, const void *addr, unsigned long count) \
\
static inline void ins##bwl(int port, void *addr, unsigned long count) \
{ \
- if (sev_key_active()) { \
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
unsigned type *value = (unsigned type *)addr; \
while (count) { \
*value = in##bwl(port); \
@@ -391,6 +377,7 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
#endif
+#ifdef CONFIG_AMD_MEM_ENCRYPT
extern bool arch_memremap_can_ram_remap(resource_size_t offset,
unsigned long size,
unsigned long flags);
@@ -398,6 +385,13 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset,
extern bool phys_mem_access_encrypted(unsigned long phys_addr,
unsigned long size);
+#else
+static inline bool phys_mem_access_encrypted(unsigned long phys_addr,
+ unsigned long size)
+{
+ return true;
+}
+#endif
/**
* iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 562854c60808..ae9d40f6c706 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -58,7 +58,7 @@
* the output constraints to make the compiler aware that R11 cannot be
* reused after the asm() statement.
*
- * For builds with CONFIG_UNWIND_FRAME_POINTER ASM_CALL_CONSTRAINT is
+ * For builds with CONFIG_UNWINDER_FRAME_POINTER, ASM_CALL_CONSTRAINT is
* required as well as this prevents certain creative GCC variants from
* misplacing the ASM code.
*
@@ -77,11 +77,11 @@
* Function calls can clobber anything except the callee-saved
* registers. Tell the compiler.
*/
-#define call_on_irqstack(func, asm_call, argconstr...) \
+#define call_on_stack(stack, func, asm_call, argconstr...) \
{ \
register void *tos asm("r11"); \
\
- tos = ((void *)__this_cpu_read(hardirq_stack_ptr)); \
+ tos = ((void *)(stack)); \
\
asm_inline volatile( \
"movq %%rsp, (%[tos]) \n" \
@@ -98,6 +98,25 @@
); \
}
+#define ASM_CALL_ARG0 \
+ "call %P[__func] \n"
+
+#define ASM_CALL_ARG1 \
+ "movq %[arg1], %%rdi \n" \
+ ASM_CALL_ARG0
+
+#define ASM_CALL_ARG2 \
+ "movq %[arg2], %%rsi \n" \
+ ASM_CALL_ARG1
+
+#define ASM_CALL_ARG3 \
+ "movq %[arg3], %%rdx \n" \
+ ASM_CALL_ARG2
+
+#define call_on_irqstack(func, asm_call, argconstr...) \
+ call_on_stack(__this_cpu_read(hardirq_stack_ptr), \
+ func, asm_call, argconstr)
+
/* Macros to assert type correctness for run_*_on_irqstack macros */
#define assert_function_type(func, proto) \
static_assert(__builtin_types_compatible_p(typeof(&func), proto))
@@ -147,8 +166,7 @@
*/
#define ASM_CALL_SYSVEC \
"call irq_enter_rcu \n" \
- "movq %[arg1], %%rdi \n" \
- "call %P[__func] \n" \
+ ASM_CALL_ARG1 \
"call irq_exit_rcu \n"
#define SYSVEC_CONSTRAINTS , [arg1] "r" (regs)
@@ -168,12 +186,10 @@
*/
#define ASM_CALL_IRQ \
"call irq_enter_rcu \n" \
- "movq %[arg1], %%rdi \n" \
- "movl %[arg2], %%esi \n" \
- "call %P[__func] \n" \
+ ASM_CALL_ARG2 \
"call irq_exit_rcu \n"
-#define IRQ_CONSTRAINTS , [arg1] "r" (regs), [arg2] "r" (vector)
+#define IRQ_CONSTRAINTS , [arg1] "r" (regs), [arg2] "r" ((unsigned long)vector)
#define run_irq_on_irqstack_cond(func, regs, vector) \
{ \
@@ -185,9 +201,7 @@
IRQ_CONSTRAINTS, regs, vector); \
}
-#define ASM_CALL_SOFTIRQ \
- "call %P[__func] \n"
-
+#ifndef CONFIG_PREEMPT_RT
/*
* Macro to invoke __do_softirq on the irq stack. This is only called from
* task context when bottom halves are about to be reenabled and soft
@@ -197,10 +211,12 @@
#define do_softirq_own_stack() \
{ \
__this_cpu_write(hardirq_stack_inuse, true); \
- call_on_irqstack(__do_softirq, ASM_CALL_SOFTIRQ); \
+ call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \
__this_cpu_write(hardirq_stack_inuse, false); \
}
+#endif
+
#else /* CONFIG_X86_64 */
/* System vector handlers always run on the stack they interrupted. */
#define run_sysvec_on_irqstack_cond(func, regs) \
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c5ce9845c999..87761396e8cc 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -114,8 +114,6 @@ static __always_inline unsigned long arch_local_irq_save(void)
#define SAVE_FLAGS pushfq; popq %rax
#endif
-#define INTERRUPT_RETURN jmp native_iret
-
#endif
#endif /* __ASSEMBLY__ */
@@ -143,8 +141,13 @@ static __always_inline void arch_local_irq_restore(unsigned long flags)
#ifdef CONFIG_X86_64
#ifdef CONFIG_XEN_PV
#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
+#define INTERRUPT_RETURN \
+ ANNOTATE_RETPOLINE_SAFE; \
+ ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \
+ X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;")
#else
#define SWAPGS swapgs
+#define INTERRUPT_RETURN jmp native_iret
#endif
#endif
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 0a6e34b07017..11b7c06e2828 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -129,7 +129,7 @@ relocate_kernel(unsigned long indirection_page,
unsigned long page_list,
unsigned long start_address,
unsigned int preserve_context,
- unsigned int sme_active);
+ unsigned int host_mem_enc_active);
#endif
#define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index bd7f5886a789..71ea2eab43d5 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -49,7 +49,6 @@ extern __visible kprobe_opcode_t optprobe_template_end[];
extern const int kretprobe_blacklist_size;
void arch_remove_kprobe(struct kprobe *p);
-asmlinkage void kretprobe_trampoline(void);
extern void arch_kprobe_override_function(struct pt_regs *regs);
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index cefe1d81e2e8..f658bb4dbb74 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -35,6 +35,7 @@ KVM_X86_OP(get_cpl)
KVM_X86_OP(set_segment)
KVM_X86_OP_NULL(get_cs_db_l_bits)
KVM_X86_OP(set_cr0)
+KVM_X86_OP_NULL(post_set_cr3)
KVM_X86_OP(is_valid_cr4)
KVM_X86_OP(set_cr4)
KVM_X86_OP(set_efer)
@@ -47,6 +48,7 @@ KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
KVM_X86_OP(set_rflags)
+KVM_X86_OP(get_if_flag)
KVM_X86_OP(tlb_flush_all)
KVM_X86_OP(tlb_flush_current)
KVM_X86_OP_NULL(tlb_remote_flush)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f8f48a7ec577..0677b9ea01c9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -38,7 +38,6 @@
#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
#define KVM_MAX_VCPUS 1024
-#define KVM_SOFT_MAX_VCPUS 710
/*
* In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
@@ -50,7 +49,7 @@
* so ratio of 4 should be enough.
*/
#define KVM_VCPU_ID_RATIO 4
-#define KVM_MAX_VCPU_ID (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
+#define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
/* memory slots that are not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 3
@@ -98,7 +97,7 @@
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
#define KVM_REQ_TLB_FLUSH_GUEST \
- KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
+ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
@@ -136,7 +135,7 @@
#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
-#define KVM_PERMILLE_MMU_PAGES 20
+#define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50
#define KVM_MIN_ALLOC_MMU_PAGES 64UL
#define KVM_MMU_HASH_SHIFT 12
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
@@ -292,25 +291,31 @@ struct kvm_kernel_irq_routing_entry;
* the number of unique SPs that can theoretically be created is 2^n, where n
* is the number of bits that are used to compute the role.
*
- * But, even though there are 18 bits in the mask below, not all combinations
- * of modes and flags are possible. The maximum number of possible upper-level
- * shadow pages for a single gfn is in the neighborhood of 2^13.
+ * But, even though there are 19 bits in the mask below, not all combinations
+ * of modes and flags are possible:
*
- * - invalid shadow pages are not accounted.
- * - level is effectively limited to four combinations, not 16 as the number
- * bits would imply, as 4k SPs are not tracked (allowed to go unsync).
- * - level is effectively unused for non-PAE paging because there is exactly
- * one upper level (see 4k SP exception above).
- * - quadrant is used only for non-PAE paging and is exclusive with
- * gpte_is_8_bytes.
- * - execonly and ad_disabled are used only for nested EPT, which makes it
- * exclusive with quadrant.
+ * - invalid shadow pages are not accounted, so the bits are effectively 18
+ *
+ * - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging);
+ * execonly and ad_disabled are only used for nested EPT which has
+ * has_4_byte_gpte=0. Therefore, 2 bits are always unused.
+ *
+ * - the 4 bits of level are effectively limited to the values 2/3/4/5,
+ * as 4k SPs are not tracked (allowed to go unsync). In addition non-PAE
+ * paging has exactly one upper level, making level completely redundant
+ * when has_4_byte_gpte=1.
+ *
+ * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if
+ * cr0_wp=0, therefore these three bits only give rise to 5 possibilities.
+ *
+ * Therefore, the maximum number of possible upper-level shadow pages for a
+ * single gfn is a bit less than 2^13.
*/
union kvm_mmu_page_role {
u32 word;
struct {
unsigned level:4;
- unsigned gpte_is_8_bytes:1;
+ unsigned has_4_byte_gpte:1;
unsigned quadrant:2;
unsigned direct:1;
unsigned access:3;
@@ -364,6 +369,7 @@ union kvm_mmu_extended_role {
unsigned int cr4_smap:1;
unsigned int cr4_smep:1;
unsigned int cr4_la57:1;
+ unsigned int efer_lma:1;
};
};
@@ -407,6 +413,7 @@ struct kvm_mmu_root_info {
#define KVM_HAVE_MMU_RWLOCK
struct kvm_mmu_page;
+struct kvm_page_fault;
/*
* x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
@@ -416,14 +423,12 @@ struct kvm_mmu_page;
struct kvm_mmu {
unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu);
u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
- int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err,
- bool prefault);
+ int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void (*inject_page_fault)(struct kvm_vcpu *vcpu,
struct x86_exception *fault);
- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa,
- u32 access, struct x86_exception *exception);
- gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
- struct x86_exception *exception);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t gva_or_gpa, u32 access,
+ struct x86_exception *exception);
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
@@ -490,6 +495,7 @@ struct kvm_pmc {
*/
u64 current_config;
bool is_paused;
+ bool intr;
};
struct kvm_pmu {
@@ -499,7 +505,6 @@ struct kvm_pmu {
u64 fixed_ctr_ctrl;
u64 global_ctrl;
u64 global_status;
- u64 global_ovf_ctrl;
u64 counter_bitmask[2];
u64 global_ctrl_mask;
u64 global_ovf_ctrl_mask;
@@ -581,7 +586,6 @@ struct kvm_vcpu_hv {
struct kvm_hyperv_exit exit;
struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
- cpumask_t tlb_flush;
bool enforce_cpuid;
struct {
u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */
@@ -606,6 +610,7 @@ struct kvm_vcpu_xen {
u64 last_steal;
u64 runstate_entry_time;
u64 runstate_times[4];
+ unsigned long evtchn_pending_sel;
};
struct kvm_vcpu_arch {
@@ -642,6 +647,7 @@ struct kvm_vcpu_arch {
u64 smi_count;
bool tpr_access_reporting;
bool xsaves_enabled;
+ bool xfd_no_write_intercept;
u64 ia32_xss;
u64 microcode_version;
u64 arch_capabilities;
@@ -691,18 +697,18 @@ struct kvm_vcpu_arch {
*
* Note that while the PKRU state lives inside the fpu registers,
* it is switched out separately at VMENTER and VMEXIT time. The
- * "guest_fpu" state here contains the guest FPU context, with the
+ * "guest_fpstate" state here contains the guest FPU context, with the
* host PRKU bits.
*/
- struct fpu *user_fpu;
- struct fpu *guest_fpu;
+ struct fpu_guest guest_fpu;
u64 xcr0;
u64 guest_supported_xcr0;
struct kvm_pio_request pio;
void *pio_data;
- void *guest_ins_data;
+ void *sev_pio_data;
+ unsigned sev_pio_count;
u8 event_exit_inst_len;
@@ -727,6 +733,7 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 *cpuid_entries;
+ u32 kvm_cpuid_base;
u64 reserved_gpa_bits;
int maxphyaddr;
@@ -750,7 +757,7 @@ struct kvm_vcpu_arch {
u8 preempted;
u64 msr_val;
u64 last_steal;
- struct gfn_to_pfn_cache cache;
+ struct gfn_to_hva_cache cache;
} st;
u64 l1_tsc_offset;
@@ -775,6 +782,7 @@ struct kvm_vcpu_arch {
unsigned nmi_pending; /* NMI queued after currently running handler */
bool nmi_injected; /* Trying to inject an NMI this entry */
bool smi_pending; /* SMI queued after currently running handler */
+ u8 handling_intr_from_guest;
struct kvm_mtrr mtrr_state;
u64 pat;
@@ -1015,7 +1023,7 @@ struct msr_bitmap_range {
struct kvm_xen {
bool long_mode;
u8 upcall_vector;
- gfn_t shinfo_gfn;
+ struct gfn_to_pfn_cache shinfo_cache;
};
enum kvm_irqchip_mode {
@@ -1036,6 +1044,8 @@ struct kvm_x86_msr_filter {
#define APICV_INHIBIT_REASON_IRQWIN 3
#define APICV_INHIBIT_REASON_PIT_REINJ 4
#define APICV_INHIBIT_REASON_X2APIC 5
+#define APICV_INHIBIT_REASON_BLOCKIRQ 6
+#define APICV_INHIBIT_REASON_ABSENT 7
struct kvm_arch {
unsigned long n_used_mmu_pages;
@@ -1073,7 +1083,7 @@ struct kvm_arch {
atomic_t apic_map_dirty;
/* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */
- struct mutex apicv_update_lock;
+ struct rw_semaphore apicv_update_lock;
bool apic_access_memslot_enabled;
unsigned long apicv_inhibit_reasons;
@@ -1087,17 +1097,23 @@ struct kvm_arch {
unsigned long irq_sources_bitmap;
s64 kvmclock_offset;
+
+ /*
+ * This also protects nr_vcpus_matched_tsc which is read from a
+ * preemption-disabled region, so it must be a raw spinlock.
+ */
raw_spinlock_t tsc_write_lock;
u64 last_tsc_nsec;
u64 last_tsc_write;
u32 last_tsc_khz;
+ u64 last_tsc_offset;
u64 cur_tsc_nsec;
u64 cur_tsc_write;
u64 cur_tsc_offset;
u64 cur_tsc_generation;
int nr_vcpus_matched_tsc;
- spinlock_t pvclock_gtod_sync_lock;
+ seqcount_raw_spinlock_t pvclock_sc;
bool use_master_clock;
u64 master_kernel_ns;
u64 master_cycle_now;
@@ -1207,10 +1223,11 @@ struct kvm_arch {
#endif /* CONFIG_X86_64 */
/*
- * If set, rmaps have been allocated for all memslots and should be
- * allocated for any newly created or modified memslots.
+ * If set, at least one shadow root has been allocated. This flag
+ * is used as one input when determining whether certain memslot
+ * related allocations are necessary.
*/
- bool memslots_have_rmaps;
+ bool shadow_root_allocated;
#if IS_ENABLED(CONFIG_HYPERV)
hpa_t hv_root_tdp;
@@ -1296,6 +1313,8 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
}
struct kvm_x86_ops {
+ const char *name;
+
int (*hardware_enable)(void);
void (*hardware_disable)(void);
void (*hardware_unsetup)(void);
@@ -1327,6 +1346,7 @@ struct kvm_x86_ops {
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
@@ -1339,6 +1359,7 @@ struct kvm_x86_ops {
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ bool (*get_if_flag)(struct kvm_vcpu *vcpu);
void (*tlb_flush_all)(struct kvm_vcpu *vcpu);
void (*tlb_flush_current)(struct kvm_vcpu *vcpu);
@@ -1405,10 +1426,11 @@ struct kvm_x86_ops {
void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier);
/*
- * Retrieve somewhat arbitrary exit information. Intended to be used
- * only from within tracepoints to avoid VMREADs when tracing is off.
+ * Retrieve somewhat arbitrary exit information. Intended to
+ * be used only from within tracepoints or error paths.
*/
- void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+ void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2,
u32 *exit_int_info, u32 *exit_int_info_err_code);
int (*check_intercept)(struct kvm_vcpu *vcpu,
@@ -1468,6 +1490,7 @@ struct kvm_x86_ops {
int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
+ int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
int (*get_msr_feature)(struct kvm_msr_entry *entry);
@@ -1506,6 +1529,7 @@ struct kvm_x86_init_ops {
int (*disabled_by_bios)(void);
int (*check_processor_compatibility)(void);
int (*hardware_setup)(void);
+ unsigned int (*handle_intel_pt_intr)(void);
struct kvm_x86_ops *runtime_ops;
};
@@ -1541,6 +1565,8 @@ static inline struct kvm *kvm_arch_alloc_vm(void)
{
return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
}
+
+#define __KVM_HAVE_ARCH_VM_FREE
void kvm_arch_free_vm(struct kvm *kvm);
#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
@@ -1553,6 +1579,9 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
return -ENOTSUPP;
}
+#define kvm_arch_pmi_in_guest(vcpu) \
+ ((vcpu) && (vcpu)->arch.handling_intr_from_guest)
+
int kvm_mmu_module_init(void);
void kvm_mmu_module_exit(void);
@@ -1572,10 +1601,9 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
-unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
-int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
@@ -1625,7 +1653,8 @@ extern u64 kvm_mce_cap_supported;
*
* EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to
* decode the instruction length. For use *only* by
- * kvm_x86_ops.skip_emulated_instruction() implementations.
+ * kvm_x86_ops.skip_emulated_instruction() implementations if
+ * EMULTYPE_COMPLETE_USER_EXIT is not set.
*
* EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to
* retry native execution under certain conditions,
@@ -1645,6 +1674,10 @@ extern u64 kvm_mce_cap_supported;
*
* EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
* case the CR2/GPA value pass on the stack is valid.
+ *
+ * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility
+ * state and inject single-step #DBs after skipping
+ * an instruction (after completing userspace I/O).
*/
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
@@ -1653,10 +1686,14 @@ extern u64 kvm_mce_cap_supported;
#define EMULTYPE_TRAP_UD_FORCED (1 << 4)
#define EMULTYPE_VMWARE_GP (1 << 5)
#define EMULTYPE_PF (1 << 6)
+#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
void *insn, int insn_len);
+void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
+ u64 *data, u8 ndata);
+void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu);
void kvm_enable_efer_bits(u64);
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
@@ -1674,7 +1711,7 @@ int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
-int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu);
int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu);
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
@@ -1685,8 +1722,6 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
int reason, bool has_error_code, u32 error_code);
-void kvm_free_guest_fpu(struct kvm_vcpu *vcpu);
-
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0);
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
@@ -1715,9 +1750,6 @@ void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault);
-int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gfn_t gfn, void *data, int offset, int len,
- u32 access);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
@@ -1745,12 +1777,9 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
void kvm_update_dr7(struct kvm_vcpu *vcpu);
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
ulong roots_to_free);
void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu);
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
- struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
@@ -1866,7 +1895,6 @@ u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier);
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
-void kvm_make_mclock_inprogress_request(struct kvm *kvm);
void kvm_make_scan_ioapic_request(struct kvm *kvm);
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
unsigned long *vcpu_bitmap);
@@ -1885,8 +1913,6 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
-int kvm_is_in_guest(void);
-
void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
u32 size);
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
@@ -1915,8 +1941,6 @@ static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
static_call_cond(kvm_x86_vcpu_unblocking)(vcpu);
}
-static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
-
static inline int kvm_cpu_get_apicid(int mps_cpu)
{
#ifdef CONFIG_X86_LOCAL_APIC
@@ -1935,6 +1959,9 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
int kvm_cpu_dirty_log_size(void);
-int alloc_all_memslots_rmaps(struct kvm *kvm);
+int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
+
+#define KVM_CLOCK_VALID_FLAGS \
+ (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index 6a5f3acf2b33..eb186bc57f6a 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -49,8 +49,12 @@ struct kvm_page_track_notifier_node {
int kvm_page_track_init(struct kvm *kvm);
void kvm_page_track_cleanup(struct kvm *kvm);
+bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
+int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
+
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
-int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
unsigned long npages);
void kvm_slot_page_track_add_page(struct kvm *kvm,
@@ -59,8 +63,9 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
void kvm_slot_page_track_remove_page(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
enum kvm_page_track_mode mode);
-bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
- enum kvm_page_track_mode mode);
+bool kvm_slot_page_track_is_active(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t gfn, enum kvm_page_track_mode mode);
void
kvm_page_track_register_notifier(struct kvm *kvm,
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 69299878b200..56935ebb1dfe 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -83,6 +83,18 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
return ret;
}
+static inline long kvm_sev_hypercall3(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3)
+{
+ long ret;
+
+ asm volatile("vmmcall"
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3)
+ : "memory");
+ return ret;
+}
+
#ifdef CONFIG_KVM_GUEST
void kvmclock_init(void);
void kvmclock_disable(void);
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 365111789cc6..030907922bd0 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -18,6 +18,20 @@
#define __ALIGN_STR __stringify(__ALIGN)
#endif
+#ifdef CONFIG_SLS
+#define RET ret; int3
+#else
+#define RET ret
+#endif
+
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_SLS
+#define ASM_RET "ret; int3\n\t"
+#else
+#define ASM_RET "ret\n\t"
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_LINKAGE_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index da9321548f6f..cc73061e7255 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -205,28 +205,16 @@ struct cper_ia_proc_ctx;
int mcheck_init(void);
void mcheck_cpu_init(struct cpuinfo_x86 *c);
void mcheck_cpu_clear(struct cpuinfo_x86 *c);
-void mcheck_vendor_init_severity(void);
int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
u64 lapic_id);
#else
static inline int mcheck_init(void) { return 0; }
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
-static inline void mcheck_vendor_init_severity(void) {}
static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
u64 lapic_id) { return -EINVAL; }
#endif
-#ifdef CONFIG_X86_ANCIENT_MCE
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
-void winchip_mcheck_init(struct cpuinfo_x86 *c);
-static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
-#else
-static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
-static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
-static inline void enable_p5_mce(void) {}
-#endif
-
void mce_setup(struct mce *m);
void mce_log(struct mce *m);
DECLARE_PER_CPU(struct device *, mce_device);
@@ -325,31 +313,22 @@ enum smca_bank_types {
SMCA_SMU, /* System Management Unit */
SMCA_SMU_V2,
SMCA_MP5, /* Microprocessor 5 Unit */
+ SMCA_MPDMA, /* MPDMA Unit */
SMCA_NBIO, /* Northbridge IO Unit */
SMCA_PCIE, /* PCI Express Unit */
SMCA_PCIE_V2,
SMCA_XGMI_PCS, /* xGMI PCS Unit */
+ SMCA_NBIF, /* NBIF Unit */
+ SMCA_SHUB, /* System HUB Unit */
+ SMCA_SATA, /* SATA Unit */
+ SMCA_USB, /* USB Unit */
+ SMCA_GMI_PCS, /* GMI PCS Unit */
SMCA_XGMI_PHY, /* xGMI PHY Unit */
SMCA_WAFL_PHY, /* WAFL PHY Unit */
+ SMCA_GMI_PHY, /* GMI PHY Unit */
N_SMCA_BANK_TYPES
};
-#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
-
-struct smca_hwid {
- unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
- u32 hwid_mcatype; /* (hwid,mcatype) tuple */
- u8 count; /* Number of instances. */
-};
-
-struct smca_bank {
- struct smca_hwid *hwid;
- u32 id; /* Value of MCA_IPID[InstanceId]. */
- u8 sysfs_id; /* Value used for sysfs name. */
-};
-
-extern struct smca_bank smca_banks[MAX_NR_BANKS];
-
extern const char *smca_get_long_name(enum smca_bank_types t);
extern bool amd_mce_is_memory_error(struct mce *m);
@@ -357,16 +336,13 @@ extern int mce_threshold_create_device(unsigned int cpu);
extern int mce_threshold_remove_device(unsigned int cpu);
void mce_amd_feature_init(struct cpuinfo_x86 *c);
-int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr);
-
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank);
#else
static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
-static inline int
-umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; };
#endif
static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); }
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 9c80c68d75b5..e2c6f433ed10 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -13,6 +13,7 @@
#ifndef __ASSEMBLY__
#include <linux/init.h>
+#include <linux/cc_platform.h>
#include <asm/bootparam.h>
@@ -43,6 +44,8 @@ void __init sme_enable(struct boot_params *bp);
int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size);
int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
+void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages,
+ bool enc);
void __init mem_encrypt_free_decrypted_mem(void);
@@ -50,9 +53,6 @@ void __init mem_encrypt_free_decrypted_mem(void);
void __init mem_encrypt_init(void);
void __init sev_es_init_vc_handling(void);
-bool sme_active(void);
-bool sev_active(void);
-bool sev_es_active(void);
#define __bss_decrypted __section(".bss..decrypted")
@@ -75,14 +75,13 @@ static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
static inline void __init sme_enable(struct boot_params *bp) { }
static inline void sev_es_init_vc_handling(void) { }
-static inline bool sme_active(void) { return false; }
-static inline bool sev_active(void) { return false; }
-static inline bool sev_es_active(void) { return false; }
static inline int __init
early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; }
static inline int __init
early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
+static inline void __init
+early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) {}
static inline void mem_encrypt_free_decrypted_mem(void) { }
@@ -101,11 +100,6 @@ static inline void mem_encrypt_free_decrypted_mem(void) { }
extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
-static inline bool mem_encrypt_active(void)
-{
- return sme_me_mask;
-}
-
static inline u64 sme_get_me_mask(void)
{
return sme_me_mask;
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ab45a220fac4..d6bfdfb0f0af 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -130,14 +130,11 @@ static inline unsigned int x86_cpuid_family(void)
extern void __init load_ucode_bsp(void);
extern void load_ucode_ap(void);
void reload_early_microcode(void);
-extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
extern bool initrd_gone;
#else
static inline void __init load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
static inline void reload_early_microcode(void) { }
-static inline bool
-get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; }
#endif
#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h
index f572d0f944bb..e69de29bb2d1 100644
--- a/arch/x86/include/asm/mmx.h
+++ b/arch/x86/include/asm/mmx.h
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_MMX_H
-#define _ASM_X86_MMX_H
-
-/*
- * MMX 3Dnow! helper operations
- */
-
-#include <linux/types.h>
-
-extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
-extern void mmx_copy_page(void *to, void *from);
-
-#endif /* _ASM_X86_MMX_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index adccbc209169..a82f603d4312 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -11,23 +11,14 @@
#include <asm/paravirt.h>
#include <asm/mshyperv.h>
+union hv_ghcb;
+
+DECLARE_STATIC_KEY_FALSE(isolation_type_snp);
+
typedef int (*hyperv_fill_flush_list_func)(
struct hv_guest_mapping_flush_list *flush,
void *data);
-static inline void hv_set_register(unsigned int reg, u64 value)
-{
- wrmsrl(reg, value);
-}
-
-static inline u64 hv_get_register(unsigned int reg)
-{
- u64 value;
-
- rdmsrl(reg, value);
- return value;
-}
-
#define hv_get_raw_timer() rdtsc_ordered()
void hyperv_vector_handler(struct pt_regs *regs);
@@ -39,6 +30,8 @@ extern void *hv_hypercall_pg;
extern u64 hv_current_partition_id;
+extern union hv_ghcb * __percpu *hv_ghcb_pg;
+
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
@@ -176,18 +169,55 @@ bool hv_vcpu_is_preempted(int vcpu);
static inline void hv_apic_init(void) {}
#endif
-static inline void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry,
- struct msi_desc *msi_desc)
-{
- msi_entry->address.as_uint32 = msi_desc->msg.address_lo;
- msi_entry->data.as_uint32 = msi_desc->msg.data;
-}
-
struct irq_domain *hv_create_pci_msi_domain(void);
int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
struct hv_interrupt_entry *entry);
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
+int hv_set_mem_host_visibility(unsigned long addr, int numpages, bool visible);
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+void hv_ghcb_msr_write(u64 msr, u64 value);
+void hv_ghcb_msr_read(u64 msr, u64 *value);
+#else
+static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
+static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
+#endif
+
+extern bool hv_isolation_type_snp(void);
+
+static inline bool hv_is_synic_reg(unsigned int reg)
+{
+ if ((reg >= HV_REGISTER_SCONTROL) &&
+ (reg <= HV_REGISTER_SINT15))
+ return true;
+ return false;
+}
+
+static inline u64 hv_get_register(unsigned int reg)
+{
+ u64 value;
+
+ if (hv_is_synic_reg(reg) && hv_isolation_type_snp())
+ hv_ghcb_msr_read(reg, &value);
+ else
+ rdmsrl(reg, value);
+ return value;
+}
+
+static inline void hv_set_register(unsigned int reg, u64 value)
+{
+ if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) {
+ hv_ghcb_msr_write(reg, value);
+
+ /* Write proxy bit via wrmsl instruction */
+ if (reg >= HV_REGISTER_SINT0 &&
+ reg <= HV_REGISTER_SINT15)
+ wrmsrl(reg, value | 1 << 20);
+ } else {
+ wrmsrl(reg, value);
+ }
+}
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
@@ -205,6 +235,13 @@ static inline int hyperv_flush_guest_mapping_range(u64 as,
{
return -1;
}
+static inline void hv_set_register(unsigned int reg, u64 value) { }
+static inline u64 hv_get_register(unsigned int reg) { return 0; }
+static inline int hv_set_mem_host_visibility(unsigned long addr, int numpages,
+ bool visible)
+{
+ return -1;
+}
#endif /* CONFIG_HYPERV */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index a7c413432b33..3faf0f97edb1 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -486,6 +486,23 @@
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
+/* AMD Collaborative Processor Performance Control MSRs */
+#define MSR_AMD_CPPC_CAP1 0xc00102b0
+#define MSR_AMD_CPPC_ENABLE 0xc00102b1
+#define MSR_AMD_CPPC_CAP2 0xc00102b2
+#define MSR_AMD_CPPC_REQ 0xc00102b3
+#define MSR_AMD_CPPC_STATUS 0xc00102b4
+
+#define AMD_CPPC_LOWEST_PERF(x) (((x) >> 0) & 0xff)
+#define AMD_CPPC_LOWNONLIN_PERF(x) (((x) >> 8) & 0xff)
+#define AMD_CPPC_NOMINAL_PERF(x) (((x) >> 16) & 0xff)
+#define AMD_CPPC_HIGHEST_PERF(x) (((x) >> 24) & 0xff)
+
+#define AMD_CPPC_MAX_PERF(x) (((x) & 0xff) << 0)
+#define AMD_CPPC_MIN_PERF(x) (((x) & 0xff) << 8)
+#define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16)
+#define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24)
+
/* Fam 17h MSRs */
#define MSR_F17H_IRPERF 0xc00000e9
@@ -625,6 +642,8 @@
#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc
+#define MSR_IA32_XFD 0x000001c4
+#define MSR_IA32_XFD_ERR 0x000001c5
#define MSR_IA32_XSS 0x00000da0
#define MSR_IA32_APICBASE 0x0000001b
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index a3f87f1015d3..d42e6c6b47b1 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -92,7 +92,7 @@ static __always_inline unsigned long long __rdmsr(unsigned int msr)
asm volatile("1: rdmsr\n"
"2:\n"
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
: EAX_EDX_RET(val, low, high) : "c" (msr));
return EAX_EDX_VAL(val, low, high);
@@ -102,7 +102,7 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high)
{
asm volatile("1: wrmsr\n"
"2:\n"
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
: : "c" (msr), "a"(low), "d" (high) : "memory");
}
@@ -137,17 +137,11 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
{
DECLARE_ARGS(val, low, high);
- asm volatile("2: rdmsr ; xor %[err],%[err]\n"
- "1:\n\t"
- ".section .fixup,\"ax\"\n\t"
- "3: mov %[fault],%[err]\n\t"
- "xorl %%eax, %%eax\n\t"
- "xorl %%edx, %%edx\n\t"
- "jmp 1b\n\t"
- ".previous\n\t"
- _ASM_EXTABLE(2b, 3b)
+ asm volatile("1: rdmsr ; xor %[err],%[err]\n"
+ "2:\n\t"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
: [err] "=r" (*err), EAX_EDX_RET(val, low, high)
- : "c" (msr), [fault] "i" (-EIO));
+ : "c" (msr));
if (tracepoint_enabled(read_msr))
do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
return EAX_EDX_VAL(val, low, high);
@@ -169,15 +163,11 @@ native_write_msr_safe(unsigned int msr, u32 low, u32 high)
{
int err;
- asm volatile("2: wrmsr ; xor %[err],%[err]\n"
- "1:\n\t"
- ".section .fixup,\"ax\"\n\t"
- "3: mov %[fault],%[err] ; jmp 1b\n\t"
- ".previous\n\t"
- _ASM_EXTABLE(2b, 3b)
+ asm volatile("1: wrmsr ; xor %[err],%[err]\n"
+ "2:\n\t"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err])
: [err] "=a" (err)
- : "c" (msr), "0" (low), "d" (high),
- [fault] "i" (-EIO)
+ : "c" (msr), "0" (low), "d" (high)
: "memory");
if (tracepoint_enabled(write_msr))
do_trace_write_msr(msr, ((u64)high << 32 | low), err);
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 829df26fd7a3..76d726074c16 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -24,8 +24,8 @@
#define _ASM_X86_MTRR_H
#include <uapi/asm/mtrr.h>
-#include <asm/memtype.h>
+void mtrr_bp_init(void);
/*
* The following functions are for use by other drivers that cannot use
@@ -43,7 +43,6 @@ extern int mtrr_del(int reg, unsigned long base, unsigned long size);
extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
extern void mtrr_ap_init(void);
-extern void mtrr_bp_init(void);
extern void set_mtrr_aps_delayed_init(void);
extern void mtrr_aps_init(void);
extern void mtrr_bp_restore(void);
@@ -84,11 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
{
}
-static inline void mtrr_bp_init(void)
-{
- pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel.");
-}
-
#define mtrr_ap_init() do {} while (0)
#define set_mtrr_aps_delayed_init() do {} while (0)
#define mtrr_aps_init() do {} while (0)
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index ec2d5c8c6694..cc74dc584836 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -5,12 +5,15 @@
#include <linux/static_key.h>
#include <linux/objtool.h>
+#include <linux/linkage.h>
#include <asm/alternative.h>
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
#include <asm/unwind_hints.h>
+#define RETPOLINE_THUNK_SIZE 32
+
/*
* Fill the CPU return stack buffer.
*
@@ -118,6 +121,16 @@
".popsection\n\t"
#ifdef CONFIG_RETPOLINE
+
+typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
+
+#define GEN(reg) \
+ extern retpoline_thunk_t __x86_indirect_thunk_ ## reg;
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+extern retpoline_thunk_t __x86_indirect_thunk_array[];
+
#ifdef CONFIG_X86_64
/*
@@ -303,63 +316,4 @@ static inline void mds_idle_clear_cpu_buffers(void)
#endif /* __ASSEMBLY__ */
-/*
- * Below is used in the eBPF JIT compiler and emits the byte sequence
- * for the following assembly:
- *
- * With retpolines configured:
- *
- * callq do_rop
- * spec_trap:
- * pause
- * lfence
- * jmp spec_trap
- * do_rop:
- * mov %rcx,(%rsp) for x86_64
- * mov %edx,(%esp) for x86_32
- * retq
- *
- * Without retpolines configured:
- *
- * jmp *%rcx for x86_64
- * jmp *%edx for x86_32
- */
-#ifdef CONFIG_RETPOLINE
-# ifdef CONFIG_X86_64
-# define RETPOLINE_RCX_BPF_JIT_SIZE 17
-# define RETPOLINE_RCX_BPF_JIT() \
-do { \
- EMIT1_off32(0xE8, 7); /* callq do_rop */ \
- /* spec_trap: */ \
- EMIT2(0xF3, 0x90); /* pause */ \
- EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \
- EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \
- /* do_rop: */ \
- EMIT4(0x48, 0x89, 0x0C, 0x24); /* mov %rcx,(%rsp) */ \
- EMIT1(0xC3); /* retq */ \
-} while (0)
-# else /* !CONFIG_X86_64 */
-# define RETPOLINE_EDX_BPF_JIT() \
-do { \
- EMIT1_off32(0xE8, 7); /* call do_rop */ \
- /* spec_trap: */ \
- EMIT2(0xF3, 0x90); /* pause */ \
- EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \
- EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \
- /* do_rop: */ \
- EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */ \
- EMIT1(0xC3); /* ret */ \
-} while (0)
-# endif
-#else /* !CONFIG_RETPOLINE */
-# ifdef CONFIG_X86_64
-# define RETPOLINE_RCX_BPF_JIT_SIZE 2
-# define RETPOLINE_RCX_BPF_JIT() \
- EMIT2(0xFF, 0xE1); /* jmp *%rcx */
-# else /* !CONFIG_X86_64 */
-# define RETPOLINE_EDX_BPF_JIT() \
- EMIT2(0xFF, 0xE2) /* jmp *%edx */
-# endif
-#endif
-
#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index 94dbd51df58f..df42f8aa99e4 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -19,19 +19,6 @@ extern unsigned long __phys_addr(unsigned long);
#define pfn_valid(pfn) ((pfn) < max_mapnr)
#endif /* CONFIG_FLATMEM */
-#ifdef CONFIG_X86_USE_3DNOW
-#include <asm/mmx.h>
-
-static inline void clear_page(void *page)
-{
- mmx_clear_page(page);
-}
-
-static inline void copy_page(void *to, void *from)
-{
- mmx_copy_page(to, from);
-}
-#else /* !CONFIG_X86_USE_3DNOW */
#include <linux/string.h>
static inline void clear_page(void *page)
@@ -43,7 +30,6 @@ static inline void copy_page(void *to, void *from)
{
memcpy(to, from, PAGE_SIZE);
}
-#endif /* CONFIG_X86_3DNOW */
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PAGE_32_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 4bde0dc66100..e9c86299b835 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -5,6 +5,7 @@
#include <asm/page_64_types.h>
#ifndef __ASSEMBLY__
+#include <asm/cpufeatures.h>
#include <asm/alternative.h>
/* duplicated to the one in bootmem.h */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index a8d4ad856568..e9e2c3ba5923 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -15,7 +15,7 @@
#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER)
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
-#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
+#define EXCEPTION_STACK_ORDER (1 + KASAN_STACK_ORDER)
#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index da3a1ac82be5..0d76502cc6f5 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -52,11 +52,11 @@ void __init paravirt_set_cap(void);
/* The paravirtualized I/O functions */
static inline void slow_down_io(void)
{
- pv_ops.cpu.io_delay();
+ PVOP_VCALL0(cpu.io_delay);
#ifdef REALLY_SLOW_IO
- pv_ops.cpu.io_delay();
- pv_ops.cpu.io_delay();
- pv_ops.cpu.io_delay();
+ PVOP_VCALL0(cpu.io_delay);
+ PVOP_VCALL0(cpu.io_delay);
+ PVOP_VCALL0(cpu.io_delay);
#endif
}
@@ -97,6 +97,12 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
PVOP_VCALL1(mmu.exit_mmap, mm);
}
+static inline void notify_page_enc_status_changed(unsigned long pfn,
+ int npages, bool enc)
+{
+ PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc);
+}
+
#ifdef CONFIG_PARAVIRT_XXL
static inline void load_sp0(unsigned long sp0)
{
@@ -113,12 +119,12 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
/*
* These special macros can be used to get or set a debugging register
*/
-static inline unsigned long paravirt_get_debugreg(int reg)
+static __always_inline unsigned long paravirt_get_debugreg(int reg)
{
return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg);
}
#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
-static inline void set_debugreg(unsigned long val, int reg)
+static __always_inline void set_debugreg(unsigned long val, int reg)
{
PVOP_VCALL2(cpu.set_debugreg, reg, val);
}
@@ -133,14 +139,14 @@ static inline void write_cr0(unsigned long x)
PVOP_VCALL1(cpu.write_cr0, x);
}
-static inline unsigned long read_cr2(void)
+static __always_inline unsigned long read_cr2(void)
{
return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
"mov %%cr2, %%rax;",
ALT_NOT(X86_FEATURE_XENPV));
}
-static inline void write_cr2(unsigned long x)
+static __always_inline void write_cr2(unsigned long x)
{
PVOP_VCALL1(mmu.write_cr2, x);
}
@@ -653,10 +659,10 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
* functions.
*/
#define PV_THUNK_NAME(func) "__raw_callee_save_" #func
-#define PV_CALLEE_SAVE_REGS_THUNK(func) \
+#define __PV_CALLEE_SAVE_REGS_THUNK(func, section) \
extern typeof(func) __raw_callee_save_##func; \
\
- asm(".pushsection .text;" \
+ asm(".pushsection " section ", \"ax\";" \
".globl " PV_THUNK_NAME(func) ";" \
".type " PV_THUNK_NAME(func) ", @function;" \
PV_THUNK_NAME(func) ":" \
@@ -665,10 +671,13 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
"call " #func ";" \
PV_RESTORE_ALL_CALLER_REGS \
FRAME_END \
- "ret;" \
+ ASM_RET \
".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \
".popsection")
+#define PV_CALLEE_SAVE_REGS_THUNK(func) \
+ __PV_CALLEE_SAVE_REGS_THUNK(func, ".text")
+
/* Get a reference to a callee-save function */
#define PV_CALLEE_SAVE(func) \
((struct paravirt_callee_save) { __raw_callee_save_##func })
@@ -678,23 +687,23 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
((struct paravirt_callee_save) { func })
#ifdef CONFIG_PARAVIRT_XXL
-static inline notrace unsigned long arch_local_save_flags(void)
+static __always_inline unsigned long arch_local_save_flags(void)
{
return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;",
ALT_NOT(X86_FEATURE_XENPV));
}
-static inline notrace void arch_local_irq_disable(void)
+static __always_inline void arch_local_irq_disable(void)
{
PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV));
}
-static inline notrace void arch_local_irq_enable(void)
+static __always_inline void arch_local_irq_enable(void)
{
PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV));
}
-static inline notrace unsigned long arch_local_irq_save(void)
+static __always_inline unsigned long arch_local_irq_save(void)
{
unsigned long f;
@@ -743,11 +752,6 @@ extern void default_banner(void);
#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
-#define INTERRUPT_RETURN \
- ANNOTATE_RETPOLINE_SAFE; \
- ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \
- X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;")
-
#ifdef CONFIG_DEBUG_ENTRY
.macro PARA_IRQ_save_fl
PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index d9d6b0203ec4..a69012e1903f 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -168,6 +168,7 @@ struct pv_mmu_ops {
/* Hook for intercepting the destruction of an mm_struct. */
void (*exit_mmap)(struct mm_struct *mm);
+ void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
#ifdef CONFIG_PARAVIRT_XXL
struct paravirt_callee_save read_cr2;
@@ -577,7 +578,9 @@ void paravirt_leave_lazy_mmu(void);
void paravirt_flush_lazy_mmu(void);
void _paravirt_nop(void);
+void paravirt_BUG(void);
u64 _paravirt_ident_64(u64);
+unsigned long paravirt_ret0(void);
#define paravirt_nop ((void *)_paravirt_nop)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 448cd01eb3ec..8a9432fb3802 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -22,10 +22,12 @@
#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot)))
#ifndef __ASSEMBLY__
+#include <linux/spinlock.h>
#include <asm/x86_init.h>
#include <asm/pkru.h>
#include <asm/fpu/api.h>
#include <asm-generic/pgtable_uffd.h>
+#include <linux/page_table_check.h>
extern pgd_t early_top_pgt[PTRS_PER_PGD];
bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
@@ -752,7 +754,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
return true;
if ((pte_flags(a) & _PAGE_PROTNONE) &&
- mm_tlb_flush_pending(mm))
+ atomic_read(&mm->tlb_flush_pending))
return true;
return false;
@@ -1006,18 +1008,21 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
+ page_table_check_pte_set(mm, addr, ptep, pte);
set_pte(ptep, pte);
}
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
+ page_table_check_pmd_set(mm, addr, pmdp, pmd);
set_pmd(pmdp, pmd);
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
+ page_table_check_pud_set(mm, addr, pudp, pud);
native_set_pud(pudp, pud);
}
@@ -1048,6 +1053,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
pte_t pte = native_ptep_get_and_clear(ptep);
+ page_table_check_pte_clear(mm, addr, pte);
return pte;
}
@@ -1063,12 +1069,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
* care about updates and native needs no locking
*/
pte = native_local_ptep_get_and_clear(ptep);
+ page_table_check_pte_clear(mm, addr, pte);
} else {
pte = ptep_get_and_clear(mm, addr, ptep);
}
return pte;
}
+#define __HAVE_ARCH_PTEP_CLEAR
+static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK))
+ ptep_get_and_clear(mm, addr, ptep);
+ else
+ pte_clear(mm, addr, ptep);
+}
+
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
@@ -1109,14 +1126,22 @@ static inline int pmd_write(pmd_t pmd)
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
{
- return native_pmdp_get_and_clear(pmdp);
+ pmd_t pmd = native_pmdp_get_and_clear(pmdp);
+
+ page_table_check_pmd_clear(mm, addr, pmd);
+
+ return pmd;
}
#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
unsigned long addr, pud_t *pudp)
{
- return native_pudp_get_and_clear(pudp);
+ pud_t pud = native_pudp_get_and_clear(pudp);
+
+ page_table_check_pud_clear(mm, addr, pud);
+
+ return pud;
}
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
@@ -1137,6 +1162,7 @@ static inline int pud_write(pud_t pud)
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
+ page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
if (IS_ENABLED(CONFIG_SMP)) {
return xchg(pmdp, pmd);
} else {
diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
index ccc539faa5bb..74f0a2d34ffd 100644
--- a/arch/x86/include/asm/pkru.h
+++ b/arch/x86/include/asm/pkru.h
@@ -2,10 +2,10 @@
#ifndef _ASM_X86_PKRU_H
#define _ASM_X86_PKRU_H
-#include <asm/fpu/xstate.h>
+#include <asm/cpufeature.h>
-#define PKRU_AD_BIT 0x1
-#define PKRU_WD_BIT 0x2
+#define PKRU_AD_BIT 0x1u
+#define PKRU_WD_BIT 0x2u
#define PKRU_BITS_PER_PKEY 2
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9ad2acaaae9b..2c5f12ae7d04 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -164,7 +164,8 @@ enum cpuid_regs_idx {
#define X86_VENDOR_NSC 8
#define X86_VENDOR_HYGON 9
#define X86_VENDOR_ZHAOXIN 10
-#define X86_VENDOR_NUM 11
+#define X86_VENDOR_VORTEX 11
+#define X86_VENDOR_NUM 12
#define X86_VENDOR_UNKNOWN 0xff
@@ -461,9 +462,6 @@ DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
#endif /* !X86_64 */
-extern unsigned int fpu_kernel_xstate_size;
-extern unsigned int fpu_user_xstate_size;
-
struct perf_event;
struct thread_struct {
@@ -518,6 +516,7 @@ struct thread_struct {
*/
unsigned long iopl_emul;
+ unsigned int iopl_warn:1;
unsigned int sig_on_uaccess_err:1;
/*
@@ -537,12 +536,12 @@ struct thread_struct {
*/
};
-/* Whitelist the FPU state from the task_struct for hardened usercopy. */
+extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size);
+
static inline void arch_thread_struct_whitelist(unsigned long *offset,
unsigned long *size)
{
- *offset = offsetof(struct thread_struct, fpu.state);
- *size = fpu_kernel_xstate_size;
+ fpu_thread_struct_whitelist(offset, size);
}
static inline void
@@ -589,7 +588,7 @@ static inline void load_sp0(unsigned long sp0)
/* Free all resources held by a thread. */
extern void release_thread(struct task_struct *);
-unsigned long get_wchan(struct task_struct *p);
+unsigned long __get_wchan(struct task_struct *p);
/*
* Generic CPUID function
@@ -807,11 +806,14 @@ static inline u32 amd_get_nodes_per_socket(void) { return 0; }
static inline u32 amd_get_highest_perf(void) { return 0; }
#endif
+#define for_each_possible_hypervisor_cpuid_base(function) \
+ for (function = 0x40000000; function < 0x40010000; function += 0x100)
+
static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
{
uint32_t base, eax, signature[3];
- for (base = 0x40000000; base < 0x40010000; base += 0x100) {
+ for_each_possible_hypervisor_cpuid_base(base) {
cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
if (!memcmp(sig, signature, 12) &&
@@ -853,4 +855,12 @@ enum mds_mitigations {
MDS_MITIGATION_VMWERV,
};
+#ifdef CONFIG_X86_SGX
+int arch_memory_failure(unsigned long pfn, int flags);
+#define arch_memory_failure arch_memory_failure
+
+bool arch_is_platform_page(u64 paddr);
+#define arch_is_platform_page arch_is_platform_page
+#endif
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 8c5d1910a848..feed36d44d04 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -40,6 +40,6 @@ void x86_report_nx(void);
extern int reboot_force;
long do_arch_prctl_common(struct task_struct *task, int option,
- unsigned long cpuid_enabled);
+ unsigned long arg2);
#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index b94f615600d5..703663175a5a 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -181,7 +181,7 @@ static inline bool any_64bit_mode(struct pt_regs *regs)
#define current_user_stack_pointer() current_pt_regs()->sp
#define compat_user_stack_pointer() current_pt_regs()->sp
-static inline bool ip_within_syscall_gap(struct pt_regs *regs)
+static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
{
bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack);
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index d86ab942219c..d87451df480b 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -53,6 +53,7 @@ static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
static inline void queued_spin_unlock(struct qspinlock *lock)
{
+ kcsan_release();
pv_queued_spin_unlock(lock);
}
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 159622ee0674..1474cf96251d 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -48,7 +48,7 @@ asm (".pushsection .text;"
"jne .slowpath;"
"pop %rdx;"
FRAME_END
- "ret;"
+ ASM_RET
".slowpath: "
"push %rsi;"
"movzbl %al,%esi;"
@@ -56,7 +56,7 @@ asm (".pushsection .text;"
"pop %rsi;"
"pop %rdx;"
FRAME_END
- "ret;"
+ ASM_RET
".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
".popsection");
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 5db5d083c873..331474b150f1 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -89,6 +89,7 @@ static inline void set_real_mode_mem(phys_addr_t mem)
}
void reserve_real_mode(void);
+void load_trampoline_pgtable(void);
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index b2d504f11937..aff774775c67 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -35,11 +35,7 @@
# define NEED_CMOV 0
#endif
-#ifdef CONFIG_X86_USE_3DNOW
-# define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31))
-#else
# define NEED_3DNOW 0
-#endif
#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31))
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 72044026eb3c..b228c9d44ee7 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -307,14 +307,7 @@ do { \
\
asm volatile(" \n" \
"1: movl %k0,%%" #seg " \n" \
- \
- ".section .fixup,\"ax\" \n" \
- "2: xorl %k0,%k0 \n" \
- " jmp 1b \n" \
- ".previous \n" \
- \
- _ASM_EXTABLE(1b, 2b) \
- \
+ _ASM_EXTABLE_TYPE_REG(1b, 1b, EX_TYPE_ZERO_REG, %k0)\
: "+r" (__val) : : "memory"); \
} while (0)
@@ -339,7 +332,7 @@ static inline void __loadsegment_fs(unsigned short value)
"1: movw %0, %%fs \n"
"2: \n"
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_clear_fs)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_CLEAR_FS)
: : "rm" (value) : "memory");
}
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 43fa081a1adb..ff0f2d90338a 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_SET_MEMORY_H
#define _ASM_X86_SET_MEMORY_H
+#include <linux/mm.h>
#include <asm/page.h>
#include <asm-generic/set_memory.h>
@@ -83,6 +84,7 @@ int set_pages_rw(struct page *page, int numpages);
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
bool kernel_page_present(struct page *page);
+void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc);
extern int kernel_set_to_readonly;
@@ -98,6 +100,9 @@ static inline int set_mce_nospec(unsigned long pfn, bool unmap)
unsigned long decoy_addr;
int rc;
+ /* SGX pages are not in the 1:1 map */
+ if (arch_is_platform_page(pfn << PAGE_SHIFT))
+ return 0;
/*
* We would like to just call:
* set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 2cef6c5a52c2..1b2fd32b42fe 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -18,20 +18,19 @@
/* SEV Information Request/Response */
#define GHCB_MSR_SEV_INFO_RESP 0x001
#define GHCB_MSR_SEV_INFO_REQ 0x002
-#define GHCB_MSR_VER_MAX_POS 48
-#define GHCB_MSR_VER_MAX_MASK 0xffff
-#define GHCB_MSR_VER_MIN_POS 32
-#define GHCB_MSR_VER_MIN_MASK 0xffff
-#define GHCB_MSR_CBIT_POS 24
-#define GHCB_MSR_CBIT_MASK 0xff
-#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
- ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \
- (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \
- (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \
+
+#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
+ /* GHCBData[63:48] */ \
+ ((((_max) & 0xffff) << 48) | \
+ /* GHCBData[47:32] */ \
+ (((_min) & 0xffff) << 32) | \
+ /* GHCBData[31:24] */ \
+ (((_cbit) & 0xff) << 24) | \
GHCB_MSR_SEV_INFO_RESP)
+
#define GHCB_MSR_INFO(v) ((v) & 0xfffUL)
-#define GHCB_MSR_PROTO_MAX(v) (((v) >> GHCB_MSR_VER_MAX_POS) & GHCB_MSR_VER_MAX_MASK)
-#define GHCB_MSR_PROTO_MIN(v) (((v) >> GHCB_MSR_VER_MIN_POS) & GHCB_MSR_VER_MIN_MASK)
+#define GHCB_MSR_PROTO_MAX(v) (((v) >> 48) & 0xffff)
+#define GHCB_MSR_PROTO_MIN(v) (((v) >> 32) & 0xffff)
/* CPUID Request/Response */
#define GHCB_MSR_CPUID_REQ 0x004
@@ -46,31 +45,48 @@
#define GHCB_CPUID_REQ_EBX 1
#define GHCB_CPUID_REQ_ECX 2
#define GHCB_CPUID_REQ_EDX 3
-#define GHCB_CPUID_REQ(fn, reg) \
- (GHCB_MSR_CPUID_REQ | \
- (((unsigned long)reg & GHCB_MSR_CPUID_REG_MASK) << GHCB_MSR_CPUID_REG_POS) | \
- (((unsigned long)fn) << GHCB_MSR_CPUID_FUNC_POS))
+#define GHCB_CPUID_REQ(fn, reg) \
+ /* GHCBData[11:0] */ \
+ (GHCB_MSR_CPUID_REQ | \
+ /* GHCBData[31:12] */ \
+ (((unsigned long)(reg) & 0x3) << 30) | \
+ /* GHCBData[63:32] */ \
+ (((unsigned long)fn) << 32))
/* AP Reset Hold */
-#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
-#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
+#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
/* GHCB Hypervisor Feature Request/Response */
-#define GHCB_MSR_HV_FT_REQ 0x080
-#define GHCB_MSR_HV_FT_RESP 0x081
+#define GHCB_MSR_HV_FT_REQ 0x080
+#define GHCB_MSR_HV_FT_RESP 0x081
#define GHCB_MSR_TERM_REQ 0x100
#define GHCB_MSR_TERM_REASON_SET_POS 12
#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
#define GHCB_MSR_TERM_REASON_POS 16
#define GHCB_MSR_TERM_REASON_MASK 0xff
-#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \
- (((((u64)reason_set) & GHCB_MSR_TERM_REASON_SET_MASK) << GHCB_MSR_TERM_REASON_SET_POS) | \
- ((((u64)reason_val) & GHCB_MSR_TERM_REASON_MASK) << GHCB_MSR_TERM_REASON_POS))
-#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0
-#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1
+#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \
+ /* GHCBData[15:12] */ \
+ (((((u64)reason_set) & 0xf) << 12) | \
+ /* GHCBData[23:16] */ \
+ ((((u64)reason_val) & 0xff) << 16))
+
+#define GHCB_SEV_ES_GEN_REQ 0
+#define GHCB_SEV_ES_PROT_UNSUPPORTED 1
#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
+/*
+ * Error codes related to GHCB input that can be communicated back to the guest
+ * by setting the lower 32-bits of the GHCB SW_EXITINFO1 field to 2.
+ */
+#define GHCB_ERR_NOT_REGISTERED 1
+#define GHCB_ERR_INVALID_USAGE 2
+#define GHCB_ERR_INVALID_SCRATCH_AREA 3
+#define GHCB_ERR_MISSING_INPUT 4
+#define GHCB_ERR_INVALID_INPUT 5
+#define GHCB_ERR_INVALID_EVENT 6
+
#endif
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index fa5cd05d3b5b..ec060c433589 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -53,6 +53,7 @@ static inline u64 lower_bits(u64 val, unsigned int bits)
struct real_mode_header;
enum stack_type;
+struct ghcb;
/* Early IDT entry points for #VC handler */
extern void vc_no_ghcb(void);
@@ -81,6 +82,11 @@ static __always_inline void sev_es_nmi_complete(void)
__sev_es_nmi_complete();
}
extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
+extern enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
+ bool set_ghcb_msr,
+ struct es_em_ctxt *ctxt,
+ u64 exit_code, u64 exit_info_1,
+ u64 exit_info_2);
#else
static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 05f3e21f01a7..3f9334ef67cd 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -46,6 +46,24 @@ enum sgx_encls_function {
};
/**
+ * SGX_ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
+ *
+ * ENCLS has its own (positive value) error codes and also generates
+ * ENCLS specific #GP and #PF faults. And the ENCLS values get munged
+ * with system error codes as everything percolates back up the stack.
+ * Unfortunately (for us), we need to precisely identify each unique
+ * error code, e.g. the action taken if EWB fails varies based on the
+ * type of fault and on the exact SGX error code, i.e. we can't simply
+ * convert all faults to -EFAULT.
+ *
+ * To make all three error types coexist, we set bit 30 to identify an
+ * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate
+ * between positive (faults and SGX error codes) and negative (system
+ * error codes) values.
+ */
+#define SGX_ENCLS_FAULT_FLAG 0x40000000
+
+/**
* enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
* %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
* been completed yet.
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 630ff08532be..81a0211a372d 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -16,7 +16,9 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
/* cpus sharing the last level cache: */
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
+DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
static inline struct cpumask *cpu_llc_shared_mask(int cpu)
@@ -24,6 +26,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
return per_cpu(cpu_llc_shared_map, cpu);
}
+static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
+{
+ return per_cpu(cpu_l2c_shared_map, cpu);
+}
+
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
@@ -119,6 +126,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
void cpu_disable_common(void);
void native_smp_prepare_boot_cpu(void);
+void smp_prepare_cpus_common(void);
void native_smp_prepare_cpus(unsigned int max_cpus);
void calculate_max_logical_packages(void);
void native_smp_cpus_done(unsigned int max_cpus);
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index f248eb2ac2d4..3881b5333eb8 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -38,6 +38,16 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
struct stack_info *info);
+static __always_inline
+bool get_stack_guard_info(unsigned long *stack, struct stack_info *info)
+{
+ /* make sure it's not in the stack proper */
+ if (get_stack_info_noinstr(stack, current, info))
+ return false;
+ /* but if it is in the page below it, we hit a guard */
+ return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info);
+}
+
const char *stack_type_name(enum stack_type type);
static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index cbb67b6030f9..ed4f8bb6c2d9 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -27,6 +27,7 @@
".globl " STATIC_CALL_TRAMP_STR(name) " \n" \
STATIC_CALL_TRAMP_STR(name) ": \n" \
insns " \n" \
+ ".byte 0x53, 0x43, 0x54 \n" \
".type " STATIC_CALL_TRAMP_STR(name) ", @function \n" \
".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
".popsection \n")
@@ -35,7 +36,7 @@
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)")
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
- __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")
+ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; int3; nop; nop; nop")
#define ARCH_ADD_TRAMP_KEY(name) \
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index f74362b05619..32c0d981a82a 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -146,42 +146,9 @@ static __always_inline void *__constant_memcpy(void *to, const void *from,
extern void *memcpy(void *, const void *, size_t);
#ifndef CONFIG_FORTIFY_SOURCE
-#ifdef CONFIG_X86_USE_3DNOW
-
-#include <asm/mmx.h>
-
-/*
- * This CPU favours 3DNow strongly (eg AMD Athlon)
- */
-
-static inline void *__constant_memcpy3d(void *to, const void *from, size_t len)
-{
- if (len < 512)
- return __constant_memcpy(to, from, len);
- return _mmx_memcpy(to, from, len);
-}
-
-static inline void *__memcpy3d(void *to, const void *from, size_t len)
-{
- if (len < 512)
- return __memcpy(to, from, len);
- return _mmx_memcpy(to, from, len);
-}
-
-#define memcpy(t, f, n) \
- (__builtin_constant_p((n)) \
- ? __constant_memcpy3d((t), (f), (n)) \
- : __memcpy3d((t), (f), (n)))
-
-#else
-
-/*
- * No 3D Now!
- */
#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
-#endif
#endif /* !CONFIG_FORTIFY_SOURCE */
#define __HAVE_ARCH_MEMMOVE
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index f7e2d82d24fb..5b85987a5e97 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -87,15 +87,6 @@ static inline void syscall_get_arguments(struct task_struct *task,
memcpy(args, &regs->bx, 6 * sizeof(args[0]));
}
-static inline void syscall_set_arguments(struct task_struct *task,
- struct pt_regs *regs,
- unsigned int i, unsigned int n,
- const unsigned long *args)
-{
- BUG_ON(i + n > 6);
- memcpy(&regs->bx + i, args, n * sizeof(args[0]));
-}
-
static inline int syscall_get_arch(struct task_struct *task)
{
return AUDIT_ARCH_I386;
@@ -127,30 +118,6 @@ static inline void syscall_get_arguments(struct task_struct *task,
}
}
-static inline void syscall_set_arguments(struct task_struct *task,
- struct pt_regs *regs,
- const unsigned long *args)
-{
-# ifdef CONFIG_IA32_EMULATION
- if (task->thread_info.status & TS_COMPAT) {
- regs->bx = *args++;
- regs->cx = *args++;
- regs->dx = *args++;
- regs->si = *args++;
- regs->di = *args++;
- regs->bp = *args;
- } else
-# endif
- {
- regs->di = *args++;
- regs->si = *args++;
- regs->dx = *args++;
- regs->r10 = *args++;
- regs->r8 = *args++;
- regs->r9 = *args;
- }
-}
-
static inline int syscall_get_arch(struct task_struct *task)
{
/* x32 tasks should be considered AUDIT_ARCH_X86_64. */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index cf132663c219..ebec69c35e95 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -57,6 +57,9 @@ struct thread_info {
unsigned long flags; /* low level flags */
unsigned long syscall_work; /* SYSCALL_WORK_ flags */
u32 status; /* thread synchronous flags */
+#ifdef CONFIG_SMP
+ u32 cpu; /* current CPU */
+#endif
};
#define INIT_THREAD_INFO(tsk) \
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index b587a9ee9cb2..98fa0a114074 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,4 +261,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
#endif /* !MODULE */
+static inline void __native_tlb_flush_global(unsigned long cr4)
+{
+ native_write_cr4(cr4 ^ X86_CR4_PGE);
+ native_write_cr4(cr4);
+}
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 9239399e5491..2f0b6be8eaab 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -103,6 +103,7 @@ static inline void setup_node_to_cpumask_map(void) { }
#include <asm-generic/topology.h>
extern const struct cpumask *cpu_coregroup_mask(int cpu);
+extern const struct cpumask *cpu_clustergroup_mask(int cpu);
#define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id)
#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
@@ -113,7 +114,9 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
extern unsigned int __max_die_per_package;
#ifdef CONFIG_SMP
+#define topology_cluster_id(cpu) (per_cpu(cpu_l2c_id, cpu))
#define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu))
+#define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu))
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
@@ -218,7 +221,7 @@ static inline void arch_set_max_freq_ratio(bool turbo_disabled)
}
#endif
-#ifdef CONFIG_ACPI_CPPC_LIB
+#if defined(CONFIG_ACPI_CPPC_LIB) && defined(CONFIG_SMP)
void init_freq_invariance_cppc(void);
#define init_freq_invariance_cppc init_freq_invariance_cppc
#endif
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 879b77792f94..4645a6334063 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -22,8 +22,8 @@ DECLARE_EVENT_CLASS(x86_fpu,
__entry->fpu = fpu;
__entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD);
if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
- __entry->xfeatures = fpu->state.xsave.header.xfeatures;
- __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv;
+ __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures;
+ __entry->xcomp_bv = fpu->fpstate->regs.xsave.header.xcomp_bv;
}
),
TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 7f7200021bd1..6221be7cafc3 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -40,9 +40,9 @@ void math_emulate(struct math_emu_info *);
bool fault_in_kernel_space(unsigned long address);
#ifdef CONFIG_VMAP_STACK
-void __noreturn handle_stack_overflow(const char *message,
- struct pt_regs *regs,
- unsigned long fault_address);
+void __noreturn handle_stack_overflow(struct pt_regs *regs,
+ unsigned long fault_address,
+ struct stack_info *info);
#endif
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 5c95d242f38d..ac96f9b2d64b 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -314,11 +314,12 @@ do { \
do { \
__chk_user_ptr(ptr); \
switch (size) { \
- unsigned char x_u8__; \
- case 1: \
+ case 1: { \
+ unsigned char x_u8__; \
__get_user_asm(x_u8__, ptr, "b", "=q", label); \
(x) = x_u8__; \
break; \
+ } \
case 2: \
__get_user_asm(x, ptr, "w", "=r", label); \
break; \
@@ -351,24 +352,22 @@ do { \
"1: movl %[lowbits],%%eax\n" \
"2: movl %[highbits],%%edx\n" \
"3:\n" \
- ".section .fixup,\"ax\"\n" \
- "4: mov %[efault],%[errout]\n" \
- " xorl %%eax,%%eax\n" \
- " xorl %%edx,%%edx\n" \
- " jmp 3b\n" \
- ".previous\n" \
- _ASM_EXTABLE_UA(1b, 4b) \
- _ASM_EXTABLE_UA(2b, 4b) \
+ _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG | \
+ EX_FLAG_CLEAR_AX_DX, \
+ %[errout]) \
+ _ASM_EXTABLE_TYPE_REG(2b, 3b, EX_TYPE_EFAULT_REG | \
+ EX_FLAG_CLEAR_AX_DX, \
+ %[errout]) \
: [errout] "=r" (retval), \
[output] "=&A"(x) \
: [lowbits] "m" (__m(__ptr)), \
[highbits] "m" __m(((u32 __user *)(__ptr)) + 1), \
- [efault] "i" (-EFAULT), "0" (retval)); \
+ "0" (retval)); \
})
#else
#define __get_user_asm_u64(x, ptr, retval) \
- __get_user_asm(x, ptr, retval, "q", "=r")
+ __get_user_asm(x, ptr, retval, "q")
#endif
#define __get_user_size(x, ptr, size, retval) \
@@ -379,14 +378,14 @@ do { \
__chk_user_ptr(ptr); \
switch (size) { \
case 1: \
- __get_user_asm(x_u8__, ptr, retval, "b", "=q"); \
+ __get_user_asm(x_u8__, ptr, retval, "b"); \
(x) = x_u8__; \
break; \
case 2: \
- __get_user_asm(x, ptr, retval, "w", "=r"); \
+ __get_user_asm(x, ptr, retval, "w"); \
break; \
case 4: \
- __get_user_asm(x, ptr, retval, "l", "=r"); \
+ __get_user_asm(x, ptr, retval, "l"); \
break; \
case 8: \
__get_user_asm_u64(x, ptr, retval); \
@@ -396,22 +395,19 @@ do { \
} \
} while (0)
-#define __get_user_asm(x, addr, err, itype, ltype) \
+#define __get_user_asm(x, addr, err, itype) \
asm volatile("\n" \
"1: mov"itype" %[umem],%[output]\n" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: mov %[efault],%[errout]\n" \
- " xorl %k[output],%k[output]\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_UA(1b, 3b) \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG | \
+ EX_FLAG_CLEAR_AX, \
+ %[errout]) \
: [errout] "=r" (err), \
- [output] ltype(x) \
+ [output] "=a" (x) \
: [umem] "m" (__m(addr)), \
- [efault] "i" (-EFAULT), "0" (err))
+ "0" (err))
-#endif // CONFIG_CC_ASM_GOTO_OUTPUT
+#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
/* FIXME: this hack is definitely wrong -AK */
struct __large_struct { unsigned long buf[100]; };
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 70fc159ebe69..2a1f8734416d 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
@@ -15,6 +16,9 @@ struct unwind_state {
unsigned long stack_mask;
struct task_struct *task;
int graph_idx;
+#ifdef CONFIG_KRETPROBES
+ struct llist_node *kr_cur;
+#endif
bool error;
#if defined(CONFIG_UNWINDER_ORC)
bool signal, full_regs;
@@ -99,6 +103,31 @@ void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
void *orc, size_t orc_size) {}
#endif
+static inline
+unsigned long unwind_recover_kretprobe(struct unwind_state *state,
+ unsigned long addr, unsigned long *addr_p)
+{
+#ifdef CONFIG_KRETPROBES
+ return is_kretprobe_trampoline(addr) ?
+ kretprobe_find_ret_addr(state->task, addr_p, &state->kr_cur) :
+ addr;
+#else
+ return addr;
+#endif
+}
+
+/* Recover the return address modified by kretprobe and ftrace_graph. */
+static inline
+unsigned long unwind_recover_ret_addr(struct unwind_state *state,
+ unsigned long addr, unsigned long *addr_p)
+{
+ unsigned long ret;
+
+ ret = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ addr, addr_p);
+ return unwind_recover_kretprobe(state, ret, addr_p);
+}
+
/*
* This disables KASAN checking when reading a value from another task's stack,
* since the other task could be running on another CPU and could have poisoned
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index 8e574c0afef8..8b33674288ea 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -52,6 +52,11 @@
UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC
.endm
+#else
+
+#define UNWIND_HINT_FUNC \
+ UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0)
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_UNWIND_HINTS_H */
diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
index 06006b0351f3..8338b0432b50 100644
--- a/arch/x86/include/asm/word-at-a-time.h
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -77,30 +77,58 @@ static inline unsigned long find_zero(unsigned long mask)
* and the next page not being mapped, take the exception and
* return zeroes in the non-existing part.
*/
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
static inline unsigned long load_unaligned_zeropad(const void *addr)
{
- unsigned long ret, dummy;
+ unsigned long offset, data;
+ unsigned long ret;
+
+ asm_volatile_goto(
+ "1: mov %[mem], %[ret]\n"
+
+ _ASM_EXTABLE(1b, %l[do_exception])
+
+ : [ret] "=r" (ret)
+ : [mem] "m" (*(unsigned long *)addr)
+ : : do_exception);
+
+ return ret;
+
+do_exception:
+ offset = (unsigned long)addr & (sizeof(long) - 1);
+ addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1));
+ data = *(unsigned long *)addr;
+ ret = data >> offset * 8;
+
+ return ret;
+}
- asm(
- "1:\tmov %2,%0\n"
+#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
+static inline unsigned long load_unaligned_zeropad(const void *addr)
+{
+ unsigned long offset, data;
+ unsigned long ret, err = 0;
+
+ asm( "1: mov %[mem], %[ret]\n"
"2:\n"
- ".section .fixup,\"ax\"\n"
- "3:\t"
- "lea %2,%1\n\t"
- "and %3,%1\n\t"
- "mov (%1),%0\n\t"
- "leal %2,%%ecx\n\t"
- "andl %4,%%ecx\n\t"
- "shll $3,%%ecx\n\t"
- "shr %%cl,%0\n\t"
- "jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- :"=&r" (ret),"=&c" (dummy)
- :"m" (*(unsigned long *)addr),
- "i" (-sizeof(unsigned long)),
- "i" (sizeof(unsigned long)-1));
+
+ _ASM_EXTABLE_FAULT(1b, 2b)
+
+ : [ret] "=&r" (ret), "+a" (err)
+ : [mem] "m" (*(unsigned long *)addr));
+
+ if (unlikely(err)) {
+ offset = (unsigned long)addr & (sizeof(long) - 1);
+ addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1));
+ data = *(unsigned long *)addr;
+ ret = data >> offset * 8;
+ }
+
return ret;
}
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
#endif /* _ASM_WORD_AT_A_TIME_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 5c69f7eb5d47..22b7412c08f6 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -289,12 +289,6 @@ struct x86_platform_ops {
struct x86_hyper_runtime hyper;
};
-struct pci_dev;
-
-struct x86_msi_ops {
- void (*restore_msi_irqs)(struct pci_dev *dev);
-};
-
struct x86_apic_ops {
unsigned int (*io_apic_read) (unsigned int apic, unsigned int reg);
void (*restore)(void);
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 454b20815f35..e5e0fe10c692 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -248,6 +248,7 @@ privcmd_call(unsigned int call,
return res;
}
+#ifdef CONFIG_XEN_PV
static inline int
HYPERVISOR_set_trap_table(struct trap_info *table)
{
@@ -280,6 +281,107 @@ HYPERVISOR_callback_op(int cmd, void *arg)
return _hypercall2(int, callback_op, cmd, arg);
}
+static __always_inline int
+HYPERVISOR_set_debugreg(int reg, unsigned long value)
+{
+ return _hypercall2(int, set_debugreg, reg, value);
+}
+
+static __always_inline unsigned long
+HYPERVISOR_get_debugreg(int reg)
+{
+ return _hypercall1(unsigned long, get_debugreg, reg);
+}
+
+static inline int
+HYPERVISOR_update_descriptor(u64 ma, u64 desc)
+{
+ return _hypercall2(int, update_descriptor, ma, desc);
+}
+
+static inline int
+HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
+ unsigned long flags)
+{
+ return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
+}
+
+static inline int
+HYPERVISOR_set_segment_base(int reg, unsigned long value)
+{
+ return _hypercall2(int, set_segment_base, reg, value);
+}
+
+static inline void
+MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
+{
+ mcl->op = __HYPERVISOR_fpu_taskswitch;
+ mcl->args[0] = set;
+
+ trace_xen_mc_entry(mcl, 1);
+}
+
+static inline void
+MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
+ pte_t new_val, unsigned long flags)
+{
+ mcl->op = __HYPERVISOR_update_va_mapping;
+ mcl->args[0] = va;
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = flags;
+
+ trace_xen_mc_entry(mcl, 3);
+}
+
+static inline void
+MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
+ struct desc_struct desc)
+{
+ mcl->op = __HYPERVISOR_update_descriptor;
+ mcl->args[0] = maddr;
+ mcl->args[1] = *(unsigned long *)&desc;
+
+ trace_xen_mc_entry(mcl, 2);
+}
+
+static inline void
+MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
+ int count, int *success_count, domid_t domid)
+{
+ mcl->op = __HYPERVISOR_mmu_update;
+ mcl->args[0] = (unsigned long)req;
+ mcl->args[1] = count;
+ mcl->args[2] = (unsigned long)success_count;
+ mcl->args[3] = domid;
+
+ trace_xen_mc_entry(mcl, 4);
+}
+
+static inline void
+MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
+ int *success_count, domid_t domid)
+{
+ mcl->op = __HYPERVISOR_mmuext_op;
+ mcl->args[0] = (unsigned long)op;
+ mcl->args[1] = count;
+ mcl->args[2] = (unsigned long)success_count;
+ mcl->args[3] = domid;
+
+ trace_xen_mc_entry(mcl, 4);
+}
+
+static inline void
+MULTI_stack_switch(struct multicall_entry *mcl,
+ unsigned long ss, unsigned long esp)
+{
+ mcl->op = __HYPERVISOR_stack_switch;
+ mcl->args[0] = ss;
+ mcl->args[1] = esp;
+
+ trace_xen_mc_entry(mcl, 2);
+}
+#endif
+
static inline int
HYPERVISOR_sched_op(int cmd, void *arg)
{
@@ -308,26 +410,6 @@ HYPERVISOR_platform_op(struct xen_platform_op *op)
return _hypercall1(int, platform_op, op);
}
-static inline int
-HYPERVISOR_set_debugreg(int reg, unsigned long value)
-{
- return _hypercall2(int, set_debugreg, reg, value);
-}
-
-static inline unsigned long
-HYPERVISOR_get_debugreg(int reg)
-{
- return _hypercall1(unsigned long, get_debugreg, reg);
-}
-
-static inline int
-HYPERVISOR_update_descriptor(u64 ma, u64 desc)
-{
- if (sizeof(u64) == sizeof(long))
- return _hypercall2(int, update_descriptor, ma, desc);
- return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
-}
-
static inline long
HYPERVISOR_memory_op(unsigned int cmd, void *arg)
{
@@ -341,24 +423,12 @@ HYPERVISOR_multicall(void *call_list, uint32_t nr_calls)
}
static inline int
-HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
- unsigned long flags)
-{
- if (sizeof(new_val) == sizeof(long))
- return _hypercall3(int, update_va_mapping, va,
- new_val.pte, flags);
- else
- return _hypercall4(int, update_va_mapping, va,
- new_val.pte, new_val.pte >> 32, flags);
-}
-
-static inline int
HYPERVISOR_event_channel_op(int cmd, void *arg)
{
return _hypercall2(int, event_channel_op, cmd, arg);
}
-static inline int
+static __always_inline int
HYPERVISOR_xen_version(int cmd, void *arg)
{
return _hypercall2(int, xen_version, cmd, arg);
@@ -394,14 +464,6 @@ HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
}
-#ifdef CONFIG_X86_64
-static inline int
-HYPERVISOR_set_segment_base(int reg, unsigned long value)
-{
- return _hypercall2(int, set_segment_base, reg, value);
-}
-#endif
-
static inline int
HYPERVISOR_suspend(unsigned long start_info_mfn)
{
@@ -423,13 +485,6 @@ HYPERVISOR_hvm_op(int op, void *arg)
}
static inline int
-HYPERVISOR_tmem_op(
- struct tmem_op *op)
-{
- return _hypercall1(int, tmem_op, op);
-}
-
-static inline int
HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
{
return _hypercall2(int, xenpmu_op, op, arg);
@@ -446,88 +501,4 @@ HYPERVISOR_dm_op(
return ret;
}
-static inline void
-MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
-{
- mcl->op = __HYPERVISOR_fpu_taskswitch;
- mcl->args[0] = set;
-
- trace_xen_mc_entry(mcl, 1);
-}
-
-static inline void
-MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
- pte_t new_val, unsigned long flags)
-{
- mcl->op = __HYPERVISOR_update_va_mapping;
- mcl->args[0] = va;
- if (sizeof(new_val) == sizeof(long)) {
- mcl->args[1] = new_val.pte;
- mcl->args[2] = flags;
- } else {
- mcl->args[1] = new_val.pte;
- mcl->args[2] = new_val.pte >> 32;
- mcl->args[3] = flags;
- }
-
- trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 3 : 4);
-}
-
-static inline void
-MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
- struct desc_struct desc)
-{
- mcl->op = __HYPERVISOR_update_descriptor;
- if (sizeof(maddr) == sizeof(long)) {
- mcl->args[0] = maddr;
- mcl->args[1] = *(unsigned long *)&desc;
- } else {
- u32 *p = (u32 *)&desc;
-
- mcl->args[0] = maddr;
- mcl->args[1] = maddr >> 32;
- mcl->args[2] = *p++;
- mcl->args[3] = *p;
- }
-
- trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4);
-}
-
-static inline void
-MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
- int count, int *success_count, domid_t domid)
-{
- mcl->op = __HYPERVISOR_mmu_update;
- mcl->args[0] = (unsigned long)req;
- mcl->args[1] = count;
- mcl->args[2] = (unsigned long)success_count;
- mcl->args[3] = domid;
-
- trace_xen_mc_entry(mcl, 4);
-}
-
-static inline void
-MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
- int *success_count, domid_t domid)
-{
- mcl->op = __HYPERVISOR_mmuext_op;
- mcl->args[0] = (unsigned long)op;
- mcl->args[1] = count;
- mcl->args[2] = (unsigned long)success_count;
- mcl->args[3] = domid;
-
- trace_xen_mc_entry(mcl, 4);
-}
-
-static inline void
-MULTI_stack_switch(struct multicall_entry *mcl,
- unsigned long ss, unsigned long esp)
-{
- mcl->op = __HYPERVISOR_stack_switch;
- mcl->args[0] = ss;
- mcl->args[1] = esp;
-
- trace_xen_mc_entry(mcl, 2);
-}
-
#endif /* _ASM_X86_XEN_HYPERCALL_H */
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index ff4b52e37e60..1bf2ad34188a 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -57,9 +57,22 @@ static inline bool __init xen_x2apic_para_available(void)
}
#endif
+struct pci_dev;
+
+#ifdef CONFIG_XEN_PV_DOM0
+bool xen_initdom_restore_msi(struct pci_dev *dev);
+#else
+static inline bool xen_initdom_restore_msi(struct pci_dev *dev) { return true; }
+#endif
+
#ifdef CONFIG_HOTPLUG_CPU
void xen_arch_register_cpu(int num);
void xen_arch_unregister_cpu(int num);
#endif
+#ifdef CONFIG_PVH
+void __init xen_pvh_init(struct boot_params *boot_params);
+void __init mem_map_via_hcall(struct boot_params *boot_params_p);
+#endif
+
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 1a162e559753..e989bc2269f5 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -96,11 +96,7 @@ static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val)
asm volatile("1: mov %[val], %[ptr]\n"
"2:\n"
- ".section .fixup, \"ax\"\n"
- "3: sub $1, %[ret]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[ret])
: [ret] "+r" (ret), [ptr] "=m" (*addr)
: [val] "r" (val));
@@ -110,16 +106,12 @@ static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val)
static inline int xen_safe_read_ulong(const unsigned long *addr,
unsigned long *val)
{
- int ret = 0;
unsigned long rval = ~0ul;
+ int ret = 0;
asm volatile("1: mov %[ptr], %[rval]\n"
"2:\n"
- ".section .fixup, \"ax\"\n"
- "3: sub $1, %[ret]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[ret])
: [ret] "+r" (ret), [rval] "+r" (rval)
: [ptr] "m" (*addr));
*val = rval;
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 3506d8c598c1..9015b888edd6 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -14,29 +14,13 @@ static inline int pci_xen_hvm_init(void)
return -1;
}
#endif
-#if defined(CONFIG_XEN_DOM0)
+#ifdef CONFIG_XEN_PV_DOM0
int __init pci_xen_initial_domain(void);
-int xen_find_device_domain_owner(struct pci_dev *dev);
-int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
-int xen_unregister_device_domain_owner(struct pci_dev *dev);
#else
static inline int __init pci_xen_initial_domain(void)
{
return -1;
}
-static inline int xen_find_device_domain_owner(struct pci_dev *dev)
-{
- return -1;
-}
-static inline int xen_register_device_domain_owner(struct pci_dev *dev,
- uint16_t domain)
-{
- return -1;
-}
-static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
-{
- return -1;
-}
#endif
#if defined(CONFIG_PCI_MSI)
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 2ef1f6513c68..2da3316bb559 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -373,9 +373,23 @@ struct kvm_debugregs {
__u64 reserved[9];
};
-/* for KVM_CAP_XSAVE */
+/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */
struct kvm_xsave {
+ /*
+ * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
+ * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+ * respectively, when invoked on the vm file descriptor.
+ *
+ * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+ * will always be at least 4096. Currently, it is only greater
+ * than 4096 if a dynamic feature has been enabled with
+ * ``arch_prctl()``, but this may change in the future.
+ *
+ * The offsets of the state save areas in struct kvm_xsave follow
+ * the contents of CPUID leaf 0xD on the host.
+ */
__u32 region[1024];
+ __u32 extra[0];
};
#define KVM_MAX_XCRS 16
@@ -504,4 +518,8 @@ struct kvm_pmu_event_filter {
#define KVM_PMU_EVENT_ALLOW 0
#define KVM_PMU_EVENT_DENY 1
+/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
+#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
+#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 5146bbab84d4..6e64b27b2c1e 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -8,6 +8,7 @@
* should be used to determine that a VM is running under KVM.
*/
#define KVM_CPUID_SIGNATURE 0x40000000
+#define KVM_SIGNATURE "KVMKVMKVM\0\0\0"
/* This CPUID returns two feature bitmaps in eax, edx. Before enabling
* a particular paravirtualization, the appropriate feature bit should
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 5a6aac9fa41f..500b96e71f18 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -2,16 +2,22 @@
#ifndef _ASM_X86_PRCTL_H
#define _ASM_X86_PRCTL_H
-#define ARCH_SET_GS 0x1001
-#define ARCH_SET_FS 0x1002
-#define ARCH_GET_FS 0x1003
-#define ARCH_GET_GS 0x1004
+#define ARCH_SET_GS 0x1001
+#define ARCH_SET_FS 0x1002
+#define ARCH_GET_FS 0x1003
+#define ARCH_GET_GS 0x1004
-#define ARCH_GET_CPUID 0x1011
-#define ARCH_SET_CPUID 0x1012
+#define ARCH_GET_CPUID 0x1011
+#define ARCH_SET_CPUID 0x1012
-#define ARCH_MAP_VDSO_X32 0x2001
-#define ARCH_MAP_VDSO_32 0x2002
-#define ARCH_MAP_VDSO_64 0x2003
+#define ARCH_GET_XCOMP_SUPP 0x1021
+#define ARCH_GET_XCOMP_PERM 0x1022
+#define ARCH_REQ_XCOMP_PERM 0x1023
+#define ARCH_GET_XCOMP_GUEST_PERM 0x1024
+#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025
+
+#define ARCH_MAP_VDSO_X32 0x2001
+#define ARCH_MAP_VDSO_32 0x2002
+#define ARCH_MAP_VDSO_64 0x2003
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
index 9690d6899ad9..f4b81587e90b 100644
--- a/arch/x86/include/uapi/asm/sgx.h
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -27,6 +27,8 @@ enum sgx_page_flags {
_IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
#define SGX_IOC_ENCLAVE_PROVISION \
_IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision)
+#define SGX_IOC_VEPC_REMOVE_ALL \
+ _IO(SGX_MAGIC, 0x04)
/**
* struct sgx_enclave_create - parameter structure for the
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8f4e8fa6ed75..6aef9ee28a39 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
CFLAGS_REMOVE_head64.o = -pg
CFLAGS_REMOVE_sev.o = -pg
+CFLAGS_REMOVE_cc_platform.o = -pg
endif
KASAN_SANITIZE_head$(BITS).o := n
@@ -29,6 +30,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
KASAN_SANITIZE_stacktrace.o := n
KASAN_SANITIZE_paravirt.o := n
KASAN_SANITIZE_sev.o := n
+KASAN_SANITIZE_cc_platform.o := n
# With some compiler versions the generated code results in boot hangs, caused
# by several compilation units. To be safe, disable all instrumentation.
@@ -47,6 +49,7 @@ endif
KCOV_INSTRUMENT := n
CFLAGS_head$(BITS).o += -fno-stack-protector
+CFLAGS_cc_platform.o += -fno-stack-protector
CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace
@@ -81,7 +84,7 @@ obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
obj-$(CONFIG_INTEL_TXT) += tboot.o
obj-$(CONFIG_ISA_DMA_API) += i8237.o
-obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-y += stacktrace.o
obj-y += cpu/
obj-y += acpi/
obj-y += reboot.o
@@ -147,6 +150,9 @@ obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o
+
+obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += cc_platform.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 14bcd59bcdee..5b6d1a95776f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -62,6 +62,7 @@ int acpi_fix_pin2_polarity __initdata;
#ifdef CONFIG_X86_LOCAL_APIC
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+static bool acpi_support_online_capable;
#endif
#ifdef CONFIG_X86_IO_APIC
@@ -138,6 +139,8 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
pr_debug("Local APIC address 0x%08x\n", madt->address);
}
+ if (madt->header.revision >= 5)
+ acpi_support_online_capable = true;
default_acpi_madt_oem_check(madt->header.oem_id,
madt->header.oem_table_id);
@@ -239,6 +242,12 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
if (processor->id == 0xff)
return 0;
+ /* don't register processors that can not be onlined */
+ if (acpi_support_online_capable &&
+ !(processor->lapic_flags & ACPI_MADT_ENABLED) &&
+ !(processor->lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
+ return 0;
+
/*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 7de599eba7f0..7945eae5b315 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -79,6 +79,21 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
*/
flags->bm_control = 0;
}
+ if (c->x86_vendor == X86_VENDOR_AMD && c->x86 >= 0x17) {
+ /*
+ * For all AMD Zen or newer CPUs that support C3, caches
+ * should not be flushed by software while entering C3
+ * type state. Set bm->check to 1 so that kernel doesn't
+ * need to execute cache flush operation.
+ */
+ flags->bm_check = 1;
+ /*
+ * In current AMD C state implementation ARB_DIS is no longer
+ * used. So set bm_control to zero to indicate ARB_DIS is not
+ * required while entering C3 type state.
+ */
+ flags->bm_control = 0;
+ }
}
EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 3f85fcae450c..1e97f944b47d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -139,8 +139,10 @@ static int __init acpi_sleep_setup(char *str)
if (strncmp(str, "s3_beep", 7) == 0)
acpi_realmode_flags |= 4;
#ifdef CONFIG_HIBERNATION
+ if (strncmp(str, "s4_hwsig", 8) == 0)
+ acpi_check_s4_hw_signature(1);
if (strncmp(str, "s4_nohwsig", 10) == 0)
- acpi_no_s4_hw_signature();
+ acpi_check_s4_hw_signature(0);
#endif
if (strncmp(str, "nonvs", 5) == 0)
acpi_nvs_nosave();
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index daf88f8143c5..cf69081073b5 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -60,7 +60,7 @@ save_registers:
popl saved_context_eflags
movl $ret_point, saved_eip
- ret
+ RET
restore_registers:
@@ -70,7 +70,7 @@ restore_registers:
movl saved_context_edi, %edi
pushl saved_context_eflags
popfl
- ret
+ RET
SYM_CODE_START(do_suspend_lowlevel)
call save_processor_state
@@ -86,7 +86,7 @@ SYM_CODE_START(do_suspend_lowlevel)
ret_point:
call restore_registers
call restore_processor_state
- ret
+ RET
SYM_CODE_END(do_suspend_lowlevel)
.data
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index e9da3dc71254..5007c3ffe96f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -29,6 +29,7 @@
#include <asm/io.h>
#include <asm/fixmap.h>
#include <asm/paravirt.h>
+#include <asm/asm-prototypes.h>
int __read_mostly alternatives_patched;
@@ -113,6 +114,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
}
}
+extern s32 __retpoline_sites[], __retpoline_sites_end[];
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);
@@ -221,7 +223,7 @@ static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
* "noinline" to cause control flow change and thus invalidate I$ and
* cause refetch after modification.
*/
-static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
+static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
{
struct insn insn;
int i = 0;
@@ -239,11 +241,11 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins
* optimized.
*/
if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
- i += optimize_nops_range(instr, a->instrlen, i);
+ i += optimize_nops_range(instr, len, i);
else
i += insn.length;
- if (i >= a->instrlen)
+ if (i >= len)
return;
}
}
@@ -331,10 +333,185 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
text_poke_early(instr, insn_buff, insn_buff_sz);
next:
- optimize_nops(a, instr);
+ optimize_nops(instr, a->instrlen);
}
}
+#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION)
+
+/*
+ * CALL/JMP *%\reg
+ */
+static int emit_indirect(int op, int reg, u8 *bytes)
+{
+ int i = 0;
+ u8 modrm;
+
+ switch (op) {
+ case CALL_INSN_OPCODE:
+ modrm = 0x10; /* Reg = 2; CALL r/m */
+ break;
+
+ case JMP32_INSN_OPCODE:
+ modrm = 0x20; /* Reg = 4; JMP r/m */
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+
+ modrm |= 0xc0; /* Mod = 3 */
+ modrm += reg;
+
+ bytes[i++] = 0xff; /* opcode */
+ bytes[i++] = modrm;
+
+ return i;
+}
+
+/*
+ * Rewrite the compiler generated retpoline thunk calls.
+ *
+ * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
+ * indirect instructions, avoiding the extra indirection.
+ *
+ * For example, convert:
+ *
+ * CALL __x86_indirect_thunk_\reg
+ *
+ * into:
+ *
+ * CALL *%\reg
+ *
+ * It also tries to inline spectre_v2=retpoline,amd when size permits.
+ */
+static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
+{
+ retpoline_thunk_t *target;
+ int reg, ret, i = 0;
+ u8 op, cc;
+
+ target = addr + insn->length + insn->immediate.value;
+ reg = target - __x86_indirect_thunk_array;
+
+ if (WARN_ON_ONCE(reg & ~0xf))
+ return -1;
+
+ /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
+ BUG_ON(reg == 4);
+
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
+ !cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD))
+ return -1;
+
+ op = insn->opcode.bytes[0];
+
+ /*
+ * Convert:
+ *
+ * Jcc.d32 __x86_indirect_thunk_\reg
+ *
+ * into:
+ *
+ * Jncc.d8 1f
+ * [ LFENCE ]
+ * JMP *%\reg
+ * [ NOP ]
+ * 1:
+ */
+ /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
+ if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) {
+ cc = insn->opcode.bytes[1] & 0xf;
+ cc ^= 1; /* invert condition */
+
+ bytes[i++] = 0x70 + cc; /* Jcc.d8 */
+ bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
+
+ /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
+ op = JMP32_INSN_OPCODE;
+ }
+
+ /*
+ * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) {
+ bytes[i++] = 0x0f;
+ bytes[i++] = 0xae;
+ bytes[i++] = 0xe8; /* LFENCE */
+ }
+
+ ret = emit_indirect(op, reg, bytes + i);
+ if (ret < 0)
+ return ret;
+ i += ret;
+
+ for (; i < insn->length;)
+ bytes[i++] = BYTES_NOP1;
+
+ return i;
+}
+
+/*
+ * Generated by 'objtool --retpoline'.
+ */
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ struct insn insn;
+ int len, ret;
+ u8 bytes[16];
+ u8 op1, op2;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op1 = insn.opcode.bytes[0];
+ op2 = insn.opcode.bytes[1];
+
+ switch (op1) {
+ case CALL_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ break;
+
+ case 0x0f: /* escape */
+ if (op2 >= 0x80 && op2 <= 0x8f)
+ break;
+ fallthrough;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
+ addr, addr, insn.length,
+ addr + insn.length + insn.immediate.value);
+
+ len = patch_retpoline(addr, &insn, bytes);
+ if (len == insn.length) {
+ optimize_nops(bytes, len);
+ DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
+ text_poke_early(addr, bytes, len);
+ }
+ }
+}
+
+#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */
+
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
+
+#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */
+
#ifdef CONFIG_SMP
static void alternatives_smp_lock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end)
@@ -537,7 +714,7 @@ asm (
" .type int3_magic, @function\n"
"int3_magic:\n"
" movl $1, (%" _ASM_ARG1 ")\n"
-" ret\n"
+ ASM_RET
" .size int3_magic, .-int3_magic\n"
" .popsection\n"
);
@@ -643,6 +820,12 @@ void __init alternative_instructions(void)
apply_paravirt(__parainstructions, __parainstructions_end);
/*
+ * Rewrite the retpolines, must be done before alternatives since
+ * those can rewrite the retpoline thunks.
+ */
+ apply_retpolines(__retpoline_sites, __retpoline_sites_end);
+
+ /*
* Then patch alternatives, such that those paravirt calls that are in
* alternatives can be overwritten by their immediate fragments.
*/
@@ -930,10 +1113,13 @@ void text_poke_sync(void)
}
struct text_poke_loc {
- s32 rel_addr; /* addr := _stext + rel_addr */
- s32 rel32;
+ /* addr := _stext + rel_addr */
+ s32 rel_addr;
+ s32 disp;
+ u8 len;
u8 opcode;
const u8 text[POKE_MAX_OPCODE_SIZE];
+ /* see text_poke_bp_batch() */
u8 old;
};
@@ -948,7 +1134,8 @@ static struct bp_patching_desc *bp_desc;
static __always_inline
struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
{
- struct bp_patching_desc *desc = __READ_ONCE(*descp); /* rcu_dereference */
+ /* rcu_dereference */
+ struct bp_patching_desc *desc = __READ_ONCE(*descp);
if (!desc || !arch_atomic_inc_not_zero(&desc->refs))
return NULL;
@@ -982,7 +1169,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
{
struct bp_patching_desc *desc;
struct text_poke_loc *tp;
- int len, ret = 0;
+ int ret = 0;
void *ip;
if (user_mode(regs))
@@ -1022,8 +1209,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
goto out_put;
}
- len = text_opcode_size(tp->opcode);
- ip += len;
+ ip += tp->len;
switch (tp->opcode) {
case INT3_INSN_OPCODE:
@@ -1038,12 +1224,12 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
break;
case CALL_INSN_OPCODE:
- int3_emulate_call(regs, (long)ip + tp->rel32);
+ int3_emulate_call(regs, (long)ip + tp->disp);
break;
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
- int3_emulate_jmp(regs, (long)ip + tp->rel32);
+ int3_emulate_jmp(regs, (long)ip + tp->disp);
break;
default:
@@ -1118,7 +1304,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
- int len = text_opcode_size(tp[i].opcode);
+ int len = tp[i].len;
if (len - INT3_INSN_SIZE > 0) {
memcpy(old + INT3_INSN_SIZE,
@@ -1195,21 +1381,37 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
const void *opcode, size_t len, const void *emulate)
{
struct insn insn;
- int ret;
+ int ret, i;
memcpy((void *)tp->text, opcode, len);
if (!emulate)
emulate = opcode;
ret = insn_decode_kernel(&insn, emulate);
-
BUG_ON(ret < 0);
- BUG_ON(len != insn.length);
tp->rel_addr = addr - (void *)_stext;
+ tp->len = len;
tp->opcode = insn.opcode.bytes[0];
switch (tp->opcode) {
+ case RET_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ /*
+ * Control flow instructions without implied execution of the
+ * next instruction can be padded with INT3.
+ */
+ for (i = insn.length; i < len; i++)
+ BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
+ break;
+
+ default:
+ BUG_ON(len != insn.length);
+ };
+
+
+ switch (tp->opcode) {
case INT3_INSN_OPCODE:
case RET_INSN_OPCODE:
break;
@@ -1217,7 +1419,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
case CALL_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
- tp->rel32 = insn.immediate.value;
+ tp->disp = insn.immediate.value;
break;
default: /* assume NOP */
@@ -1225,13 +1427,13 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
case 2: /* NOP2 -- emulate as JMP8+0 */
BUG_ON(memcmp(emulate, x86_nops[len], len));
tp->opcode = JMP8_INSN_OPCODE;
- tp->rel32 = 0;
+ tp->disp = 0;
break;
case 5: /* NOP5 -- emulate as JMP32+0 */
BUG_ON(memcmp(emulate, x86_nops[len], len));
tp->opcode = JMP32_INSN_OPCODE;
- tp->rel32 = 0;
+ tp->disp = 0;
break;
default: /* unknown instruction */
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index c92c9c774c0e..020c906f7934 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -19,17 +19,19 @@
#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480
#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630
+#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
+#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1
#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5
#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d
#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
-/* Protect the PCI config register pairs used for SMN and DF indirect access. */
+/* Protect the PCI config register pairs used for SMN. */
static DEFINE_MUTEX(smn_mutex);
static u32 *flush_words;
@@ -39,6 +41,7 @@ static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) },
{}
};
@@ -61,6 +64,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
{}
@@ -78,6 +82,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
@@ -182,53 +187,6 @@ int amd_smn_write(u16 node, u32 address, u32 value)
}
EXPORT_SYMBOL_GPL(amd_smn_write);
-/*
- * Data Fabric Indirect Access uses FICAA/FICAD.
- *
- * Fabric Indirect Configuration Access Address (FICAA): Constructed based
- * on the device's Instance Id and the PCI function and register offset of
- * the desired register.
- *
- * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO
- * and FICAD HI registers but so far we only need the LO register.
- */
-int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
-{
- struct pci_dev *F4;
- u32 ficaa;
- int err = -ENODEV;
-
- if (node >= amd_northbridges.num)
- goto out;
-
- F4 = node_to_amd_nb(node)->link;
- if (!F4)
- goto out;
-
- ficaa = 1;
- ficaa |= reg & 0x3FC;
- ficaa |= (func & 0x7) << 11;
- ficaa |= instance_id << 16;
-
- mutex_lock(&smn_mutex);
-
- err = pci_write_config_dword(F4, 0x5C, ficaa);
- if (err) {
- pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
- goto out_unlock;
- }
-
- err = pci_read_config_dword(F4, 0x98, lo);
- if (err)
- pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
-
-out_unlock:
- mutex_unlock(&smn_mutex);
-
-out:
- return err;
-}
-EXPORT_SYMBOL_GPL(amd_df_indirect_read);
int amd_cache_northbridges(void)
{
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 10562885f5fc..af3ba08b684b 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -73,12 +73,23 @@ static int gart_mem_pfn_is_ram(unsigned long pfn)
(pfn >= aperture_pfn_start + aperture_page_count));
}
+#ifdef CONFIG_PROC_VMCORE
+static bool gart_oldmem_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn)
+{
+ return !!gart_mem_pfn_is_ram(pfn);
+}
+
+static struct vmcore_cb gart_vmcore_cb = {
+ .pfn_is_ram = gart_oldmem_pfn_is_ram,
+};
+#endif
+
static void __init exclude_from_core(u64 aper_base, u32 aper_order)
{
aperture_pfn_start = aper_base >> PAGE_SHIFT;
aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT;
#ifdef CONFIG_PROC_VMCORE
- WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram));
+ register_vmcore_cb(&gart_vmcore_cb);
#endif
#ifdef CONFIG_PROC_KCORE
WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram));
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index dbacb9ec8843..7517eb05bdc1 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -19,6 +19,7 @@
#include <asm/hw_irq.h>
#include <asm/apic.h>
#include <asm/irq_remapping.h>
+#include <asm/xen/hypervisor.h>
struct irq_domain *x86_pci_msi_default_domain __ro_after_init;
@@ -159,11 +160,8 @@ static struct irq_chip pci_msi_controller = {
int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
msi_alloc_info_t *arg)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct msi_desc *desc = first_pci_msi_entry(pdev);
-
init_irq_alloc_info(arg, NULL);
- if (desc->msi_attrib.is_msix) {
+ if (to_pci_dev(dev)->msix_enabled) {
arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
} else {
arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
@@ -345,3 +343,8 @@ void dmar_free_hwirq(int irq)
irq_domain_free_irqs(irq, 1);
}
#endif
+
+bool arch_restore_msi_irqs(struct pci_dev *dev)
+{
+ return xen_initdom_restore_msi(dev);
+}
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index f4da9bb69a88..e696e22d0531 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -15,9 +15,15 @@ struct cluster_mask {
struct cpumask mask;
};
-static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+/*
+ * __x2apic_send_IPI_mask() possibly needs to read
+ * x86_cpu_to_logical_apicid for all online cpus in a sequential way.
+ * Using per cpu variable would cost one cache line per cpu.
+ */
+static u32 *x86_cpu_to_logical_apicid __read_mostly;
+
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
-static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
+static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks);
static struct cluster_mask *cluster_hotplug_mask;
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -27,7 +33,7 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
static void x2apic_send_IPI(int cpu, int vector)
{
- u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+ u32 dest = x86_cpu_to_logical_apicid[cpu];
/* x2apic MSRs are special and need a special fence: */
weak_wrmsr_fence();
@@ -58,7 +64,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
dest = 0;
for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
- dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
+ dest |= x86_cpu_to_logical_apicid[clustercpu];
if (!dest)
continue;
@@ -94,7 +100,7 @@ static void x2apic_send_IPI_all(int vector)
static u32 x2apic_calc_apicid(unsigned int cpu)
{
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ return x86_cpu_to_logical_apicid[cpu];
}
static void init_x2apic_ldr(void)
@@ -103,7 +109,7 @@ static void init_x2apic_ldr(void)
u32 cluster, apicid = apic_read(APIC_LDR);
unsigned int cpu;
- this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+ x86_cpu_to_logical_apicid[smp_processor_id()] = apicid;
if (cmsk)
goto update;
@@ -166,12 +172,21 @@ static int x2apic_dead_cpu(unsigned int dead_cpu)
static int x2apic_cluster_probe(void)
{
+ u32 slots;
+
if (!x2apic_mode)
return 0;
+ slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids);
+ x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL);
+ if (!x86_cpu_to_logical_apicid)
+ return 0;
+
if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
pr_err("Failed to register X2APIC_PREPARE\n");
+ kfree(x86_cpu_to_logical_apicid);
+ x86_cpu_to_logical_apicid = NULL;
return 0;
}
init_x2apic_ldr();
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index ecd3fd6993d1..9fb0a2f8b62a 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -38,9 +38,6 @@ static void __used common(void)
#endif
BLANK();
- OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
-
- BLANK();
OFFSET(pbe_address, pbe, address);
OFFSET(pbe_orig_address, pbe, orig_address);
OFFSET(pbe_next, pbe, next);
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 83d9cad4e68b..44c3601cfdc4 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -47,14 +47,16 @@ int audit_classify_syscall(int abi, unsigned syscall)
#endif
switch(syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_execve:
case __NR_execveat:
- return 5;
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 0;
+ return AUDITSC_NATIVE;
}
}
diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c
new file mode 100644
index 000000000000..6a6ffcd978f6
--- /dev/null
+++ b/arch/x86/kernel/cc_platform.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Confidential Computing Platform Capability checks
+ *
+ * Copyright (C) 2021 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#include <linux/export.h>
+#include <linux/cc_platform.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/mshyperv.h>
+#include <asm/processor.h>
+
+static bool __maybe_unused intel_cc_platform_has(enum cc_attr attr)
+{
+#ifdef CONFIG_INTEL_TDX_GUEST
+ return false;
+#else
+ return false;
+#endif
+}
+
+/*
+ * SME and SEV are very similar but they are not the same, so there are
+ * times that the kernel will need to distinguish between SME and SEV. The
+ * cc_platform_has() function is used for this. When a distinction isn't
+ * needed, the CC_ATTR_MEM_ENCRYPT attribute can be used.
+ *
+ * The trampoline code is a good example for this requirement. Before
+ * paging is activated, SME will access all memory as decrypted, but SEV
+ * will access all memory as encrypted. So, when APs are being brought
+ * up under SME the trampoline area cannot be encrypted, whereas under SEV
+ * the trampoline area must be encrypted.
+ */
+static bool amd_cc_platform_has(enum cc_attr attr)
+{
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ switch (attr) {
+ case CC_ATTR_MEM_ENCRYPT:
+ return sme_me_mask;
+
+ case CC_ATTR_HOST_MEM_ENCRYPT:
+ return sme_me_mask && !(sev_status & MSR_AMD64_SEV_ENABLED);
+
+ case CC_ATTR_GUEST_MEM_ENCRYPT:
+ return sev_status & MSR_AMD64_SEV_ENABLED;
+
+ case CC_ATTR_GUEST_STATE_ENCRYPT:
+ return sev_status & MSR_AMD64_SEV_ES_ENABLED;
+
+ /*
+ * With SEV, the rep string I/O instructions need to be unrolled
+ * but SEV-ES supports them through the #VC handler.
+ */
+ case CC_ATTR_GUEST_UNROLL_STRING_IO:
+ return (sev_status & MSR_AMD64_SEV_ENABLED) &&
+ !(sev_status & MSR_AMD64_SEV_ES_ENABLED);
+
+ default:
+ return false;
+ }
+#else
+ return false;
+#endif
+}
+
+static bool hyperv_cc_platform_has(enum cc_attr attr)
+{
+ return attr == CC_ATTR_GUEST_MEM_ENCRYPT;
+}
+
+bool cc_platform_has(enum cc_attr attr)
+{
+ if (sme_me_mask)
+ return amd_cc_platform_has(attr);
+
+ if (hv_is_isolation_supported())
+ return hyperv_cc_platform_has(attr);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(cc_platform_has);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 637b499450d1..9661e3e802be 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o
+obj-$(CONFIG_CPU_SUP_VORTEX_32) += vortex.o
obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2131af9f2fa2..4edb6f0f628c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -989,6 +989,8 @@ static void init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_IRPERF) &&
!cpu_has_amd_erratum(c, amd_erratum_1054))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+
+ check_null_seg_clears_base(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ecfca3bbcd96..1c1f218a701d 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -22,7 +22,7 @@
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/processor-flags.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/msr.h>
#include <asm/vmx.h>
#include <asm/paravirt.h>
@@ -758,11 +758,11 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
case SPECTRE_V2_USER_CMD_FORCE:
mode = SPECTRE_V2_USER_STRICT;
break;
+ case SPECTRE_V2_USER_CMD_AUTO:
case SPECTRE_V2_USER_CMD_PRCTL:
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
mode = SPECTRE_V2_USER_PRCTL;
break;
- case SPECTRE_V2_USER_CMD_AUTO:
case SPECTRE_V2_USER_CMD_SECCOMP:
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
if (IS_ENABLED(CONFIG_SECCOMP))
@@ -882,13 +882,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return SPECTRE_V2_CMD_AUTO;
}
- if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
- boot_cpu_data.x86_vendor != X86_VENDOR_HYGON &&
- boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
- pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
- return SPECTRE_V2_CMD_AUTO;
- }
-
spec_v2_print_cond(mitigation_options[i].option,
mitigation_options[i].secure);
return cmd;
@@ -1169,7 +1162,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
return mode;
switch (cmd) {
- case SPEC_STORE_BYPASS_CMD_AUTO:
case SPEC_STORE_BYPASS_CMD_SECCOMP:
/*
* Choose prctl+seccomp as the default mode if seccomp is
@@ -1183,6 +1175,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
case SPEC_STORE_BYPASS_CMD_ON:
mode = SPEC_STORE_BYPASS_DISABLE;
break;
+ case SPEC_STORE_BYPASS_CMD_AUTO:
case SPEC_STORE_BYPASS_CMD_PRCTL:
mode = SPEC_STORE_BYPASS_PRCTL;
break;
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index b5e36bd0425b..fe98a1465be6 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -846,6 +846,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
l2 = new_l2;
#ifdef CONFIG_SMP
per_cpu(cpu_llc_id, cpu) = l2_id;
+ per_cpu(cpu_l2c_id, cpu) = l2_id;
#endif
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0f8885949e8c..7b8382c11788 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -42,7 +42,7 @@
#include <asm/setup.h>
#include <asm/apic.h>
#include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/mtrr.h>
#include <asm/hwcap2.h>
#include <linux/numa.h>
@@ -85,6 +85,9 @@ u16 get_llc_id(unsigned int cpu)
}
EXPORT_SYMBOL_GPL(get_llc_id);
+/* L2 cache ID of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID;
+
/* correctly size the local cpu masks */
void __init setup_cpu_local_masks(void)
{
@@ -326,6 +329,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_SMAP
cr4_set_bits(X86_CR4_SMAP);
#else
+ clear_cpu_cap(c, X86_FEATURE_SMAP);
cr4_clear_bits(X86_CR4_SMAP);
#endif
}
@@ -380,7 +384,7 @@ set_register:
}
EXPORT_SYMBOL(native_write_cr0);
-void native_write_cr4(unsigned long val)
+void __no_profile native_write_cr4(unsigned long val)
{
unsigned long bits_changed = 0;
@@ -1044,6 +1048,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION),
VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION),
VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(VORTEX, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
@@ -1395,9 +1401,8 @@ void __init early_cpu_init(void)
early_identify_cpu(&boot_cpu_data);
}
-static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
+static bool detect_null_seg_behavior(void)
{
-#ifdef CONFIG_X86_64
/*
* Empirically, writing zero to a segment selector on AMD does
* not clear the base, whereas writing zero to a segment
@@ -1418,10 +1423,43 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
wrmsrl(MSR_FS_BASE, 1);
loadsegment(fs, 0);
rdmsrl(MSR_FS_BASE, tmp);
- if (tmp != 0)
- set_cpu_bug(c, X86_BUG_NULL_SEG);
wrmsrl(MSR_FS_BASE, old_base);
-#endif
+ return tmp == 0;
+}
+
+void check_null_seg_clears_base(struct cpuinfo_x86 *c)
+{
+ /* BUG_NULL_SEG is only relevant with 64bit userspace */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */
+ if (c->extended_cpuid_level >= 0x80000021 &&
+ cpuid_eax(0x80000021) & BIT(6))
+ return;
+
+ /*
+ * CPUID bit above wasn't set. If this kernel is still running
+ * as a HV guest, then the HV has decided not to advertize
+ * that CPUID bit for whatever reason. For example, one
+ * member of the migration pool might be vulnerable. Which
+ * means, the bug is present: set the BUG flag and return.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ return;
+ }
+
+ /*
+ * Zen2 CPUs also have this behaviour, but no CPUID bit.
+ * 0x18 is the respective family for Hygon.
+ */
+ if ((c->x86 == 0x17 || c->x86 == 0x18) &&
+ detect_null_seg_behavior())
+ return;
+
+ /* All the remaining ones are affected */
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
}
static void generic_identify(struct cpuinfo_x86 *c)
@@ -1457,8 +1495,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
get_model_name(c); /* Default name */
- detect_null_seg_behavior(c);
-
/*
* ESPFIX is a strange bug. All real CPUs have it. Paravirt
* systems that run Linux at CPL > 0 may or may not have the
@@ -1751,6 +1787,17 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count);
DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+static void wrmsrl_cstar(unsigned long val)
+{
+ /*
+ * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR
+ * is so far ignored by the CPU, but raises a #VE trap in a TDX
+ * guest. Avoid the pointless write on all Intel CPUs.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ wrmsrl(MSR_CSTAR, val);
+}
+
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
@@ -1758,7 +1805,7 @@ void syscall_init(void)
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
- wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
+ wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1770,7 +1817,7 @@ void syscall_init(void)
(unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
- wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
+ wrmsrl_cstar((unsigned long)ignore_sysret);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 95521302630d..ee6f23f7587d 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -75,6 +75,7 @@ extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
extern int detect_extended_topology(struct cpuinfo_x86 *c);
extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
+extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index defda61f372d..c881bcafba7d 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -75,6 +75,9 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
{ X86_FEATURE_SGX1, X86_FEATURE_SGX },
{ X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
+ { X86_FEATURE_XFD, X86_FEATURE_XSAVES },
+ { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
+ { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
{}
};
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 6d50136f7ab9..3fcdda4c1e11 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -335,6 +335,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
/* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
if (!cpu_has(c, X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ check_null_seg_clears_base(c);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index f4dd73396f28..fbaf12e43f41 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -16,6 +16,7 @@
#include <linux/syscore_ops.h>
#include <linux/pm.h>
+#include <asm/cpu_device_id.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -58,6 +59,22 @@ static DEFINE_PER_CPU(u8, saved_epb);
#define EPB_SAVED 0x10ULL
#define MAX_EPB EPB_MASK
+enum energy_perf_value_index {
+ EPB_INDEX_PERFORMANCE,
+ EPB_INDEX_BALANCE_PERFORMANCE,
+ EPB_INDEX_NORMAL,
+ EPB_INDEX_BALANCE_POWERSAVE,
+ EPB_INDEX_POWERSAVE,
+};
+
+static u8 energ_perf_values[] = {
+ [EPB_INDEX_PERFORMANCE] = ENERGY_PERF_BIAS_PERFORMANCE,
+ [EPB_INDEX_BALANCE_PERFORMANCE] = ENERGY_PERF_BIAS_BALANCE_PERFORMANCE,
+ [EPB_INDEX_NORMAL] = ENERGY_PERF_BIAS_NORMAL,
+ [EPB_INDEX_BALANCE_POWERSAVE] = ENERGY_PERF_BIAS_BALANCE_POWERSAVE,
+ [EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE,
+};
+
static int intel_epb_save(void)
{
u64 epb;
@@ -90,7 +107,7 @@ static void intel_epb_restore(void)
*/
val = epb & EPB_MASK;
if (val == ENERGY_PERF_BIAS_PERFORMANCE) {
- val = ENERGY_PERF_BIAS_NORMAL;
+ val = energ_perf_values[EPB_INDEX_NORMAL];
pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
}
}
@@ -103,18 +120,11 @@ static struct syscore_ops intel_epb_syscore_ops = {
};
static const char * const energy_perf_strings[] = {
- "performance",
- "balance-performance",
- "normal",
- "balance-power",
- "power"
-};
-static const u8 energ_perf_values[] = {
- ENERGY_PERF_BIAS_PERFORMANCE,
- ENERGY_PERF_BIAS_BALANCE_PERFORMANCE,
- ENERGY_PERF_BIAS_NORMAL,
- ENERGY_PERF_BIAS_BALANCE_POWERSAVE,
- ENERGY_PERF_BIAS_POWERSAVE
+ [EPB_INDEX_PERFORMANCE] = "performance",
+ [EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance",
+ [EPB_INDEX_NORMAL] = "normal",
+ [EPB_INDEX_BALANCE_POWERSAVE] = "balance-power",
+ [EPB_INDEX_POWERSAVE] = "power",
};
static ssize_t energy_perf_bias_show(struct device *dev,
@@ -193,13 +203,22 @@ static int intel_epb_offline(unsigned int cpu)
return 0;
}
+static const struct x86_cpu_id intel_epb_normal[] = {
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 7),
+ {}
+};
+
static __init int intel_epb_init(void)
{
+ const struct x86_cpu_id *id = x86_match_cpu(intel_epb_normal);
int ret;
if (!boot_cpu_has(X86_FEATURE_EPB))
return -ENODEV;
+ if (id)
+ energ_perf_values[EPB_INDEX_NORMAL] = id->driver_data;
+
ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE,
"x86/intel/epb:online", intel_epb_online,
intel_epb_offline);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 08831acc1d03..a1e2f41796dc 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -71,6 +71,22 @@ static const char * const smca_umc_block_names[] = {
"misc_umc"
};
+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
+
+struct smca_hwid {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
+ u32 hwid_mcatype; /* (hwid,mcatype) tuple */
+};
+
+struct smca_bank {
+ const struct smca_hwid *hwid;
+ u32 id; /* Value of MCA_IPID[InstanceId]. */
+ u8 sysfs_id; /* Value used for sysfs name. */
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
+static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
+
struct smca_bank_name {
const char *name; /* Short name for sysfs */
const char *long_name; /* Long name for pretty-printing */
@@ -95,11 +111,18 @@ static struct smca_bank_name smca_names[] = {
[SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
[SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
[SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
+ [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" },
[SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
[SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" },
[SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" },
+ [SMCA_NBIF] = { "nbif", "NBIF Unit" },
+ [SMCA_SHUB] = { "shub", "System Hub Unit" },
+ [SMCA_SATA] = { "sata", "SATA Unit" },
+ [SMCA_USB] = { "usb", "USB Unit" },
+ [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" },
[SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
[SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
+ [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" },
};
static const char *smca_get_name(enum smca_bank_types t)
@@ -119,21 +142,22 @@ const char *smca_get_long_name(enum smca_bank_types t)
}
EXPORT_SYMBOL_GPL(smca_get_long_name);
-static enum smca_bank_types smca_get_bank_type(unsigned int bank)
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
{
struct smca_bank *b;
if (bank >= MAX_NR_BANKS)
return N_SMCA_BANK_TYPES;
- b = &smca_banks[bank];
+ b = &per_cpu(smca_banks, cpu)[bank];
if (!b->hwid)
return N_SMCA_BANK_TYPES;
return b->hwid->bank_type;
}
+EXPORT_SYMBOL_GPL(smca_get_bank_type);
-static struct smca_hwid smca_hwid_mcatypes[] = {
+static const struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype } */
/* Reserved type */
@@ -173,6 +197,9 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Microprocessor 5 Unit MCA type */
{ SMCA_MP5, HWID_MCATYPE(0x01, 0x2) },
+ /* MPDMA MCA type */
+ { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) },
+
/* Northbridge IO Unit MCA type */
{ SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) },
@@ -180,19 +207,17 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
{ SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
- /* xGMI PCS MCA type */
{ SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
-
- /* xGMI PHY MCA type */
+ { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) },
+ { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
+ { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
+ { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
+ { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
-
- /* WAFL PHY MCA type */
{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
+ { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
};
-struct smca_bank smca_banks[MAX_NR_BANKS];
-EXPORT_SYMBOL_GPL(smca_banks);
-
/*
* In SMCA enabled processors, we can have multiple banks for a given IP type.
* So to define a unique name for each bank, we use a temp c-string to append
@@ -248,8 +273,9 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)
static void smca_configure(unsigned int bank, unsigned int cpu)
{
+ u8 *bank_counts = this_cpu_ptr(smca_bank_counts);
+ const struct smca_hwid *s_hwid;
unsigned int i, hwid_mcatype;
- struct smca_hwid *s_hwid;
u32 high, low;
u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
@@ -285,10 +311,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
smca_set_misc_banks_map(bank, cpu);
- /* Return early if this bank was already initialized. */
- if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0)
- return;
-
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
@@ -299,10 +321,11 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
s_hwid = &smca_hwid_mcatypes[i];
+
if (hwid_mcatype == s_hwid->hwid_mcatype) {
- smca_banks[bank].hwid = s_hwid;
- smca_banks[bank].id = low;
- smca_banks[bank].sysfs_id = s_hwid->count++;
+ this_cpu_ptr(smca_banks)[bank].hwid = s_hwid;
+ this_cpu_ptr(smca_banks)[bank].id = low;
+ this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++;
break;
}
}
@@ -526,7 +549,7 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
/* Fall back to method we used for older processors: */
switch (block) {
case 0:
- addr = msr_ops.misc(bank);
+ addr = mca_msr_reg(bank, MCA_MISC);
break;
case 1:
offset = ((low & MASK_BLKPTR_LO) >> 21);
@@ -588,7 +611,7 @@ out:
bool amd_filter_mce(struct mce *m)
{
- enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
+ enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
struct cpuinfo_x86 *c = &boot_cpu_data;
/* See Family 17h Models 10h-2Fh Erratum #1114. */
@@ -626,7 +649,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
} else if (c->x86 == 0x17 &&
(c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {
- if (smca_get_bank_type(bank) != SMCA_IF)
+ if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF)
return;
msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
@@ -688,213 +711,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
deferred_error_interrupt_enable(c);
}
-int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
-{
- u64 dram_base_addr, dram_limit_addr, dram_hole_base;
- /* We start from the normalized address */
- u64 ret_addr = norm_addr;
-
- u32 tmp;
-
- u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
- u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
- u8 intlv_addr_sel, intlv_addr_bit;
- u8 num_intlv_bits, hashed_bit;
- u8 lgcy_mmio_hole_en, base = 0;
- u8 cs_mask, cs_id = 0;
- bool hash_enabled = false;
-
- /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
- if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp))
- goto out_err;
-
- /* Remove HiAddrOffset from normalized address, if enabled: */
- if (tmp & BIT(0)) {
- u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8;
-
- if (norm_addr >= hi_addr_offset) {
- ret_addr -= hi_addr_offset;
- base = 1;
- }
- }
-
- /* Read D18F0x110 (DramBaseAddress). */
- if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp))
- goto out_err;
-
- /* Check if address range is valid. */
- if (!(tmp & BIT(0))) {
- pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
- __func__, tmp);
- goto out_err;
- }
-
- lgcy_mmio_hole_en = tmp & BIT(1);
- intlv_num_chan = (tmp >> 4) & 0xF;
- intlv_addr_sel = (tmp >> 8) & 0x7;
- dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16;
-
- /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
- if (intlv_addr_sel > 3) {
- pr_err("%s: Invalid interleave address select %d.\n",
- __func__, intlv_addr_sel);
- goto out_err;
- }
-
- /* Read D18F0x114 (DramLimitAddress). */
- if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp))
- goto out_err;
-
- intlv_num_sockets = (tmp >> 8) & 0x1;
- intlv_num_dies = (tmp >> 10) & 0x3;
- dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
-
- intlv_addr_bit = intlv_addr_sel + 8;
-
- /* Re-use intlv_num_chan by setting it equal to log2(#channels) */
- switch (intlv_num_chan) {
- case 0: intlv_num_chan = 0; break;
- case 1: intlv_num_chan = 1; break;
- case 3: intlv_num_chan = 2; break;
- case 5: intlv_num_chan = 3; break;
- case 7: intlv_num_chan = 4; break;
-
- case 8: intlv_num_chan = 1;
- hash_enabled = true;
- break;
- default:
- pr_err("%s: Invalid number of interleaved channels %d.\n",
- __func__, intlv_num_chan);
- goto out_err;
- }
-
- num_intlv_bits = intlv_num_chan;
-
- if (intlv_num_dies > 2) {
- pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
- __func__, intlv_num_dies);
- goto out_err;
- }
-
- num_intlv_bits += intlv_num_dies;
-
- /* Add a bit if sockets are interleaved. */
- num_intlv_bits += intlv_num_sockets;
-
- /* Assert num_intlv_bits <= 4 */
- if (num_intlv_bits > 4) {
- pr_err("%s: Invalid interleave bits %d.\n",
- __func__, num_intlv_bits);
- goto out_err;
- }
-
- if (num_intlv_bits > 0) {
- u64 temp_addr_x, temp_addr_i, temp_addr_y;
- u8 die_id_bit, sock_id_bit, cs_fabric_id;
-
- /*
- * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
- * This is the fabric id for this coherent slave. Use
- * umc/channel# as instance id of the coherent slave
- * for FICAA.
- */
- if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp))
- goto out_err;
-
- cs_fabric_id = (tmp >> 8) & 0xFF;
- die_id_bit = 0;
-
- /* If interleaved over more than 1 channel: */
- if (intlv_num_chan) {
- die_id_bit = intlv_num_chan;
- cs_mask = (1 << die_id_bit) - 1;
- cs_id = cs_fabric_id & cs_mask;
- }
-
- sock_id_bit = die_id_bit;
-
- /* Read D18F1x208 (SystemFabricIdMask). */
- if (intlv_num_dies || intlv_num_sockets)
- if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp))
- goto out_err;
-
- /* If interleaved over more than 1 die. */
- if (intlv_num_dies) {
- sock_id_bit = die_id_bit + intlv_num_dies;
- die_id_shift = (tmp >> 24) & 0xF;
- die_id_mask = (tmp >> 8) & 0xFF;
-
- cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
- }
-
- /* If interleaved over more than 1 socket. */
- if (intlv_num_sockets) {
- socket_id_shift = (tmp >> 28) & 0xF;
- socket_id_mask = (tmp >> 16) & 0xFF;
-
- cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
- }
-
- /*
- * The pre-interleaved address consists of XXXXXXIIIYYYYY
- * where III is the ID for this CS, and XXXXXXYYYYY are the
- * address bits from the post-interleaved address.
- * "num_intlv_bits" has been calculated to tell us how many "I"
- * bits there are. "intlv_addr_bit" tells us how many "Y" bits
- * there are (where "I" starts).
- */
- temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0);
- temp_addr_i = (cs_id << intlv_addr_bit);
- temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
- ret_addr = temp_addr_x | temp_addr_i | temp_addr_y;
- }
-
- /* Add dram base address */
- ret_addr += dram_base_addr;
-
- /* If legacy MMIO hole enabled */
- if (lgcy_mmio_hole_en) {
- if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp))
- goto out_err;
-
- dram_hole_base = tmp & GENMASK(31, 24);
- if (ret_addr >= dram_hole_base)
- ret_addr += (BIT_ULL(32) - dram_hole_base);
- }
-
- if (hash_enabled) {
- /* Save some parentheses and grab ls-bit at the end. */
- hashed_bit = (ret_addr >> 12) ^
- (ret_addr >> 18) ^
- (ret_addr >> 21) ^
- (ret_addr >> 30) ^
- cs_id;
-
- hashed_bit &= BIT(0);
-
- if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0)))
- ret_addr ^= BIT(intlv_addr_bit);
- }
-
- /* Is calculated system address is above DRAM limit address? */
- if (ret_addr > dram_limit_addr)
- goto out_err;
-
- *sys_addr = ret_addr;
- return 0;
-
-out_err:
- return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
-
bool amd_mce_is_memory_error(struct mce *m)
{
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
if (mce_flags.smca)
- return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0;
+ return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0;
return m->bank == 4 && xec == 0x8;
}
@@ -978,8 +801,8 @@ static void log_error_deferred(unsigned int bank)
{
bool defrd;
- defrd = _log_error_bank(bank, msr_ops.status(bank),
- msr_ops.addr(bank), 0);
+ defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
+ mca_msr_reg(bank, MCA_ADDR), 0);
if (!mce_flags.smca)
return;
@@ -1009,7 +832,7 @@ static void amd_deferred_error_interrupt(void)
static void log_error_thresholding(unsigned int bank, u64 misc)
{
- _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc);
+ _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc);
}
static void log_and_reset_block(struct threshold_block *block)
@@ -1210,7 +1033,7 @@ static struct kobj_type threshold_ktype = {
.release = threshold_block_release,
};
-static const char *get_name(unsigned int bank, struct threshold_block *b)
+static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b)
{
enum smca_bank_types bank_type;
@@ -1221,7 +1044,7 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return th_names[bank];
}
- bank_type = smca_get_bank_type(bank);
+ bank_type = smca_get_bank_type(cpu, bank);
if (bank_type >= N_SMCA_BANK_TYPES)
return NULL;
@@ -1231,12 +1054,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return NULL;
}
- if (smca_banks[bank].hwid->count == 1)
+ if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1)
return smca_get_name(bank_type);
snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
- "%s_%x", smca_get_name(bank_type),
- smca_banks[bank].sysfs_id);
+ "%s_%u", smca_get_name(bank_type),
+ per_cpu(smca_banks, cpu)[bank].sysfs_id);
return buf_mcatype;
}
@@ -1292,7 +1115,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
else
tb->blocks = b;
- err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b));
+ err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
if (err)
goto out_free;
recurse:
@@ -1347,7 +1170,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
struct device *dev = this_cpu_read(mce_device);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
- const char *name = get_name(bank, NULL);
+ const char *name = get_name(cpu, bank, NULL);
int err = 0;
if (!dev)
@@ -1397,7 +1220,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
}
}
- err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank));
+ err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC));
if (err)
goto out_kobj;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 193204aee880..5818b837fd4d 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -99,7 +99,6 @@ struct mca_config mca_cfg __read_mostly = {
static DEFINE_PER_CPU(struct mce, mces_seen);
static unsigned long mce_need_notify;
-static int cpu_missing;
/*
* MCA banks polled by the period polling timer for corrected events.
@@ -121,8 +120,6 @@ mce_banks_t mce_banks_ce_disabled;
static struct work_struct mce_work;
static struct irq_work mce_irq_work;
-static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
-
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
@@ -130,7 +127,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
-noinstr void mce_setup(struct mce *m)
+void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
@@ -176,53 +173,27 @@ void mce_unregister_decode_chain(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
-static inline u32 ctl_reg(int bank)
-{
- return MSR_IA32_MCx_CTL(bank);
-}
-
-static inline u32 status_reg(int bank)
-{
- return MSR_IA32_MCx_STATUS(bank);
-}
-
-static inline u32 addr_reg(int bank)
-{
- return MSR_IA32_MCx_ADDR(bank);
-}
-
-static inline u32 misc_reg(int bank)
-{
- return MSR_IA32_MCx_MISC(bank);
-}
-
-static inline u32 smca_ctl_reg(int bank)
+u32 mca_msr_reg(int bank, enum mca_msr reg)
{
- return MSR_AMD64_SMCA_MCx_CTL(bank);
-}
-
-static inline u32 smca_status_reg(int bank)
-{
- return MSR_AMD64_SMCA_MCx_STATUS(bank);
-}
+ if (mce_flags.smca) {
+ switch (reg) {
+ case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank);
+ case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank);
+ case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank);
+ case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank);
+ }
+ }
-static inline u32 smca_addr_reg(int bank)
-{
- return MSR_AMD64_SMCA_MCx_ADDR(bank);
-}
+ switch (reg) {
+ case MCA_CTL: return MSR_IA32_MCx_CTL(bank);
+ case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank);
+ case MCA_MISC: return MSR_IA32_MCx_MISC(bank);
+ case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank);
+ }
-static inline u32 smca_misc_reg(int bank)
-{
- return MSR_AMD64_SMCA_MCx_MISC(bank);
+ return 0;
}
-struct mca_msr_regs msr_ops = {
- .ctl = ctl_reg,
- .status = status_reg,
- .addr = addr_reg,
- .misc = misc_reg
-};
-
static void __print_mce(struct mce *m)
{
pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
@@ -295,11 +266,17 @@ static void wait_for_panic(void)
panic("Panicing machine check CPU died");
}
-static void mce_panic(const char *msg, struct mce *final, char *exp)
+static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
{
- int apei_err = 0;
struct llist_node *pending;
struct mce_evt_llist *l;
+ int apei_err = 0;
+
+ /*
+ * Allow instrumentation around external facilities usage. Not that it
+ * matters a whole lot since the machine is going to panic anyway.
+ */
+ instrumentation_begin();
if (!fake_panic) {
/*
@@ -314,7 +291,7 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
} else {
/* Don't log too much for fake panic */
if (atomic_inc_return(&mce_fake_panicked) > 1)
- return;
+ goto out;
}
pending = mce_gen_pool_prepare_records();
/* First print corrected ones that are still unlogged */
@@ -342,8 +319,6 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
if (!apei_err)
apei_err = apei_write_mce(final);
}
- if (cpu_missing)
- pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
if (exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
if (!fake_panic) {
@@ -352,6 +327,9 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
+
+out:
+ instrumentation_end();
}
/* Support code for software error injection */
@@ -362,24 +340,27 @@ static int msr_to_offset(u32 msr)
if (msr == mca_cfg.rip_msr)
return offsetof(struct mce, ip);
- if (msr == msr_ops.status(bank))
+ if (msr == mca_msr_reg(bank, MCA_STATUS))
return offsetof(struct mce, status);
- if (msr == msr_ops.addr(bank))
+ if (msr == mca_msr_reg(bank, MCA_ADDR))
return offsetof(struct mce, addr);
- if (msr == msr_ops.misc(bank))
+ if (msr == mca_msr_reg(bank, MCA_MISC))
return offsetof(struct mce, misc);
if (msr == MSR_IA32_MCG_STATUS)
return offsetof(struct mce, mcgstatus);
return -1;
}
-__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
{
- pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
- (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+ if (wrmsr) {
+ pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
+ regs->ip, (void *)regs->ip);
+ } else {
+ pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+ }
show_stack_regs(regs);
@@ -387,12 +368,10 @@ __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
while (true)
cpu_relax();
-
- return true;
}
/* MSR access wrappers used for error injection */
-static noinstr u64 mce_rdmsrl(u32 msr)
+noinstr u64 mce_rdmsrl(u32 msr)
{
DECLARE_ARGS(val, low, high);
@@ -420,32 +399,13 @@ static noinstr u64 mce_rdmsrl(u32 msr)
*/
asm volatile("1: rdmsr\n"
"2:\n"
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
: EAX_EDX_RET(val, low, high) : "c" (msr));
return EAX_EDX_VAL(val, low, high);
}
-__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
-{
- pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
- (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
- regs->ip, (void *)regs->ip);
-
- show_stack_regs(regs);
-
- panic("MCA architectural violation!\n");
-
- while (true)
- cpu_relax();
-
- return true;
-}
-
static noinstr void mce_wrmsrl(u32 msr, u64 v)
{
u32 low, high;
@@ -470,7 +430,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
/* See comment in mce_rdmsrl() */
asm volatile("1: wrmsr\n"
"2:\n"
- _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
: : "c" (msr), "a"(low), "d" (high) : "memory");
}
@@ -479,9 +439,15 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
* check into our "mce" struct so that we can use it later to assess
* the severity of the problem as we read per-bank specific details.
*/
-static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
{
+ /*
+ * Enable instrumentation around mce_setup() which calls external
+ * facilities.
+ */
+ instrumentation_begin();
mce_setup(m);
+ instrumentation_end();
m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
if (regs) {
@@ -682,13 +648,13 @@ static struct notifier_block mce_default_nb = {
/*
* Read ADDR and MISC registers.
*/
-static void mce_read_aux(struct mce *m, int i)
+static noinstr void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
- m->misc = mce_rdmsrl(msr_ops.misc(i));
+ m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
if (m->status & MCI_STATUS_ADDRV) {
- m->addr = mce_rdmsrl(msr_ops.addr(i));
+ m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR));
/*
* Mask the reported address by the reported granularity.
@@ -758,7 +724,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
m.bank = i;
barrier();
- m.status = mce_rdmsrl(msr_ops.status(i));
+ m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
/* If this entry is not valid, ignore it */
if (!(m.status & MCI_STATUS_VAL))
@@ -826,7 +792,7 @@ clear_it:
/*
* Clear state for this bank.
*/
- mce_wrmsrl(msr_ops.status(i), 0);
+ mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
}
/*
@@ -841,6 +807,34 @@ clear_it:
EXPORT_SYMBOL_GPL(machine_check_poll);
/*
+ * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+ * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+ * Vol 3B Table 15-20). But this confuses both the code that determines
+ * whether the machine check occurred in kernel or user mode, and also
+ * the severity assessment code. Pretend that EIPV was set, and take the
+ * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+ */
+static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 0)
+ return;
+ if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+ return;
+ if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+ MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+ MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+ MCACOD)) !=
+ (MCI_STATUS_UC|MCI_STATUS_EN|
+ MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+ MCI_STATUS_AR|MCACOD_INSTR))
+ return;
+
+ m->mcgstatus |= MCG_STATUS_EIPV;
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+}
+
+/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
@@ -851,13 +845,13 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
- m->status = mce_rdmsrl(msr_ops.status(i));
+ m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
if (!(m->status & MCI_STATUS_VAL))
continue;
__set_bit(i, validp);
- if (quirk_no_way_out)
- quirk_no_way_out(i, m, regs);
+ if (mce_flags.snb_ifu_quirk)
+ quirk_sandybridge_ifu(i, m, regs);
m->bank = i;
if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
@@ -889,8 +883,13 @@ static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
/*
* Check if a timeout waiting for other CPUs happened.
*/
-static int mce_timed_out(u64 *t, const char *msg)
+static noinstr int mce_timed_out(u64 *t, const char *msg)
{
+ int ret = 0;
+
+ /* Enable instrumentation around calls to external facilities */
+ instrumentation_begin();
+
/*
* The others already did panic for some reason.
* Bail out like in a timeout.
@@ -909,13 +908,17 @@ static int mce_timed_out(u64 *t, const char *msg)
cpumask_pr_args(&mce_missing_cpus));
mce_panic(msg, NULL, NULL);
}
- cpu_missing = 1;
- return 1;
+ ret = 1;
+ goto out;
}
*t -= SPINUNIT;
+
out:
touch_nmi_watchdog();
- return 0;
+
+ instrumentation_end();
+
+ return ret;
}
/*
@@ -1004,14 +1007,13 @@ static atomic_t global_nwo;
* in the entry order.
* TBD double check parallel CPU hotunplug
*/
-static int mce_start(int *no_way_out)
+static noinstr int mce_start(int *no_way_out)
{
- int order;
- int cpus = num_online_cpus();
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int order, ret = -1;
if (!timeout)
- return -1;
+ return ret;
atomic_add(*no_way_out, &global_nwo);
/*
@@ -1021,14 +1023,17 @@ static int mce_start(int *no_way_out)
order = atomic_inc_return(&mce_callin);
cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
+ /* Enable instrumentation around calls to external facilities */
+ instrumentation_begin();
+
/*
* Wait for everyone.
*/
- while (atomic_read(&mce_callin) != cpus) {
+ while (atomic_read(&mce_callin) != num_online_cpus()) {
if (mce_timed_out(&timeout,
"Timeout: Not all CPUs entered broadcast exception handler")) {
atomic_set(&global_nwo, 0);
- return -1;
+ goto out;
}
ndelay(SPINUNIT);
}
@@ -1054,7 +1059,7 @@ static int mce_start(int *no_way_out)
if (mce_timed_out(&timeout,
"Timeout: Subject CPUs unable to finish machine check processing")) {
atomic_set(&global_nwo, 0);
- return -1;
+ goto out;
}
ndelay(SPINUNIT);
}
@@ -1065,17 +1070,25 @@ static int mce_start(int *no_way_out)
*/
*no_way_out = atomic_read(&global_nwo);
- return order;
+ ret = order;
+
+out:
+ instrumentation_end();
+
+ return ret;
}
/*
* Synchronize between CPUs after main scanning loop.
* This invokes the bulk of the Monarch processing.
*/
-static int mce_end(int order)
+static noinstr int mce_end(int order)
{
- int ret = -1;
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int ret = -1;
+
+ /* Allow instrumentation around external facilities. */
+ instrumentation_begin();
if (!timeout)
goto reset;
@@ -1088,14 +1101,11 @@ static int mce_end(int order)
atomic_inc(&mce_executing);
if (order == 1) {
- /* CHECKME: Can this race with a parallel hotplug? */
- int cpus = num_online_cpus();
-
/*
* Monarch: Wait for everyone to go through their scanning
* loops.
*/
- while (atomic_read(&mce_executing) <= cpus) {
+ while (atomic_read(&mce_executing) <= num_online_cpus()) {
if (mce_timed_out(&timeout,
"Timeout: Monarch CPU unable to finish machine check processing"))
goto reset;
@@ -1119,7 +1129,8 @@ static int mce_end(int order)
/*
* Don't reset anything. That's done by the Monarch.
*/
- return 0;
+ ret = 0;
+ goto out;
}
/*
@@ -1135,6 +1146,10 @@ reset:
* Let others run again.
*/
atomic_set(&mce_executing, 0);
+
+out:
+ instrumentation_end();
+
return ret;
}
@@ -1144,7 +1159,7 @@ static void mce_clear_state(unsigned long *toclear)
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
if (test_bit(i, toclear))
- mce_wrmsrl(msr_ops.status(i), 0);
+ mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
}
}
@@ -1183,13 +1198,14 @@ static noinstr bool mce_check_crashing_cpu(void)
return false;
}
-static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
- unsigned long *toclear, unsigned long *valid_banks,
- int no_way_out, int *worst)
+static __always_inline int
+__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
+ unsigned long *toclear, unsigned long *valid_banks, int no_way_out,
+ int *worst)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
struct mca_config *cfg = &mca_cfg;
- int severity, i;
+ int severity, i, taint = 0;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
__clear_bit(i, toclear);
@@ -1203,7 +1219,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
m->addr = 0;
m->bank = i;
- m->status = mce_rdmsrl(msr_ops.status(i));
+ m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
if (!(m->status & MCI_STATUS_VAL))
continue;
@@ -1216,7 +1232,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
continue;
/* Set taint even when machine check was not enabled. */
- add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+ taint++;
severity = mce_severity(m, regs, cfg->tolerant, NULL, true);
@@ -1239,7 +1255,13 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
/* assuming valid severity level != 0 */
m->severity = severity;
+ /*
+ * Enable instrumentation around the mce_log() call which is
+ * done in #MC context, where instrumentation is disabled.
+ */
+ instrumentation_begin();
mce_log(m);
+ instrumentation_end();
if (severity > *worst) {
*final = *m;
@@ -1249,6 +1271,8 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
/* mce_clear_state will clear *final, save locally for use later */
*m = *final;
+
+ return taint;
}
static void kill_me_now(struct callback_head *ch)
@@ -1272,7 +1296,7 @@ static void kill_me_maybe(struct callback_head *cb)
flags |= MF_MUST_KILL;
ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
- if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+ if (!ret) {
set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
sync_core();
return;
@@ -1286,15 +1310,21 @@ static void kill_me_maybe(struct callback_head *cb)
if (ret == -EHWPOISON)
return;
- if (p->mce_vaddr != (void __user *)-1l) {
- force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
- } else {
- pr_err("Memory error not recovered");
- kill_me_now(cb);
- }
+ pr_err("Memory error not recovered");
+ kill_me_now(cb);
}
-static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
+static void kill_me_never(struct callback_head *cb)
+{
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+
+ p->mce_count = 0;
+ pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
+ if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+}
+
+static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
{
int count = ++current->mce_count;
@@ -1304,11 +1334,7 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
current->mce_kflags = m->kflags;
current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
current->mce_whole_page = whole_page(m);
-
- if (kill_current_task)
- current->mce_kill_me.func = kill_me_now;
- else
- current->mce_kill_me.func = kill_me_maybe;
+ current->mce_kill_me.func = func;
}
/* Ten is likely overkill. Don't expect more than two faults before task_work() */
@@ -1326,12 +1352,21 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
task_work_add(current, &current->mce_kill_me, TWA_RESUME);
}
+/* Handle unconfigured int18 (should never happen) */
+static noinstr void unexpected_machine_check(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
+ smp_processor_id());
+ instrumentation_end();
+}
+
/*
- * The actual machine check handler. This only handles real
- * exceptions when something got corrupted coming in through int 18.
+ * The actual machine check handler. This only handles real exceptions when
+ * something got corrupted coming in through int 18.
*
- * This is executed in NMI context not subject to normal locking rules. This
- * implies that most kernel services cannot be safely used. Don't even
+ * This is executed in #MC context not subject to normal locking rules.
+ * This implies that most kernel services cannot be safely used. Don't even
* think about putting a printk in there!
*
* On Intel systems this is entered on all CPUs in parallel through
@@ -1343,39 +1378,54 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
* issues: if the machine check was due to a failure of the memory
* backing the user stack, tracing that reads the user stack will cause
* potentially infinite recursion.
+ *
+ * Currently, the #MC handler calls out to a number of external facilities
+ * and, therefore, allows instrumentation around them. The optimal thing to
+ * have would be to do the absolutely minimal work required in #MC context
+ * and have instrumentation disabled only around that. Further processing can
+ * then happen in process context where instrumentation is allowed. Achieving
+ * that requires careful auditing and modifications. Until then, the code
+ * allows instrumentation temporarily, where required. *
*/
noinstr void do_machine_check(struct pt_regs *regs)
{
- DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
- DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
struct mca_config *cfg = &mca_cfg;
struct mce m, *final;
char *msg = NULL;
- int worst = 0;
+
+ if (unlikely(mce_flags.p5))
+ return pentium_machine_check(regs);
+ else if (unlikely(mce_flags.winchip))
+ return winchip_machine_check(regs);
+ else if (unlikely(!mca_cfg.initialized))
+ return unexpected_machine_check(regs);
/*
* Establish sequential order between the CPUs entering the machine
* check handler.
*/
- int order = -1;
+ order = -1;
/*
* If no_way_out gets set, there is no safe way to recover from this
* MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
*/
- int no_way_out = 0;
+ no_way_out = 0;
/*
* If kill_current_task is not set, there might be a way to recover from this
* error.
*/
- int kill_current_task = 0;
+ kill_current_task = 0;
/*
* MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
* on Intel.
*/
- int lmce = 1;
+ lmce = 1;
this_cpu_inc(mce_exception_count);
@@ -1385,7 +1435,6 @@ noinstr void do_machine_check(struct pt_regs *regs)
final = this_cpu_ptr(&mces_seen);
*final = m;
- memset(valid_banks, 0, sizeof(valid_banks));
no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
barrier();
@@ -1419,7 +1468,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
order = mce_start(&no_way_out);
}
- __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
+ taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
if (!no_way_out)
mce_clear_state(toclear);
@@ -1451,6 +1500,16 @@ noinstr void do_machine_check(struct pt_regs *regs)
}
}
+ /*
+ * Enable instrumentation around the external facilities like task_work_add()
+ * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this
+ * properly would need a lot more involved reorganization.
+ */
+ instrumentation_begin();
+
+ if (taint)
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
if (worst != MCE_AR_SEVERITY && !kill_current_task)
goto out;
@@ -1459,7 +1518,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- queue_task_work(&m, msg, kill_current_task);
+ if (kill_current_task)
+ queue_task_work(&m, msg, kill_me_now);
+ else
+ queue_task_work(&m, msg, kill_me_maybe);
} else {
/*
@@ -1477,9 +1539,12 @@ noinstr void do_machine_check(struct pt_regs *regs)
}
if (m.kflags & MCE_IN_KERNEL_COPYIN)
- queue_task_work(&m, msg, kill_current_task);
+ queue_task_work(&m, msg, kill_me_never);
}
+
out:
+ instrumentation_end();
+
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1687,8 +1752,8 @@ static void __mcheck_cpu_init_clear_banks(void)
if (!b->init)
continue;
- wrmsrl(msr_ops.ctl(i), b->ctl);
- wrmsrl(msr_ops.status(i), 0);
+ wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
+ wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
}
}
@@ -1714,39 +1779,11 @@ static void __mcheck_cpu_check_banks(void)
if (!b->init)
continue;
- rdmsrl(msr_ops.ctl(i), msrval);
+ rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
b->init = !!msrval;
}
}
-/*
- * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
- * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
- * Vol 3B Table 15-20). But this confuses both the code that determines
- * whether the machine check occurred in kernel or user mode, and also
- * the severity assessment code. Pretend that EIPV was set, and take the
- * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
- */
-static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
-{
- if (bank != 0)
- return;
- if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
- return;
- if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
- MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
- MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
- MCACOD)) !=
- (MCI_STATUS_UC|MCI_STATUS_EN|
- MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
- MCI_STATUS_AR|MCACOD_INSTR))
- return;
-
- m->mcgstatus |= MCG_STATUS_EIPV;
- m->ip = regs->ip;
- m->cs = regs->cs;
-}
-
/* Add per CPU specific workarounds here */
static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{
@@ -1820,7 +1857,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
cfg->bootlog = 0;
if (c->x86 == 6 && c->x86_model == 45)
- quirk_no_way_out = quirk_sandybridge_ifu;
+ mce_flags.snb_ifu_quirk = 1;
}
if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
@@ -1850,9 +1887,11 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
+ mce_flags.p5 = 1;
return 1;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
+ mce_flags.winchip = 1;
return 1;
default:
return 0;
@@ -1871,13 +1910,6 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
mce_flags.amd_threshold = 1;
-
- if (mce_flags.smca) {
- msr_ops.ctl = smca_ctl_reg;
- msr_ops.status = smca_status_reg;
- msr_ops.addr = smca_addr_reg;
- msr_ops.misc = smca_misc_reg;
- }
}
}
@@ -2007,18 +2039,6 @@ bool filter_mce(struct mce *m)
return false;
}
-/* Handle unconfigured int18 (should never happen) */
-static noinstr void unexpected_machine_check(struct pt_regs *regs)
-{
- instrumentation_begin();
- pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
- smp_processor_id());
- instrumentation_end();
-}
-
-/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;
-
static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{
irqentry_state_t irq_state;
@@ -2029,31 +2049,22 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
* Only required when from kernel mode. See
* mce_check_crashing_cpu() for details.
*/
- if (machine_check_vector == do_machine_check &&
- mce_check_crashing_cpu())
+ if (mca_cfg.initialized && mce_check_crashing_cpu())
return;
irq_state = irqentry_nmi_enter(regs);
- /*
- * The call targets are marked noinstr, but objtool can't figure
- * that out because it's an indirect call. Annotate it.
- */
- instrumentation_begin();
- machine_check_vector(regs);
+ do_machine_check(regs);
- instrumentation_end();
irqentry_nmi_exit(regs, irq_state);
}
static __always_inline void exc_machine_check_user(struct pt_regs *regs)
{
irqentry_enter_from_user_mode(regs);
- instrumentation_begin();
- machine_check_vector(regs);
+ do_machine_check(regs);
- instrumentation_end();
irqentry_exit_to_user_mode(regs);
}
@@ -2120,7 +2131,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
return;
}
- machine_check_vector = do_machine_check;
+ mca_cfg.initialized = 1;
__mcheck_cpu_init_early(c);
__mcheck_cpu_init_generic();
@@ -2228,7 +2239,6 @@ int __init mcheck_init(void)
mce_register_decode_chain(&early_nb);
mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
- mcheck_vendor_init_severity();
INIT_WORK(&mce_work, mce_gen_pool_process);
init_irq_work(&mce_irq_work, mce_irq_work_cb);
@@ -2253,7 +2263,7 @@ static void mce_disable_error_reporting(void)
struct mce_bank *b = &mce_banks[i];
if (b->init)
- wrmsrl(msr_ops.ctl(i), 0);
+ wrmsrl(mca_msr_reg(i, MCA_CTL), 0);
}
return;
}
@@ -2605,7 +2615,7 @@ static void mce_reenable_cpu(void)
struct mce_bank *b = &mce_banks[i];
if (b->init)
- wrmsrl(msr_ops.ctl(i), b->ctl);
+ wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
}
}
@@ -2754,7 +2764,6 @@ struct dentry *mce_get_debugfs_dir(void)
static void mce_reset(void)
{
- cpu_missing = 0;
atomic_set(&mce_fake_panicked, 0);
atomic_set(&mce_executing, 0);
atomic_set(&mce_callin, 0);
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 0bfc14041bbb..5fbd7ffb3233 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -74,7 +74,6 @@ MCE_INJECT_SET(status);
MCE_INJECT_SET(misc);
MCE_INJECT_SET(addr);
MCE_INJECT_SET(synd);
-MCE_INJECT_SET(ipid);
#define MCE_INJECT_GET(reg) \
static int inj_##reg##_get(void *data, u64 *val) \
@@ -95,6 +94,20 @@ DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+
+/* Use the user provided IPID value on a sw injection. */
+static int inj_ipid_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ if (inj_type == SW_INJ)
+ m->ipid = val;
+ }
+
+ return 0;
+}
+
DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n");
static void setup_inj_struct(struct mce *m)
@@ -350,7 +363,7 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf,
char buf[MAX_FLAG_OPT_SIZE], *__buf;
int err;
- if (cnt > MAX_FLAG_OPT_SIZE)
+ if (!cnt || cnt > MAX_FLAG_OPT_SIZE)
return -EINVAL;
if (copy_from_user(&buf, ubuf, cnt))
@@ -490,6 +503,8 @@ static void do_inject(void)
i_mce.tsc = rdtsc_ordered();
+ i_mce.status |= MCI_STATUS_VAL;
+
if (i_mce.misc)
i_mce.status |= MCI_STATUS_MISCV;
@@ -577,6 +592,33 @@ static int inj_bank_set(void *data, u64 val)
}
m->bank = val;
+
+ /*
+ * sw-only injection allows to write arbitrary values into the MCA
+ * registers because it tests only the decoding paths.
+ */
+ if (inj_type == SW_INJ)
+ goto inject;
+
+ /*
+ * Read IPID value to determine if a bank is populated on the target
+ * CPU.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ u64 ipid;
+
+ if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) {
+ pr_err("Error reading IPID on CPU%d\n", m->extcpu);
+ return -EINVAL;
+ }
+
+ if (!ipid) {
+ pr_err("Cannot inject into unpopulated bank %llu\n", val);
+ return -ENODEV;
+ }
+ }
+
+inject:
do_inject();
/* Reset injection struct */
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index acfd5d9f93c6..bb9a46a804bf 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -547,12 +547,13 @@ bool intel_filter_mce(struct mce *m)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
- /* MCE errata HSD131, HSM142, HSW131, BDM48, and HSM142 */
+ /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */
if ((c->x86 == 6) &&
((c->x86_model == INTEL_FAM6_HASWELL) ||
(c->x86_model == INTEL_FAM6_HASWELL_L) ||
(c->x86_model == INTEL_FAM6_BROADWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_G)) &&
+ (c->x86_model == INTEL_FAM6_HASWELL_G) ||
+ (c->x86_model == INTEL_FAM6_SKYLAKE_X)) &&
(m->bank == 0) &&
((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
return true;
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 88dcc79cfb07..52c633950b38 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -8,9 +8,6 @@
#include <linux/device.h>
#include <asm/mce.h>
-/* Pointer to the installed machine check handler for this CPU setup. */
-extern void (*machine_check_vector)(struct pt_regs *);
-
enum severity_level {
MCE_NO_SEVERITY,
MCE_DEFERRED_SEVERITY,
@@ -38,8 +35,7 @@ int mce_gen_pool_add(struct mce *mce);
int mce_gen_pool_init(void);
struct llist_node *mce_gen_pool_prepare_records(void);
-extern int (*mce_severity)(struct mce *a, struct pt_regs *regs,
- int tolerant, char **msg, bool is_excp);
+int mce_severity(struct mce *a, struct pt_regs *regs, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern mce_banks_t mce_banks_ce_disabled;
@@ -61,7 +57,7 @@ static inline void cmci_disable_bank(int bank) { }
static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
static inline void intel_clear_lmce(void) { }
-static inline bool intel_filter_mce(struct mce *m) { return false; };
+static inline bool intel_filter_mce(struct mce *m) { return false; }
#endif
void mce_timer_kick(unsigned long interval);
@@ -117,23 +113,25 @@ static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
#endif
struct mca_config {
- bool dont_log_ce;
- bool cmci_disabled;
- bool ignore_ce;
- bool print_all;
-
__u64 lmce_disabled : 1,
disabled : 1,
ser : 1,
recovery : 1,
bios_cmci_threshold : 1,
- __reserved : 59;
+ /* Proper #MC exception handler is set */
+ initialized : 1,
+ __reserved : 58;
+
+ bool dont_log_ce;
+ bool cmci_disabled;
+ bool ignore_ce;
+ bool print_all;
- s8 bootlog;
int tolerant;
int monarch_timeout;
int panic_timeout;
u32 rip_msr;
+ s8 bootlog;
};
extern struct mca_config mca_cfg;
@@ -163,19 +161,28 @@ struct mce_vendor_flags {
/* AMD-style error thresholding banks present. */
amd_threshold : 1,
- __reserved_0 : 60;
+ /* Pentium, family 5-style MCA */
+ p5 : 1,
+
+ /* Centaur Winchip C6-style MCA */
+ winchip : 1,
+
+ /* SandyBridge IFU quirk */
+ snb_ifu_quirk : 1,
+
+ __reserved_0 : 57;
};
extern struct mce_vendor_flags mce_flags;
-struct mca_msr_regs {
- u32 (*ctl) (int bank);
- u32 (*status) (int bank);
- u32 (*addr) (int bank);
- u32 (*misc) (int bank);
+enum mca_msr {
+ MCA_CTL,
+ MCA_STATUS,
+ MCA_ADDR,
+ MCA_MISC,
};
-extern struct mca_msr_regs msr_ops;
+u32 mca_msr_reg(int bank, enum mca_msr reg);
/* Decide whether to add MCE record to MCE event pool or filter it out. */
extern bool filter_mce(struct mce *m);
@@ -183,17 +190,23 @@ extern bool filter_mce(struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
#else
-static inline bool amd_filter_mce(struct mce *m) { return false; };
+static inline bool amd_filter_mce(struct mce *m) { return false; }
#endif
-__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr);
+#ifdef CONFIG_X86_ANCIENT_MCE
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+void winchip_mcheck_init(struct cpuinfo_x86 *c);
+noinstr void pentium_machine_check(struct pt_regs *regs);
+noinstr void winchip_machine_check(struct pt_regs *regs);
+static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
+#else
+static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline void enable_p5_mce(void) {}
+static inline void pentium_machine_check(struct pt_regs *regs) {}
+static inline void winchip_machine_check(struct pt_regs *regs) {}
+#endif
-__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr);
+noinstr u64 mce_rdmsrl(u32 msr);
#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c
index 19e90cae8e97..2272ad53fc33 100644
--- a/arch/x86/kernel/cpu/mce/p5.c
+++ b/arch/x86/kernel/cpu/mce/p5.c
@@ -21,7 +21,7 @@
int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
-static noinstr void pentium_machine_check(struct pt_regs *regs)
+noinstr void pentium_machine_check(struct pt_regs *regs)
{
u32 loaddr, hi, lotype;
@@ -54,10 +54,6 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
if (!cpu_has(c, X86_FEATURE_MCE))
return;
- machine_check_vector = pentium_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
/* Read registers before enabling: */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 17e631443116..7aa2bda93cbb 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -222,6 +222,9 @@ static bool is_copy_from_user(struct pt_regs *regs)
struct insn insn;
int ret;
+ if (!regs)
+ return false;
+
if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
return false;
@@ -263,33 +266,44 @@ static bool is_copy_from_user(struct pt_regs *regs)
* distinguish an exception taken in user from from one
* taken in the kernel.
*/
-static int error_context(struct mce *m, struct pt_regs *regs)
+static noinstr int error_context(struct mce *m, struct pt_regs *regs)
{
- enum handler_type t;
+ int fixup_type;
+ bool copy_user;
if ((m->cs & 3) == 3)
return IN_USER;
+
if (!mc_recoverable(m->mcgstatus))
return IN_KERNEL;
- t = ex_get_fault_handler_type(m->ip);
- if (t == EX_HANDLER_FAULT) {
- m->kflags |= MCE_IN_KERNEL_RECOV;
- return IN_KERNEL_RECOV;
- }
- if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) {
- m->kflags |= MCE_IN_KERNEL_RECOV;
+ /* Allow instrumentation around external facilities usage. */
+ instrumentation_begin();
+ fixup_type = ex_get_fixup_type(m->ip);
+ copy_user = is_copy_from_user(regs);
+ instrumentation_end();
+
+ switch (fixup_type) {
+ case EX_TYPE_UACCESS:
+ case EX_TYPE_COPY:
+ if (!copy_user)
+ return IN_KERNEL;
m->kflags |= MCE_IN_KERNEL_COPYIN;
+ fallthrough;
+
+ case EX_TYPE_FAULT_MCE_SAFE:
+ case EX_TYPE_DEFAULT_MCE_SAFE:
+ m->kflags |= MCE_IN_KERNEL_RECOV;
return IN_KERNEL_RECOV;
- }
- return IN_KERNEL;
+ default:
+ return IN_KERNEL;
+ }
}
static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
{
- u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
- u32 low, high;
+ u64 mcx_cfg;
/*
* We need to look at the following bits:
@@ -300,11 +314,10 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
if (!mce_flags.succor)
return MCE_PANIC_SEVERITY;
- if (rdmsr_safe(addr, &low, &high))
- return MCE_PANIC_SEVERITY;
+ mcx_cfg = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(m->bank));
/* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
- if ((low & MCI_CONFIG_MCAX) &&
+ if ((mcx_cfg & MCI_CONFIG_MCAX) &&
(m->status & MCI_STATUS_TCC) &&
(err_ctx == IN_KERNEL))
return MCE_PANIC_SEVERITY;
@@ -317,8 +330,8 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
* See AMD Error Scope Hierarchy table in a newer BKDG. For example
* 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
*/
-static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant,
- char **msg, bool is_excp)
+static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant,
+ char **msg, bool is_excp)
{
enum context ctx = error_context(m, regs);
@@ -370,8 +383,8 @@ static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant,
return MCE_KEEP_SEVERITY;
}
-static int mce_severity_intel(struct mce *m, struct pt_regs *regs,
- int tolerant, char **msg, bool is_excp)
+static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs,
+ int tolerant, char **msg, bool is_excp)
{
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
enum context ctx = error_context(m, regs);
@@ -407,15 +420,14 @@ static int mce_severity_intel(struct mce *m, struct pt_regs *regs,
}
}
-/* Default to mce_severity_intel */
-int (*mce_severity)(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, bool is_excp) =
- mce_severity_intel;
-
-void __init mcheck_vendor_init_severity(void)
+int noinstr mce_severity(struct mce *m, struct pt_regs *regs, int tolerant, char **msg,
+ bool is_excp)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
- mce_severity = mce_severity_amd;
+ return mce_severity_amd(m, regs, tolerant, msg, is_excp);
+ else
+ return mce_severity_intel(m, regs, tolerant, msg, is_excp);
}
#ifdef CONFIG_DEBUG_FS
diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c
index 9c9f0abd2d7f..6c99f2941909 100644
--- a/arch/x86/kernel/cpu/mce/winchip.c
+++ b/arch/x86/kernel/cpu/mce/winchip.c
@@ -17,7 +17,7 @@
#include "internal.h"
/* Machine check handler for WinChip C6: */
-static noinstr void winchip_machine_check(struct pt_regs *regs)
+noinstr void winchip_machine_check(struct pt_regs *regs)
{
instrumentation_begin();
pr_emerg("CPU0: Machine Check Exception.\n");
@@ -30,10 +30,6 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
- machine_check_vector = winchip_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
rdmsr(MSR_IDT_FCR1, lo, hi);
lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo &= ~(1<<4); /* Enable MCE */
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 3d4a48336084..8b2fcdfa6d31 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -456,17 +456,23 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p
static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
{
-#ifdef CONFIG_X86_64
char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ struct firmware fw;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
if (family >= 0x15)
snprintf(fw_name, sizeof(fw_name),
"amd-ucode/microcode_amd_fam%.2xh.bin", family);
- return get_builtin_firmware(cp, fw_name);
-#else
+ if (firmware_request_builtin(&fw, fw_name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+
return false;
-#endif
}
static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index efb69be41ab1..f955d25076ba 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -140,23 +140,6 @@ static bool __init check_loader_disabled_bsp(void)
return *res;
}
-extern struct builtin_fw __start_builtin_fw[];
-extern struct builtin_fw __end_builtin_fw[];
-
-bool get_builtin_firmware(struct cpio_data *cd, const char *name)
-{
- struct builtin_fw *b_fw;
-
- for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
- if (!strcmp(name, b_fw->name)) {
- cd->size = b_fw->size;
- cd->data = b_fw->data;
- return true;
- }
- }
- return false;
-}
-
void __init load_ucode_bsp(void)
{
unsigned int cpuid_1_eax;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 7e8e07bddd5f..d28a9f8f3fec 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -456,6 +456,7 @@ static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int s
static bool load_builtin_intel_microcode(struct cpio_data *cp)
{
unsigned int eax = 1, ebx, ecx = 0, edx;
+ struct firmware fw;
char name[30];
if (IS_ENABLED(CONFIG_X86_32))
@@ -466,7 +467,13 @@ static bool load_builtin_intel_microcode(struct cpio_data *cp)
sprintf(name, "intel-ucode/%02x-%02x-%02x",
x86_family(eax), x86_model(eax), x86_stepping(eax));
- return get_builtin_firmware(cp, name);
+ if (firmware_request_builtin(&fw, name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+
+ return false;
}
/*
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e095c28d27ae..5a99f993e639 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
#include <linux/kexec.h>
#include <linux/i8253.h>
#include <linux/random.h>
+#include <linux/swiotlb.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv-tlfs.h>
@@ -79,7 +80,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
inc_irq_stat(hyperv_stimer0_count);
if (hv_stimer0_handler)
hv_stimer0_handler();
- add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0);
+ add_interrupt_randomness(HYPERV_STIMER0_VECTOR);
ack_APIC_irq();
set_irq_regs(old_regs);
@@ -163,12 +164,22 @@ static uint32_t __init ms_hyperv_platform(void)
cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
&eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
- if (eax >= HYPERV_CPUID_MIN &&
- eax <= HYPERV_CPUID_MAX &&
- !memcmp("Microsoft Hv", hyp_signature, 12))
- return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
+ if (eax < HYPERV_CPUID_MIN || eax > HYPERV_CPUID_MAX ||
+ memcmp("Microsoft Hv", hyp_signature, 12))
+ return 0;
- return 0;
+ /* HYPERCALL and VP_INDEX MSRs are mandatory for all features. */
+ eax = cpuid_eax(HYPERV_CPUID_FEATURES);
+ if (!(eax & HV_MSR_HYPERCALL_AVAILABLE)) {
+ pr_warn("x86/hyperv: HYPERCALL MSR not available.\n");
+ return 0;
+ }
+ if (!(eax & HV_MSR_VP_INDEX_AVAILABLE)) {
+ pr_warn("x86/hyperv: VP_INDEX MSR not available.\n");
+ return 0;
+ }
+
+ return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
}
static unsigned char hv_get_nmi_reason(void)
@@ -313,9 +324,26 @@ static void __init ms_hyperv_init_platform(void)
if (ms_hyperv.priv_high & HV_ISOLATION) {
ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
+ ms_hyperv.shared_gpa_boundary =
+ BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
+
+ if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
+ static_branch_enable(&isolation_type_snp);
+#ifdef CONFIG_SWIOTLB
+ swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary;
+#endif
+ }
+
+#ifdef CONFIG_SWIOTLB
+ /*
+ * Enable swiotlb force mode in Isolation VM to
+ * use swiotlb bounce buffer for dma transaction.
+ */
+ swiotlb_force = SWIOTLB_FORCE;
+#endif
}
if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 4b8813bafffd..bb1c3f5f60c8 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -527,12 +527,14 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
rdt_domain_reconfigure_cdp(r);
if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
- kfree(d);
+ kfree(hw_dom);
return;
}
if (r->mon_capable && domain_setup_mon_state(r, d)) {
- kfree(d);
+ kfree(hw_dom->ctrl_val);
+ kfree(hw_dom->mbps_val);
+ kfree(hw_dom);
return;
}
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index c9f0f3d63f75..eaf25a234ff5 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -282,7 +282,7 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
u64 shift = 64 - width, chunks;
chunks = (cur_msr << shift) - (prev_msr << shift);
- return chunks >>= shift;
+ return chunks >> shift;
}
static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index 9b204843b78d..fa04a73daf9c 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -11,26 +11,8 @@
#include <asm/traps.h>
#include "sgx.h"
-/**
- * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
- *
- * ENCLS has its own (positive value) error codes and also generates
- * ENCLS specific #GP and #PF faults. And the ENCLS values get munged
- * with system error codes as everything percolates back up the stack.
- * Unfortunately (for us), we need to precisely identify each unique
- * error code, e.g. the action taken if EWB fails varies based on the
- * type of fault and on the exact SGX error code, i.e. we can't simply
- * convert all faults to -EFAULT.
- *
- * To make all three error types coexist, we set bit 30 to identify an
- * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate
- * between positive (faults and SGX error codes) and negative (system
- * error codes) values.
- */
-#define ENCLS_FAULT_FLAG 0x40000000
-
/* Retrieve the encoded trapnr from the specified return code. */
-#define ENCLS_TRAPNR(r) ((r) & ~ENCLS_FAULT_FLAG)
+#define ENCLS_TRAPNR(r) ((r) & ~SGX_ENCLS_FAULT_FLAG)
/* Issue a WARN() about an ENCLS function. */
#define ENCLS_WARN(r, name) { \
@@ -50,7 +32,7 @@
*/
static inline bool encls_faulted(int ret)
{
- return ret & ENCLS_FAULT_FLAG;
+ return ret & SGX_ENCLS_FAULT_FLAG;
}
/**
@@ -88,11 +70,7 @@ static inline bool encls_failed(int ret)
asm volatile( \
"1: .byte 0x0f, 0x01, 0xcf;\n\t" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_FAULT(1b, 3b) \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \
: "=a"(ret) \
: "a"(rax), inputs \
: "memory", "cc"); \
@@ -127,7 +105,7 @@ static inline bool encls_failed(int ret)
*
* Return:
* 0 on success,
- * trapnr with ENCLS_FAULT_FLAG set on fault
+ * trapnr with SGX_ENCLS_FAULT_FLAG set on fault
*/
#define __encls_N(rax, rbx_out, inputs...) \
({ \
@@ -136,11 +114,7 @@ static inline bool encls_failed(int ret)
"1: .byte 0x0f, 0x01, 0xcf;\n\t" \
" xor %%eax,%%eax;\n" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_FAULT(1b, 3b) \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \
: "=a"(ret), "=b"(rbx_out) \
: "a"(rax), inputs \
: "memory"); \
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 63d3de02bbcc..4b41efc9e367 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -6,11 +6,13 @@
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/miscdevice.h>
+#include <linux/node.h>
#include <linux/pagemap.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
+#include <linux/sysfs.h>
#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
@@ -20,6 +22,7 @@ struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
static int sgx_nr_epc_sections;
static struct task_struct *ksgxd_tsk;
static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
+static DEFINE_XARRAY(sgx_epc_address_space);
/*
* These variables are part of the state of the reclaimer, and must be accessed
@@ -28,8 +31,7 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
static LIST_HEAD(sgx_active_page_list);
static DEFINE_SPINLOCK(sgx_reclaimer_lock);
-/* The free page list lock protected variables prepend the lock. */
-static unsigned long sgx_nr_free_pages;
+static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
/* Nodes with one or more EPC sections. */
static nodemask_t sgx_numa_mask;
@@ -61,6 +63,24 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
+ /*
+ * Checking page->poison without holding the node->lock
+ * is racy, but losing the race (i.e. poison is set just
+ * after the check) just means __eremove() will be uselessly
+ * called for a page that sgx_free_epc_page() will put onto
+ * the node->sgx_poison_page_list later.
+ */
+ if (page->poison) {
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ struct sgx_numa_node *node = section->node;
+
+ spin_lock(&node->lock);
+ list_move(&page->list, &node->sgx_poison_page_list);
+ spin_unlock(&node->lock);
+
+ continue;
+ }
+
ret = __eremove(sgx_get_epc_virt_addr(page));
if (!ret) {
/*
@@ -403,14 +423,15 @@ skip:
spin_lock(&node->lock);
list_add_tail(&epc_page->list, &node->free_page_list);
- sgx_nr_free_pages++;
spin_unlock(&node->lock);
+ atomic_long_inc(&sgx_nr_free_pages);
}
}
static bool sgx_should_reclaim(unsigned long watermark)
{
- return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
+ return atomic_long_read(&sgx_nr_free_pages) < watermark &&
+ !list_empty(&sgx_active_page_list);
}
static int ksgxd(void *p)
@@ -471,9 +492,10 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
list_del_init(&page->list);
- sgx_nr_free_pages--;
+ page->flags = 0;
spin_unlock(&node->lock);
+ atomic_long_dec(&sgx_nr_free_pages);
return page;
}
@@ -624,10 +646,15 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
spin_lock(&node->lock);
- list_add_tail(&page->list, &node->free_page_list);
- sgx_nr_free_pages++;
+ page->owner = NULL;
+ if (page->poison)
+ list_add(&page->list, &node->sgx_poison_page_list);
+ else
+ list_add_tail(&page->list, &node->free_page_list);
+ page->flags = SGX_EPC_PAGE_IS_FREE;
spin_unlock(&node->lock);
+ atomic_long_inc(&sgx_nr_free_pages);
}
static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
@@ -648,17 +675,102 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
}
section->phys_addr = phys_addr;
+ xa_store_range(&sgx_epc_address_space, section->phys_addr,
+ phys_addr + size - 1, section, GFP_KERNEL);
for (i = 0; i < nr_pages; i++) {
section->pages[i].section = index;
section->pages[i].flags = 0;
section->pages[i].owner = NULL;
+ section->pages[i].poison = 0;
list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
}
return true;
}
+bool arch_is_platform_page(u64 paddr)
+{
+ return !!xa_load(&sgx_epc_address_space, paddr);
+}
+EXPORT_SYMBOL_GPL(arch_is_platform_page);
+
+static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
+{
+ struct sgx_epc_section *section;
+
+ section = xa_load(&sgx_epc_address_space, paddr);
+ if (!section)
+ return NULL;
+
+ return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
+}
+
+/*
+ * Called in process context to handle a hardware reported
+ * error in an SGX EPC page.
+ * If the MF_ACTION_REQUIRED bit is set in flags, then the
+ * context is the task that consumed the poison data. Otherwise
+ * this is called from a kernel thread unrelated to the page.
+ */
+int arch_memory_failure(unsigned long pfn, int flags)
+{
+ struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
+ struct sgx_epc_section *section;
+ struct sgx_numa_node *node;
+
+ /*
+ * mm/memory-failure.c calls this routine for all errors
+ * where there isn't a "struct page" for the address. But that
+ * includes other address ranges besides SGX.
+ */
+ if (!page)
+ return -ENXIO;
+
+ /*
+ * If poison was consumed synchronously. Send a SIGBUS to
+ * the task. Hardware has already exited the SGX enclave and
+ * will not allow re-entry to an enclave that has a memory
+ * error. The signal may help the task understand why the
+ * enclave is broken.
+ */
+ if (flags & MF_ACTION_REQUIRED)
+ force_sig(SIGBUS);
+
+ section = &sgx_epc_sections[page->section];
+ node = section->node;
+
+ spin_lock(&node->lock);
+
+ /* Already poisoned? Nothing more to do */
+ if (page->poison)
+ goto out;
+
+ page->poison = 1;
+
+ /*
+ * If the page is on a free list, move it to the per-node
+ * poison page list.
+ */
+ if (page->flags & SGX_EPC_PAGE_IS_FREE) {
+ list_move(&page->list, &node->sgx_poison_page_list);
+ goto out;
+ }
+
+ /*
+ * TBD: Add additional plumbing to enable pre-emptive
+ * action for asynchronous poison notification. Until
+ * then just hope that the poison:
+ * a) is not accessed - sgx_free_epc_page() will deal with it
+ * when the user gives it back
+ * b) results in a recoverable machine check rather than
+ * a fatal one
+ */
+out:
+ spin_unlock(&node->lock);
+ return 0;
+}
+
/**
* A section metric is concatenated in a way that @low bits 12-31 define the
* bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
@@ -670,6 +782,48 @@ static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
((high & GENMASK_ULL(19, 0)) << 32);
}
+#ifdef CONFIG_NUMA
+static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size);
+}
+static DEVICE_ATTR_RO(sgx_total_bytes);
+
+static umode_t arch_node_attr_is_visible(struct kobject *kobj,
+ struct attribute *attr, int idx)
+{
+ /* Make all x86/ attributes invisible when SGX is not initialized: */
+ if (nodes_empty(sgx_numa_mask))
+ return 0;
+
+ return attr->mode;
+}
+
+static struct attribute *arch_node_dev_attrs[] = {
+ &dev_attr_sgx_total_bytes.attr,
+ NULL,
+};
+
+const struct attribute_group arch_node_dev_group = {
+ .name = "x86",
+ .attrs = arch_node_dev_attrs,
+ .is_visible = arch_node_attr_is_visible,
+};
+
+static void __init arch_update_sysfs_visibility(int nid)
+{
+ struct node *node = node_devices[nid];
+ int ret;
+
+ ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group);
+
+ if (ret)
+ pr_err("sysfs update failed (%d), files may be invisible", ret);
+}
+#else /* !CONFIG_NUMA */
+static void __init arch_update_sysfs_visibility(int nid) {}
+#endif
+
static bool __init sgx_page_cache_init(void)
{
u32 eax, ebx, ecx, edx, type;
@@ -713,10 +867,16 @@ static bool __init sgx_page_cache_init(void)
if (!node_isset(nid, sgx_numa_mask)) {
spin_lock_init(&sgx_numa_nodes[nid].lock);
INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
node_set(nid, sgx_numa_mask);
+ sgx_numa_nodes[nid].size = 0;
+
+ /* Make SGX-specific node sysfs files visible: */
+ arch_update_sysfs_visibility(nid);
}
sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
+ sgx_numa_nodes[nid].size += size;
sgx_nr_epc_sections++;
}
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 4628acec0009..0f17def9fe6f 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -26,9 +26,13 @@
/* Pages, which are being tracked by the page reclaimer. */
#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0)
+/* Pages on free list */
+#define SGX_EPC_PAGE_IS_FREE BIT(1)
+
struct sgx_epc_page {
unsigned int section;
- unsigned int flags;
+ u16 flags;
+ u16 poison;
struct sgx_encl_page *owner;
struct list_head list;
};
@@ -39,6 +43,8 @@ struct sgx_epc_page {
*/
struct sgx_numa_node {
struct list_head free_page_list;
+ struct list_head sgx_poison_page_list;
+ unsigned long size;
spinlock_t lock;
};
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 64511c4a5200..6a77a14eee38 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -111,10 +111,8 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
{
- int ret;
-
/*
* Take a previously guest-owned EPC page and return it to the
* general EPC page pool.
@@ -124,7 +122,12 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
* case that a guest properly EREMOVE'd this page, a superfluous
* EREMOVE is harmless.
*/
- ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+ return __eremove(sgx_get_epc_virt_addr(epc_page));
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+ int ret = sgx_vepc_remove_page(epc_page);
if (ret) {
/*
* Only SGX_CHILD_PRESENT is expected, which is because of
@@ -144,10 +147,44 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
}
sgx_free_epc_page(epc_page);
-
return 0;
}
+static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
+{
+ struct sgx_epc_page *entry;
+ unsigned long index;
+ long failures = 0;
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ int ret = sgx_vepc_remove_page(entry);
+ if (ret) {
+ if (ret == SGX_CHILD_PRESENT) {
+ /* The page is a SECS, userspace will retry. */
+ failures++;
+ } else {
+ /*
+ * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
+ * WARN, as userspace can induce said failures by
+ * calling the ioctl concurrently on multiple vEPCs or
+ * while one or more CPUs is running the enclave. Only
+ * a #PF on EREMOVE indicates a kernel/hardware issue.
+ */
+ WARN_ON_ONCE(encls_faulted(ret) &&
+ ENCLS_TRAPNR(ret) != X86_TRAP_GP);
+ return -EBUSY;
+ }
+ }
+ cond_resched();
+ }
+
+ /*
+ * Return the number of SECS pages that failed to be removed, so
+ * userspace knows that it has to retry.
+ */
+ return failures;
+}
+
static int sgx_vepc_release(struct inode *inode, struct file *file)
{
struct sgx_vepc *vepc = file->private_data;
@@ -233,9 +270,27 @@ static int sgx_vepc_open(struct inode *inode, struct file *file)
return 0;
}
+static long sgx_vepc_ioctl(struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ switch (cmd) {
+ case SGX_IOC_VEPC_REMOVE_ALL:
+ if (arg)
+ return -EINVAL;
+ return sgx_vepc_remove_all(vepc);
+
+ default:
+ return -ENOTTY;
+ }
+}
+
static const struct file_operations sgx_vepc_fops = {
.owner = THIS_MODULE,
.open = sgx_vepc_open,
+ .unlocked_ioctl = sgx_vepc_ioctl,
+ .compat_ioctl = sgx_vepc_ioctl,
.release = sgx_vepc_release,
.mmap = sgx_vepc_mmap,
};
diff --git a/arch/x86/kernel/cpu/vortex.c b/arch/x86/kernel/cpu/vortex.c
new file mode 100644
index 000000000000..e2685470ba94
--- /dev/null
+++ b/arch/x86/kernel/cpu/vortex.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <asm/processor.h>
+#include "cpu.h"
+
+/*
+ * No special init required for Vortex processors.
+ */
+
+static const struct cpu_dev vortex_cpu_dev = {
+ .c_vendor = "Vortex",
+ .c_ident = { "Vortex86 SoC" },
+ .legacy_models = {
+ {
+ .family = 5,
+ .model_names = {
+ [2] = "Vortex86DX",
+ [8] = "Vortex86MX",
+ },
+ },
+ {
+ .family = 6,
+ .model_names = {
+ /*
+ * Both the Vortex86EX and the Vortex86EX2
+ * have the same family and model id.
+ *
+ * However, the -EX2 supports the product name
+ * CPUID call, so this name will only be used
+ * for the -EX, which does not.
+ */
+ [0] = "Vortex86EX",
+ },
+ },
+ },
+ .c_x86_vendor = X86_VENDOR_VORTEX,
+};
+
+cpu_dev_register(vortex_cpu_dev);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045e82e8945b..a7f617a3981d 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,6 +10,7 @@
#include <linux/crash_dump.h>
#include <linux/uaccess.h>
#include <linux/io.h>
+#include <linux/cc_platform.h>
static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf,
@@ -73,5 +74,6 @@ ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, sev_active());
+ return read_from_oldmem(buf, count, ppos, 0,
+ cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT));
}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 6a4cb71c2498..5cd51f25f446 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -31,11 +31,6 @@ char __initdata cmd_line[COMMAND_LINE_SIZE];
int __initdata of_ioapic;
-void __init early_init_dt_scan_chosen_arch(unsigned long node)
-{
- BUG();
-}
-
void __init early_init_dt_add_memory_arch(u64 base, u64 size)
{
BUG();
@@ -139,12 +134,11 @@ static void __init dtb_cpu_setup(void)
{
struct device_node *dn;
u32 apic_id, version;
- int ret;
version = GET_APIC_VERSION(apic_read(APIC_LVR));
for_each_of_cpu_node(dn) {
- ret = of_property_read_u32(dn, "reg", &apic_id);
- if (ret < 0) {
+ apic_id = of_get_cpu_hwid(dn, 0);
+ if (apic_id == ~0U) {
pr_warn("%pOF: missing local APIC ID\n", dn);
continue;
}
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index d1d49e3d536b..3b58d8703094 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -77,9 +77,6 @@ asmlinkage noinstr void __noreturn doublefault_shim(void)
* some way to reconstruct CR3. We could make a credible guess based
* on cpu_tlbstate, but that would be racy and would not account for
* PTI.
- *
- * Instead, don't bother. We can return through
- * rewind_stack_do_exit() instead.
*/
panic("cannot return from double fault\n");
}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 5601b95944fa..6c5defd6569a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_type type)
{
BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
+ if (type == STACK_TYPE_TASK)
+ return "TASK";
+
if (type == STACK_TYPE_IRQ)
return "IRQ";
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
+
if (type == STACK_TYPE_ENTRY) {
/*
* On 64-bit, we have a generic entry stack that we
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 38837dad46e6..fd2d3ab38ebb 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -554,6 +554,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_RKL_IDS(&gen11_early_ops),
INTEL_ADLS_IDS(&gen11_early_ops),
INTEL_ADLP_IDS(&gen11_early_ops),
+ INTEL_RPLS_IDS(&gen11_early_ops),
};
struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
@@ -714,12 +715,6 @@ static struct chipset early_qrk[] __initdata = {
*/
{ PCI_VENDOR_ID_INTEL, 0x0f00,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x3e20,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x3ec4,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x8a12,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
{ PCI_VENDOR_ID_BROADCOM, 0x4331,
PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
{}
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index 2954fab15e51..794e70151203 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -2,7 +2,7 @@
/*
* x86 FPU bug checks:
*/
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
/*
* Boot time CPU/FPU FDIV bug detection code:
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
new file mode 100644
index 000000000000..958accf2ccf0
--- /dev/null
+++ b/arch/x86/kernel/fpu/context.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_CONTEXT_H
+#define __X86_KERNEL_FPU_CONTEXT_H
+
+#include <asm/fpu/xstate.h>
+#include <asm/trace/fpu.h>
+
+/* Functions related to FPU context tracking */
+
+/*
+ * The in-register FPU state for an FPU context on a CPU is assumed to be
+ * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
+ * matches the FPU.
+ *
+ * If the FPU register state is valid, the kernel can skip restoring the
+ * FPU state from memory.
+ *
+ * Any code that clobbers the FPU registers or updates the in-memory
+ * FPU state for a task MUST let the rest of the kernel know that the
+ * FPU registers are no longer valid for this task.
+ *
+ * Either one of these invalidation functions is enough. Invalidate
+ * a resource you control: CPU if using the CPU for something else
+ * (with preemption disabled), FPU for the current task, or a task that
+ * is prevented from running by the current task.
+ */
+static inline void __cpu_invalidate_fpregs_state(void)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+}
+
+static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
+{
+ fpu->last_cpu = -1;
+}
+
+static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
+{
+ return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
+}
+
+static inline void fpregs_deactivate(struct fpu *fpu)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+ trace_x86_fpu_regs_deactivated(fpu);
+}
+
+static inline void fpregs_activate(struct fpu *fpu)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, fpu);
+ trace_x86_fpu_regs_activated(fpu);
+}
+
+/* Internal helper for switch_fpu_return() and signal frame setup */
+static inline void fpregs_restore_userregs(void)
+{
+ struct fpu *fpu = &current->thread.fpu;
+ int cpu = smp_processor_id();
+
+ if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
+ return;
+
+ if (!fpregs_state_valid(fpu, cpu)) {
+ /*
+ * This restores _all_ xstate which has not been
+ * established yet.
+ *
+ * If PKRU is enabled, then the PKRU value is already
+ * correct because it was either set in switch_to() or in
+ * flush_thread(). So it is excluded because it might be
+ * not up to date in current->thread.fpu.xsave state.
+ *
+ * XFD state is handled in restore_fpregs_from_fpstate().
+ */
+ restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE);
+
+ fpregs_activate(fpu);
+ fpu->last_cpu = cpu;
+ }
+ clear_thread_flag(TIF_NEED_FPU_LOAD);
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 7ada7bd03a32..8dea01ffc5c1 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -6,8 +6,9 @@
* General FPU state handling cleanups
* Gareth Hughes <gareth@valinux.com>, May 2000
*/
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
+#include <asm/fpu/sched.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/traps.h>
@@ -15,15 +16,30 @@
#include <linux/hardirq.h>
#include <linux/pkeys.h>
+#include <linux/vmalloc.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>
+#ifdef CONFIG_X86_64
+DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+DEFINE_PER_CPU(u64, xfd_state);
+#endif
+
+/* The FPU state configuration data for kernel and user space */
+struct fpu_state_config fpu_kernel_cfg __ro_after_init;
+struct fpu_state_config fpu_user_cfg __ro_after_init;
+
/*
* Represents the initial FPU state. It's mostly (but not completely) zeroes,
* depending on the FPU hardware format:
*/
-union fpregs_state init_fpstate __ro_after_init;
+struct fpstate init_fpstate __ro_after_init;
/*
* Track whether the kernel is using the FPU state
@@ -83,7 +99,20 @@ bool irq_fpu_usable(void)
EXPORT_SYMBOL(irq_fpu_usable);
/*
- * Save the FPU register state in fpu->state. The register state is
+ * Track AVX512 state use because it is known to slow the max clock
+ * speed of the core.
+ */
+static void update_avx_timestamp(struct fpu *fpu)
+{
+
+#define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM)
+
+ if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK)
+ fpu->avx512_timestamp = jiffies;
+}
+
+/*
+ * Save the FPU register state in fpu->fpstate->regs. The register state is
* preserved.
*
* Must be called with fpregs_lock() held.
@@ -99,19 +128,13 @@ EXPORT_SYMBOL(irq_fpu_usable);
void save_fpregs_to_fpstate(struct fpu *fpu)
{
if (likely(use_xsave())) {
- os_xsave(&fpu->state.xsave);
-
- /*
- * AVX512 state is tracked here because its use is
- * known to slow the max clock speed of the core.
- */
- if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
- fpu->avx512_timestamp = jiffies;
+ os_xsave(fpu->fpstate);
+ update_avx_timestamp(fpu);
return;
}
if (likely(use_fxsr())) {
- fxsave(&fpu->state.fxsave);
+ fxsave(&fpu->fpstate->regs.fxsave);
return;
}
@@ -119,12 +142,11 @@ void save_fpregs_to_fpstate(struct fpu *fpu)
* Legacy FPU register saving, FNSAVE always clears FPU registers,
* so we have to reload them from the memory state.
*/
- asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
- frstor(&fpu->state.fsave);
+ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave));
+ frstor(&fpu->fpstate->regs.fsave);
}
-EXPORT_SYMBOL(save_fpregs_to_fpstate);
-void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
+void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
{
/*
* AMD K7/K8 and later CPUs up to Zen don't save/restore
@@ -141,15 +163,265 @@ void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
}
if (use_xsave()) {
- os_xrstor(&fpstate->xsave, mask);
+ /*
+ * Dynamically enabled features are enabled in XCR0, but
+ * usage requires also that the corresponding bits in XFD
+ * are cleared. If the bits are set then using a related
+ * instruction will raise #NM. This allows to do the
+ * allocation of the larger FPU buffer lazy from #NM or if
+ * the task has no permission to kill it which would happen
+ * via #UD if the feature is disabled in XCR0.
+ *
+ * XFD state is following the same life time rules as
+ * XSTATE and to restore state correctly XFD has to be
+ * updated before XRSTORS otherwise the component would
+ * stay in or go into init state even if the bits are set
+ * in fpstate::regs::xsave::xfeatures.
+ */
+ xfd_update_state(fpstate);
+
+ /*
+ * Restoring state always needs to modify all features
+ * which are in @mask even if the current task cannot use
+ * extended features.
+ *
+ * So fpstate->xfeatures cannot be used here, because then
+ * a feature for which the task has no permission but was
+ * used by the previous task would not go into init state.
+ */
+ mask = fpu_kernel_cfg.max_features & mask;
+
+ os_xrstor(fpstate, mask);
} else {
if (use_fxsr())
- fxrstor(&fpstate->fxsave);
+ fxrstor(&fpstate->regs.fxsave);
else
- frstor(&fpstate->fsave);
+ frstor(&fpstate->regs.fsave);
+ }
+}
+
+void fpu_reset_from_exception_fixup(void)
+{
+ restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE);
+}
+
+#if IS_ENABLED(CONFIG_KVM)
+static void __fpstate_reset(struct fpstate *fpstate, u64 xfd);
+
+static void fpu_init_guest_permissions(struct fpu_guest *gfpu)
+{
+ struct fpu_state_perm *fpuperm;
+ u64 perm;
+
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ spin_lock_irq(&current->sighand->siglock);
+ fpuperm = &current->group_leader->thread.fpu.guest_perm;
+ perm = fpuperm->__state_perm;
+
+ /* First fpstate allocation locks down permissions. */
+ WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED);
+
+ spin_unlock_irq(&current->sighand->siglock);
+
+ gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED;
+}
+
+bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
+{
+ struct fpstate *fpstate;
+ unsigned int size;
+
+ size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
+ fpstate = vzalloc(size);
+ if (!fpstate)
+ return false;
+
+ /* Leave xfd to 0 (the reset value defined by spec) */
+ __fpstate_reset(fpstate, 0);
+ fpstate_init_user(fpstate);
+ fpstate->is_valloc = true;
+ fpstate->is_guest = true;
+
+ gfpu->fpstate = fpstate;
+ gfpu->xfeatures = fpu_user_cfg.default_features;
+ gfpu->perm = fpu_user_cfg.default_features;
+ gfpu->uabi_size = fpu_user_cfg.default_size;
+ fpu_init_guest_permissions(gfpu);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);
+
+void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
+{
+ struct fpstate *fps = gfpu->fpstate;
+
+ if (!fps)
+ return;
+
+ if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use))
+ return;
+
+ gfpu->fpstate = NULL;
+ vfree(fps);
+}
+EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);
+
+/*
+ * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
+ * @guest_fpu: Pointer to the guest FPU container
+ * @xfeatures: Features requested by guest CPUID
+ *
+ * Enable all dynamic xfeatures according to guest perm and requested CPUID.
+ *
+ * Return: 0 on success, error code otherwise
+ */
+int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
+{
+ lockdep_assert_preemption_enabled();
+
+ /* Nothing to do if all requested features are already enabled. */
+ xfeatures &= ~guest_fpu->xfeatures;
+ if (!xfeatures)
+ return 0;
+
+ return __xfd_enable_feature(xfeatures, guest_fpu);
+}
+EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);
+
+#ifdef CONFIG_X86_64
+void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
+{
+ fpregs_lock();
+ guest_fpu->fpstate->xfd = xfd;
+ if (guest_fpu->fpstate->in_use)
+ xfd_update_state(guest_fpu->fpstate);
+ fpregs_unlock();
+}
+EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
+
+/**
+ * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
+ *
+ * Must be invoked from KVM after a VMEXIT before enabling interrupts when
+ * XFD write emulation is disabled. This is required because the guest can
+ * freely modify XFD and the state at VMEXIT is not guaranteed to be the
+ * same as the state on VMENTER. So software state has to be udpated before
+ * any operation which depends on it can take place.
+ *
+ * Note: It can be invoked unconditionally even when write emulation is
+ * enabled for the price of a then pointless MSR read.
+ */
+void fpu_sync_guest_vmexit_xfd_state(void)
+{
+ struct fpstate *fps = current->thread.fpu.fpstate;
+
+ lockdep_assert_irqs_disabled();
+ if (fpu_state_size_dynamic()) {
+ rdmsrl(MSR_IA32_XFD, fps->xfd);
+ __this_cpu_write(xfd_state, fps->xfd);
+ }
+}
+EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
+#endif /* CONFIG_X86_64 */
+
+int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
+{
+ struct fpstate *guest_fps = guest_fpu->fpstate;
+ struct fpu *fpu = &current->thread.fpu;
+ struct fpstate *cur_fps = fpu->fpstate;
+
+ fpregs_lock();
+ if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
+ save_fpregs_to_fpstate(fpu);
+
+ /* Swap fpstate */
+ if (enter_guest) {
+ fpu->__task_fpstate = cur_fps;
+ fpu->fpstate = guest_fps;
+ guest_fps->in_use = true;
+ } else {
+ guest_fps->in_use = false;
+ fpu->fpstate = fpu->__task_fpstate;
+ fpu->__task_fpstate = NULL;
}
+
+ cur_fps = fpu->fpstate;
+
+ if (!cur_fps->is_confidential) {
+ /* Includes XFD update */
+ restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
+ } else {
+ /*
+ * XSTATE is restored by firmware from encrypted
+ * memory. Make sure XFD state is correct while
+ * running with guest fpstate
+ */
+ xfd_update_state(cur_fps);
+ }
+
+ fpregs_mark_activate();
+ fpregs_unlock();
+ return 0;
}
-EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate);
+EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
+
+void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
+ unsigned int size, u32 pkru)
+{
+ struct fpstate *kstate = gfpu->fpstate;
+ union fpregs_state *ustate = buf;
+ struct membuf mb = { .p = buf, .left = size };
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+ __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE);
+ } else {
+ memcpy(&ustate->fxsave, &kstate->regs.fxsave,
+ sizeof(ustate->fxsave));
+ /* Make it restorable on a XSAVE enabled host */
+ ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
+ }
+}
+EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi);
+
+int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
+ u64 xcr0, u32 *vpkru)
+{
+ struct fpstate *kstate = gfpu->fpstate;
+ const union fpregs_state *ustate = buf;
+ struct pkru_state *xpkru;
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+ if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
+ return -EINVAL;
+ if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
+ memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave));
+ return 0;
+ }
+
+ if (ustate->xsave.header.xfeatures & ~xcr0)
+ return -EINVAL;
+
+ ret = copy_uabi_from_kernel_to_xstate(kstate, ustate);
+ if (ret)
+ return ret;
+
+ /* Retrieve PKRU if not in init state */
+ if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) {
+ xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU);
+ *vpkru = xpkru->pkru;
+ }
+
+ /* Ensure that XCOMP_BV is set up for XSAVES */
+ xstate_init_xcomp_bv(&kstate->regs.xsave, kstate->xfeatures);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
+#endif /* CONFIG_KVM */
void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
@@ -203,52 +475,91 @@ void fpu_sync_fpstate(struct fpu *fpu)
fpregs_unlock();
}
-static inline void fpstate_init_xstate(struct xregs_state *xsave)
+static inline unsigned int init_fpstate_copy_size(void)
{
- /*
- * XRSTORS requires these bits set in xcomp_bv, or it will
- * trigger #GP:
- */
- xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
+ if (!use_xsave())
+ return fpu_kernel_cfg.default_size;
+
+ /* XSAVE(S) just needs the legacy and the xstate header part */
+ return sizeof(init_fpstate.regs.xsave);
}
-static inline void fpstate_init_fxstate(struct fxregs_state *fx)
+static inline void fpstate_init_fxstate(struct fpstate *fpstate)
{
- fx->cwd = 0x37f;
- fx->mxcsr = MXCSR_DEFAULT;
+ fpstate->regs.fxsave.cwd = 0x37f;
+ fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT;
}
/*
* Legacy x87 fpstate state init:
*/
-static inline void fpstate_init_fstate(struct fregs_state *fp)
+static inline void fpstate_init_fstate(struct fpstate *fpstate)
{
- fp->cwd = 0xffff037fu;
- fp->swd = 0xffff0000u;
- fp->twd = 0xffffffffu;
- fp->fos = 0xffff0000u;
+ fpstate->regs.fsave.cwd = 0xffff037fu;
+ fpstate->regs.fsave.swd = 0xffff0000u;
+ fpstate->regs.fsave.twd = 0xffffffffu;
+ fpstate->regs.fsave.fos = 0xffff0000u;
}
-void fpstate_init(union fpregs_state *state)
+/*
+ * Used in two places:
+ * 1) Early boot to setup init_fpstate for non XSAVE systems
+ * 2) fpu_init_fpstate_user() which is invoked from KVM
+ */
+void fpstate_init_user(struct fpstate *fpstate)
{
- if (!static_cpu_has(X86_FEATURE_FPU)) {
- fpstate_init_soft(&state->soft);
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ fpstate_init_soft(&fpstate->regs.soft);
return;
}
- memset(state, 0, fpu_kernel_xstate_size);
+ xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures);
- if (static_cpu_has(X86_FEATURE_XSAVES))
- fpstate_init_xstate(&state->xsave);
- if (static_cpu_has(X86_FEATURE_FXSR))
- fpstate_init_fxstate(&state->fxsave);
+ if (cpu_feature_enabled(X86_FEATURE_FXSR))
+ fpstate_init_fxstate(fpstate);
else
- fpstate_init_fstate(&state->fsave);
+ fpstate_init_fstate(fpstate);
+}
+
+static void __fpstate_reset(struct fpstate *fpstate, u64 xfd)
+{
+ /* Initialize sizes and feature masks */
+ fpstate->size = fpu_kernel_cfg.default_size;
+ fpstate->user_size = fpu_user_cfg.default_size;
+ fpstate->xfeatures = fpu_kernel_cfg.default_features;
+ fpstate->user_xfeatures = fpu_user_cfg.default_features;
+ fpstate->xfd = xfd;
+}
+
+void fpstate_reset(struct fpu *fpu)
+{
+ /* Set the fpstate pointer to the default fpstate */
+ fpu->fpstate = &fpu->__fpstate;
+ __fpstate_reset(fpu->fpstate, init_fpstate.xfd);
+
+ /* Initialize the permission related info in fpu */
+ fpu->perm.__state_perm = fpu_kernel_cfg.default_features;
+ fpu->perm.__state_size = fpu_kernel_cfg.default_size;
+ fpu->perm.__user_state_size = fpu_user_cfg.default_size;
+ /* Same defaults for guests */
+ fpu->guest_perm = fpu->perm;
+}
+
+static inline void fpu_inherit_perms(struct fpu *dst_fpu)
+{
+ if (fpu_state_size_dynamic()) {
+ struct fpu *src_fpu = &current->group_leader->thread.fpu;
+
+ spin_lock_irq(&current->sighand->siglock);
+ /* Fork also inherits the permissions of the parent */
+ dst_fpu->perm = src_fpu->perm;
+ dst_fpu->guest_perm = src_fpu->guest_perm;
+ spin_unlock_irq(&current->sighand->siglock);
+ }
}
-EXPORT_SYMBOL_GPL(fpstate_init);
/* Clone current's FPU state on fork */
-int fpu_clone(struct task_struct *dst)
+int fpu_clone(struct task_struct *dst, unsigned long clone_flags)
{
struct fpu *src_fpu = &current->thread.fpu;
struct fpu *dst_fpu = &dst->thread.fpu;
@@ -256,30 +567,51 @@ int fpu_clone(struct task_struct *dst)
/* The new task's FPU state cannot be valid in the hardware. */
dst_fpu->last_cpu = -1;
+ fpstate_reset(dst_fpu);
+
if (!cpu_feature_enabled(X86_FEATURE_FPU))
return 0;
/*
- * Don't let 'init optimized' areas of the XSAVE area
- * leak into the child task:
+ * Enforce reload for user space tasks and prevent kernel threads
+ * from trying to save the FPU registers on context switch.
*/
- memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+ set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
/*
- * If the FPU registers are not owned by current just memcpy() the
- * state. Otherwise save the FPU registers directly into the
- * child's FPU context, without any memory-to-memory copying.
+ * No FPU state inheritance for kernel threads and IO
+ * worker threads.
+ */
+ if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) {
+ /* Clear out the minimal state */
+ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs,
+ init_fpstate_copy_size());
+ return 0;
+ }
+
+ /*
+ * If a new feature is added, ensure all dynamic features are
+ * caller-saved from here!
+ */
+ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);
+
+ /*
+ * Save the default portion of the current FPU state into the
+ * clone. Assume all dynamic features to be defined as caller-
+ * saved, which enables skipping both the expansion of fpstate
+ * and the copying of any dynamic state.
+ *
+ * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because
+ * copying is not valid when current uses non-default states.
*/
fpregs_lock();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
- memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);
-
- else
- save_fpregs_to_fpstate(dst_fpu);
+ fpregs_restore_userregs();
+ save_fpregs_to_fpstate(dst_fpu);
+ if (!(clone_flags & CLONE_THREAD))
+ fpu_inherit_perms(dst_fpu);
fpregs_unlock();
- set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
-
trace_x86_fpu_copy_src(src_fpu);
trace_x86_fpu_copy_dst(dst_fpu);
@@ -287,6 +619,16 @@ int fpu_clone(struct task_struct *dst)
}
/*
+ * Whitelist the FPU register state embedded into task_struct for hardened
+ * usercopy.
+ */
+void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
+{
+ *offset = offsetof(struct thread_struct, fpu.__fpstate.regs);
+ *size = fpu_kernel_cfg.default_size;
+}
+
+/*
* Drops current FPU state: deactivates the fpregs and
* the fpstate. NOTE: it still leaves previous contents
* in the fpregs in the eager-FPU case.
@@ -319,28 +661,19 @@ void fpu__drop(struct fpu *fpu)
static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
{
if (use_xsave())
- os_xrstor(&init_fpstate.xsave, features_mask);
+ os_xrstor(&init_fpstate, features_mask);
else if (use_fxsr())
- fxrstor(&init_fpstate.fxsave);
+ fxrstor(&init_fpstate.regs.fxsave);
else
- frstor(&init_fpstate.fsave);
+ frstor(&init_fpstate.regs.fsave);
pkru_write_default();
}
-static inline unsigned int init_fpstate_copy_size(void)
-{
- if (!use_xsave())
- return fpu_kernel_xstate_size;
-
- /* XSAVE(S) just needs the legacy and the xstate header part */
- return sizeof(init_fpstate.xsave);
-}
-
/*
* Reset current->fpu memory state to the init values.
*/
-static void fpu_reset_fpstate(void)
+static void fpu_reset_fpregs(void)
{
struct fpu *fpu = &current->thread.fpu;
@@ -359,7 +692,7 @@ static void fpu_reset_fpstate(void)
* user space as PKRU is eagerly written in switch_to() and
* flush_thread().
*/
- memcpy(&fpu->state, &init_fpstate, init_fpstate_copy_size());
+ memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size());
set_thread_flag(TIF_NEED_FPU_LOAD);
fpregs_unlock();
}
@@ -375,7 +708,7 @@ void fpu__clear_user_states(struct fpu *fpu)
fpregs_lock();
if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
- fpu_reset_fpstate();
+ fpu_reset_fpregs();
fpregs_unlock();
return;
}
@@ -385,12 +718,11 @@ void fpu__clear_user_states(struct fpu *fpu)
* corresponding registers.
*/
if (xfeatures_mask_supervisor() &&
- !fpregs_state_valid(fpu, smp_processor_id())) {
- os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
- }
+ !fpregs_state_valid(fpu, smp_processor_id()))
+ os_xrstor_supervisor(fpu->fpstate);
/* Reset user states in registers. */
- restore_fpregs_from_init_fpstate(xfeatures_mask_restore_user());
+ restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);
/*
* Now all FPU registers have their desired values. Inform the FPU
@@ -405,7 +737,8 @@ void fpu__clear_user_states(struct fpu *fpu)
void fpu_flush_thread(void)
{
- fpu_reset_fpstate();
+ fpstate_reset(&current->thread.fpu);
+ fpu_reset_fpregs();
}
/*
* Load FPU context before returning to userspace.
@@ -445,7 +778,6 @@ void fpregs_mark_activate(void)
fpu->last_cpu = smp_processor_id();
clear_thread_flag(TIF_NEED_FPU_LOAD);
}
-EXPORT_SYMBOL_GPL(fpregs_mark_activate);
/*
* x87 math exception handling:
@@ -468,11 +800,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
* fully reproduce the context of the exception.
*/
if (boot_cpu_has(X86_FEATURE_FXSR)) {
- cwd = fpu->state.fxsave.cwd;
- swd = fpu->state.fxsave.swd;
+ cwd = fpu->fpstate->regs.fxsave.cwd;
+ swd = fpu->fpstate->regs.fxsave.swd;
} else {
- cwd = (unsigned short)fpu->state.fsave.cwd;
- swd = (unsigned short)fpu->state.fsave.swd;
+ cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd;
+ swd = (unsigned short)fpu->fpstate->regs.fsave.swd;
}
err = swd & ~cwd;
@@ -486,7 +818,7 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
unsigned short mxcsr = MXCSR_DEFAULT;
if (boot_cpu_has(X86_FEATURE_XMM))
- mxcsr = fpu->state.fxsave.mxcsr;
+ mxcsr = fpu->fpstate->regs.fxsave.mxcsr;
err = ~(mxcsr >> 7) & mxcsr;
}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 64e29927cc32..621f4b6cac4a 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -2,7 +2,7 @@
/*
* x86 FPU boot time init code:
*/
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
@@ -10,6 +10,10 @@
#include <linux/sched/task.h>
#include <linux/init.h>
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
/*
* Initialize the registers found in all CPUs, CR0 and CR4:
*/
@@ -34,7 +38,7 @@ static void fpu__init_cpu_generic(void)
/* Flush out any pending x87 state: */
#ifdef CONFIG_MATH_EMULATION
if (!boot_cpu_has(X86_FEATURE_FPU))
- fpstate_init_soft(&current->thread.fpu.state.soft);
+ fpstate_init_soft(&current->thread.fpu.fpstate->regs.soft);
else
#endif
asm volatile ("fninit");
@@ -121,23 +125,14 @@ static void __init fpu__init_system_mxcsr(void)
static void __init fpu__init_system_generic(void)
{
/*
- * Set up the legacy init FPU context. (xstate init might overwrite this
- * with a more modern format, if the CPU supports it.)
+ * Set up the legacy init FPU context. Will be updated when the
+ * CPU supports XSAVE[S].
*/
- fpstate_init(&init_fpstate);
+ fpstate_init_user(&init_fpstate);
fpu__init_system_mxcsr();
}
-/*
- * Size of the FPU context state. All tasks in the system use the
- * same context size, regardless of what portion they use.
- * This is inherent to the XSAVE architecture which puts all state
- * components into a single, continuous memory block:
- */
-unsigned int fpu_kernel_xstate_size __ro_after_init;
-EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
-
/* Get alignment of the TYPE. */
#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
@@ -162,13 +157,13 @@ static void __init fpu__init_task_struct_size(void)
* Subtract off the static size of the register state.
* It potentially has a bunch of padding.
*/
- task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+ task_size -= sizeof(current->thread.fpu.__fpstate.regs);
/*
* Add back the dynamically-calculated register state
* size.
*/
- task_size += fpu_kernel_xstate_size;
+ task_size += fpu_kernel_cfg.default_size;
/*
* We dynamically size 'struct fpu', so we require that
@@ -177,7 +172,7 @@ static void __init fpu__init_task_struct_size(void)
* you hit a compile error here, check the structure to
* see if something got added to the end.
*/
- CHECK_MEMBER_AT_END_OF(struct fpu, state);
+ CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate);
CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
@@ -192,37 +187,34 @@ static void __init fpu__init_task_struct_size(void)
*/
static void __init fpu__init_system_xstate_size_legacy(void)
{
- static int on_boot_cpu __initdata = 1;
-
- WARN_ON_FPU(!on_boot_cpu);
- on_boot_cpu = 0;
+ unsigned int size;
/*
- * Note that xstate sizes might be overwritten later during
- * fpu__init_system_xstate().
+ * Note that the size configuration might be overwritten later
+ * during fpu__init_system_xstate().
*/
-
- if (!boot_cpu_has(X86_FEATURE_FPU)) {
- fpu_kernel_xstate_size = sizeof(struct swregs_state);
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ size = sizeof(struct swregs_state);
+ } else if (cpu_feature_enabled(X86_FEATURE_FXSR)) {
+ size = sizeof(struct fxregs_state);
+ fpu_user_cfg.legacy_features = XFEATURE_MASK_FPSSE;
} else {
- if (boot_cpu_has(X86_FEATURE_FXSR))
- fpu_kernel_xstate_size =
- sizeof(struct fxregs_state);
- else
- fpu_kernel_xstate_size =
- sizeof(struct fregs_state);
+ size = sizeof(struct fregs_state);
+ fpu_user_cfg.legacy_features = XFEATURE_MASK_FP;
}
- fpu_user_xstate_size = fpu_kernel_xstate_size;
+ fpu_kernel_cfg.max_size = size;
+ fpu_kernel_cfg.default_size = size;
+ fpu_user_cfg.max_size = size;
+ fpu_user_cfg.default_size = size;
+ fpstate_reset(&current->thread.fpu);
}
-/* Legacy code to initialize eager fpu mode. */
-static void __init fpu__init_system_ctx_switch(void)
+static void __init fpu__init_init_fpstate(void)
{
- static bool on_boot_cpu __initdata = 1;
-
- WARN_ON_FPU(!on_boot_cpu);
- on_boot_cpu = 0;
+ /* Bring init_fpstate size and features up to date */
+ init_fpstate.size = fpu_kernel_cfg.max_size;
+ init_fpstate.xfeatures = fpu_kernel_cfg.max_features;
}
/*
@@ -231,6 +223,7 @@ static void __init fpu__init_system_ctx_switch(void)
*/
void __init fpu__init_system(struct cpuinfo_x86 *c)
{
+ fpstate_reset(&current->thread.fpu);
fpu__init_system_early_generic(c);
/*
@@ -241,8 +234,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_generic();
fpu__init_system_xstate_size_legacy();
- fpu__init_system_xstate();
+ fpu__init_system_xstate(fpu_kernel_cfg.max_size);
fpu__init_task_struct_size();
-
- fpu__init_system_ctx_switch();
+ fpu__init_init_fpstate();
}
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
new file mode 100644
index 000000000000..dbdb31f55fc7
--- /dev/null
+++ b/arch/x86/kernel/fpu/internal.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_INTERNAL_H
+#define __X86_KERNEL_FPU_INTERNAL_H
+
+extern struct fpstate init_fpstate;
+
+/* CPU feature check wrappers */
+static __always_inline __pure bool use_xsave(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_XSAVE);
+}
+
+static __always_inline __pure bool use_fxsr(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_FXSR);
+}
+
+#ifdef CONFIG_X86_DEBUG_FPU
+# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
+#else
+# define WARN_ON_FPU(x) ({ (void)(x); 0; })
+#endif
+
+/* Used in init.c */
+extern void fpstate_init_user(struct fpstate *fpstate);
+extern void fpstate_reset(struct fpu *fpu);
+
+#endif
diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h
new file mode 100644
index 000000000000..098f367bb8a7
--- /dev/null
+++ b/arch/x86/kernel/fpu/legacy.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_LEGACY_H
+#define __X86_KERNEL_FPU_LEGACY_H
+
+#include <asm/fpu/types.h>
+
+extern unsigned int mxcsr_feature_mask;
+
+static inline void ldmxcsr(u32 mxcsr)
+{
+ asm volatile("ldmxcsr %0" :: "m" (mxcsr));
+}
+
+/*
+ * Returns 0 on success or the trap number when the operation raises an
+ * exception.
+ */
+#define user_insn(insn, output, input...) \
+({ \
+ int err; \
+ \
+ might_fault(); \
+ \
+ asm volatile(ASM_STAC "\n" \
+ "1: " #insn "\n" \
+ "2: " ASM_CLAC "\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \
+ : [err] "=a" (err), output \
+ : "0"(0), input); \
+ err; \
+})
+
+#define kernel_insn_err(insn, output, input...) \
+({ \
+ int err; \
+ asm volatile("1:" #insn "\n\t" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[err]) \
+ : [err] "=r" (err), output \
+ : "0"(0), input); \
+ err; \
+})
+
+#define kernel_insn(insn, output, input...) \
+ asm volatile("1:" #insn "\n\t" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \
+ : output : input)
+
+static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
+{
+ return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx));
+}
+
+static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
+ else
+ return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
+
+}
+
+static inline void fxrstor(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_safe(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void frstor(struct fregs_state *fx)
+{
+ kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_safe(struct fregs_state *fx)
+{
+ return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_from_user_sigframe(struct fregs_state __user *fx)
+{
+ return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void fxsave(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+ else
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 66ed317ebc0d..437d7c930c0b 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -5,10 +5,14 @@
#include <linux/sched/task_stack.h>
#include <linux/vmalloc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
-#include <asm/fpu/xstate.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
/*
* The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
@@ -74,8 +78,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
sync_fpstate(fpu);
if (!use_xsave()) {
- return membuf_write(&to, &fpu->state.fxsave,
- sizeof(fpu->state.fxsave));
+ return membuf_write(&to, &fpu->fpstate->regs.fxsave,
+ sizeof(fpu->fpstate->regs.fxsave));
}
copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX);
@@ -110,15 +114,15 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
fpu_force_restore(fpu);
/* Copy the state */
- memcpy(&fpu->state.fxsave, &newstate, sizeof(newstate));
+ memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate));
/* Clear xmm8..15 */
- BUILD_BUG_ON(sizeof(fpu->state.fxsave.xmm_space) != 16 * 16);
- memset(&fpu->state.fxsave.xmm_space[8], 0, 8 * 16);
+ BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16);
+ memset(&fpu->fpstate->regs.fxsave.xmm_space[8], 0, 8 * 16);
/* Mark FP and SSE as in use when XSAVE is enabled */
if (use_xsave())
- fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+ fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
return 0;
}
@@ -149,7 +153,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
/*
* A whole standard-format XSAVE buffer is needed:
*/
- if (pos != 0 || count != fpu_user_xstate_size)
+ if (pos != 0 || count != fpu_user_cfg.max_size)
return -EFAULT;
if (!kbuf) {
@@ -164,7 +168,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
}
fpu_force_restore(fpu);
- ret = copy_uabi_from_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf);
+ ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf);
out:
vfree(tmpbuf);
@@ -283,7 +287,7 @@ static void __convert_from_fxsr(struct user_i387_ia32_struct *env,
void
convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
{
- __convert_from_fxsr(env, tsk, &tsk->thread.fpu.state.fxsave);
+ __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave);
}
void convert_to_fxsr(struct fxregs_state *fxsave,
@@ -326,7 +330,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
return fpregs_soft_get(target, regset, to);
if (!cpu_feature_enabled(X86_FEATURE_FXSR)) {
- return membuf_write(&to, &fpu->state.fsave,
+ return membuf_write(&to, &fpu->fpstate->regs.fsave,
sizeof(struct fregs_state));
}
@@ -337,7 +341,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP);
fx = &fxsave;
} else {
- fx = &fpu->state.fxsave;
+ fx = &fpu->fpstate->regs.fxsave;
}
__convert_from_fxsr(&env, target, fx);
@@ -366,16 +370,16 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
fpu_force_restore(fpu);
if (cpu_feature_enabled(X86_FEATURE_FXSR))
- convert_to_fxsr(&fpu->state.fxsave, &env);
+ convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env);
else
- memcpy(&fpu->state.fsave, &env, sizeof(env));
+ memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env));
/*
* Update the header bit in the xsave header, indicating the
* presence of FP.
*/
if (cpu_feature_enabled(X86_FEATURE_XSAVE))
- fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
+ fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP;
return 0;
}
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 445c57c9c539..91d4b6de58ab 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -7,23 +7,25 @@
#include <linux/cpu.h>
#include <linux/pagemap.h>
-#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/xstate.h>
#include <asm/sigframe.h>
+#include <asm/trapnr.h>
#include <asm/trace/fpu.h>
-static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init;
-static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init;
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
/*
* Check for the presence of extended state information in the
* user fpstate pointer in the sigcontext.
*/
-static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
- struct _fpx_sw_bytes *fx_sw)
+static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
+ struct _fpx_sw_bytes *fx_sw)
{
int min_xstate_size = sizeof(struct fxregs_state) +
sizeof(struct xstate_header);
@@ -31,12 +33,12 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
unsigned int magic2;
if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
- return -EFAULT;
+ return false;
/* Check for the first magic field and other error scenarios. */
if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
fx_sw->xstate_size < min_xstate_size ||
- fx_sw->xstate_size > fpu_user_xstate_size ||
+ fx_sw->xstate_size > current->thread.fpu.fpstate->user_size ||
fx_sw->xstate_size > fx_sw->extended_size)
goto setfx;
@@ -47,10 +49,10 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
* in the memory layout.
*/
if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)))
- return -EFAULT;
+ return false;
if (likely(magic2 == FP_XSTATE_MAGIC2))
- return 0;
+ return true;
setfx:
trace_x86_fpu_xstate_check_failed(&current->thread.fpu);
@@ -58,22 +60,22 @@ setfx:
fx_sw->magic1 = 0;
fx_sw->xstate_size = sizeof(struct fxregs_state);
fx_sw->xfeatures = XFEATURE_MASK_FPSSE;
- return 0;
+ return true;
}
/*
* Signal frame handlers.
*/
-static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
+static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
{
if (use_fxsr()) {
- struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+ struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave;
struct user_i387_ia32_struct env;
struct _fpstate_32 __user *fp = buf;
fpregs_lock();
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
- fxsave(&tsk->thread.fpu.state.fxsave);
+ fxsave(&tsk->thread.fpu.fpstate->regs.fxsave);
fpregs_unlock();
convert_from_fxsr(&env, tsk);
@@ -81,33 +83,54 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
if (__copy_to_user(buf, &env, sizeof(env)) ||
__put_user(xsave->i387.swd, &fp->status) ||
__put_user(X86_FXSR_MAGIC, &fp->magic))
- return -1;
+ return false;
} else {
struct fregs_state __user *fp = buf;
u32 swd;
+
if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
- return -1;
+ return false;
}
- return 0;
+ return true;
+}
+
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed to by the fpstate pointer in the sigcontext.
+ * This is saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame,
+ struct fpstate *fpstate)
+{
+ sw_bytes->magic1 = FP_XSTATE_MAGIC1;
+ sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE;
+ sw_bytes->xfeatures = fpstate->user_xfeatures;
+ sw_bytes->xstate_size = fpstate->user_size;
+
+ if (ia32_frame)
+ sw_bytes->extended_size += sizeof(struct fregs_state);
}
-static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
+static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
+ struct fpstate *fpstate)
{
struct xregs_state __user *x = buf;
- struct _fpx_sw_bytes *sw_bytes;
+ struct _fpx_sw_bytes sw_bytes = {};
u32 xfeatures;
int err;
/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
- sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
- err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
+ save_sw_bytes(&sw_bytes, ia32_frame, fpstate);
+ err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes));
if (!use_xsave())
- return err;
+ return !err;
err |= __put_user(FP_XSTATE_MAGIC2,
- (__u32 __user *)(buf + fpu_user_xstate_size));
+ (__u32 __user *)(buf + fpstate->user_size));
/*
* Read the xfeatures which we copied (directly from the cpu or
@@ -130,23 +153,17 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
- return err;
+ return !err;
}
static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
{
- int err;
-
if (use_xsave())
- err = xsave_to_user_sigframe(buf);
- else if (use_fxsr())
- err = fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
+ return xsave_to_user_sigframe(buf);
+ if (use_fxsr())
+ return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
else
- err = fnsave_to_user_sigframe((struct fregs_state __user *) buf);
-
- if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size))
- err = -EFAULT;
- return err;
+ return fnsave_to_user_sigframe((struct fregs_state __user *) buf);
}
/*
@@ -159,10 +176,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
* buf == buf_fx for 64-bit frames and 32-bit fsave frame.
* buf != buf_fx for 32-bit frames with fxstate.
*
- * Try to save it directly to the user frame with disabled page fault handler.
- * If this fails then do the slow path where the FPU state is first saved to
- * task's fpu->state and then copy it to the user frame pointed to by the
- * aligned pointer 'buf_fx'.
+ * Save it directly to the user frame with disabled page fault handler. If
+ * that faults, try to clear the frame which handles the page fault.
*
* If this is a 32-bit frame with fxstate, put a fsave header before
* the aligned state at 'buf_fx'.
@@ -170,10 +185,11 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
* For [f]xsave state, update the SW reserved fields in the [f]xsave frame
* indicating the absence/presence of the extended state to the user.
*/
-int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
{
struct task_struct *tsk = current;
- int ia32_fxstate = (buf != buf_fx);
+ struct fpstate *fpstate = tsk->thread.fpu.fpstate;
+ bool ia32_fxstate = (buf != buf_fx);
int ret;
ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
@@ -181,13 +197,25 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
if (!static_cpu_has(X86_FEATURE_FPU)) {
struct user_i387_ia32_struct fp;
+
fpregs_soft_get(current, NULL, (struct membuf){.p = &fp,
.left = sizeof(fp)});
- return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0;
+ return !copy_to_user(buf, &fp, sizeof(fp));
}
if (!access_ok(buf, size))
- return -EACCES;
+ return false;
+
+ if (use_xsave()) {
+ struct xregs_state __user *xbuf = buf_fx;
+
+ /*
+ * Clear the xsave header first, so that reserved fields are
+ * initialized to zero.
+ */
+ if (__clear_user(&xbuf->header, sizeof(xbuf->header)))
+ return false;
+ }
retry:
/*
* Load the FPU registers if they are not valid for the current task.
@@ -205,26 +233,26 @@ retry:
fpregs_unlock();
if (ret) {
- if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size))
+ if (!__clear_user(buf_fx, fpstate->user_size))
goto retry;
- return -EFAULT;
+ return false;
}
/* Save the fsave header for the 32-bit frames. */
- if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
- return -1;
+ if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf))
+ return false;
- if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
- return -1;
+ if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate))
+ return false;
- return 0;
+ return true;
}
-static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
- bool fx_only)
+static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
+ u64 xrestore, bool fx_only)
{
if (use_xsave()) {
- u64 init_bv = xfeatures_mask_uabi() & ~xrestore;
+ u64 init_bv = ufeatures & ~xrestore;
int ret;
if (likely(!fx_only))
@@ -233,7 +261,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
ret = fxrstor_from_user_sigframe(buf);
if (!ret && unlikely(init_bv))
- os_xrstor(&init_fpstate.xsave, init_bv);
+ os_xrstor(&init_fpstate, init_bv);
return ret;
} else if (use_fxsr()) {
return fxrstor_from_user_sigframe(buf);
@@ -246,16 +274,19 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
* Attempt to restore the FPU registers directly from user memory.
* Pagefaults are handled and any errors returned are fatal.
*/
-static int restore_fpregs_from_user(void __user *buf, u64 xrestore,
- bool fx_only, unsigned int size)
+static bool restore_fpregs_from_user(void __user *buf, u64 xrestore,
+ bool fx_only, unsigned int size)
{
struct fpu *fpu = &current->thread.fpu;
int ret;
retry:
fpregs_lock();
+ /* Ensure that XFD is up to date */
+ xfd_update_state(fpu->fpstate);
pagefault_disable();
- ret = __restore_fpregs_from_user(buf, xrestore, fx_only);
+ ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures,
+ xrestore, fx_only);
pagefault_enable();
if (unlikely(ret)) {
@@ -275,13 +306,12 @@ retry:
fpregs_unlock();
/* Try to handle #PF, but anything else is fatal. */
- if (ret != -EFAULT)
- return -EINVAL;
+ if (ret != X86_TRAP_PF)
+ return false;
- ret = fault_in_pages_readable(buf, size);
- if (!ret)
+ if (!fault_in_readable(buf, size))
goto retry;
- return ret;
+ return false;
}
/*
@@ -294,45 +324,40 @@ retry:
* been restored from a user buffer directly.
*/
if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
- os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
+ os_xrstor_supervisor(fpu->fpstate);
fpregs_mark_activate();
fpregs_unlock();
- return 0;
+ return true;
}
-static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
- bool ia32_fxstate)
+static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
+ bool ia32_fxstate)
{
- int state_size = fpu_kernel_xstate_size;
struct task_struct *tsk = current;
struct fpu *fpu = &tsk->thread.fpu;
struct user_i387_ia32_struct env;
+ bool success, fx_only = false;
+ union fpregs_state *fpregs;
+ unsigned int state_size;
u64 user_xfeatures = 0;
- bool fx_only = false;
- int ret;
if (use_xsave()) {
struct _fpx_sw_bytes fx_sw_user;
- ret = check_xstate_in_sigframe(buf_fx, &fx_sw_user);
- if (unlikely(ret))
- return ret;
+ if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user))
+ return false;
fx_only = !fx_sw_user.magic1;
state_size = fx_sw_user.xstate_size;
user_xfeatures = fx_sw_user.xfeatures;
} else {
user_xfeatures = XFEATURE_MASK_FPSSE;
+ state_size = fpu->fpstate->user_size;
}
if (likely(!ia32_fxstate)) {
- /*
- * Attempt to restore the FPU registers directly from user
- * memory. For that to succeed, the user access cannot cause page
- * faults. If it does, fall back to the slow path below, going
- * through the kernel buffer with the enabled pagefault handler.
- */
+ /* Restore the FPU registers directly from user memory. */
return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
state_size);
}
@@ -342,9 +367,8 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
* to be ignored for histerical raisins. The legacy state is folded
* in once the larger state has been copied.
*/
- ret = __copy_from_user(&env, buf, sizeof(env));
- if (ret)
- return ret;
+ if (__copy_from_user(&env, buf, sizeof(env)))
+ return false;
/*
* By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
@@ -363,33 +387,38 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
* the right place in memory. It's ia32 mode. Shrug.
*/
if (xfeatures_mask_supervisor())
- os_xsave(&fpu->state.xsave);
+ os_xsave(fpu->fpstate);
set_thread_flag(TIF_NEED_FPU_LOAD);
}
__fpu_invalidate_fpregs_state(fpu);
__cpu_invalidate_fpregs_state();
fpregs_unlock();
+ fpregs = &fpu->fpstate->regs;
if (use_xsave() && !fx_only) {
- ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx);
- if (ret)
- return ret;
+ if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx))
+ return false;
} else {
- if (__copy_from_user(&fpu->state.fxsave, buf_fx,
- sizeof(fpu->state.fxsave)))
- return -EFAULT;
-
- /* Reject invalid MXCSR values. */
- if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask)
- return -EINVAL;
+ if (__copy_from_user(&fpregs->fxsave, buf_fx,
+ sizeof(fpregs->fxsave)))
+ return false;
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* Reject invalid MXCSR values. */
+ if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask)
+ return false;
+ } else {
+ /* Mask invalid bits out for historical reasons (broken hardware). */
+ fpregs->fxsave.mxcsr &= mxcsr_feature_mask;
+ }
/* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
if (use_xsave())
- fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+ fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
}
/* Fold the legacy FP storage */
- convert_to_fxsr(&fpu->state.fxsave, &env);
+ convert_to_fxsr(&fpregs->fxsave, &env);
fpregs_lock();
if (use_xsave()) {
@@ -404,40 +433,45 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
*/
u64 mask = user_xfeatures | xfeatures_mask_supervisor();
- fpu->state.xsave.header.xfeatures &= mask;
- ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all);
+ fpregs->xsave.header.xfeatures &= mask;
+ success = !os_xrstor_safe(fpu->fpstate,
+ fpu_kernel_cfg.max_features);
} else {
- ret = fxrstor_safe(&fpu->state.fxsave);
+ success = !fxrstor_safe(&fpregs->fxsave);
}
- if (likely(!ret))
+ if (likely(success))
fpregs_mark_activate();
fpregs_unlock();
- return ret;
+ return success;
}
-static inline int xstate_sigframe_size(void)
+
+static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate)
{
- return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE :
- fpu_user_xstate_size;
+ unsigned int size = fpstate->user_size;
+
+ return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size;
}
/*
* Restore FPU state from a sigframe:
*/
-int fpu__restore_sig(void __user *buf, int ia32_frame)
+bool fpu__restore_sig(void __user *buf, int ia32_frame)
{
- unsigned int size = xstate_sigframe_size();
struct fpu *fpu = &current->thread.fpu;
void __user *buf_fx = buf;
bool ia32_fxstate = false;
- int ret;
+ bool success = false;
+ unsigned int size;
if (unlikely(!buf)) {
fpu__clear_user_states(fpu);
- return 0;
+ return true;
}
+ size = xstate_sigframe_size(fpu->fpstate);
+
ia32_frame &= (IS_ENABLED(CONFIG_X86_32) ||
IS_ENABLED(CONFIG_IA32_EMULATION));
@@ -451,30 +485,28 @@ int fpu__restore_sig(void __user *buf, int ia32_frame)
ia32_fxstate = true;
}
- if (!access_ok(buf, size)) {
- ret = -EACCES;
+ if (!access_ok(buf, size))
goto out;
- }
if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) {
- ret = fpregs_soft_set(current, NULL, 0,
- sizeof(struct user_i387_ia32_struct),
- NULL, buf);
+ success = !fpregs_soft_set(current, NULL, 0,
+ sizeof(struct user_i387_ia32_struct),
+ NULL, buf);
} else {
- ret = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
+ success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
}
out:
- if (unlikely(ret))
+ if (unlikely(!success))
fpu__clear_user_states(fpu);
- return ret;
+ return success;
}
unsigned long
fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
unsigned long *buf_fx, unsigned long *size)
{
- unsigned long frame_size = xstate_sigframe_size();
+ unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate);
*buf_fx = sp = round_down(sp - frame_size, 64);
if (ia32_frame && use_fxsr()) {
@@ -487,9 +519,12 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
return sp;
}
-unsigned long fpu__get_fpstate_size(void)
+unsigned long __init fpu__get_fpstate_size(void)
{
- unsigned long ret = xstate_sigframe_size();
+ unsigned long ret = fpu_user_cfg.max_size;
+
+ if (use_xsave())
+ ret += FP_XSTATE_MAGIC2_SIZE;
/*
* This space is needed on (most) 32-bit kernels, or when a 32-bit
@@ -505,28 +540,3 @@ unsigned long fpu__get_fpstate_size(void)
return ret;
}
-/*
- * Prepare the SW reserved portion of the fxsave memory layout, indicating
- * the presence of the extended state information in the memory layout
- * pointed by the fpstate pointer in the sigcontext.
- * This will be saved when ever the FP and extended state context is
- * saved on the user stack during the signal handler delivery to the user.
- */
-void fpu__init_prepare_fx_sw_frame(void)
-{
- int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
-
- fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
- fx_sw_reserved.extended_size = size;
- fx_sw_reserved.xfeatures = xfeatures_mask_uabi();
- fx_sw_reserved.xstate_size = fpu_user_xstate_size;
-
- if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
- IS_ENABLED(CONFIG_X86_32)) {
- int fsave_header_size = sizeof(struct fregs_state);
-
- fx_sw_reserved_ia32 = fx_sw_reserved;
- fx_sw_reserved_ia32.extended_size = size + fsave_header_size;
- }
-}
-
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c8def1b7f8fb..02b3ddaf4f75 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -4,21 +4,33 @@
*
* Author: Suresh Siddha <suresh.b.siddha@intel.com>
*/
+#include <linux/bitops.h>
#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/mman.h>
+#include <linux/nospec.h>
#include <linux/pkeys.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
#include <asm/fpu/api.h>
-#include <asm/fpu/internal.h>
-#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
-#include <asm/fpu/xstate.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/xcr.h>
#include <asm/tlbflush.h>
-#include <asm/cpufeature.h>
+#include <asm/prctl.h>
+#include <asm/elf.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+#define for_each_extended_xfeature(bit, mask) \
+ (bit) = FIRST_EXTENDED_XFEATURE; \
+ for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
/*
* Although we spell it out in here, the Processor Trace
@@ -39,29 +51,32 @@ static const char *xfeature_names[] =
"Protection Keys User registers",
"PASID state",
"unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "unknown xstate feature" ,
+ "AMX Tile config" ,
+ "AMX Tile data" ,
+ "unknown xstate feature" ,
};
-static short xsave_cpuid_features[] __initdata = {
- X86_FEATURE_FPU,
- X86_FEATURE_XMM,
- X86_FEATURE_AVX,
- X86_FEATURE_MPX,
- X86_FEATURE_MPX,
- X86_FEATURE_AVX512F,
- X86_FEATURE_AVX512F,
- X86_FEATURE_AVX512F,
- X86_FEATURE_INTEL_PT,
- X86_FEATURE_PKU,
- X86_FEATURE_ENQCMD,
+static unsigned short xsave_cpuid_features[] __initdata = {
+ [XFEATURE_FP] = X86_FEATURE_FPU,
+ [XFEATURE_SSE] = X86_FEATURE_XMM,
+ [XFEATURE_YMM] = X86_FEATURE_AVX,
+ [XFEATURE_BNDREGS] = X86_FEATURE_MPX,
+ [XFEATURE_BNDCSR] = X86_FEATURE_MPX,
+ [XFEATURE_OPMASK] = X86_FEATURE_AVX512F,
+ [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F,
+ [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F,
+ [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT,
+ [XFEATURE_PKRU] = X86_FEATURE_PKU,
+ [XFEATURE_PASID] = X86_FEATURE_ENQCMD,
+ [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
+ [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
};
-/*
- * This represents the full set of bits that should ever be set in a kernel
- * XSAVE buffer, both supervisor and user xstates.
- */
-u64 xfeatures_mask_all __ro_after_init;
-EXPORT_SYMBOL_GPL(xfeatures_mask_all);
-
static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
{ [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
@@ -72,20 +87,13 @@ static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init
{ [ 0 ... XFEATURE_MAX - 1] = -1};
/*
- * The XSAVE area of kernel can be in standard or compacted format;
- * it is always in standard format for user mode. This is the user
- * mode standard format size used for signal and ptrace frames.
- */
-unsigned int fpu_user_xstate_size __ro_after_init;
-
-/*
* Return whether the system supports a given xfeature.
*
* Also return the name of the (most advanced) feature that the caller requested:
*/
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
{
- u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all;
+ u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
if (unlikely(feature_name)) {
long xfeature_idx, max_idx;
@@ -135,17 +143,26 @@ static bool xfeature_is_supervisor(int xfeature_nr)
*/
void fpu__init_cpu_xstate(void)
{
- if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all)
+ if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
return;
cr4_set_bits(X86_CR4_OSXSAVE);
/*
+ * Must happen after CR4 setup and before xsetbv() to allow KVM
+ * lazy passthrough. Write independent of the dynamic state static
+ * key as that does not work on the boot CPU. This also ensures
+ * that any stale state is wiped out from XFD.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XFD))
+ wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
+
+ /*
* XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
* managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user
* states can be set here.
*/
- xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi());
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
/*
* MSR_IA32_XSS sets supervisor states managed by XSAVES.
@@ -158,7 +175,7 @@ void fpu__init_cpu_xstate(void)
static bool xfeature_enabled(enum xfeature xfeature)
{
- return xfeatures_mask_all & BIT_ULL(xfeature);
+ return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
}
/*
@@ -184,10 +201,7 @@ static void __init setup_xstate_features(void)
xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state,
xmm_space);
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i))
- continue;
-
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
xstate_sizes[i] = eax;
@@ -236,6 +250,8 @@ static void __init print_xstate_features(void)
print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
print_xstate_feature(XFEATURE_MASK_PKRU);
print_xstate_feature(XFEATURE_MASK_PASID);
+ print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
+ print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
}
/*
@@ -291,20 +307,15 @@ static void __init setup_xstate_comp_offsets(void)
xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state,
xmm_space);
- if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (xfeature_enabled(i))
- xstate_comp_offsets[i] = xstate_offsets[i];
- }
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) {
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features)
+ xstate_comp_offsets[i] = xstate_offsets[i];
return;
}
next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i))
- continue;
-
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
if (xfeature_is_aligned(i))
next_offset = ALIGN(next_offset, 64);
@@ -328,8 +339,8 @@ static void __init setup_supervisor_only_offsets(void)
next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i) || !xfeature_is_supervisor(i))
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+ if (!xfeature_is_supervisor(i))
continue;
if (xfeature_is_aligned(i))
@@ -347,15 +358,36 @@ static void __init print_xstate_offset_size(void)
{
int i;
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i))
- continue;
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
i, xstate_comp_offsets[i], i, xstate_sizes[i]);
}
}
/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static __init void os_xrstor_booting(struct xregs_state *xstate)
+{
+ u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+ else
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+ /*
+ * We should never fault when copying from a kernel buffer, and the FPU
+ * state we set at boot time should be valid.
+ */
+ WARN_ON_FPU(err);
+}
+
+/*
* All supported features have either init state all zeros or are
* handled in setup_init_fpu() individually. This is an explicit
* feature list and does not use XFEATURE_MASK*SUPPORTED to catch
@@ -372,36 +404,30 @@ static void __init print_xstate_offset_size(void)
XFEATURE_MASK_PKRU | \
XFEATURE_MASK_BNDREGS | \
XFEATURE_MASK_BNDCSR | \
- XFEATURE_MASK_PASID)
+ XFEATURE_MASK_PASID | \
+ XFEATURE_MASK_XTILE)
/*
* setup the xstate image representing the init state
*/
static void __init setup_init_fpu_buf(void)
{
- static int on_boot_cpu __initdata = 1;
-
BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
XFEATURES_INIT_FPSTATE_HANDLED);
- WARN_ON_FPU(!on_boot_cpu);
- on_boot_cpu = 0;
-
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return;
setup_xstate_features();
print_xstate_features();
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
- xfeatures_mask_all;
+ xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features);
/*
* Init all the features state with header.xfeatures being 0x0
*/
- os_xrstor_booting(&init_fpstate.xsave);
+ os_xrstor_booting(&init_fpstate.regs.xsave);
/*
* All components are now in init state. Read the state back so
@@ -419,7 +445,7 @@ static void __init setup_init_fpu_buf(void)
* state is all zeroes or if not to add the necessary handling
* here.
*/
- fxsave(&init_fpstate.fxsave);
+ fxsave(&init_fpstate.regs.fxsave);
}
static int xfeature_uncompacted_offset(int xfeature_nr)
@@ -451,10 +477,11 @@ int xfeature_size(int xfeature_nr)
}
/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-static int validate_user_xstate_header(const struct xstate_header *hdr)
+static int validate_user_xstate_header(const struct xstate_header *hdr,
+ struct fpstate *fpstate)
{
/* No unknown or supervisor features may be set */
- if (hdr->xfeatures & ~xfeatures_mask_uabi())
+ if (hdr->xfeatures & ~fpstate->user_xfeatures)
return -EINVAL;
/* Userspace must use the uncompacted format */
@@ -474,7 +501,7 @@ static int validate_user_xstate_header(const struct xstate_header *hdr)
return 0;
}
-static void __xstate_dump_leaves(void)
+static void __init __xstate_dump_leaves(void)
{
int i;
u32 eax, ebx, ecx, edx;
@@ -509,12 +536,73 @@ static void __xstate_dump_leaves(void)
} \
} while (0)
+/**
+ * check_xtile_data_against_struct - Check tile data state size.
+ *
+ * Calculate the state size by multiplying the single tile size which is
+ * recorded in a C struct, and the number of tiles that the CPU informs.
+ * Compare the provided size with the calculation.
+ *
+ * @size: The tile data state size
+ *
+ * Returns: 0 on success, -EINVAL on mismatch.
+ */
+static int __init check_xtile_data_against_struct(int size)
+{
+ u32 max_palid, palid, state_size;
+ u32 eax, ebx, ecx, edx;
+ u16 max_tile;
+
+ /*
+ * Check the maximum palette id:
+ * eax: the highest numbered palette subleaf.
+ */
+ cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
+
+ /*
+ * Cross-check each tile size and find the maximum number of
+ * supported tiles.
+ */
+ for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
+ u16 tile_size, max;
+
+ /*
+ * Check the tile size info:
+ * eax[31:16]: bytes per title
+ * ebx[31:16]: the max names (or max number of tiles)
+ */
+ cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
+ tile_size = eax >> 16;
+ max = ebx >> 16;
+
+ if (tile_size != sizeof(struct xtile_data)) {
+ pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
+ __stringify(XFEATURE_XTILE_DATA),
+ sizeof(struct xtile_data), tile_size);
+ __xstate_dump_leaves();
+ return -EINVAL;
+ }
+
+ if (max > max_tile)
+ max_tile = max;
+ }
+
+ state_size = sizeof(struct xtile_data) * max_tile;
+ if (size != state_size) {
+ pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
+ __stringify(XFEATURE_XTILE_DATA), state_size, size);
+ __xstate_dump_leaves();
+ return -EINVAL;
+ }
+ return 0;
+}
+
/*
* We have a C struct for each 'xstate'. We need to ensure
* that our software representation matches what the CPU
* tells us about the state's size.
*/
-static void check_xstate_against_struct(int nr)
+static bool __init check_xstate_against_struct(int nr)
{
/*
* Ask the CPU for the size of the state.
@@ -532,6 +620,11 @@ static void check_xstate_against_struct(int nr)
XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state);
+ XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg);
+
+ /* The tile data size varies between implementations. */
+ if (nr == XFEATURE_XTILE_DATA)
+ check_xtile_data_against_struct(sz);
/*
* Make *SURE* to add any feature numbers in below if
@@ -541,10 +634,39 @@ static void check_xstate_against_struct(int nr)
if ((nr < XFEATURE_YMM) ||
(nr >= XFEATURE_MAX) ||
(nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
- ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) {
+ ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
WARN_ONCE(1, "no structure for xstate: %d\n", nr);
XSTATE_WARN_ON(1);
+ return false;
}
+ return true;
+}
+
+static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
+{
+ unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ int i;
+
+ for_each_extended_xfeature(i, xfeatures) {
+ /* Align from the end of the previous feature */
+ if (xfeature_is_aligned(i))
+ size = ALIGN(size, 64);
+ /*
+ * In compacted format the enabled features are packed,
+ * i.e. disabled features do not occupy space.
+ *
+ * In non-compacted format the offsets are fixed and
+ * disabled states still occupy space in the memory buffer.
+ */
+ if (!compacted)
+ size = xfeature_uncompacted_offset(i);
+ /*
+ * Add the feature size even for non-compacted format
+ * to make the end result correct
+ */
+ size += xfeature_size(i);
+ }
+ return size;
}
/*
@@ -556,44 +678,29 @@ static void check_xstate_against_struct(int nr)
* covered by these checks. Only the size of the buffer for task->fpu
* is checked here.
*/
-static void do_extra_xstate_size_checks(void)
+static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
{
- int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
+ unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
int i;
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- if (!xfeature_enabled(i))
- continue;
-
- check_xstate_against_struct(i);
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+ if (!check_xstate_against_struct(i))
+ return false;
/*
* Supervisor state components can be managed only by
* XSAVES.
*/
- if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
- XSTATE_WARN_ON(xfeature_is_supervisor(i));
-
- /* Align from the end of the previous feature */
- if (xfeature_is_aligned(i))
- paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
- /*
- * The offset of a given state in the non-compacted
- * format is given to us in a CPUID leaf. We check
- * them for being ordered (increasing offsets) in
- * setup_xstate_features(). XSAVES uses compacted format.
- */
- if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
- paranoid_xstate_size = xfeature_uncompacted_offset(i);
- /*
- * The compacted-format offset always depends on where
- * the previous state ended.
- */
- paranoid_xstate_size += xfeature_size(i);
+ if (!compacted && xfeature_is_supervisor(i)) {
+ XSTATE_WARN_ON(1);
+ return false;
+ }
}
- XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size);
+ size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
+ XSTATE_WARN_ON(size != kernel_size);
+ return size == kernel_size;
}
-
/*
* Get total size of enabled xstates in XCR0 | IA32_XSS.
*
@@ -644,7 +751,7 @@ static unsigned int __init get_xsaves_size_no_independent(void)
return size;
}
-static unsigned int __init get_xsave_size(void)
+static unsigned int __init get_xsave_size_user(void)
{
unsigned int eax, ebx, ecx, edx;
/*
@@ -662,44 +769,54 @@ static unsigned int __init get_xsave_size(void)
* Will the runtime-enumerated 'xstate_size' fit in the init
* task's statically-allocated buffer?
*/
-static bool is_supported_xstate_size(unsigned int test_xstate_size)
+static bool __init is_supported_xstate_size(unsigned int test_xstate_size)
{
- if (test_xstate_size <= sizeof(union fpregs_state))
+ if (test_xstate_size <= sizeof(init_fpstate.regs))
return true;
pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
- sizeof(union fpregs_state), test_xstate_size);
+ sizeof(init_fpstate.regs), test_xstate_size);
return false;
}
static int __init init_xstate_size(void)
{
/* Recompute the context size for enabled features: */
- unsigned int possible_xstate_size;
- unsigned int xsave_size;
+ unsigned int user_size, kernel_size, kernel_default_size;
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
- xsave_size = get_xsave_size();
+ /* Uncompacted user space size */
+ user_size = get_xsave_size_user();
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- possible_xstate_size = get_xsaves_size_no_independent();
+ /*
+ * XSAVES kernel size includes supervisor states and
+ * uses compacted format when available.
+ *
+ * XSAVE does not support supervisor states so
+ * kernel and user size is identical.
+ */
+ if (compacted)
+ kernel_size = get_xsaves_size_no_independent();
else
- possible_xstate_size = xsave_size;
+ kernel_size = user_size;
+
+ kernel_default_size =
+ xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
- /* Ensure we have the space to store all enabled: */
- if (!is_supported_xstate_size(possible_xstate_size))
+ /* Ensure we have the space to store all default enabled features. */
+ if (!is_supported_xstate_size(kernel_default_size))
return -EINVAL;
- /*
- * The size is OK, we are definitely going to use xsave,
- * make it known to the world that we need more space.
- */
- fpu_kernel_xstate_size = possible_xstate_size;
- do_extra_xstate_size_checks();
+ if (!paranoid_xstate_size_valid(kernel_size))
+ return -EINVAL;
+
+ fpu_kernel_cfg.max_size = kernel_size;
+ fpu_user_cfg.max_size = user_size;
+
+ fpu_kernel_cfg.default_size = kernel_default_size;
+ fpu_user_cfg.default_size =
+ xstate_calculate_size(fpu_user_cfg.default_features, false);
- /*
- * User space is always in standard format.
- */
- fpu_user_xstate_size = xsave_size;
return 0;
}
@@ -707,28 +824,38 @@ static int __init init_xstate_size(void)
* We enabled the XSAVE hardware, but something went wrong and
* we can not use it. Disable it.
*/
-static void fpu__init_disable_system_xstate(void)
+static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
{
- xfeatures_mask_all = 0;
+ fpu_kernel_cfg.max_features = 0;
cr4_clear_bits(X86_CR4_OSXSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+ /* Restore the legacy size.*/
+ fpu_kernel_cfg.max_size = legacy_size;
+ fpu_kernel_cfg.default_size = legacy_size;
+ fpu_user_cfg.max_size = legacy_size;
+ fpu_user_cfg.default_size = legacy_size;
+
+ /*
+ * Prevent enabling the static branch which enables writes to the
+ * XFD MSR.
+ */
+ init_fpstate.xfd = 0;
+
+ fpstate_reset(&current->thread.fpu);
}
/*
* Enable and initialize the xsave feature.
* Called once per system bootup.
*/
-void __init fpu__init_system_xstate(void)
+void __init fpu__init_system_xstate(unsigned int legacy_size)
{
unsigned int eax, ebx, ecx, edx;
- static int on_boot_cpu __initdata = 1;
u64 xfeatures;
int err;
int i;
- WARN_ON_FPU(!on_boot_cpu);
- on_boot_cpu = 0;
-
if (!boot_cpu_has(X86_FEATURE_FPU)) {
pr_info("x86/fpu: No FPU detected\n");
return;
@@ -749,22 +876,22 @@ void __init fpu__init_system_xstate(void)
* Find user xstates supported by the processor.
*/
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
- xfeatures_mask_all = eax + ((u64)edx << 32);
+ fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
/*
* Find supervisor xstates supported by the processor.
*/
cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
- xfeatures_mask_all |= ecx + ((u64)edx << 32);
+ fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
- if ((xfeatures_mask_uabi() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
/*
* This indicates that something really unexpected happened
* with the enumeration. Disable XSAVE and try to continue
* booting without it. This is too early to BUG().
*/
pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
- xfeatures_mask_all);
+ fpu_kernel_cfg.max_features);
goto out_disable;
}
@@ -772,15 +899,39 @@ void __init fpu__init_system_xstate(void)
* Clear XSAVE features that are disabled in the normal CPUID.
*/
for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
- if (!boot_cpu_has(xsave_cpuid_features[i]))
- xfeatures_mask_all &= ~BIT_ULL(i);
+ unsigned short cid = xsave_cpuid_features[i];
+
+ /* Careful: X86_FEATURE_FPU is 0! */
+ if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
+ fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
}
- xfeatures_mask_all &= XFEATURE_MASK_USER_SUPPORTED |
+ if (!cpu_feature_enabled(X86_FEATURE_XFD))
+ fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
+ fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+ fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
+ fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
+
+ /* Clean out dynamic features from default */
+ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
+ fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
+ fpu_user_cfg.default_features = fpu_user_cfg.max_features;
+ fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
/* Store it for paranoia check at the end */
- xfeatures = xfeatures_mask_all;
+ xfeatures = fpu_kernel_cfg.max_features;
+
+ /*
+ * Initialize the default XFD state in initfp_state and enable the
+ * dynamic sizing mechanism if dynamic states are available. The
+ * static key cannot be enabled here because this runs before
+ * jump_label_init(). This is delayed to an initcall.
+ */
+ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
@@ -788,13 +939,16 @@ void __init fpu__init_system_xstate(void)
if (err)
goto out_disable;
+ /* Reset the state for the current task */
+ fpstate_reset(&current->thread.fpu);
+
/*
* Update info used for ptrace frames; use standard-format size and no
* supervisor xstates:
*/
- update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_uabi());
+ update_regset_xstate_info(fpu_user_cfg.max_size,
+ fpu_user_cfg.max_features);
- fpu__init_prepare_fx_sw_frame();
setup_init_fpu_buf();
setup_xstate_comp_offsets();
setup_supervisor_only_offsets();
@@ -803,22 +957,22 @@ void __init fpu__init_system_xstate(void)
* Paranoia check whether something in the setup modified the
* xfeatures mask.
*/
- if (xfeatures != xfeatures_mask_all) {
+ if (xfeatures != fpu_kernel_cfg.max_features) {
pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
- xfeatures, xfeatures_mask_all);
+ xfeatures, fpu_kernel_cfg.max_features);
goto out_disable;
}
print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
- xfeatures_mask_all,
- fpu_kernel_xstate_size,
+ fpu_kernel_cfg.max_features,
+ fpu_kernel_cfg.max_size,
boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
return;
out_disable:
/* something went wrong, try to boot without any XSAVE support */
- fpu__init_disable_system_xstate();
+ fpu__init_disable_system_xstate(legacy_size);
}
/*
@@ -830,7 +984,7 @@ void fpu__resume_cpu(void)
* Restore XCR0 on xsave capable CPUs:
*/
if (cpu_feature_enabled(X86_FEATURE_XSAVE))
- xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi());
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
/*
* Restore IA32_XSS. The same CPUID bit enumerates support
@@ -840,6 +994,9 @@ void fpu__resume_cpu(void)
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
xfeatures_mask_independent());
}
+
+ if (fpu_state_size_dynamic())
+ wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
}
/*
@@ -886,7 +1043,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
* We should not ever be requesting features that we
* have not enabled.
*/
- WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)),
+ WARN_ONCE(!(fpu_kernel_cfg.max_features & BIT_ULL(xfeature_nr)),
"get of unsupported state");
/*
* This assumes the last 'xsave*' instruction to
@@ -904,7 +1061,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
return __raw_xsave_addr(xsave, xfeature_nr);
}
-EXPORT_SYMBOL_GPL(get_xsave_addr);
#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -961,9 +1117,10 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
}
/**
- * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
* @to: membuf descriptor
- * @tsk: The task from which to copy the saved xstate
+ * @fpstate: The fpstate buffer from which to copy
+ * @pkru_val: The PKRU value to store in the PKRU component
* @copy_mode: The requested copy mode
*
* Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
@@ -972,14 +1129,15 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
*
* It supports partial copy but @to.pos always starts from zero.
*/
-void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
- enum xstate_copy_mode copy_mode)
+void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
+ u32 pkru_val, enum xstate_copy_mode copy_mode)
{
const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
- struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
- struct xregs_state *xinit = &init_fpstate.xsave;
+ struct xregs_state *xinit = &init_fpstate.regs.xsave;
+ struct xregs_state *xsave = &fpstate->regs.xsave;
struct xstate_header header;
unsigned int zerofrom;
+ u64 mask;
int i;
memset(&header, 0, sizeof(header));
@@ -996,7 +1154,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
break;
case XSTATE_COPY_XSAVE:
- header.xfeatures &= xfeatures_mask_uabi();
+ header.xfeatures &= fpstate->user_xfeatures;
break;
}
@@ -1033,17 +1191,15 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
zerofrom = offsetof(struct xregs_state, extended_state_area);
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- /*
- * The ptrace buffer is in non-compacted XSAVE format.
- * In non-compacted format disabled features still occupy
- * state space, but there is no state to copy from in the
- * compacted init_fpstate. The gap tracking will zero this
- * later.
- */
- if (!(xfeatures_mask_uabi() & BIT_ULL(i)))
- continue;
+ /*
+ * The ptrace buffer is in non-compacted XSAVE format. In
+ * non-compacted format disabled features still occupy state space,
+ * but there is no state to copy from in the compacted
+ * init_fpstate. The gap tracking will zero these states.
+ */
+ mask = fpstate->user_xfeatures;
+ for_each_extended_xfeature(i, mask) {
/*
* If there was a feature or alignment gap, zero the space
* in the destination buffer.
@@ -1055,10 +1211,9 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
struct pkru_state pkru = {0};
/*
* PKRU is not necessarily up to date in the
- * thread's XSAVE buffer. Fill this part from the
- * per-thread storage.
+ * XSAVE buffer. Use the provided value.
*/
- pkru.pkru = tsk->thread.pkru;
+ pkru.pkru = pkru_val;
membuf_write(&to, &pkru, sizeof(pkru));
} else {
copy_feature(header.xfeatures & BIT_ULL(i), &to,
@@ -1078,6 +1233,25 @@ out:
membuf_zero(&to, to.left);
}
+/**
+ * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * @to: membuf descriptor
+ * @tsk: The task from which to copy the saved xstate
+ * @copy_mode: The requested copy mode
+ *
+ * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
+ * format, i.e. from the kernel internal hardware dependent storage format
+ * to the requested @mode. UABI XSTATE is always uncompacted!
+ *
+ * It supports partial copy but @to.pos always starts from zero.
+ */
+void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+ enum xstate_copy_mode copy_mode)
+{
+ __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
+ tsk->thread.pkru, copy_mode);
+}
+
static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
const void *kbuf, const void __user *ubuf)
{
@@ -1091,9 +1265,10 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
}
-static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf,
+static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
const void __user *ubuf)
{
+ struct xregs_state *xsave = &fpstate->regs.xsave;
unsigned int offset, size;
struct xstate_header hdr;
u64 mask;
@@ -1103,7 +1278,7 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf,
if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
return -EFAULT;
- if (validate_user_xstate_header(&hdr))
+ if (validate_user_xstate_header(&hdr, fpstate))
return -EINVAL;
/* Validate MXCSR when any of the related features is in use */
@@ -1156,12 +1331,11 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf,
/*
* Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
- * format and copy to the target thread. This is called from
- * xstateregs_set().
+ * format and copy to the target thread. Used by ptrace and KVM.
*/
-int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
+int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
{
- return copy_uabi_to_xstate(xsave, kbuf, NULL);
+ return copy_uabi_to_xstate(fpstate, kbuf, NULL);
}
/*
@@ -1169,26 +1343,20 @@ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
* XSAVE[S] format and copy to the target thread. This is called from the
* sigreturn() and rt_sigreturn() system calls.
*/
-int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave,
+int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate,
const void __user *ubuf)
{
- return copy_uabi_to_xstate(xsave, NULL, ubuf);
+ return copy_uabi_to_xstate(fpstate, NULL, ubuf);
}
-static bool validate_xsaves_xrstors(u64 mask)
+static bool validate_independent_components(u64 mask)
{
u64 xchk;
if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
return false;
- /*
- * Validate that this is either a task->fpstate related component
- * subset or an independent one.
- */
- if (mask & xfeatures_mask_independent())
- xchk = ~xfeatures_mask_independent();
- else
- xchk = ~xfeatures_mask_all;
+
+ xchk = ~xfeatures_mask_independent();
if (WARN_ON_ONCE(!mask || mask & xchk))
return false;
@@ -1206,14 +1374,13 @@ static bool validate_xsaves_xrstors(u64 mask)
* buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
* can #GP.
*
- * The feature mask must either be a subset of the independent features or
- * a subset of the task->fpstate related features.
+ * The feature mask must be a subset of the independent features.
*/
void xsaves(struct xregs_state *xstate, u64 mask)
{
int err;
- if (!validate_xsaves_xrstors(mask))
+ if (!validate_independent_components(mask))
return;
XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
@@ -1231,20 +1398,420 @@ void xsaves(struct xregs_state *xstate, u64 mask)
* Proper usage is to restore the state which was saved with
* xsaves() into @xstate.
*
- * The feature mask must either be a subset of the independent features or
- * a subset of the task->fpstate related features.
+ * The feature mask must be a subset of the independent features.
*/
void xrstors(struct xregs_state *xstate, u64 mask)
{
int err;
- if (!validate_xsaves_xrstors(mask))
+ if (!validate_independent_components(mask))
return;
XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
WARN_ON_ONCE(err);
}
+#if IS_ENABLED(CONFIG_KVM)
+void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
+{
+ void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
+
+ if (addr)
+ memset(addr, 0, xstate_sizes[xfeature]);
+}
+EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
+#endif
+
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_X86_DEBUG_FPU
+/*
+ * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
+ * can safely operate on the @fpstate buffer.
+ */
+static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+ u64 xfd = __this_cpu_read(xfd_state);
+
+ if (fpstate->xfd == xfd)
+ return true;
+
+ /*
+ * The XFD MSR does not match fpstate->xfd. That's invalid when
+ * the passed in fpstate is current's fpstate.
+ */
+ if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
+ return false;
+
+ /*
+ * XRSTOR(S) from init_fpstate are always correct as it will just
+ * bring all components into init state and not read from the
+ * buffer. XSAVE(S) raises #PF after init.
+ */
+ if (fpstate == &init_fpstate)
+ return rstor;
+
+ /*
+ * XSAVE(S): clone(), fpu_swap_kvm_fpu()
+ * XRSTORS(S): fpu_swap_kvm_fpu()
+ */
+
+ /*
+ * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
+ * the buffer area for XFD-disabled state components.
+ */
+ mask &= ~xfd;
+
+ /*
+ * Remove features which are valid in fpstate. They
+ * have space allocated in fpstate.
+ */
+ mask &= ~fpstate->xfeatures;
+
+ /*
+ * Any remaining state components in 'mask' might be written
+ * by XSAVE/XRSTOR. Fail validation it found.
+ */
+ return !mask;
+}
+
+void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+ WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
+}
+#endif /* CONFIG_X86_DEBUG_FPU */
+
+static int __init xfd_update_static_branch(void)
+{
+ /*
+ * If init_fpstate.xfd has bits set then dynamic features are
+ * available and the dynamic sizing must be enabled.
+ */
+ if (init_fpstate.xfd)
+ static_branch_enable(&__fpu_state_size_dynamic);
+ return 0;
+}
+arch_initcall(xfd_update_static_branch)
+
+void fpstate_free(struct fpu *fpu)
+{
+ if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
+ vfree(fpu->fpstate);
+}
+
+/**
+ * fpstate_realloc - Reallocate struct fpstate for the requested new features
+ *
+ * @xfeatures: A bitmap of xstate features which extend the enabled features
+ * of that task
+ * @ksize: The required size for the kernel buffer
+ * @usize: The required size for user space buffers
+ * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations
+ *
+ * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
+ * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
+ * with large states are likely to live longer.
+ *
+ * Returns: 0 on success, -ENOMEM on allocation error.
+ */
+static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
+ unsigned int usize, struct fpu_guest *guest_fpu)
+{
+ struct fpu *fpu = &current->thread.fpu;
+ struct fpstate *curfps, *newfps = NULL;
+ unsigned int fpsize;
+ bool in_use;
+
+ fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
+
+ newfps = vzalloc(fpsize);
+ if (!newfps)
+ return -ENOMEM;
+ newfps->size = ksize;
+ newfps->user_size = usize;
+ newfps->is_valloc = true;
+
+ /*
+ * When a guest FPU is supplied, use @guest_fpu->fpstate
+ * as reference independent whether it is in use or not.
+ */
+ curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
+
+ /* Determine whether @curfps is the active fpstate */
+ in_use = fpu->fpstate == curfps;
+
+ if (guest_fpu) {
+ newfps->is_guest = true;
+ newfps->is_confidential = curfps->is_confidential;
+ newfps->in_use = curfps->in_use;
+ guest_fpu->xfeatures |= xfeatures;
+ guest_fpu->uabi_size = usize;
+ }
+
+ fpregs_lock();
+ /*
+ * If @curfps is in use, ensure that the current state is in the
+ * registers before swapping fpstate as that might invalidate it
+ * due to layout changes.
+ */
+ if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpregs_restore_userregs();
+
+ newfps->xfeatures = curfps->xfeatures | xfeatures;
+ newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
+ newfps->xfd = curfps->xfd & ~xfeatures;
+
+ /* Do the final updates within the locked region */
+ xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
+
+ if (guest_fpu) {
+ guest_fpu->fpstate = newfps;
+ /* If curfps is active, update the FPU fpstate pointer */
+ if (in_use)
+ fpu->fpstate = newfps;
+ } else {
+ fpu->fpstate = newfps;
+ }
+
+ if (in_use)
+ xfd_update_state(fpu->fpstate);
+ fpregs_unlock();
+
+ /* Only free valloc'ed state */
+ if (curfps && curfps->is_valloc)
+ vfree(curfps);
+
+ return 0;
+}
+
+static int validate_sigaltstack(unsigned int usize)
+{
+ struct task_struct *thread, *leader = current->group_leader;
+ unsigned long framesize = get_sigframe_size();
+
+ lockdep_assert_held(&current->sighand->siglock);
+
+ /* get_sigframe_size() is based on fpu_user_cfg.max_size */
+ framesize -= fpu_user_cfg.max_size;
+ framesize += usize;
+ for_each_thread(leader, thread) {
+ if (thread->sas_ss_size && thread->sas_ss_size < framesize)
+ return -ENOSPC;
+ }
+ return 0;
+}
+
+static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
+{
+ /*
+ * This deliberately does not exclude !XSAVES as we still might
+ * decide to optionally context switch XCR0 or talk the silicon
+ * vendors into extending XFD for the pre AMX states, especially
+ * AVX512.
+ */
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
+ struct fpu *fpu = &current->group_leader->thread.fpu;
+ struct fpu_state_perm *perm;
+ unsigned int ksize, usize;
+ u64 mask;
+ int ret = 0;
+
+ /* Check whether fully enabled */
+ if ((permitted & requested) == requested)
+ return 0;
+
+ /* Calculate the resulting kernel state size */
+ mask = permitted | requested;
+ ksize = xstate_calculate_size(mask, compacted);
+
+ /* Calculate the resulting user state size */
+ mask &= XFEATURE_MASK_USER_SUPPORTED;
+ usize = xstate_calculate_size(mask, false);
+
+ if (!guest) {
+ ret = validate_sigaltstack(usize);
+ if (ret)
+ return ret;
+ }
+
+ perm = guest ? &fpu->guest_perm : &fpu->perm;
+ /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
+ WRITE_ONCE(perm->__state_perm, requested);
+ /* Protected by sighand lock */
+ perm->__state_size = ksize;
+ perm->__user_state_size = usize;
+ return ret;
+}
+
+/*
+ * Permissions array to map facilities with more than one component
+ */
+static const u64 xstate_prctl_req[XFEATURE_MAX] = {
+ [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
+};
+
+static int xstate_request_perm(unsigned long idx, bool guest)
+{
+ u64 permitted, requested;
+ int ret;
+
+ if (idx >= XFEATURE_MAX)
+ return -EINVAL;
+
+ /*
+ * Look up the facility mask which can require more than
+ * one xstate component.
+ */
+ idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
+ requested = xstate_prctl_req[idx];
+ if (!requested)
+ return -EOPNOTSUPP;
+
+ if ((fpu_user_cfg.max_features & requested) != requested)
+ return -EOPNOTSUPP;
+
+ /* Lockless quick check */
+ permitted = xstate_get_group_perm(guest);
+ if ((permitted & requested) == requested)
+ return 0;
+
+ /* Protect against concurrent modifications */
+ spin_lock_irq(&current->sighand->siglock);
+ permitted = xstate_get_group_perm(guest);
+
+ /* First vCPU allocation locks the permissions. */
+ if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
+ ret = -EBUSY;
+ else
+ ret = __xstate_request_perm(permitted, requested, guest);
+ spin_unlock_irq(&current->sighand->siglock);
+ return ret;
+}
+
+int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
+{
+ u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
+ struct fpu_state_perm *perm;
+ unsigned int ksize, usize;
+ struct fpu *fpu;
+
+ if (!xfd_event) {
+ if (!guest_fpu)
+ pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
+ return 0;
+ }
+
+ /* Protect against concurrent modifications */
+ spin_lock_irq(&current->sighand->siglock);
+
+ /* If not permitted let it die */
+ if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
+ spin_unlock_irq(&current->sighand->siglock);
+ return -EPERM;
+ }
+
+ fpu = &current->group_leader->thread.fpu;
+ perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
+ ksize = perm->__state_size;
+ usize = perm->__user_state_size;
+
+ /*
+ * The feature is permitted. State size is sufficient. Dropping
+ * the lock is safe here even if more features are added from
+ * another task, the retrieved buffer sizes are valid for the
+ * currently requested feature(s).
+ */
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /*
+ * Try to allocate a new fpstate. If that fails there is no way
+ * out.
+ */
+ if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
+ return -EFAULT;
+ return 0;
+}
+
+int xfd_enable_feature(u64 xfd_err)
+{
+ return __xfd_enable_feature(xfd_err, NULL);
+}
+
+#else /* CONFIG_X86_64 */
+static inline int xstate_request_perm(unsigned long idx, bool guest)
+{
+ return -EPERM;
+}
+#endif /* !CONFIG_X86_64 */
+
+u64 xstate_get_guest_group_perm(void)
+{
+ return xstate_get_group_perm(true);
+}
+EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
+
+/**
+ * fpu_xstate_prctl - xstate permission operations
+ * @tsk: Redundant pointer to current
+ * @option: A subfunction of arch_prctl()
+ * @arg2: option argument
+ * Return: 0 if successful; otherwise, an error code
+ *
+ * Option arguments:
+ *
+ * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
+ * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
+ * ARCH_REQ_XCOMP_PERM: Facility number requested
+ *
+ * For facilities which require more than one XSTATE component, the request
+ * must be the highest state component number related to that facility,
+ * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
+ * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
+ */
+long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2)
+{
+ u64 __user *uptr = (u64 __user *)arg2;
+ u64 permitted, supported;
+ unsigned long idx = arg2;
+ bool guest = false;
+
+ if (tsk != current)
+ return -EPERM;
+
+ switch (option) {
+ case ARCH_GET_XCOMP_SUPP:
+ supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
+ return put_user(supported, uptr);
+
+ case ARCH_GET_XCOMP_PERM:
+ /*
+ * Lockless snapshot as it can also change right after the
+ * dropping the lock.
+ */
+ permitted = xstate_get_host_group_perm();
+ permitted &= XFEATURE_MASK_USER_SUPPORTED;
+ return put_user(permitted, uptr);
+
+ case ARCH_GET_XCOMP_GUEST_PERM:
+ permitted = xstate_get_guest_group_perm();
+ permitted &= XFEATURE_MASK_USER_SUPPORTED;
+ return put_user(permitted, uptr);
+
+ case ARCH_REQ_XCOMP_GUEST_PERM:
+ guest = true;
+ fallthrough;
+
+ case ARCH_REQ_XCOMP_PERM:
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return -EOPNOTSUPP;
+
+ return xstate_request_perm(idx, guest);
+
+ default:
+ return -EINVAL;
+ }
+}
+
#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
* Report the amount of time elapsed in millisecond since last AVX512
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
new file mode 100644
index 000000000000..d22ace092ca2
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -0,0 +1,322 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_XSTATE_H
+#define __X86_KERNEL_FPU_XSTATE_H
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/xcr.h>
+
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(u64, xfd_state);
+#endif
+
+static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
+{
+ /*
+ * XRSTORS requires these bits set in xcomp_bv, or it will
+ * trigger #GP:
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
+}
+
+static inline u64 xstate_get_group_perm(bool guest)
+{
+ struct fpu *fpu = &current->group_leader->thread.fpu;
+ struct fpu_state_perm *perm;
+
+ /* Pairs with WRITE_ONCE() in xstate_request_perm() */
+ perm = guest ? &fpu->guest_perm : &fpu->perm;
+ return READ_ONCE(perm->__state_perm);
+}
+
+static inline u64 xstate_get_host_group_perm(void)
+{
+ return xstate_get_group_perm(false);
+}
+
+enum xstate_copy_mode {
+ XSTATE_COPY_FP,
+ XSTATE_COPY_FX,
+ XSTATE_COPY_XSAVE,
+};
+
+struct membuf;
+extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
+ u32 pkru_val, enum xstate_copy_mode copy_mode);
+extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+ enum xstate_copy_mode mode);
+extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf);
+extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf);
+
+
+extern void fpu__init_cpu_xstate(void);
+extern void fpu__init_system_xstate(unsigned int legacy_size);
+
+extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+
+static inline u64 xfeatures_mask_supervisor(void)
+{
+ return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+}
+
+static inline u64 xfeatures_mask_independent(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
+ return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
+
+ return XFEATURE_MASK_INDEPENDENT;
+}
+
+/* XSAVE/XRSTOR wrapper functions */
+
+#ifdef CONFIG_X86_64
+#define REX_PREFIX "0x48, "
+#else
+#define REX_PREFIX
+#endif
+
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+/*
+ * After this @err contains 0 on success or the trap number when the
+ * operation raises an exception.
+ */
+#define XSTATE_OP(op, st, lmask, hmask, err) \
+ asm volatile("1:" op "\n\t" \
+ "xor %[err], %[err]\n" \
+ "2:\n\t" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \
+ : [err] "=a" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
+ * format and supervisor states in addition to modified optimization in
+ * XSAVEOPT.
+ *
+ * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
+ * supports modified optimization which is not supported by XSAVE.
+ *
+ * We use XSAVE as a fallback.
+ *
+ * The 661 label is defined in the ALTERNATIVE* macros as the address of the
+ * original instruction which gets replaced. We need to use it here as the
+ * address of the instruction where we might get an exception at.
+ */
+#define XSTATE_XSAVE(st, lmask, hmask, err) \
+ asm volatile(ALTERNATIVE_2(XSAVE, \
+ XSAVEOPT, X86_FEATURE_XSAVEOPT, \
+ XSAVES, X86_FEATURE_XSAVES) \
+ "\n" \
+ "xor %[err], %[err]\n" \
+ "3:\n" \
+ _ASM_EXTABLE_TYPE_REG(661b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
+ : [err] "=r" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
+ * XSAVE area format.
+ */
+#define XSTATE_XRESTORE(st, lmask, hmask) \
+ asm volatile(ALTERNATIVE(XRSTOR, \
+ XRSTORS, X86_FEATURE_XSAVES) \
+ "\n" \
+ "3:\n" \
+ _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \
+ : \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU)
+extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor);
+#else
+static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { }
+#endif
+
+#ifdef CONFIG_X86_64
+static inline void xfd_update_state(struct fpstate *fpstate)
+{
+ if (fpu_state_size_dynamic()) {
+ u64 xfd = fpstate->xfd;
+
+ if (__this_cpu_read(xfd_state) != xfd) {
+ wrmsrl(MSR_IA32_XFD, xfd);
+ __this_cpu_write(xfd_state, xfd);
+ }
+ }
+}
+
+extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
+#else
+static inline void xfd_update_state(struct fpstate *fpstate) { }
+
+static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
+ return -EPERM;
+}
+#endif
+
+/*
+ * Save processor xstate to xsave area.
+ *
+ * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
+ * and command line options. The choice is permanent until the next reboot.
+ */
+static inline void os_xsave(struct fpstate *fpstate)
+{
+ u64 mask = fpstate->xfeatures;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ WARN_ON_FPU(!alternatives_patched);
+ xfd_validate_state(fpstate, mask, false);
+
+ XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err);
+
+ /* We should never fault when copying to a kernel buffer: */
+ WARN_ON_FPU(err);
+}
+
+/*
+ * Restore processor xstate from xsave area.
+ *
+ * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
+ */
+static inline void os_xrstor(struct fpstate *fpstate, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ xfd_validate_state(fpstate, mask, true);
+ XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
+}
+
+/* Restore of supervisor state. Does not require XFD */
+static inline void os_xrstor_supervisor(struct fpstate *fpstate)
+{
+ u64 mask = xfeatures_mask_supervisor();
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
+}
+
+/*
+ * XSAVE itself always writes all requested xfeatures. Removing features
+ * from the request bitmap reduces the features which are written.
+ * Generate a mask of features which must be written to a sigframe. The
+ * unset features can be optimized away and not written.
+ *
+ * This optimization is user-visible. Only use for states where
+ * uninitialized sigframe contents are tolerable, like dynamic features.
+ *
+ * Users of buffers produced with this optimization must check XSTATE_BV
+ * to determine which features have been optimized out.
+ */
+static inline u64 xfeatures_need_sigframe_write(void)
+{
+ u64 xfeaures_to_write;
+
+ /* In-use features must be written: */
+ xfeaures_to_write = xfeatures_in_use();
+
+ /* Also write all non-optimizable sigframe features: */
+ xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED &
+ ~XFEATURE_MASK_SIGFRAME_INITOPT;
+
+ return xfeaures_to_write;
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for backward compatibility for
+ * old applications which don't understand the compacted format of the
+ * xsave area.
+ *
+ * The caller has to zero buf::header before calling this because XSAVE*
+ * does not touch the reserved fields in the header.
+ */
+static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
+{
+ /*
+ * Include the features which are not xsaved/rstored by the kernel
+ * internally, e.g. PKRU. That's user space ABI and also required
+ * to allow the signal handler to modify PKRU.
+ */
+ struct fpstate *fpstate = current->thread.fpu.fpstate;
+ u64 mask = fpstate->user_xfeatures;
+ u32 lmask;
+ u32 hmask;
+ int err;
+
+ /* Optimize away writing unnecessary xfeatures: */
+ if (fpu_state_size_dynamic())
+ mask &= xfeatures_need_sigframe_write();
+
+ lmask = mask;
+ hmask = mask >> 32;
+ xfd_validate_state(fpstate, mask, false);
+
+ stac();
+ XSTATE_OP(XSAVE, buf, lmask, hmask, err);
+ clac();
+
+ return err;
+}
+
+/*
+ * Restore xstate from user space xsave area.
+ */
+static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
+{
+ struct xregs_state *xstate = ((__force struct xregs_state *)buf);
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ xfd_validate_state(current->thread.fpu.fpstate, mask, true);
+
+ stac();
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+ clac();
+
+ return err;
+}
+
+/*
+ * Restore xstate from kernel space xsave area, return an error code instead of
+ * an exception.
+ */
+static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask)
+{
+ struct xregs_state *xstate = &fpstate->regs.xsave;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ /* Ensure that XFD is up to date */
+ xfd_update_state(fpstate);
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+ else
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+ return err;
+}
+
+
+#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 1b3ce3b4a2a2..7cc540e6de0c 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -252,11 +252,6 @@ void arch_ftrace_update_code(int command)
ftrace_modify_all_code(command);
}
-int __init ftrace_dyn_arch_init(void)
-{
- return 0;
-}
-
/* Currently only x86_64 supports dynamic trampolines */
#ifdef CONFIG_X86_64
@@ -308,7 +303,7 @@ union ftrace_op_code_union {
} __attribute__((packed));
};
-#define RET_SIZE 1
+#define RET_SIZE 1 + IS_ENABLED(CONFIG_SLS)
static unsigned long
create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
@@ -527,7 +522,7 @@ static void *addr_from_call(void *ptr)
return ptr + CALL_INSN_SIZE + call.disp;
}
-void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
unsigned long frame_pointer);
/*
@@ -541,7 +536,8 @@ static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
void *ptr;
if (ops && ops->trampoline) {
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) && \
+ defined(CONFIG_FUNCTION_GRAPH_TRACER)
/*
* We only know about function graph tracer setting as static
* trampoline.
@@ -589,8 +585,9 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
-extern void ftrace_graph_call(void);
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+extern void ftrace_graph_call(void);
static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
{
return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr);
@@ -618,19 +615,28 @@ int ftrace_disable_ftrace_graph_caller(void)
return ftrace_mod_jmp(ip, &ftrace_stub);
}
+#else /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
+int ftrace_enable_ftrace_graph_caller(void)
+{
+ return 0;
+}
+int ftrace_disable_ftrace_graph_caller(void)
+{
+ return 0;
+}
+#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
#endif /* !CONFIG_DYNAMIC_FTRACE */
/*
* Hook the return address and push it in the stack of return addrs
* in current thread info.
*/
-void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
unsigned long frame_pointer)
{
unsigned long return_hooker = (unsigned long)&return_to_handler;
- unsigned long old;
- int faulted;
+ int bit;
/*
* When resuming from suspend-to-ram, this function can be indirectly
@@ -650,37 +656,25 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
if (unlikely(atomic_read(&current->tracing_graph_pause)))
return;
- /*
- * Protect against fault, even if it shouldn't
- * happen. This tool is too much intrusive to
- * ignore such a protection.
- */
- asm volatile(
- "1: " _ASM_MOV " (%[parent]), %[old]\n"
- "2: " _ASM_MOV " %[return_hooker], (%[parent])\n"
- " movl $0, %[faulted]\n"
- "3:\n"
-
- ".section .fixup, \"ax\"\n"
- "4: movl $1, %[faulted]\n"
- " jmp 3b\n"
- ".previous\n"
-
- _ASM_EXTABLE(1b, 4b)
- _ASM_EXTABLE(2b, 4b)
-
- : [old] "=&r" (old), [faulted] "=r" (faulted)
- : [parent] "r" (parent), [return_hooker] "r" (return_hooker)
- : "memory"
- );
-
- if (unlikely(faulted)) {
- ftrace_graph_stop();
- WARN_ON(1);
+ bit = ftrace_test_recursion_trylock(ip, *parent);
+ if (bit < 0)
return;
- }
- if (function_graph_enter(old, self_addr, frame_pointer, parent))
- *parent = old;
+ if (!function_graph_enter(*parent, ip, frame_pointer, parent))
+ *parent = return_hooker;
+
+ ftrace_test_recursion_unlock(bit);
}
+
+#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
+{
+ struct pt_regs *regs = &fregs->regs;
+ unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs);
+
+ prepare_ftrace_return(ip, (unsigned long *)stack, 0);
+}
+#endif
+
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index e405fe1a8bf4..a0ed0e4a2c0c 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -19,7 +19,7 @@
#endif
SYM_FUNC_START(__fentry__)
- ret
+ RET
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
@@ -84,7 +84,7 @@ ftrace_graph_call:
/* This is weak to keep gas from relaxing the jumps */
SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
- ret
+ RET
SYM_CODE_END(ftrace_caller)
SYM_CODE_START(ftrace_regs_caller)
@@ -177,7 +177,7 @@ SYM_CODE_START(ftrace_graph_caller)
popl %edx
popl %ecx
popl %eax
- ret
+ RET
SYM_CODE_END(ftrace_graph_caller)
.globl return_to_handler
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 7c273846c687..11ac028e30e4 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -132,7 +132,7 @@
#ifdef CONFIG_DYNAMIC_FTRACE
SYM_FUNC_START(__fentry__)
- retq
+ RET
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
@@ -174,18 +174,13 @@ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL)
SYM_FUNC_END(ftrace_caller);
SYM_FUNC_START(ftrace_epilogue)
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL)
- jmp ftrace_stub
-#endif
-
/*
* This is weak to keep gas from relaxing the jumps.
- * It is also used to copy the retq for trampolines.
+ * It is also used to copy the RET for trampolines.
*/
SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
UNWIND_HINT_FUNC
- retq
+ RET
SYM_FUNC_END(ftrace_epilogue)
SYM_FUNC_START(ftrace_regs_caller)
@@ -251,7 +246,6 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
* If ORIG_RAX is anything but zero, make this a call to that.
* See arch_ftrace_set_direct_caller().
*/
- movq ORIG_RAX(%rsp), %rax
testq %rax, %rax
SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL)
jnz 1f
@@ -289,17 +283,8 @@ SYM_FUNC_START(__fentry__)
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
-fgraph_trace:
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpq $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-
SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL)
- retq
+ RET
trace:
/* save_mcount_regs fills in first two parameters */
@@ -315,25 +300,12 @@ trace:
CALL_NOSPEC r8
restore_mcount_regs
- jmp fgraph_trace
+ jmp ftrace_stub
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-SYM_FUNC_START(ftrace_graph_caller)
- /* Saves rbp into %rdx and fills first parameter */
- save_mcount_regs
-
- leaq MCOUNT_REG_SIZE+8(%rsp), %rsi
- movq $0, %rdx /* No framepointers needed */
- call prepare_ftrace_return
-
- restore_mcount_regs
-
- retq
-SYM_FUNC_END(ftrace_graph_caller)
-
SYM_FUNC_START(return_to_handler)
subq $24, %rsp
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index de01903c3735..de563db9cdcd 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -19,7 +19,7 @@
#include <linux/start_kernel.h>
#include <linux/io.h>
#include <linux/memblock.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <linux/pgtable.h>
#include <asm/processor.h>
@@ -126,6 +126,36 @@ static bool __head check_la57_support(unsigned long physaddr)
}
#endif
+static unsigned long sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd)
+{
+ unsigned long vaddr, vaddr_end;
+ int i;
+
+ /* Encrypt the kernel and related (if SME is active) */
+ sme_encrypt_kernel(bp);
+
+ /*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (sme_get_me_mask()) {
+ vaddr = (unsigned long)__start_bss_decrypted;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ i = pmd_index(vaddr);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
/* Code in __startup_64() can be relocated during execution, but the compiler
* doesn't have to generate PC-relative relocations when accessing globals from
* that function. Clang actually does not generate them, which leads to
@@ -135,7 +165,6 @@ static bool __head check_la57_support(unsigned long physaddr)
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
- unsigned long vaddr, vaddr_end;
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
@@ -276,29 +305,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
*/
*fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
- /* Encrypt the kernel and related (if SME is active) */
- sme_encrypt_kernel(bp);
-
- /*
- * Clear the memory encryption mask from the .bss..decrypted section.
- * The bss section will be memset to zero later in the initialization so
- * there is no need to zero it after changing the memory encryption
- * attribute.
- */
- if (mem_encrypt_active()) {
- vaddr = (unsigned long)__start_bss_decrypted;
- vaddr_end = (unsigned long)__end_bss_decrypted;
- for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
- i = pmd_index(vaddr);
- pmd[i] -= sme_get_me_mask();
- }
- }
-
- /*
- * Return the SME encryption mask (if SME is active) to be used as a
- * modifier for the initial pgdir entry programmed into CR3.
- */
- return sme_get_me_mask();
+ return sme_postprocess_startup(bp, pmd);
}
unsigned long __startup_secondary_64(void)
@@ -480,6 +487,10 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
clear_bss();
+ /*
+ * This needs to happen *before* kasan_early_init() because latter maps stuff
+ * into that page.
+ */
clear_page(init_top_pgt);
/*
@@ -491,6 +502,16 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
kasan_early_init();
+ /*
+ * Flush global TLB entries which could be left over from the trampoline page
+ * table.
+ *
+ * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs
+ * instrument native_write_cr4() so KASAN must be initialized for that
+ * instrumentation to work.
+ */
+ __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
+
idt_setup_early_handler();
copy_bootdata(__va(real_mode_data));
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index d8c64dab0efe..eb8656bac99b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -340,7 +340,7 @@ SYM_FUNC_END(startup_32_smp)
__INIT
setup_once:
andl $0,setup_once_ref /* Once is enough, thanks */
- ret
+ RET
SYM_FUNC_START(early_idt_handler_array)
# 36(%esp) %eflags
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d8b3ebd2bb85..9c63fc5988cd 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -166,9 +166,26 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
call sev_verify_cbit
popq %rsi
- /* Switch to new page-table */
+ /*
+ * Switch to new page-table
+ *
+ * For the boot CPU this switches to early_top_pgt which still has the
+ * indentity mappings present. The secondary CPUs will switch to the
+ * init_top_pgt here, away from the trampoline_pgd and unmap the
+ * indentity mapped ranges.
+ */
movq %rax, %cr3
+ /*
+ * Do a global TLB flush after the CR3 switch to make sure the TLB
+ * entries from the identity mapping are flushed.
+ */
+ movq %cr4, %rcx
+ movq %rcx, %rax
+ xorq $X86_CR4_PGE, %rcx
+ movq %rcx, %cr4
+ movq %rax, %cr4
+
/* Ensure I am executing from virtual addresses */
movq $1f, %rax
ANNOTATE_RETPOLINE_SAFE
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 42fc41dd0e1f..882213df3713 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -10,6 +10,7 @@
#include <asm/irq_remapping.h>
#include <asm/hpet.h>
#include <asm/time.h>
+#include <asm/mwait.h>
#undef pr_fmt
#define pr_fmt(fmt) "hpet: " fmt
@@ -916,6 +917,83 @@ static bool __init hpet_counting(void)
return false;
}
+static bool __init mwait_pc10_supported(void)
+{
+ unsigned int eax, ebx, ecx, mwait_substates;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MWAIT))
+ return false;
+
+ if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+ return false;
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
+
+ return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) &&
+ (ecx & CPUID5_ECX_INTERRUPT_BREAK) &&
+ (mwait_substates & (0xF << 28));
+}
+
+/*
+ * Check whether the system supports PC10. If so force disable HPET as that
+ * stops counting in PC10. This check is overbroad as it does not take any
+ * of the following into account:
+ *
+ * - ACPI tables
+ * - Enablement of intel_idle
+ * - Command line arguments which limit intel_idle C-state support
+ *
+ * That's perfectly fine. HPET is a piece of hardware designed by committee
+ * and the only reasons why it is still in use on modern systems is the
+ * fact that it is impossible to reliably query TSC and CPU frequency via
+ * CPUID or firmware.
+ *
+ * If HPET is functional it is useful for calibrating TSC, but this can be
+ * done via PMTIMER as well which seems to be the last remaining timer on
+ * X86/INTEL platforms that has not been completely wreckaged by feature
+ * creep.
+ *
+ * In theory HPET support should be removed altogether, but there are older
+ * systems out there which depend on it because TSC and APIC timer are
+ * dysfunctional in deeper C-states.
+ *
+ * It's only 20 years now that hardware people have been asked to provide
+ * reliable and discoverable facilities which can be used for timekeeping
+ * and per CPU timer interrupts.
+ *
+ * The probability that this problem is going to be solved in the
+ * forseeable future is close to zero, so the kernel has to be cluttered
+ * with heuristics to keep up with the ever growing amount of hardware and
+ * firmware trainwrecks. Hopefully some day hardware people will understand
+ * that the approach of "This can be fixed in software" is not sustainable.
+ * Hope dies last...
+ */
+static bool __init hpet_is_pc10_damaged(void)
+{
+ unsigned long long pcfg;
+
+ /* Check whether PC10 substates are supported */
+ if (!mwait_pc10_supported())
+ return false;
+
+ /* Check whether PC10 is enabled in PKG C-state limit */
+ rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg);
+ if ((pcfg & 0xF) < 8)
+ return false;
+
+ if (hpet_force_user) {
+ pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n");
+ return false;
+ }
+
+ pr_info("HPET dysfunctional in PC10. Force disabled.\n");
+ boot_hpet_disable = true;
+ return true;
+}
+
/**
* hpet_enable - Try to setup the HPET timer. Returns 1 on success.
*/
@@ -929,6 +1007,9 @@ int __init hpet_enable(void)
if (!is_hpet_capable())
return 0;
+ if (hpet_is_pc10_damaged())
+ return 0;
+
hpet_set_mapping();
if (!hpet_virt_address)
return 0;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e28f6a5d14f1..766ffe3ba313 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -291,8 +291,10 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
{
if (handler)
kvm_posted_intr_wakeup_handler = handler;
- else
+ else {
kvm_posted_intr_wakeup_handler = dummy_handler;
+ synchronize_rcu();
+ }
}
EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 044902d5a3c4..e5dd6da78713 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -132,6 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return 0;
}
+#ifndef CONFIG_PREEMPT_RT
void do_softirq_own_stack(void)
{
struct irq_stack *irqstk;
@@ -148,6 +149,7 @@ void do_softirq_own_stack(void)
call_on_stack(__do_softirq, isp);
}
+#endif
void __handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index 8ef35063964b..aaf9e776f323 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -7,9 +7,11 @@
/*
* unsigned long native_save_fl(void)
*/
+.pushsection .noinstr.text, "ax"
SYM_FUNC_START(native_save_fl)
pushf
pop %_ASM_AX
- ret
+ RET
SYM_FUNC_END(native_save_fl)
+.popsection
EXPORT_SYMBOL(native_save_fl)
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 1afbdd1dd777..9ff480e94511 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -198,7 +198,7 @@ void sched_set_itmt_core_prio(int prio, int core_cpu)
* of the priority chain and only used when
* all other high priority cpus are out of capacity.
*/
- smt_prio = prio * smp_num_siblings / i;
+ smt_prio = prio * smp_num_siblings / (i * i);
per_cpu(sched_core_priority, cpu) = smt_prio;
i++;
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index b6e046e4b289..6290712cb36d 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -809,7 +809,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
ri->fp = sara;
/* Replace the return addr with trampoline addr */
- *sara = (unsigned long) &kretprobe_trampoline;
+ *sara = (unsigned long) &__kretprobe_trampoline;
}
NOKPROBE_SYMBOL(arch_prepare_kretprobe);
@@ -1019,52 +1019,91 @@ NOKPROBE_SYMBOL(kprobe_int3_handler);
*/
asm(
".text\n"
- ".global kretprobe_trampoline\n"
- ".type kretprobe_trampoline, @function\n"
- "kretprobe_trampoline:\n"
- /* We don't bother saving the ss register */
+ ".global __kretprobe_trampoline\n"
+ ".type __kretprobe_trampoline, @function\n"
+ "__kretprobe_trampoline:\n"
#ifdef CONFIG_X86_64
+ /* Push a fake return address to tell the unwinder it's a kretprobe. */
+ " pushq $__kretprobe_trampoline\n"
+ UNWIND_HINT_FUNC
+ /* Save the 'sp - 8', this will be fixed later. */
" pushq %rsp\n"
" pushfq\n"
SAVE_REGS_STRING
" movq %rsp, %rdi\n"
" call trampoline_handler\n"
- /* Replace saved sp with true return address. */
- " movq %rax, 19*8(%rsp)\n"
RESTORE_REGS_STRING
+ /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */
+ " addq $8, %rsp\n"
" popfq\n"
#else
+ /* Push a fake return address to tell the unwinder it's a kretprobe. */
+ " pushl $__kretprobe_trampoline\n"
+ UNWIND_HINT_FUNC
+ /* Save the 'sp - 4', this will be fixed later. */
" pushl %esp\n"
" pushfl\n"
SAVE_REGS_STRING
" movl %esp, %eax\n"
" call trampoline_handler\n"
- /* Replace saved sp with true return address. */
- " movl %eax, 15*4(%esp)\n"
RESTORE_REGS_STRING
+ /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */
+ " addl $4, %esp\n"
" popfl\n"
#endif
- " ret\n"
- ".size kretprobe_trampoline, .-kretprobe_trampoline\n"
+ ASM_RET
+ ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n"
);
-NOKPROBE_SYMBOL(kretprobe_trampoline);
-STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
+NOKPROBE_SYMBOL(__kretprobe_trampoline);
+/*
+ * __kretprobe_trampoline() skips updating frame pointer. The frame pointer
+ * saved in trampoline_handler() points to the real caller function's
+ * frame pointer. Thus the __kretprobe_trampoline() doesn't have a
+ * standard stack frame with CONFIG_FRAME_POINTER=y.
+ * Let's mark it non-standard function. Anyway, FP unwinder can correctly
+ * unwind without the hint.
+ */
+STACK_FRAME_NON_STANDARD_FP(__kretprobe_trampoline);
+
+/* This is called from kretprobe_trampoline_handler(). */
+void arch_kretprobe_fixup_return(struct pt_regs *regs,
+ kprobe_opcode_t *correct_ret_addr)
+{
+ unsigned long *frame_pointer = &regs->sp + 1;
+ /* Replace fake return address with real one. */
+ *frame_pointer = (unsigned long)correct_ret_addr;
+}
/*
- * Called from kretprobe_trampoline
+ * Called from __kretprobe_trampoline
*/
-__used __visible void *trampoline_handler(struct pt_regs *regs)
+__used __visible void trampoline_handler(struct pt_regs *regs)
{
+ unsigned long *frame_pointer;
+
/* fixup registers */
regs->cs = __KERNEL_CS;
#ifdef CONFIG_X86_32
regs->gs = 0;
#endif
- regs->ip = (unsigned long)&kretprobe_trampoline;
+ regs->ip = (unsigned long)&__kretprobe_trampoline;
regs->orig_ax = ~0UL;
+ regs->sp += sizeof(long);
+ frame_pointer = &regs->sp + 1;
- return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, &regs->sp);
+ /*
+ * The return address at 'frame_pointer' is recovered by the
+ * arch_kretprobe_fixup_return() which called from the
+ * kretprobe_trampoline_handler().
+ */
+ kretprobe_trampoline_handler(regs, frame_pointer);
+
+ /*
+ * Copy FLAGS to 'pt_regs::sp' so that __kretprobe_trapmoline()
+ * can do RET right after POPF.
+ */
+ regs->sp = regs->flags;
}
NOKPROBE_SYMBOL(trampoline_handler);
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 596de2f6d3a5..dd2ec14adb77 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -25,7 +25,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
if (bit < 0)
return;
- preempt_disable_notrace();
p = get_kprobe((kprobe_opcode_t *)ip);
if (unlikely(!p) || kprobe_disabled(p))
goto out;
@@ -59,7 +58,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
__this_cpu_write(current_kprobe, NULL);
}
out:
- preempt_enable_notrace();
ftrace_test_recursion_unlock(bit);
}
NOKPROBE_SYMBOL(kprobe_ftrace_handler);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 71425ebba98a..b4a54a52aa59 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -367,10 +367,10 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op)
/* Check the addr is within the optimized instructions. */
int arch_within_optimized_kprobe(struct optimized_kprobe *op,
- unsigned long addr)
+ kprobe_opcode_t *addr)
{
- return ((unsigned long)op->kp.addr <= addr &&
- (unsigned long)op->kp.addr + op->optinsn.size > addr);
+ return (op->kp.addr <= addr &&
+ op->kp.addr + op->optinsn.size > addr);
}
/* Free optimized instruction slot */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b656456c3a94..a438217cbfac 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,8 @@
#include <linux/nmi.h>
#include <linux/swait.h>
#include <linux/syscore_ops.h>
+#include <linux/cc_platform.h>
+#include <linux/efi.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
@@ -40,6 +42,7 @@
#include <asm/ptrace.h>
#include <asm/reboot.h>
#include <asm/svm.h>
+#include <asm/e820/api.h>
DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@@ -310,7 +313,7 @@ static void kvm_register_steal_time(void)
return;
wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
- pr_info("stealtime: cpu %d, msr %llx\n", cpu,
+ pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
(unsigned long long) slow_virt_to_phys(st));
}
@@ -347,7 +350,7 @@ static void kvm_guest_cpu_init(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
__this_cpu_write(apf_reason.enabled, 1);
- pr_info("setup async PF for cpu %d\n", smp_processor_id());
+ pr_debug("setup async PF for cpu %d\n", smp_processor_id());
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
@@ -373,7 +376,7 @@ static void kvm_pv_disable_apf(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
__this_cpu_write(apf_reason.enabled, 0);
- pr_info("disable async PF for cpu %d\n", smp_processor_id());
+ pr_debug("disable async PF for cpu %d\n", smp_processor_id());
}
static void kvm_disable_steal_time(void)
@@ -418,7 +421,7 @@ static void __init sev_map_percpu_data(void)
{
int cpu;
- if (!sev_active())
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
for_each_possible_cpu(cpu) {
@@ -433,6 +436,8 @@ static void kvm_guest_cpu_offline(bool shutdown)
kvm_disable_steal_time();
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+ if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
+ wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0);
kvm_pv_disable_apf();
if (!shutdown)
apf_task_wake_all();
@@ -547,6 +552,55 @@ static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
__send_ipi_mask(local_mask, vector);
}
+static int __init setup_efi_kvm_sev_migration(void)
+{
+ efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
+ efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
+ efi_status_t status;
+ unsigned long size;
+ bool enabled;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) ||
+ !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
+ return 0;
+
+ if (!efi_enabled(EFI_BOOT))
+ return 0;
+
+ if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
+ pr_info("%s : EFI runtime services are not enabled\n", __func__);
+ return 0;
+ }
+
+ size = sizeof(enabled);
+
+ /* Get variable contents into buffer */
+ status = efi.get_variable(efi_sev_live_migration_enabled,
+ &efi_variable_guid, NULL, &size, &enabled);
+
+ if (status == EFI_NOT_FOUND) {
+ pr_info("%s : EFI live migration variable not found\n", __func__);
+ return 0;
+ }
+
+ if (status != EFI_SUCCESS) {
+ pr_info("%s : EFI variable retrieval failed\n", __func__);
+ return 0;
+ }
+
+ if (enabled == 0) {
+ pr_info("%s: live migration disabled in EFI\n", __func__);
+ return 0;
+ }
+
+ pr_info("%s : live migration enabled in EFI\n", __func__);
+ wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
+
+ return 1;
+}
+
+late_initcall(setup_efi_kvm_sev_migration);
+
/*
* Set the IPI entry points
*/
@@ -755,7 +809,7 @@ static noinline uint32_t __kvm_cpuid_base(void)
return 0; /* So we don't blow up on old processors */
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
+ return hypervisor_cpuid_base(KVM_SIGNATURE, 0);
return 0;
}
@@ -805,8 +859,62 @@ static bool __init kvm_msi_ext_dest_id(void)
return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
}
+static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
+{
+ kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages,
+ KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
+}
+
static void __init kvm_init_platform(void)
{
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
+ kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
+ unsigned long nr_pages;
+ int i;
+
+ pv_ops.mmu.notify_page_enc_status_changed =
+ kvm_sev_hc_page_enc_status;
+
+ /*
+ * Reset the host's shared pages list related to kernel
+ * specific page encryption status settings before we load a
+ * new kernel by kexec. Reset the page encryption status
+ * during early boot intead of just before kexec to avoid SMP
+ * races during kvm_pv_guest_cpu_reboot().
+ * NOTE: We cannot reset the complete shared pages list
+ * here as we need to retain the UEFI/OVMF firmware
+ * specific settings.
+ */
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+
+ nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
+
+ kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr,
+ nr_pages,
+ KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
+ }
+
+ /*
+ * Ensure that _bss_decrypted section is marked as decrypted in the
+ * shared pages list.
+ */
+ nr_pages = DIV_ROUND_UP(__end_bss_decrypted - __start_bss_decrypted,
+ PAGE_SIZE);
+ early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted,
+ nr_pages, 0);
+
+ /*
+ * If not booted using EFI, enable Live migration support.
+ */
+ if (!efi_enabled(EFI_BOOT))
+ wrmsrl(MSR_KVM_MIGRATION_CONTROL,
+ KVM_MIGRATION_READY);
+ }
kvmclock_init();
x86_platform.apic_post_init = kvm_apic_init;
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 73c74b961d0f..a35cbf9107af 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -16,9 +16,9 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/set_memory.h>
+#include <linux/cc_platform.h>
#include <asm/hypervisor.h>
-#include <asm/mem_encrypt.h>
#include <asm/x86_init.h>
#include <asm/kvmclock.h>
@@ -174,7 +174,7 @@ static void kvm_register_clock(char *txt)
pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
wrmsrl(msr_kvm_system_time, pa);
- pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
+ pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
}
static void kvm_save_sched_clock_state(void)
@@ -223,7 +223,7 @@ static void __init kvmclock_init_mem(void)
* hvclock is shared between the guest and the hypervisor, must
* be mapped decrypted.
*/
- if (sev_active()) {
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
r = set_memory_decrypted((unsigned long) hvclock_mem,
1UL << order);
if (r) {
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 131f30fdcfbd..f5da4a18070a 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -17,6 +17,7 @@
#include <linux/suspend.h>
#include <linux/vmalloc.h>
#include <linux/efi.h>
+#include <linux/cc_platform.h>
#include <asm/init.h>
#include <asm/tlbflush.h>
@@ -166,7 +167,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
}
pte = pte_offset_kernel(pmd, vaddr);
- if (sev_active())
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
prot = PAGE_KERNEL_EXEC;
set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
@@ -206,7 +207,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
level4p = (pgd_t *)__va(start_pgtable);
clear_page(level4p);
- if (sev_active()) {
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
info.page_flag |= _PAGE_ENC;
info.kernpg_flag |= _PAGE_ENC;
}
@@ -358,7 +359,7 @@ void machine_kexec(struct kimage *image)
(unsigned long)page_list,
image->start,
image->preserve_context,
- sme_active());
+ cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT));
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -569,12 +570,12 @@ void arch_kexec_unprotect_crashkres(void)
*/
int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
{
- if (sev_active())
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
return 0;
/*
- * If SME is active we need to be sure that kexec pages are
- * not encrypted because when we boot to the new kernel the
+ * If host memory encryption is active we need to be sure that kexec
+ * pages are not encrypted because when we boot to the new kernel the
* pages won't be accessed encrypted (initially).
*/
return set_memory_decrypted((unsigned long)vaddr, pages);
@@ -582,12 +583,12 @@ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
{
- if (sev_active())
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
return;
/*
- * If SME is active we need to reset the pages back to being
- * an encrypted mapping before freeing them.
+ * If host memory encryption is active we need to reset the pages back
+ * to being an encrypted mapping before freeing them.
*/
set_memory_encrypted((unsigned long)vaddr, pages);
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 5e9a34b5bd74..95fa745e310a 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -67,6 +67,7 @@ static unsigned long int get_module_load_offset(void)
void *module_alloc(unsigned long size)
{
+ gfp_t gfp_mask = GFP_KERNEL;
void *p;
if (PAGE_ALIGN(size) > MODULES_LEN)
@@ -74,10 +75,10 @@ void *module_alloc(unsigned long size)
p = __vmalloc_node_range(size, MODULE_ALIGN,
MODULES_VADDR + get_module_load_offset(),
- MODULES_END, GFP_KERNEL,
- PAGE_KERNEL, 0, NUMA_NO_NODE,
+ MODULES_END, gfp_mask,
+ PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
__builtin_return_address(0));
- if (p && (kasan_module_alloc(p, size) < 0)) {
+ if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
vfree(p);
return NULL;
}
@@ -251,7 +252,8 @@ int module_finalize(const Elf_Ehdr *hdr,
struct module *me)
{
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
- *para = NULL, *orc = NULL, *orc_ip = NULL;
+ *para = NULL, *orc = NULL, *orc_ip = NULL,
+ *retpolines = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -267,8 +269,14 @@ int module_finalize(const Elf_Ehdr *hdr,
orc = s;
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
orc_ip = s;
+ if (!strcmp(".retpoline_sites", secstrings + s->sh_name))
+ retpolines = s;
}
+ if (retpolines) {
+ void *rseg = (void *)retpolines->sh_addr;
+ apply_retpolines(rseg, rseg + retpolines->sh_size);
+ }
if (alt) {
/* patch .altinstructions */
void *aseg = (void *)alt->sh_addr;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 04cafc057bed..4420499f7bb4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -41,11 +41,22 @@ extern void _paravirt_nop(void);
asm (".pushsection .entry.text, \"ax\"\n"
".global _paravirt_nop\n"
"_paravirt_nop:\n\t"
- "ret\n\t"
+ ASM_RET
".size _paravirt_nop, . - _paravirt_nop\n\t"
".type _paravirt_nop, @function\n\t"
".popsection");
+/* stub always returning 0. */
+asm (".pushsection .entry.text, \"ax\"\n"
+ ".global paravirt_ret0\n"
+ "paravirt_ret0:\n\t"
+ "xor %" _ASM_AX ", %" _ASM_AX ";\n\t"
+ ASM_RET
+ ".size paravirt_ret0, . - paravirt_ret0\n\t"
+ ".type paravirt_ret0, @function\n\t"
+ ".popsection");
+
+
void __init default_banner(void)
{
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -53,7 +64,7 @@ void __init default_banner(void)
}
/* Undefined instruction for dealing with missing ops pointers. */
-static void paravirt_BUG(void)
+noinstr void paravirt_BUG(void)
{
BUG();
}
@@ -218,6 +229,36 @@ void paravirt_end_context_switch(struct task_struct *next)
if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
arch_enter_lazy_mmu_mode();
}
+
+static noinstr unsigned long pv_native_read_cr2(void)
+{
+ return native_read_cr2();
+}
+
+static noinstr void pv_native_write_cr2(unsigned long val)
+{
+ native_write_cr2(val);
+}
+
+static noinstr unsigned long pv_native_get_debugreg(int regno)
+{
+ return native_get_debugreg(regno);
+}
+
+static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
+{
+ native_set_debugreg(regno, val);
+}
+
+static noinstr void pv_native_irq_enable(void)
+{
+ native_irq_enable();
+}
+
+static noinstr void pv_native_irq_disable(void)
+{
+ native_irq_disable();
+}
#endif
enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
@@ -244,8 +285,8 @@ struct paravirt_patch_template pv_ops = {
#ifdef CONFIG_PARAVIRT_XXL
.cpu.cpuid = native_cpuid,
- .cpu.get_debugreg = native_get_debugreg,
- .cpu.set_debugreg = native_set_debugreg,
+ .cpu.get_debugreg = pv_native_get_debugreg,
+ .cpu.set_debugreg = pv_native_set_debugreg,
.cpu.read_cr0 = native_read_cr0,
.cpu.write_cr0 = native_write_cr0,
.cpu.write_cr4 = native_write_cr4,
@@ -281,8 +322,8 @@ struct paravirt_patch_template pv_ops = {
/* Irq ops. */
.irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
- .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
- .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
+ .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
+ .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
.irq.safe_halt = native_safe_halt,
.irq.halt = native_halt,
#endif /* CONFIG_PARAVIRT_XXL */
@@ -296,10 +337,11 @@ struct paravirt_patch_template pv_ops = {
(void (*)(struct mmu_gather *, void *))tlb_remove_page,
.mmu.exit_mmap = paravirt_nop,
+ .mmu.notify_page_enc_status_changed = paravirt_nop,
#ifdef CONFIG_PARAVIRT_XXL
- .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2),
- .mmu.write_cr2 = native_write_cr2,
+ .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2),
+ .mmu.write_cr2 = pv_native_write_cr2,
.mmu.read_cr3 = __native_read_cr3,
.mmu.write_cr3 = native_write_cr3,
@@ -371,9 +413,6 @@ struct paravirt_patch_template pv_ops = {
};
#ifdef CONFIG_PARAVIRT_XXL
-/* At this point, native_get/set_debugreg has real function entries */
-NOKPROBE_SYMBOL(native_get_debugreg);
-NOKPROBE_SYMBOL(native_set_debugreg);
NOKPROBE_SYMBOL(native_load_idt);
void (*paravirt_iret)(void) = native_iret;
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index c2cfa5e7c152..814ab46a0dad 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -6,7 +6,7 @@
#include <linux/swiotlb.h>
#include <linux/memblock.h>
#include <linux/dma-direct.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <asm/iommu.h>
#include <asm/swiotlb.h>
@@ -45,11 +45,10 @@ int __init pci_swiotlb_detect_4gb(void)
swiotlb = 1;
/*
- * If SME is active then swiotlb will be set to 1 so that bounce
- * buffers are allocated and used for devices that do not support
- * the addressing range required for the encryption mask.
+ * Set swiotlb to 1 so that bounce buffers are allocated and used for
+ * devices that can't support DMA to encrypted memory.
*/
- if (sme_active())
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
swiotlb = 1;
return swiotlb;
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 9e1def3744f2..36e84d904260 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -80,7 +80,7 @@ static struct resource video_rom_resource = {
*/
static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
{
- struct pci_driver *drv = pdev->driver;
+ struct pci_driver *drv = to_pci_driver(pdev->dev.driver);
const struct pci_device_id *id;
if (pdev->vendor == vendor && pdev->device == device)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1d9463e3096b..81d8ef036637 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -30,7 +30,9 @@
#include <asm/apic.h>
#include <linux/uaccess.h>
#include <asm/mwait.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/sched.h>
+#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
#include <asm/nmi.h>
#include <asm/tlbflush.h>
@@ -43,6 +45,7 @@
#include <asm/io_bitmap.h>
#include <asm/proto.h>
#include <asm/frame.h>
+#include <asm/unwind.h>
#include "process.h"
@@ -87,9 +90,20 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
#ifdef CONFIG_VM86
dst->thread.vm86 = NULL;
#endif
- return fpu_clone(dst);
+ /* Drop the copied pointer to current's fpstate */
+ dst->thread.fpu.fpstate = NULL;
+
+ return 0;
}
+#ifdef CONFIG_X86_64
+void arch_release_task_struct(struct task_struct *tsk)
+{
+ if (fpu_state_size_dynamic())
+ fpstate_free(&tsk->thread.fpu);
+}
+#endif
+
/*
* Free thread data structures etc..
*/
@@ -132,6 +146,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
+ p->thread.iopl_warn = 0;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
#ifdef CONFIG_X86_64
@@ -154,6 +169,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
frame->flags = X86_EFLAGS_FIXED;
#endif
+ fpu_clone(p, clone_flags);
+
/* Kernel thread ? */
if (unlikely(p->flags & PF_KTHREAD)) {
p->thread.pkru = pkru_get_init_value();
@@ -348,7 +365,7 @@ void arch_setup_new_exec(void)
clear_thread_flag(TIF_SSBD);
task_clear_spec_ssb_disable(current);
task_clear_spec_ssb_noexec(current);
- speculation_ctrl_update(task_thread_info(current)->flags);
+ speculation_ctrl_update(read_thread_flags());
}
}
@@ -600,7 +617,7 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
}
/* Return the updated threadinfo flags*/
- return task_thread_info(tsk)->flags;
+ return read_task_thread_flags(tsk);
}
void speculation_ctrl_update(unsigned long tif)
@@ -636,8 +653,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
unsigned long tifp, tifn;
- tifn = READ_ONCE(task_thread_info(next_p)->flags);
- tifp = READ_ONCE(task_thread_info(prev_p)->flags);
+ tifn = read_task_thread_flags(next_p);
+ tifp = read_task_thread_flags(prev_p);
switch_to_bitmap(tifp);
@@ -942,70 +959,43 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
* because the task might wake up and we might look at a stack
* changing under us.
*/
-unsigned long get_wchan(struct task_struct *p)
+unsigned long __get_wchan(struct task_struct *p)
{
- unsigned long start, bottom, top, sp, fp, ip, ret = 0;
- int count = 0;
-
- if (p == current || task_is_running(p))
- return 0;
+ struct unwind_state state;
+ unsigned long addr = 0;
if (!try_get_task_stack(p))
return 0;
- start = (unsigned long)task_stack_page(p);
- if (!start)
- goto out;
-
- /*
- * Layout of the stack page:
- *
- * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
- * PADDING
- * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
- * stack
- * ----------- bottom = start
- *
- * The tasks stack pointer points at the location where the
- * framepointer is stored. The data on the stack is:
- * ... IP FP ... IP FP
- *
- * We need to read FP and IP, so we need to adjust the upper
- * bound by another unsigned long.
- */
- top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
- top -= 2 * sizeof(unsigned long);
- bottom = start;
-
- sp = READ_ONCE(p->thread.sp);
- if (sp < bottom || sp > top)
- goto out;
-
- fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp);
- do {
- if (fp < bottom || fp > top)
- goto out;
- ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
- if (!in_sched_functions(ip)) {
- ret = ip;
- goto out;
- }
- fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
- } while (count++ < 16 && !task_is_running(p));
+ for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr)
+ break;
+ if (in_sched_functions(addr))
+ continue;
+ break;
+ }
-out:
put_task_stack(p);
- return ret;
+
+ return addr;
}
long do_arch_prctl_common(struct task_struct *task, int option,
- unsigned long cpuid_enabled)
+ unsigned long arg2)
{
switch (option) {
case ARCH_GET_CPUID:
return get_cpuid_mode();
case ARCH_SET_CPUID:
- return set_cpuid_mode(task, cpuid_enabled);
+ return set_cpuid_mode(task, arg2);
+ case ARCH_GET_XCOMP_SUPP:
+ case ARCH_GET_XCOMP_PERM:
+ case ARCH_REQ_XCOMP_PERM:
+ case ARCH_GET_XCOMP_GUEST_PERM:
+ case ARCH_REQ_XCOMP_GUEST_PERM:
+ return fpu_xstate_prctl(task, option, arg2);
}
return -EINVAL;
diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
index 1d0797b2338a..76b547b83232 100644
--- a/arch/x86/kernel/process.h
+++ b/arch/x86/kernel/process.h
@@ -13,8 +13,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
static inline void switch_to_extra(struct task_struct *prev,
struct task_struct *next)
{
- unsigned long next_tif = task_thread_info(next)->flags;
- unsigned long prev_tif = task_thread_info(prev)->flags;
+ unsigned long next_tif = read_task_thread_flags(next);
+ unsigned long prev_tif = read_task_thread_flags(prev);
if (IS_ENABLED(CONFIG_SMP)) {
/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4f2f54e1281c..26edb1cd07a4 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -41,7 +41,7 @@
#include <asm/ldt.h>
#include <asm/processor.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/sched.h>
#include <asm/desc.h>
#include <linux/err.h>
@@ -160,7 +160,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
@@ -213,7 +212,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(current_task, next_p);
- switch_fpu_finish(next_fpu);
+ switch_fpu_finish();
/* Load the Intel cache allocation PQR MSR. */
resctrl_sched_in();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ec0d836a13b1..3402edec236c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -42,7 +42,7 @@
#include <asm/processor.h>
#include <asm/pkru.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/sched.h>
#include <asm/mmu_context.h>
#include <asm/prctl.h>
#include <asm/desc.h>
@@ -559,7 +559,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
@@ -620,7 +619,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(current_task, next_p);
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
- switch_fpu_finish(next_fpu);
+ switch_fpu_finish();
/* Reload sp0. */
update_task_stack(next_p);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 4c208ea3bd9f..6d2244c94799 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -29,9 +29,9 @@
#include <linux/uaccess.h>
#include <asm/processor.h>
-#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
#include <asm/ldt.h>
#include <asm/desc.h>
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 0a40df66a40d..fa700b46588e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -113,17 +113,9 @@ void __noreturn machine_real_restart(unsigned int type)
spin_unlock(&rtc_lock);
/*
- * Switch back to the initial page table.
+ * Switch to the trampoline page table.
*/
-#ifdef CONFIG_X86_32
- load_cr3(initial_page_table);
-#else
- write_cr3(real_mode_header->trampoline_pgd);
-
- /* Exiting long mode will fail if CR4.PCIDE is set. */
- if (boot_cpu_has(X86_FEATURE_PCID))
- cr4_clear_bits(X86_CR4_PCIDE);
-#endif
+ load_trampoline_pgtable();
/* Jump to the identity-mapped low memory code */
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index f469153eca8a..fcc8a7699103 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -91,7 +91,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
movl %edi, %eax
addl $(identity_mapped - relocate_kernel), %eax
pushl %eax
- ret
+ RET
SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
@@ -159,7 +159,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
xorl %edx, %edx
xorl %esi, %esi
xorl %ebp, %ebp
- ret
+ RET
1:
popl %edx
movl CP_PA_SWAP_PAGE(%edi), %esp
@@ -190,7 +190,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
movl %edi, %eax
addl $(virtual_mapped - relocate_kernel), %eax
pushl %eax
- ret
+ RET
SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
@@ -208,7 +208,7 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
popl %edi
popl %esi
popl %ebx
- ret
+ RET
SYM_CODE_END(virtual_mapped)
/* Do the copies */
@@ -271,7 +271,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
popl %edi
popl %ebx
popl %ebp
- ret
+ RET
SYM_CODE_END(swap_pages)
.globl kexec_control_code_size
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index c53271aebb64..399f075ccdc4 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -47,7 +47,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
* %rsi page_list
* %rdx start address
* %rcx preserve_context
- * %r8 sme_active
+ * %r8 host_mem_enc_active
*/
/* Save the CPU context, used for jumping back */
@@ -104,7 +104,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
/* jump to identity mapped page */
addq $(identity_mapped - relocate_kernel), %r8
pushq %r8
- ret
+ RET
SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
@@ -191,7 +191,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
xorl %r14d, %r14d
xorl %r15d, %r15d
- ret
+ RET
1:
popq %rdx
@@ -210,7 +210,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
call swap_pages
movq $virtual_mapped, %rax
pushq %rax
- ret
+ RET
SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
@@ -231,7 +231,7 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
popq %r12
popq %rbp
popq %rbx
- ret
+ RET
SYM_CODE_END(virtual_mapped)
/* Do the copies */
@@ -288,7 +288,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
lea PAGE_SIZE(%rax), %rsi
jmp 0b
3:
- ret
+ RET
SYM_CODE_END(swap_pages)
.globl kexec_control_code_size
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 40ed44ead063..f7a132eb794d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -40,6 +40,7 @@
#include <asm/kasan.h>
#include <asm/kaslr.h>
#include <asm/mce.h>
+#include <asm/memtype.h>
#include <asm/mtrr.h>
#include <asm/realmode.h>
#include <asm/olpc_ofw.h>
@@ -322,7 +323,7 @@ static void __init reserve_initrd(void)
relocate_initrd();
- memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
+ memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image);
}
#else
@@ -521,7 +522,7 @@ static void __init reserve_crashkernel(void)
}
if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
- memblock_free(crash_base, crash_size);
+ memblock_phys_free(crash_base, crash_size);
return;
}
@@ -713,9 +714,6 @@ static void __init early_reserve_memory(void)
early_reserve_initrd();
- if (efi_enabled(EFI_BOOT))
- efi_memblock_x86_reserve_range();
-
memblock_x86_reserve_range_setup_data();
reserve_ibft_region();
@@ -890,6 +888,9 @@ void __init setup_arch(char **cmdline_p)
parse_early_param();
+ if (efi_enabled(EFI_BOOT))
+ efi_memblock_x86_reserve_range();
+
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory used by the kernel cannot be hot-removed because Linux
@@ -967,7 +968,11 @@ void __init setup_arch(char **cmdline_p)
max_pfn = e820__end_of_ram_pfn();
/* update e820 for memory not covered by WB MTRRs */
- mtrr_bp_init();
+ if (IS_ENABLED(CONFIG_MTRR))
+ mtrr_bp_init();
+ else
+ pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel.");
+
if (mtrr_trim_uncached_memory(max_pfn))
max_pfn = e820__end_of_ram_pfn();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5afd98559193..7b65275544b2 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
static void __init pcpu_fc_free(void *ptr, size_t size)
{
- memblock_free_ptr(ptr, size);
+ memblock_free(ptr, size);
}
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 9f90f460a28c..ce987688bbc0 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -64,7 +64,7 @@ static bool sev_es_negotiate_protocol(void)
static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
{
ghcb->save.sw_exit_code = 0;
- memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+ __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
static bool vc_decoding_needed(unsigned long exit_code)
@@ -94,25 +94,15 @@ static void vc_finish_insn(struct es_em_ctxt *ctxt)
ctxt->regs->ip += ctxt->insn.length;
}
-static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt,
- u64 exit_code, u64 exit_info_1,
- u64 exit_info_2)
+static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
- enum es_result ret;
-
- /* Fill in protocol and format specifiers */
- ghcb->protocol_version = GHCB_PROTOCOL_MAX;
- ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
-
- ghcb_set_sw_exit_code(ghcb, exit_code);
- ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
- ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+ u32 ret;
- sev_es_wr_ghcb_msr(__pa(ghcb));
- VMGEXIT();
+ ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0);
+ if (!ret)
+ return ES_OK;
- if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) {
+ if (ret == 1) {
u64 info = ghcb->save.sw_exit_info_2;
unsigned long v;
@@ -124,17 +114,40 @@ static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) &&
((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) {
ctxt->fi.vector = v;
+
if (info & SVM_EVTINJ_VALID_ERR)
ctxt->fi.error_code = info >> 32;
- ret = ES_EXCEPTION;
- } else {
- ret = ES_VMM_ERROR;
+
+ return ES_EXCEPTION;
}
- } else {
- ret = ES_OK;
}
- return ret;
+ return ES_VMM_ERROR;
+}
+
+enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr,
+ struct es_em_ctxt *ctxt, u64 exit_code,
+ u64 exit_info_1, u64 exit_info_2)
+{
+ /* Fill in protocol and format specifiers */
+ ghcb->protocol_version = GHCB_PROTOCOL_MAX;
+ ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
+
+ ghcb_set_sw_exit_code(ghcb, exit_code);
+ ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+ ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+ /*
+ * Hyper-V unenlightened guests use a paravisor for communicating and
+ * GHCB pages are being allocated and set up by that paravisor. Linux
+ * should not change the GHCB page's physical address.
+ */
+ if (set_ghcb_msr)
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+
+ VMGEXIT();
+
+ return verify_exception_info(ghcb, ctxt);
}
/*
@@ -208,7 +221,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
fail:
/* Terminate the guest */
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
}
static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
@@ -411,7 +424,7 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
*/
sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer);
ghcb_set_sw_scratch(ghcb, sw_scratch);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO,
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_IOIO,
exit_info_1, exit_info_2);
if (ret != ES_OK)
return ret;
@@ -453,7 +466,8 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
ghcb_set_rax(ghcb, rax);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt,
+ SVM_EXIT_IOIO, exit_info_1, 0);
if (ret != ES_OK)
return ret;
@@ -484,7 +498,7 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
/* xgetbv will cause #GP - use reset value for xcr0 */
ghcb_set_xcr0(ghcb, 1);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_CPUID, 0, 0);
if (ret != ES_OK)
return ret;
@@ -509,7 +523,7 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
bool rdtscp = (exit_code == SVM_EXIT_RDTSCP);
enum es_result ret;
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, 0, 0);
if (ret != ES_OK)
return ret;
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index a6895e440bc3..e6d316a01fdd 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -11,7 +11,7 @@
#include <linux/sched/debug.h> /* For show_regs() */
#include <linux/percpu-defs.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <linux/printk.h>
#include <linux/mm_types.h>
#include <linux/set_memory.h>
@@ -23,9 +23,10 @@
#include <asm/stacktrace.h>
#include <asm/sev.h>
#include <asm/insn-eval.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/xcr.h>
#include <asm/processor.h>
#include <asm/realmode.h>
+#include <asm/setup.h>
#include <asm/traps.h>
#include <asm/svm.h>
#include <asm/smp.h>
@@ -46,16 +47,6 @@ static struct ghcb __initdata *boot_ghcb;
struct sev_es_runtime_data {
struct ghcb ghcb_page;
- /* Physical storage for the per-CPU IST stack of the #VC handler */
- char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
-
- /*
- * Physical storage for the per-CPU fall-back stack of the #VC handler.
- * The fall-back stack is used when it is not safe to switch back to the
- * interrupted stack in the #VC entry code.
- */
- char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
-
/*
* Reserve one page per CPU as backup storage for the unencrypted GHCB.
* It is needed when an NMI happens while the #VC handler uses the real
@@ -96,30 +87,6 @@ struct ghcb_state {
static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
-/* Needed in vc_early_forward_exception */
-void do_early_exception(struct pt_regs *regs, int trapnr);
-
-static void __init setup_vc_stacks(int cpu)
-{
- struct sev_es_runtime_data *data;
- struct cpu_entry_area *cea;
- unsigned long vaddr;
- phys_addr_t pa;
-
- data = per_cpu(runtime_data, cpu);
- cea = get_cpu_entry_area(cpu);
-
- /* Map #VC IST stack */
- vaddr = CEA_ESTACK_BOT(&cea->estacks, VC);
- pa = __pa(data->ist_stack);
- cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
-
- /* Map VC fall-back stack */
- vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2);
- pa = __pa(data->fallback_stack);
- cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
-}
-
static __always_inline bool on_vc_stack(struct pt_regs *regs)
{
unsigned long sp = regs->sp;
@@ -240,9 +207,6 @@ static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
return ghcb;
}
-/* Needed in vc_early_forward_exception */
-void do_early_exception(struct pt_regs *regs, int trapnr);
-
static inline u64 sev_es_rd_ghcb_msr(void)
{
return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
@@ -325,11 +289,6 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
char *dst, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
- char __user *target = (char __user *)dst;
- u64 d8;
- u32 d4;
- u16 d2;
- u8 d1;
/*
* This function uses __put_user() independent of whether kernel or user
@@ -351,26 +310,42 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
* instructions here would cause infinite nesting.
*/
switch (size) {
- case 1:
+ case 1: {
+ u8 d1;
+ u8 __user *target = (u8 __user *)dst;
+
memcpy(&d1, buf, 1);
if (__put_user(d1, target))
goto fault;
break;
- case 2:
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *target = (u16 __user *)dst;
+
memcpy(&d2, buf, 2);
if (__put_user(d2, target))
goto fault;
break;
- case 4:
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *target = (u32 __user *)dst;
+
memcpy(&d4, buf, 4);
if (__put_user(d4, target))
goto fault;
break;
- case 8:
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *target = (u64 __user *)dst;
+
memcpy(&d8, buf, 8);
if (__put_user(d8, target))
goto fault;
break;
+ }
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
@@ -393,11 +368,6 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
char *src, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT;
- char __user *s = (char __user *)src;
- u64 d8;
- u32 d4;
- u16 d2;
- u8 d1;
/*
* This function uses __get_user() independent of whether kernel or user
@@ -419,26 +389,41 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
* instructions here would cause infinite nesting.
*/
switch (size) {
- case 1:
+ case 1: {
+ u8 d1;
+ u8 __user *s = (u8 __user *)src;
+
if (__get_user(d1, s))
goto fault;
memcpy(buf, &d1, 1);
break;
- case 2:
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *s = (u16 __user *)src;
+
if (__get_user(d2, s))
goto fault;
memcpy(buf, &d2, 2);
break;
- case 4:
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *s = (u32 __user *)src;
+
if (__get_user(d4, s))
goto fault;
memcpy(buf, &d4, 4);
break;
- case 8:
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *s = (u64 __user *)src;
if (__get_user(d8, s))
goto fault;
memcpy(buf, &d8, 8);
break;
+ }
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
@@ -615,7 +600,7 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
int cpu;
u64 pfn;
- if (!sev_es_active())
+ if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
return 0;
pflags = _PAGE_NX | _PAGE_RW;
@@ -648,7 +633,8 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
ghcb_set_rdx(ghcb, regs->dx);
}
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_MSR,
+ exit_info_1, 0);
if ((ret == ES_OK) && (!exit_info_1)) {
regs->ax = ghcb->save.rax;
@@ -774,7 +760,7 @@ void __init sev_es_init_vc_handling(void)
BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
- if (!sev_es_active())
+ if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
return;
if (!sev_es_check_cpu_features())
@@ -787,7 +773,6 @@ void __init sev_es_init_vc_handling(void)
for_each_possible_cpu(cpu) {
alloc_runtime_data(cpu);
init_ghcb(cpu);
- setup_vc_stacks(cpu);
}
sev_es_setup_play_dead();
@@ -807,22 +792,6 @@ static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
do_early_exception(ctxt->regs, trapnr);
}
-static long *vc_insn_get_reg(struct es_em_ctxt *ctxt)
-{
- long *reg_array;
- int offset;
-
- reg_array = (long *)ctxt->regs;
- offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs);
-
- if (offset < 0)
- return NULL;
-
- offset /= sizeof(long);
-
- return reg_array + offset;
-}
-
static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
{
long *reg_array;
@@ -867,77 +836,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
- return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
-}
-
-static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct insn *insn = &ctxt->insn;
- unsigned int bytes = 0;
- enum es_result ret;
- int sign_byte;
- long *reg_data;
-
- switch (insn->opcode.bytes[1]) {
- /* MMIO Read w/ zero-extension */
- case 0xb6:
- bytes = 1;
- fallthrough;
- case 0xb7:
- if (!bytes)
- bytes = 2;
-
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- /* Zero extend based on operand size */
- reg_data = vc_insn_get_reg(ctxt);
- if (!reg_data)
- return ES_DECODE_FAILED;
-
- memset(reg_data, 0, insn->opnd_bytes);
-
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
-
- /* MMIO Read w/ sign-extension */
- case 0xbe:
- bytes = 1;
- fallthrough;
- case 0xbf:
- if (!bytes)
- bytes = 2;
-
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- /* Sign extend based on operand size */
- reg_data = vc_insn_get_reg(ctxt);
- if (!reg_data)
- return ES_DECODE_FAILED;
-
- if (bytes == 1) {
- u8 *val = (u8 *)ghcb->shared_buffer;
-
- sign_byte = (*val & 0x80) ? 0xff : 0x00;
- } else {
- u16 *val = (u16 *)ghcb->shared_buffer;
-
- sign_byte = (*val & 0x8000) ? 0xff : 0x00;
- }
- memset(reg_data, sign_byte, insn->opnd_bytes);
-
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
-
- default:
- ret = ES_UNSUPPORTED;
- }
-
- return ret;
+ return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2);
}
/*
@@ -1007,83 +906,79 @@ static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
return ES_RETRY;
}
-static enum es_result vc_handle_mmio(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
+static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
struct insn *insn = &ctxt->insn;
unsigned int bytes = 0;
+ enum mmio_type mmio;
enum es_result ret;
+ u8 sign_byte;
long *reg_data;
- switch (insn->opcode.bytes[0]) {
- /* MMIO Write */
- case 0x88:
- bytes = 1;
- fallthrough;
- case 0x89:
- if (!bytes)
- bytes = insn->opnd_bytes;
+ mmio = insn_decode_mmio(insn, &bytes);
+ if (mmio == MMIO_DECODE_FAILED)
+ return ES_DECODE_FAILED;
- reg_data = vc_insn_get_reg(ctxt);
+ if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
+ reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs);
if (!reg_data)
return ES_DECODE_FAILED;
+ }
+ switch (mmio) {
+ case MMIO_WRITE:
memcpy(ghcb->shared_buffer, reg_data, bytes);
-
ret = vc_do_mmio(ghcb, ctxt, bytes, false);
break;
-
- case 0xc6:
- bytes = 1;
- fallthrough;
- case 0xc7:
- if (!bytes)
- bytes = insn->opnd_bytes;
-
+ case MMIO_WRITE_IMM:
memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
-
ret = vc_do_mmio(ghcb, ctxt, bytes, false);
break;
-
- /* MMIO Read */
- case 0x8a:
- bytes = 1;
- fallthrough;
- case 0x8b:
- if (!bytes)
- bytes = insn->opnd_bytes;
-
+ case MMIO_READ:
ret = vc_do_mmio(ghcb, ctxt, bytes, true);
if (ret)
break;
- reg_data = vc_insn_get_reg(ctxt);
- if (!reg_data)
- return ES_DECODE_FAILED;
-
/* Zero-extend for 32-bit operation */
if (bytes == 4)
*reg_data = 0;
memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
+ case MMIO_READ_ZERO_EXTEND:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
- /* MOVS instruction */
- case 0xa4:
- bytes = 1;
- fallthrough;
- case 0xa5:
- if (!bytes)
- bytes = insn->opnd_bytes;
+ /* Zero extend based on operand size */
+ memset(reg_data, 0, insn->opnd_bytes);
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+ case MMIO_READ_SIGN_EXTEND:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
- ret = vc_handle_mmio_movs(ctxt, bytes);
+ if (bytes == 1) {
+ u8 *val = (u8 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x80) ? 0xff : 0x00;
+ } else {
+ u16 *val = (u16 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x8000) ? 0xff : 0x00;
+ }
+
+ /* Sign extend based on operand size */
+ memset(reg_data, sign_byte, insn->opnd_bytes);
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
- /* Two-Byte Opcodes */
- case 0x0f:
- ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt);
+ case MMIO_MOVS:
+ ret = vc_handle_mmio_movs(ctxt, bytes);
break;
default:
ret = ES_UNSUPPORTED;
+ break;
}
return ret;
@@ -1117,7 +1012,7 @@ static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
/* Using a value of 0 for ExitInfo1 means RAX holds the value */
ghcb_set_rax(ghcb, val);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
if (ret != ES_OK)
return ret;
@@ -1147,7 +1042,7 @@ static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
- return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0);
+ return sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WBINVD, 0, 0);
}
static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
@@ -1156,7 +1051,7 @@ static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt
ghcb_set_rcx(ghcb, ctxt->regs->cx);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_RDPMC, 0, 0);
if (ret != ES_OK)
return ret;
@@ -1197,7 +1092,7 @@ static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
if (x86_platform.hyper.sev_es_hcall_prepare)
x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_VMMCALL, 0, 0);
if (ret != ES_OK)
return ret;
@@ -1319,13 +1214,26 @@ static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
}
}
-static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
+static __always_inline bool is_vc2_stack(unsigned long sp)
{
- unsigned long sp = (unsigned long)regs;
-
return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
}
+static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
+{
+ unsigned long sp, prev_sp;
+
+ sp = (unsigned long)regs;
+ prev_sp = regs->sp;
+
+ /*
+ * If the code was already executing on the VC2 stack when the #VC
+ * happened, let it proceed to the normal handling routine. This way the
+ * code executing on the VC2 stack can cause #VC exceptions to get handled.
+ */
+ return is_vc2_stack(sp) && !is_vc2_stack(prev_sp);
+}
+
static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
{
struct ghcb_state state;
@@ -1406,7 +1314,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
* But keep this here in case the noinstr annotations are violated due
* to bug elsewhere.
*/
- if (unlikely(on_vc_fallback_stack(regs))) {
+ if (unlikely(vc_from_invalid_context(regs))) {
instrumentation_begin();
panic("Can't handle #VC exception from unsupported context\n");
instrumentation_end();
@@ -1429,7 +1337,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
show_regs(regs);
/* Ask hypervisor to sev_es_terminate */
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
/* If that fails and we get here - just panic */
panic("Returned from Terminate-Request to Hypervisor\n");
@@ -1477,7 +1385,7 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
/* Do initial setup or terminate the guest */
if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb()))
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
vc_ghcb_invalidate(boot_ghcb);
diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S
index ee04941a6546..3355e27c69eb 100644
--- a/arch/x86/kernel/sev_verify_cbit.S
+++ b/arch/x86/kernel/sev_verify_cbit.S
@@ -85,5 +85,5 @@ SYM_FUNC_START(sev_verify_cbit)
#endif
/* Return page-table pointer */
movq %rdi, %rax
- ret
+ RET
SYM_FUNC_END(sev_verify_cbit)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index f4d21e470083..ec71e06ae364 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -15,6 +15,7 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
+#include <linux/kstrtox.h>
#include <linux/errno.h>
#include <linux/wait.h>
#include <linux/tracehook.h>
@@ -30,8 +31,8 @@
#include <asm/processor.h>
#include <asm/ucontext.h>
-#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
+#include <asm/fpu/xstate.h>
#include <asm/vdso.h>
#include <asm/mce.h>
#include <asm/sighandling.h>
@@ -41,6 +42,7 @@
#include <linux/compat.h>
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
+#include <asm/fpu/xstate.h>
#endif /* CONFIG_X86_64 */
#include <asm/syscall.h>
@@ -79,9 +81,9 @@ static void force_valid_ss(struct pt_regs *regs)
# define CONTEXT_COPY_SIZE sizeof(struct sigcontext)
#endif
-static int restore_sigcontext(struct pt_regs *regs,
- struct sigcontext __user *usc,
- unsigned long uc_flags)
+static bool restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext __user *usc,
+ unsigned long uc_flags)
{
struct sigcontext sc;
@@ -89,7 +91,7 @@ static int restore_sigcontext(struct pt_regs *regs,
current->restart_block.fn = do_no_restart_syscall;
if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE))
- return -EFAULT;
+ return false;
#ifdef CONFIG_X86_32
set_user_gs(regs, sc.gs);
@@ -244,7 +246,6 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
unsigned long math_size = 0;
unsigned long sp = regs->sp;
unsigned long buf_fx = 0;
- int ret;
/* redzone */
if (IS_ENABLED(CONFIG_X86_64))
@@ -292,8 +293,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
}
/* save i387 and extended state */
- ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size);
- if (ret < 0)
+ if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size))
return (void __user *)-1L;
return (void __user *)sp;
@@ -643,7 +643,7 @@ SYSCALL_DEFINE0(sigreturn)
* x86_32 has no uc_flags bits relevant to restore_sigcontext.
* Save a few cycles by skipping the __get_user.
*/
- if (restore_sigcontext(regs, &frame->sc, 0))
+ if (!restore_sigcontext(regs, &frame->sc, 0))
goto badframe;
return regs->ax;
@@ -671,7 +671,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
set_current_blocked(&set);
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
if (restore_altstack(&frame->uc.uc_stack))
@@ -721,12 +721,15 @@ badframe:
/* max_frame_size tells userspace the worst case signal stack size. */
static unsigned long __ro_after_init max_frame_size;
+static unsigned int __ro_after_init fpu_default_state_size;
void __init init_sigframe_size(void)
{
+ fpu_default_state_size = fpu__get_fpstate_size();
+
max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING;
- max_frame_size += fpu__get_fpstate_size() + MAX_XSAVE_PADDING;
+ max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING;
/* Userspace expects an aligned size. */
max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
@@ -910,6 +913,62 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
force_sig(SIGSEGV);
}
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE
+static bool strict_sigaltstack_size __ro_after_init = true;
+#else
+static bool strict_sigaltstack_size __ro_after_init = false;
+#endif
+
+static int __init strict_sas_size(char *arg)
+{
+ return kstrtobool(arg, &strict_sigaltstack_size);
+}
+__setup("strict_sas_size", strict_sas_size);
+
+/*
+ * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512
+ * exceeds that size already. As such programs might never use the
+ * sigaltstack they just continued to work. While always checking against
+ * the real size would be correct, this might be considered a regression.
+ *
+ * Therefore avoid the sanity check, unless enforced by kernel
+ * configuration or command line option.
+ *
+ * When dynamic FPU features are supported, the check is also enforced when
+ * the task has permissions to use dynamic features. Tasks which have no
+ * permission are checked against the size of the non-dynamic feature set
+ * if strict checking is enabled. This avoids forcing all tasks on the
+ * system to allocate large sigaltstacks even if they are never going
+ * to use a dynamic feature. As this is serialized via sighand::siglock
+ * any permission request for a dynamic feature either happened already
+ * or will see the newly install sigaltstack size in the permission checks.
+ */
+bool sigaltstack_size_valid(size_t ss_size)
+{
+ unsigned long fsize = max_frame_size - fpu_default_state_size;
+ u64 mask;
+
+ lockdep_assert_held(&current->sighand->siglock);
+
+ if (!fpu_state_size_dynamic() && !strict_sigaltstack_size)
+ return true;
+
+ fsize += current->group_leader->thread.fpu.perm.__user_state_size;
+ if (likely(ss_size > fsize))
+ return true;
+
+ if (strict_sigaltstack_size)
+ return ss_size > fsize;
+
+ mask = current->group_leader->thread.fpu.perm.__state_perm;
+ if (mask & XFEATURE_MASK_USER_DYNAMIC)
+ return ss_size > fsize;
+
+ return true;
+}
+#endif /* CONFIG_DYNAMIC_SIGFRAME */
+
#ifdef CONFIG_X86_X32_ABI
COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
{
@@ -929,7 +988,7 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
set_current_blocked(&set);
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
if (compat_restore_altstack(&frame->uc.uc_stack))
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 85f6e242b6b4..617012f4619f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -70,7 +70,7 @@
#include <asm/mwait.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/setup.h>
#include <asm/uv/uv.h>
#include <linux/mc146818rtc.h>
@@ -101,6 +101,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map);
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
+
/* Per CPU bogomips and other parameters */
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
@@ -464,6 +466,21 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
+static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ /* If the arch didn't set up l2c_id, fall back to SMT */
+ if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
+ return match_smt(c, o);
+
+ /* Do not match if L2 cache id does not match: */
+ if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
+ return false;
+
+ return topology_sane(c, o, "l2c");
+}
+
/*
* Unlike the other levels, we do not enforce keeping a
* multicore group inside a NUMA node. If this happens, we will
@@ -523,7 +540,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
}
-#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
static inline int x86_sched_itmt_flags(void)
{
return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
@@ -541,15 +558,35 @@ static int x86_smt_flags(void)
return cpu_smt_flags() | x86_sched_itmt_flags();
}
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+static int x86_cluster_flags(void)
+{
+ return cpu_cluster_flags() | x86_sched_itmt_flags();
+}
+#endif
#endif
static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+ { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { NULL, },
+};
+
+static struct sched_domain_topology_level x86_hybrid_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
@@ -557,6 +594,9 @@ static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
+#ifdef CONFIG_SCHED_CLUSTER
+ { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
@@ -584,6 +624,7 @@ void set_cpu_sibling_map(int cpu)
if (!has_mp) {
cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu));
cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
c->booted_cores = 1;
@@ -602,6 +643,9 @@ void set_cpu_sibling_map(int cpu)
if ((i == cpu) || (has_mp && match_llc(c, o)))
link_mask(cpu_llc_shared_mask, cpu, i);
+ if ((i == cpu) || (has_mp && match_l2c(c, o)))
+ link_mask(cpu_l2c_shared_mask, cpu, i);
+
if ((i == cpu) || (has_mp && match_die(c, o)))
link_mask(topology_die_cpumask, cpu, i);
}
@@ -652,6 +696,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
return cpu_llc_shared_mask(cpu);
}
+const struct cpumask *cpu_clustergroup_mask(int cpu)
+{
+ return cpu_l2c_shared_mask(cpu);
+}
+
static void impress_friends(void)
{
int cpu;
@@ -1312,12 +1361,7 @@ static void __init smp_get_logical_apicid(void)
cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
}
-/*
- * Prepare for SMP bootup.
- * @max_cpus: configured maximum number of CPUs, It is a legacy parameter
- * for common interface support.
- */
-void __init native_smp_prepare_cpus(unsigned int max_cpus)
+void __init smp_prepare_cpus_common(void)
{
unsigned int i;
@@ -1335,6 +1379,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
}
/*
@@ -1347,6 +1392,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
+}
+
+/*
+ * Prepare for SMP bootup.
+ * @max_cpus: configured maximum number of CPUs, It is a legacy parameter
+ * for common interface support.
+ */
+void __init native_smp_prepare_cpus(unsigned int max_cpus)
+{
+ smp_prepare_cpus_common();
+
init_freq_invariance(false, false);
smp_sanity_check();
@@ -1424,8 +1480,11 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
calculate_max_logical_packages();
+ /* XXX for now assume numa-in-package and hybrid don't overlap */
if (x86_has_numa_in_package)
set_sched_topology(x86_numa_in_package_topology);
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ set_sched_topology(x86_hybrid_topology);
nmi_selftest();
impress_friends();
@@ -1564,7 +1623,10 @@ static void remove_siblinginfo(int cpu)
for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+ for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling));
cpumask_clear(cpu_llc_shared_mask(cpu));
+ cpumask_clear(cpu_l2c_shared_mask(cpu));
cpumask_clear(topology_sibling_cpumask(cpu));
cpumask_clear(topology_core_cpumask(cpu));
cpumask_clear(topology_die_cpumask(cpu));
@@ -2166,7 +2228,7 @@ DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
void arch_scale_freq_tick(void)
{
- u64 freq_scale = SCHED_CAPACITY_SCALE;
+ u64 freq_scale;
u64 aperf, mperf;
u64 acnt, mcnt;
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index ea028e736831..531fb4cbb63f 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -17,6 +17,8 @@ enum insn_type {
*/
static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };
+static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
+
static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
{
const void *emulate = NULL;
@@ -42,8 +44,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
break;
case RET:
- code = text_gen_insn(RET_INSN_OPCODE, insn, func);
- size = RET_INSN_SIZE;
+ code = &retinsn;
break;
}
@@ -56,10 +57,15 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
text_poke_bp(insn, code, size, emulate);
}
-static void __static_call_validate(void *insn, bool tail)
+static void __static_call_validate(void *insn, bool tail, bool tramp)
{
u8 opcode = *(u8 *)insn;
+ if (tramp && memcmp(insn+5, "SCT", 3)) {
+ pr_err("trampoline signature fail");
+ BUG();
+ }
+
if (tail) {
if (opcode == JMP32_INSN_OPCODE ||
opcode == RET_INSN_OPCODE)
@@ -74,7 +80,8 @@ static void __static_call_validate(void *insn, bool tail)
/*
* If we ever trigger this, our text is corrupt, we'll probably not live long.
*/
- WARN_ONCE(1, "unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn);
+ pr_err("unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn);
+ BUG();
}
static inline enum insn_type __sc_insn(bool null, bool tail)
@@ -97,12 +104,12 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
mutex_lock(&text_mutex);
if (tramp) {
- __static_call_validate(tramp, true);
+ __static_call_validate(tramp, true, true);
__static_call_transform(tramp, __sc_insn(!func, true), func);
}
if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) {
- __static_call_validate(site, tail);
+ __static_call_validate(site, tail, false);
__static_call_transform(site, __sc_insn(!func, tail), func);
}
diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c
index 6b73b6f92ad3..8322e8352777 100644
--- a/arch/x86/kernel/trace.c
+++ b/arch/x86/kernel/trace.c
@@ -231,4 +231,4 @@ void osnoise_arch_unregister(void)
unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL);
}
-#endif /* CONFIG_OSNOISE_TRAECR && CONFIG_X86_LOCAL_APIC */
+#endif /* CONFIG_OSNOISE_TRACER && CONFIG_X86_LOCAL_APIC */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a58800973aed..c9d566dcf89a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -48,7 +48,7 @@
#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/cpu.h>
#include <asm/cpu_entry_area.h>
#include <asm/mce.h>
@@ -313,17 +313,19 @@ out:
}
#ifdef CONFIG_VMAP_STACK
-__visible void __noreturn handle_stack_overflow(const char *message,
- struct pt_regs *regs,
- unsigned long fault_address)
+__visible void __noreturn handle_stack_overflow(struct pt_regs *regs,
+ unsigned long fault_address,
+ struct stack_info *info)
{
- printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
- (void *)fault_address, current->stack,
- (char *)current->stack + THREAD_SIZE - 1);
- die(message, regs, 0);
+ const char *name = stack_type_name(info->type);
+
+ printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n",
+ name, (void *)fault_address, info->begin, info->end);
+
+ die("stack guard page", regs, 0);
/* Be absolutely certain we don't return. */
- panic("%s", message);
+ panic("%s stack guard hit", name);
}
#endif
@@ -353,6 +355,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
#ifdef CONFIG_VMAP_STACK
unsigned long address = read_cr2();
+ struct stack_info info;
#endif
#ifdef CONFIG_X86_ESPFIX64
@@ -455,10 +458,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
* stack even if the actual trigger for the double fault was
* something else.
*/
- if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
- handle_stack_overflow("kernel stack overflow (double-fault)",
- regs, address);
- }
+ if (get_stack_guard_info((void *)address, &info))
+ handle_stack_overflow(regs, address, &info);
#endif
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
@@ -528,6 +529,36 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
#define GPFSTR "general protection fault"
+static bool fixup_iopl_exception(struct pt_regs *regs)
+{
+ struct thread_struct *t = &current->thread;
+ unsigned char byte;
+ unsigned long ip;
+
+ if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3)
+ return false;
+
+ if (insn_get_effective_ip(regs, &ip))
+ return false;
+
+ if (get_user(byte, (const char __user *)ip))
+ return false;
+
+ if (byte != 0xfa && byte != 0xfb)
+ return false;
+
+ if (!t->iopl_warn && printk_ratelimit()) {
+ pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx",
+ current->comm, task_pid_nr(current), ip);
+ print_vma_addr(KERN_CONT " in ", ip);
+ pr_cont("\n");
+ t->iopl_warn = 1;
+ }
+
+ regs->ip += 1;
+ return true;
+}
+
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
@@ -553,6 +584,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
tsk = current;
if (user_mode(regs)) {
+ if (fixup_iopl_exception(regs))
+ goto exit;
+
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
@@ -709,7 +743,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
stack = (unsigned long *)sp;
if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
- info.type >= STACK_TYPE_EXCEPTION_LAST)
+ info.type > STACK_TYPE_EXCEPTION_LAST)
sp = __this_cpu_ist_top_va(VC2);
sync:
@@ -1108,10 +1142,48 @@ DEFINE_IDTENTRY(exc_spurious_interrupt_bug)
*/
}
+static bool handle_xfd_event(struct pt_regs *regs)
+{
+ u64 xfd_err;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD))
+ return false;
+
+ rdmsrl(MSR_IA32_XFD_ERR, xfd_err);
+ if (!xfd_err)
+ return false;
+
+ wrmsrl(MSR_IA32_XFD_ERR, 0);
+
+ /* Die if that happens in kernel space */
+ if (WARN_ON(!user_mode(regs)))
+ return false;
+
+ local_irq_enable();
+
+ err = xfd_enable_feature(xfd_err);
+
+ switch (err) {
+ case -EPERM:
+ force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs));
+ break;
+ case -EFAULT:
+ force_sig(SIGSEGV);
+ break;
+ }
+
+ local_irq_disable();
+ return true;
+}
+
DEFINE_IDTENTRY(exc_device_not_available)
{
unsigned long cr0 = read_cr0();
+ if (handle_xfd_event(regs))
+ return;
+
#ifdef CONFIG_MATH_EMULATION
if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
struct math_emu_info info = { };
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 2e076a459a0c..a698196377be 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1180,6 +1180,12 @@ void mark_tsc_unstable(char *reason)
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+static void __init tsc_disable_clocksource_watchdog(void)
+{
+ clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1196,6 +1202,23 @@ static void __init check_system_tsc_reliable(void)
#endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;
+
+ /*
+ * Disable the clocksource watchdog when the system has:
+ * - TSC running at constant frequency
+ * - TSC which does not stop in C-States
+ * - the TSC_ADJUST register which allows to detect even minimal
+ * modifications
+ * - not more than two sockets. As the number of sockets cannot be
+ * evaluated at the early boot stage where this has to be
+ * invoked, check the number of online memory nodes as a
+ * fallback solution which is an reasonable estimate.
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ nr_online_nodes <= 2)
+ tsc_disable_clocksource_watchdog();
}
/*
@@ -1387,9 +1410,6 @@ static int __init init_tsc_clocksource(void)
if (tsc_unstable)
goto unreg;
- if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@@ -1527,7 +1547,7 @@ void __init tsc_init(void)
}
if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ tsc_disable_clocksource_watchdog();
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
detect_art();
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 50a4515fe0ad..9452dc9664b5 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -30,6 +30,7 @@ struct tsc_adjust {
};
static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+static struct timer_list tsc_sync_check_timer;
/*
* TSC's on different sockets may be reset asynchronously.
@@ -77,6 +78,46 @@ void tsc_verify_tsc_adjust(bool resume)
}
}
+/*
+ * Normally the tsc_sync will be checked every time system enters idle
+ * state, but there is still caveat that a system won't enter idle,
+ * either because it's too busy or configured purposely to not enter
+ * idle.
+ *
+ * So setup a periodic timer (every 10 minutes) to make sure the check
+ * is always on.
+ */
+
+#define SYNC_CHECK_INTERVAL (HZ * 600)
+
+static void tsc_sync_check_timer_fn(struct timer_list *unused)
+{
+ int next_cpu;
+
+ tsc_verify_tsc_adjust(false);
+
+ /* Run the check for all onlined CPUs in turn */
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+
+ tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
+ add_timer_on(&tsc_sync_check_timer, next_cpu);
+}
+
+static int __init start_sync_check_timer(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
+ return 0;
+
+ timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
+ tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
+ add_timer(&tsc_sync_check_timer);
+
+ return 0;
+}
+late_initcall(start_sync_check_timer);
+
static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
unsigned int cpu, bool bootcpu)
{
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index 576b47e7523d..5a4b21389b1d 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -92,8 +92,8 @@ static const char * const umip_insns[5] = {
#define umip_pr_err(regs, fmt, ...) \
umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__)
-#define umip_pr_warn(regs, fmt, ...) \
- umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define umip_pr_debug(regs, fmt, ...) \
+ umip_printk(regs, KERN_DEBUG, fmt, ##__VA_ARGS__)
/**
* umip_printk() - Print a rate-limited message
@@ -361,10 +361,10 @@ bool fixup_umip_exception(struct pt_regs *regs)
if (umip_inst < 0)
return false;
- umip_pr_warn(regs, "%s instruction cannot be used by applications.\n",
+ umip_pr_debug(regs, "%s instruction cannot be used by applications.\n",
umip_insns[umip_inst]);
- umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n");
+ umip_pr_debug(regs, "For now, expensive software emulation returns the result.\n");
if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size,
user_64bit_mode(regs)))
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index d7c44b257f7f..8e1c50c86e5d 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -240,8 +240,7 @@ static bool update_stack_state(struct unwind_state *state,
else {
addr_p = unwind_get_return_address_ptr(state);
addr = READ_ONCE_TASK_STACK(state->task, *addr_p);
- state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
- addr, addr_p);
+ state->ip = unwind_recover_ret_addr(state, addr, addr_p);
}
/* Save the original stack pointer for unwind_dump(): */
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
index c49f10ffd8cd..884d68a6e714 100644
--- a/arch/x86/kernel/unwind_guess.c
+++ b/arch/x86/kernel/unwind_guess.c
@@ -15,8 +15,7 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
addr = READ_ONCE_NOCHECK(*state->sp);
- return ftrace_graph_ret_addr(state->task, &state->graph_idx,
- addr, state->sp);
+ return unwind_recover_ret_addr(state, addr, state->sp);
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index a1202536fc57..2de3c8c5eba9 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -175,7 +175,7 @@ static struct orc_entry *orc_find(unsigned long ip)
}
/* vmlinux .init slow lookup: */
- if (init_kernel_text(ip))
+ if (is_kernel_inittext(ip))
return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
__stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
@@ -534,9 +534,8 @@ bool unwind_next_frame(struct unwind_state *state)
if (!deref_stack_reg(state, ip_p, &state->ip))
goto err;
- state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
- state->ip, (void *)ip_p);
-
+ state->ip = unwind_recover_ret_addr(state, state->ip,
+ (unsigned long *)ip_p);
state->sp = sp;
state->regs = NULL;
state->prev_regs = NULL;
@@ -549,7 +548,18 @@ bool unwind_next_frame(struct unwind_state *state)
(void *)orig_ip);
goto err;
}
-
+ /*
+ * There is a small chance to interrupt at the entry of
+ * __kretprobe_trampoline() where the ORC info doesn't exist.
+ * That point is right after the RET to __kretprobe_trampoline()
+ * which was modified return address.
+ * At that point, the @addr_p of the unwind_recover_kretprobe()
+ * (this has to point the address of the stack entry storing
+ * the modified return address) must be "SP - (a stack entry)"
+ * because SP is incremented by the RET.
+ */
+ state->ip = unwind_recover_kretprobe(state, state->ip,
+ (unsigned long *)(state->sp - sizeof(long)));
state->regs = (struct pt_regs *)sp;
state->prev_regs = NULL;
state->full_regs = true;
@@ -562,6 +572,9 @@ bool unwind_next_frame(struct unwind_state *state)
(void *)orig_ip);
goto err;
}
+ /* See UNWIND_HINT_TYPE_REGS case comment. */
+ state->ip = unwind_recover_kretprobe(state, state->ip,
+ (unsigned long *)(state->sp - sizeof(long)));
if (state->full_regs)
state->prev_regs = state->regs;
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 641f0fe1e5b4..1258a5872d12 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -132,9 +132,9 @@ SYM_FUNC_START_LOCAL(verify_cpu)
.Lverify_cpu_no_longmode:
popf # Restore caller passed flags
movl $1,%eax
- ret
+ RET
.Lverify_cpu_sse_ok:
popf # Restore caller passed flags
xorl %eax, %eax
- ret
+ RET
SYM_FUNC_END(verify_cpu)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e5a7a10a0164..c21bcd668284 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -106,10 +106,8 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
*/
local_irq_enable();
- if (!vm86 || !vm86->user_vm86) {
- pr_alert("no user_vm86: BAD\n");
- do_exit(SIGSEGV);
- }
+ BUG_ON(!vm86);
+
set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
user = vm86->user_vm86;
@@ -142,6 +140,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
user_access_end();
+exit_vm86:
preempt_disable();
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
@@ -161,7 +160,8 @@ Efault_end:
user_access_end();
Efault:
pr_alert("could not access userspace vm86 info\n");
- do_exit(SIGSEGV);
+ force_exit_sig(SIGSEGV);
+ goto exit_vm86;
}
static int do_vm86_irq_handling(int subfunction, int irqnumber);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index efd9e9ea17f2..27f830345b6f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -137,7 +137,6 @@ SECTIONS
ALIGN_ENTRY_TEXT_END
SOFTIRQENTRY_TEXT
STATIC_CALL_TEXT
- *(.fixup)
*(.gnu.warning)
#ifdef CONFIG_RETPOLINE
@@ -272,6 +271,20 @@ SECTIONS
__parainstructions_end = .;
}
+#ifdef CONFIG_RETPOLINE
+ /*
+ * List of instructions that call/jmp/jcc to retpoline thunks
+ * __x86_indirect_thunk_*(). These instructions can be patched along
+ * with alternatives, after which the section can be freed.
+ */
+ . = ALIGN(8);
+ .retpoline_sites : AT(ADDR(.retpoline_sites) - LOAD_OFFSET) {
+ __retpoline_sites = .;
+ *(.retpoline_sites)
+ __retpoline_sites_end = .;
+ }
+#endif
+
/*
* struct alt_inst entries. From the header (alternative.h):
* "Alternative instructions for different CPU types or capabilities"
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 8b395821cb8d..7d20c1d34a3c 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -145,18 +145,6 @@ struct x86_platform_ops x86_platform __ro_after_init = {
EXPORT_SYMBOL_GPL(x86_platform);
-#if defined(CONFIG_PCI_MSI)
-struct x86_msi_ops x86_msi __ro_after_init = {
- .restore_msi_irqs = default_restore_msi_irqs,
-};
-
-/* MSI arch specific hooks */
-void arch_restore_msi_irqs(struct pci_dev *dev)
-{
- x86_msi.restore_msi_irqs(dev);
-}
-#endif
-
struct x86_apic_ops x86_apic_ops __ro_after_init = {
.io_apic_read = native_io_apic_read,
.restore = native_restore_boot_irq_mode,
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ac69894eab88..2b1548da00eb 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -26,7 +26,9 @@ config KVM
select PREEMPT_NOTIFIERS
select MMU_NOTIFIER
select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_PFNCACHE
select HAVE_KVM_IRQFD
+ select HAVE_KVM_DIRTY_RING
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
@@ -36,6 +38,7 @@ config KVM
select KVM_MMIO
select SCHED_INFO
select PERF_EVENTS
+ select GUEST_PERF_EVENTS
select HAVE_KVM_MSI
select HAVE_KVM_CPU_RELAX_INTERCEPT
select HAVE_KVM_NO_POLL
@@ -43,6 +46,7 @@ config KVM
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_VFIO
select SRCU
+ select INTERVAL_TREE
select HAVE_KVM_PM_NOTIFIER if PM
help
Support hosting fully virtualized guest machines using hardware
@@ -129,4 +133,7 @@ config KVM_MMU_AUDIT
This option adds a R/W kVM module parameter 'mmu_audit', which allows
auditing of KVM MMU events at runtime.
+config KVM_EXTERNAL_WRITE_TRACKING
+ bool
+
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 75dfd27b6e8a..30f244b64523 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,12 +7,7 @@ ifeq ($(CONFIG_FRAME_POINTER),y)
OBJECT_FILES_NON_STANDARD_vmenter.o := y
endif
-KVM := ../../../virt/kvm
-
-kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
- $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
- $(KVM)/dirty_ring.o $(KVM)/binary_stats.o
-kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
+include $(srctree)/virt/kvm/Makefile.kvm
kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 751aa85a3001..c55e57b30e81 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -32,7 +32,7 @@
u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
EXPORT_SYMBOL_GPL(kvm_cpu_caps);
-static u32 xstate_required_size(u64 xstate_bv, bool compacted)
+u32 xstate_required_size(u64 xstate_bv, bool compacted)
{
int feature_bit = 0;
u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
@@ -42,7 +42,11 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
if (xstate_bv & 0x1) {
u32 eax, ebx, ecx, edx, offset;
cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
- offset = compacted ? ret : ebx;
+ /* ECX[1]: 64B alignment in compacted form */
+ if (compacted)
+ offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret;
+ else
+ offset = ebx;
ret = max(ret, offset + eax);
}
@@ -53,9 +57,16 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
return ret;
}
+/*
+ * This one is tied to SSB in the user API, and not
+ * visible in /proc/cpuinfo.
+ */
+#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
+
#define F feature_bit
#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
+
static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
{
@@ -73,9 +84,12 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
return NULL;
}
-static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent)
+static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid_entry2 *entries,
+ int nent)
{
struct kvm_cpuid_entry2 *best;
+ u64 xfeatures;
/*
* The existing code assumes virtual address is 48-bit or 57-bit in the
@@ -89,14 +103,61 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent)
return -EINVAL;
}
- return 0;
+ /*
+ * Exposing dynamic xfeatures to the guest requires additional
+ * enabling in the FPU, e.g. to expand the guest XSAVE state size.
+ */
+ best = cpuid_entry2_find(entries, nent, 0xd, 0);
+ if (!best)
+ return 0;
+
+ xfeatures = best->eax | ((u64)best->edx << 32);
+ xfeatures &= XFEATURE_MASK_USER_DYNAMIC;
+ if (!xfeatures)
+ return 0;
+
+ return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
}
-void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
+static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
{
- struct kvm_cpuid_entry2 *best;
+ u32 function;
+ struct kvm_cpuid_entry2 *entry;
+
+ vcpu->arch.kvm_cpuid_base = 0;
+
+ for_each_possible_hypervisor_cpuid_base(function) {
+ entry = kvm_find_cpuid_entry(vcpu, function, 0);
+
+ if (entry) {
+ u32 signature[3];
+
+ signature[0] = entry->ebx;
+ signature[1] = entry->ecx;
+ signature[2] = entry->edx;
+
+ BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE));
+ if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) {
+ vcpu->arch.kvm_cpuid_base = function;
+ break;
+ }
+ }
+ }
+}
- best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
+static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
+{
+ u32 base = vcpu->arch.kvm_cpuid_base;
+
+ if (!base)
+ return NULL;
+
+ return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0);
+}
+
+void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best = kvm_find_kvm_cpuid_features(vcpu);
/*
* save the feature bitmap to avoid cpuid lookup for every PV
@@ -135,7 +196,7 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
- best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
+ best = kvm_find_kvm_cpuid_features(vcpu);
if (kvm_hlt_in_guest(vcpu->kvm) && best &&
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
@@ -232,6 +293,26 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
return rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
}
+static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
+ int nent)
+{
+ int r;
+
+ r = kvm_check_cpuid(vcpu, e2, nent);
+ if (r)
+ return r;
+
+ kvfree(vcpu->arch.cpuid_entries);
+ vcpu->arch.cpuid_entries = e2;
+ vcpu->arch.cpuid_nent = nent;
+
+ kvm_update_kvm_cpuid_base(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
+ kvm_vcpu_after_set_cpuid(vcpu);
+
+ return 0;
+}
+
/* when an old userspace process fills a new kernel module */
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid *cpuid,
@@ -268,18 +349,9 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
e2[i].padding[2] = 0;
}
- r = kvm_check_cpuid(e2, cpuid->nent);
- if (r) {
+ r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
+ if (r)
kvfree(e2);
- goto out_free_cpuid;
- }
-
- kvfree(vcpu->arch.cpuid_entries);
- vcpu->arch.cpuid_entries = e2;
- vcpu->arch.cpuid_nent = cpuid->nent;
-
- kvm_update_cpuid_runtime(vcpu);
- kvm_vcpu_after_set_cpuid(vcpu);
out_free_cpuid:
kvfree(e);
@@ -303,20 +375,11 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
return PTR_ERR(e2);
}
- r = kvm_check_cpuid(e2, cpuid->nent);
- if (r) {
+ r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
+ if (r)
kvfree(e2);
- return r;
- }
- kvfree(vcpu->arch.cpuid_entries);
- vcpu->arch.cpuid_entries = e2;
- vcpu->arch.cpuid_nent = cpuid->nent;
-
- kvm_update_cpuid_runtime(vcpu);
- kvm_vcpu_after_set_cpuid(vcpu);
-
- return 0;
+ return r;
}
int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
@@ -379,9 +442,11 @@ void kvm_set_cpu_caps(void)
#ifdef CONFIG_X86_64
unsigned int f_gbpages = F(GBPAGES);
unsigned int f_lm = F(LM);
+ unsigned int f_xfd = F(XFD);
#else
unsigned int f_gbpages = 0;
unsigned int f_lm = 0;
+ unsigned int f_xfd = 0;
#endif
memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
@@ -449,7 +514,8 @@ void kvm_set_cpu_caps(void)
F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
- F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16)
+ F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
+ F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16)
);
/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
@@ -468,7 +534,7 @@ void kvm_set_cpu_caps(void)
);
kvm_cpu_cap_mask(CPUID_D_1_EAX,
- F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
+ F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
);
kvm_cpu_cap_init_scattered(CPUID_12_EAX,
@@ -480,7 +546,7 @@ void kvm_set_cpu_caps(void)
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
- F(TOPOEXT) | F(PERFCTR_CORE)
+ F(TOPOEXT) | 0 /* PERFCTR_CORE */
);
kvm_cpu_cap_mask(CPUID_8000_0001_EDX,
@@ -500,7 +566,8 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
F(CLZERO) | F(XSAVEERPTR) |
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
- F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON)
+ F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) |
+ __feature_bit(KVM_X86_FEATURE_PSFD)
);
/*
@@ -593,6 +660,8 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
case 0x14:
case 0x17:
case 0x18:
+ case 0x1d:
+ case 0x1e:
case 0x1f:
case 0x8000001d:
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
@@ -767,11 +836,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
goto out;
}
break;
- case 0xd:
- entry->eax &= supported_xcr0;
+ case 0xd: {
+ u64 guest_perm = xstate_get_guest_group_perm();
+
+ entry->eax &= supported_xcr0 & guest_perm;
entry->ebx = xstate_required_size(supported_xcr0, false);
entry->ecx = entry->ebx;
- entry->edx &= supported_xcr0 >> 32;
+ entry->edx &= (supported_xcr0 & guest_perm) >> 32;
if (!supported_xcr0)
break;
@@ -818,6 +889,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx = 0;
}
break;
+ }
case 0x12:
/* Intel SGX */
if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
@@ -862,9 +934,26 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
goto out;
}
break;
+ /* Intel AMX TILE */
+ case 0x1d:
+ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+ if (!do_host_cpuid(array, function, i))
+ goto out;
+ }
+ break;
+ case 0x1e: /* TMUL information */
+ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+ break;
case KVM_CPUID_SIGNATURE: {
- static const char signature[12] = "KVMKVMKVM\0\0";
- const u32 *sigptr = (const u32 *)signature;
+ const u32 *sigptr = (const u32 *)KVM_SIGNATURE;
entry->eax = KVM_CPUID_FEATURES;
entry->ebx = sigptr[0];
entry->ecx = sigptr[1];
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index c99edfff7f82..8a770b481d9d 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -30,6 +30,8 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool exact_only);
+u32 xstate_required_size(u64 xstate_bv, bool compacted);
+
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 54a83a744538..9240b3b7f8dd 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -95,6 +95,9 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
unsigned int *log[KVM_NR_PAGE_SIZES], *cur;
int i, j, k, l, ret;
+ if (!kvm_memslots_have_rmaps(kvm))
+ return 0;
+
ret = -ENOMEM;
memset(log, 0, sizeof(log));
for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
@@ -107,9 +110,10 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ int bkt;
+
slots = __kvm_memslots(kvm, i);
- for (j = 0; j < slots->used_slots; j++) {
- slot = &slots->memslots[j];
+ kvm_for_each_memslot(slot, bkt, slots)
for (k = 0; k < KVM_NR_PAGE_SIZES; k++) {
rmap = slot->arch.rmap[k];
lpage_size = kvm_mmu_slot_lpages(slot, k + 1);
@@ -121,7 +125,6 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
cur[index]++;
}
}
- }
}
write_unlock(&kvm->mmu_lock);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9a144ca8e146..5719d8cfdbd9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -175,6 +175,7 @@
#define No16 ((u64)1 << 53) /* No 16 bit operand */
#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */
+#define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -191,8 +192,9 @@
#define FASTOP_SIZE 8
struct opcode {
- u64 flags : 56;
- u64 intercept : 8;
+ u64 flags;
+ u8 intercept;
+ u8 pad[7];
union {
int (*execute)(struct x86_emulate_ctxt *ctxt);
const struct opcode *group;
@@ -315,7 +317,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
__FOP_FUNC(#name)
#define __FOP_RET(name) \
- "ret \n\t" \
+ "11: " ASM_RET \
".size " name ", .-" name "\n\t"
#define FOP_RET(name) \
@@ -344,7 +346,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
__FOP_RET(#op "_" #dst)
#define FOP1EEX(op, dst) \
- FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
+ FOP1E(op, dst) _ASM_EXTABLE_TYPE_REG(10b, 11b, EX_TYPE_ZERO_REG, %%esi)
#define FASTOP1(op) \
FOP_START(op) \
@@ -434,10 +436,6 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
#op " %al \n\t" \
__FOP_RET(#op)
-asm(".pushsection .fixup, \"ax\"\n"
- "kvm_fastop_exception: xor %esi, %esi; ret\n"
- ".popsection");
-
FOP_START(setcc)
FOP_SETCC(seto)
FOP_SETCC(setno)
@@ -473,12 +471,8 @@ FOP_END;
\
asm volatile("1:" insn "\n" \
"2:\n" \
- ".pushsection .fixup, \"ax\"\n" \
- "3: movl $1, %[_fault]\n" \
- " jmp 2b\n" \
- ".popsection\n" \
- _ASM_EXTABLE(1b, 3b) \
- : [_fault] "+qm"(_fault) inoutclob ); \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[_fault]) \
+ : [_fault] "+r"(_fault) inoutclob ); \
\
_fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \
})
@@ -4222,6 +4216,11 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx))
return X86EMUL_CONTINUE;
+ /*
+ * If CR4.PCE is set, the SDM requires CPL=0 or CR0.PE=0. The CR0.PE
+ * check however is unnecessary because CPL is always 0 outside
+ * protected mode.
+ */
if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
ctxt->ops->check_pmc(ctxt, rcx))
return emulate_gp(ctxt, 0);
@@ -4359,10 +4358,10 @@ static const struct opcode group4[] = {
static const struct opcode group5[] = {
F(DstMem | SrcNone | Lock, em_inc),
F(DstMem | SrcNone | Lock, em_dec),
- I(SrcMem | NearBranch, em_call_near_abs),
- I(SrcMemFAddr | ImplicitOps, em_call_far),
- I(SrcMem | NearBranch, em_jmp_abs),
- I(SrcMemFAddr | ImplicitOps, em_jmp_far),
+ I(SrcMem | NearBranch | IsBranch, em_call_near_abs),
+ I(SrcMemFAddr | ImplicitOps | IsBranch, em_call_far),
+ I(SrcMem | NearBranch | IsBranch, em_jmp_abs),
+ I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far),
I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined),
};
@@ -4572,7 +4571,7 @@ static const struct opcode opcode_table[256] = {
I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
/* 0x70 - 0x7F */
- X16(D(SrcImmByte | NearBranch)),
+ X16(D(SrcImmByte | NearBranch | IsBranch)),
/* 0x80 - 0x87 */
G(ByteOp | DstMem | SrcImm, group1),
G(DstMem | SrcImm, group1),
@@ -4591,7 +4590,7 @@ static const struct opcode opcode_table[256] = {
DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
/* 0x98 - 0x9F */
D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
- I(SrcImmFAddr | No64, em_call_far), N,
+ I(SrcImmFAddr | No64 | IsBranch, em_call_far), N,
II(ImplicitOps | Stack, em_pushf, pushf),
II(ImplicitOps | Stack, em_popf, popf),
I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf),
@@ -4611,17 +4610,19 @@ static const struct opcode opcode_table[256] = {
X8(I(DstReg | SrcImm64 | Mov, em_mov)),
/* 0xC0 - 0xC7 */
G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
- I(ImplicitOps | NearBranch | SrcImmU16, em_ret_near_imm),
- I(ImplicitOps | NearBranch, em_ret),
+ I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch, em_ret_near_imm),
+ I(ImplicitOps | NearBranch | IsBranch, em_ret),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
G(ByteOp, group11), G(0, group11),
/* 0xC8 - 0xCF */
- I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
- I(ImplicitOps | SrcImmU16, em_ret_far_imm),
- I(ImplicitOps, em_ret_far),
- D(ImplicitOps), DI(SrcImmByte, intn),
- D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
+ I(Stack | SrcImmU16 | Src2ImmByte | IsBranch, em_enter),
+ I(Stack | IsBranch, em_leave),
+ I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm),
+ I(ImplicitOps | IsBranch, em_ret_far),
+ D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn),
+ D(ImplicitOps | No64 | IsBranch),
+ II(ImplicitOps | IsBranch, em_iret, iret),
/* 0xD0 - 0xD7 */
G(Src2One | ByteOp, group2), G(Src2One, group2),
G(Src2CL | ByteOp, group2), G(Src2CL, group2),
@@ -4632,14 +4633,15 @@ static const struct opcode opcode_table[256] = {
/* 0xD8 - 0xDF */
N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
/* 0xE0 - 0xE7 */
- X3(I(SrcImmByte | NearBranch, em_loop)),
- I(SrcImmByte | NearBranch, em_jcxz),
+ X3(I(SrcImmByte | NearBranch | IsBranch, em_loop)),
+ I(SrcImmByte | NearBranch | IsBranch, em_jcxz),
I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
/* 0xE8 - 0xEF */
- I(SrcImm | NearBranch, em_call), D(SrcImm | ImplicitOps | NearBranch),
- I(SrcImmFAddr | No64, em_jmp_far),
- D(SrcImmByte | ImplicitOps | NearBranch),
+ I(SrcImm | NearBranch | IsBranch, em_call),
+ D(SrcImm | ImplicitOps | NearBranch | IsBranch),
+ I(SrcImmFAddr | No64 | IsBranch, em_jmp_far),
+ D(SrcImmByte | ImplicitOps | NearBranch | IsBranch),
I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
/* 0xF0 - 0xF7 */
@@ -4655,7 +4657,7 @@ static const struct opcode opcode_table[256] = {
static const struct opcode twobyte_table[256] = {
/* 0x00 - 0x0F */
G(0, group6), GD(0, &group7), N, N,
- N, I(ImplicitOps | EmulateOnUD, em_syscall),
+ N, I(ImplicitOps | EmulateOnUD | IsBranch, em_syscall),
II(ImplicitOps | Priv, em_clts, clts), N,
DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
@@ -4686,8 +4688,8 @@ static const struct opcode twobyte_table[256] = {
IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
II(ImplicitOps | Priv, em_rdmsr, rdmsr),
IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
- I(ImplicitOps | EmulateOnUD, em_sysenter),
- I(ImplicitOps | Priv | EmulateOnUD, em_sysexit),
+ I(ImplicitOps | EmulateOnUD | IsBranch, em_sysenter),
+ I(ImplicitOps | Priv | EmulateOnUD | IsBranch, em_sysexit),
N, N,
N, N, N, N, N, N, N, N,
/* 0x40 - 0x4F */
@@ -4705,7 +4707,7 @@ static const struct opcode twobyte_table[256] = {
N, N, N, N,
N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
/* 0x80 - 0x8F */
- X16(D(SrcImm | NearBranch)),
+ X16(D(SrcImm | NearBranch | IsBranch)),
/* 0x90 - 0x9F */
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
@@ -5219,6 +5221,8 @@ done_prefixes:
ctxt->d |= opcode.flags;
}
+ ctxt->is_branch = opcode.flags & IsBranch;
+
/* Unrecognised? */
if (ctxt->d == 0)
return EMULATION_FAILED;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index d5124b520f76..6e38a7d22e97 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -112,7 +112,7 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
if (!!auto_eoi_old == !!auto_eoi_new)
return;
- mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+ down_write(&vcpu->kvm->arch.apicv_update_lock);
if (auto_eoi_new)
hv->synic_auto_eoi_used++;
@@ -123,7 +123,7 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
!hv->synic_auto_eoi_used,
APICV_INHIBIT_REASON_HYPERV);
- mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
+ up_write(&vcpu->kvm->arch.apicv_update_lock);
}
static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
@@ -164,7 +164,7 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
{
struct kvm_vcpu *vcpu = NULL;
- int i;
+ unsigned long i;
if (vpidx >= KVM_MAX_VCPUS)
return NULL;
@@ -1472,7 +1472,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
hv_vcpu->hv_vapic = data;
- if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0))
+ if (kvm_lapic_set_pv_eoi(vcpu, 0, 0))
return 1;
break;
}
@@ -1490,7 +1490,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return 1;
hv_vcpu->hv_vapic = data;
kvm_vcpu_mark_page_dirty(vcpu, gfn);
- if (kvm_lapic_enable_pv_eoi(vcpu,
+ if (kvm_lapic_set_pv_eoi(vcpu,
gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
sizeof(struct hv_vp_assist_page)))
return 1;
@@ -1716,7 +1716,8 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask(
{
struct kvm_hv *hv = to_kvm_hv(kvm);
struct kvm_vcpu *vcpu;
- int i, bank, sbank = 0;
+ int bank, sbank = 0;
+ unsigned long i;
memset(vp_bitmap, 0,
KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap));
@@ -1754,7 +1755,6 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
int i;
gpa_t gpa;
struct kvm *kvm = vcpu->kvm;
- struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
@@ -1836,18 +1836,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
}
}
- cpumask_clear(&hv_vcpu->tlb_flush);
-
- vcpu_mask = all_cpus ? NULL :
- sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
- vp_bitmap, vcpu_bitmap);
-
/*
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
* analyze it here, flush TLB regardless of the specified address space.
*/
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST,
- NULL, vcpu_mask, &hv_vcpu->tlb_flush);
+ if (all_cpus) {
+ kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);
+ } else {
+ vcpu_mask = sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
+ vp_bitmap, vcpu_bitmap);
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST,
+ vcpu_mask);
+ }
ret_success:
/* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */
@@ -1863,7 +1864,7 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
.vector = vector
};
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
@@ -1922,11 +1923,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+ if (all_cpus)
+ goto check_and_send_ipi;
+
if (!sparse_banks_len)
goto ret_success;
- if (!all_cpus &&
- kvm_read_guest(kvm,
+ if (kvm_read_guest(kvm,
hc->ingpa + offsetof(struct hv_send_ipi_ex,
vp_set.bank_contents),
sparse_banks,
@@ -1934,6 +1937,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
+check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
@@ -2022,7 +2026,7 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
{
bool longmode;
- longmode = is_64_bit_mode(vcpu);
+ longmode = is_64_bit_hypercall(vcpu);
if (longmode)
kvm_rax_write(vcpu, result);
else {
@@ -2171,7 +2175,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
}
#ifdef CONFIG_X86_64
- if (is_64_bit_mode(vcpu)) {
+ if (is_64_bit_hypercall(vcpu)) {
hc.param = kvm_rcx_read(vcpu);
hc.ingpa = kvm_rdx_read(vcpu);
hc.outgpa = kvm_r8_read(vcpu);
@@ -2516,6 +2520,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
case HYPERV_CPUID_NESTED_FEATURES:
ent->eax = evmcs_ver;
+ if (evmcs_ver)
+ ent->eax |= HV_X64_NESTED_MSR_BITMAP;
break;
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 5a69cce4d72d..0b65a764ed3a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -242,7 +242,7 @@ static void pit_do_work(struct kthread_work *work)
struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
struct kvm *kvm = pit->kvm;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
struct kvm_kpit_state *ps = &pit->pit_state;
if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 0b80263d46d8..814064d06016 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -50,7 +50,7 @@ static void pic_unlock(struct kvm_pic *s)
{
bool wakeup = s->wakeup_needed;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
s->wakeup_needed = false;
@@ -270,7 +270,8 @@ int kvm_pic_read_irq(struct kvm *kvm)
static void kvm_pic_reset(struct kvm_kpic_state *s)
{
- int irq, i;
+ int irq;
+ unsigned long i;
struct kvm_vcpu *vcpu;
u8 edge_irr = s->irr & ~s->elcr;
bool found = false;
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 8c065da73f8e..decfa36b7891 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -96,7 +96,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
{
ioapic->rtc_status.pending_eoi = 0;
- bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1);
+ bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_IDS);
}
static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
@@ -149,7 +149,7 @@ void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
{
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
if (RTC_GSI >= IOAPIC_NUM_PINS)
return;
@@ -184,7 +184,7 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq)
{
- int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index bbd4a5d18b5d..539333ac4b38 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -39,13 +39,13 @@ struct kvm_vcpu;
struct dest_map {
/* vcpu bitmap where IRQ has been sent */
- DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1);
+ DECLARE_BITMAP(map, KVM_MAX_VCPU_IDS);
/*
* Vector sent to a given vcpu, only valid when
* the vcpu's bit in map is set
*/
- u8 vectors[KVM_MAX_VCPU_ID + 1];
+ u8 vectors[KVM_MAX_VCPU_IDS];
};
@@ -81,7 +81,6 @@ struct kvm_ioapic {
unsigned long irq_states[IOAPIC_NUM_PINS];
struct kvm_io_device dev;
struct kvm *kvm;
- void (*ack_notifier)(void *opaque, int irq);
spinlock_t lock;
struct rtc_status rtc_status;
struct delayed_work eoi_inject;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 650642b18d15..c2d7cfe82d00 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -56,7 +56,6 @@ struct kvm_pic {
struct kvm_io_device dev_master;
struct kvm_io_device dev_slave;
struct kvm_io_device dev_elcr;
- void (*ack_notifier)(void *opaque, int irq);
unsigned long irq_states[PIC_NUM_PINS];
};
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index d5b72a08e566..6e0dab04320e 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -24,6 +24,7 @@
#include "hyperv.h"
#include "x86.h"
+#include "xen.h"
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
@@ -45,9 +46,9 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, struct dest_map *dest_map)
{
- int i, r = -1;
+ int r = -1;
struct kvm_vcpu *vcpu, *lowest = NULL;
- unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
unsigned int dest_vcpus = 0;
if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
@@ -175,6 +176,13 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
return r;
break;
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ if (!level)
+ return -1;
+
+ return kvm_xen_set_evtchn_fast(e, kvm);
+#endif
default:
break;
}
@@ -310,6 +318,10 @@ int kvm_set_routing_entry(struct kvm *kvm,
e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
e->hv_sint.sint = ue->u.hv_sint.sint;
break;
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
default:
return -EINVAL;
}
@@ -320,7 +332,8 @@ int kvm_set_routing_entry(struct kvm *kvm,
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu)
{
- int i, r = 0;
+ int r = 0;
+ unsigned long i;
struct kvm_vcpu *vcpu;
if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 90e1ffdc05b7..3febc342360c 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -9,6 +9,12 @@
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE)
+#define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG)
+#define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP)
+#define X86_CR4_PDPTR_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP)
+
+static_assert(!(KVM_POSSIBLE_CR0_GUEST_BITS & X86_CR0_PDPTR_BITS));
+
#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \
static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
{ \
@@ -37,6 +43,13 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14)
BUILD_KVM_GPR_ACCESSORS(r15, R15)
#endif
+/*
+ * avail dirty
+ * 0 0 register in VMCS/VMCB
+ * 0 1 *INVALID*
+ * 1 0 register in vcpu->arch
+ * 1 1 register in vcpu->arch, needs to be stored back
+ */
static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
@@ -55,13 +68,6 @@ static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
}
-static inline void kvm_register_clear_available(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
-{
- __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
- __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
-}
-
static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 68b420289d7e..39eded2426ff 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -369,6 +369,7 @@ struct x86_emulate_ctxt {
struct fetch_cache fetch;
struct read_cache io_read;
struct read_cache mem_read;
+ bool is_branch;
};
/* Repeat String Operation Prefix */
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
index c7db2df50a7a..b469f45e3fe4 100644
--- a/arch/x86/kvm/kvm_onhyperv.c
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -33,7 +33,8 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm,
{
struct kvm_arch *kvm_arch = &kvm->arch;
struct kvm_vcpu *vcpu;
- int ret = 0, i, nr_unique_valid_roots;
+ int ret = 0, nr_unique_valid_roots;
+ unsigned long i;
hpa_t root;
spin_lock(&kvm_arch->hv_root_tdp_lock);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 76fb00921203..c5028e6b0f96 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -185,7 +185,7 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
{
struct kvm_apic_map *new, *old = NULL;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
u32 max_id = 255; /* enough space for any xAPIC ID */
/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
@@ -673,41 +673,40 @@ static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
}
-static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
-{
- u8 val;
- if (pv_eoi_get_user(vcpu, &val) < 0) {
- printk(KERN_WARNING "Can't read EOI MSR value: 0x%llx\n",
- (unsigned long long)vcpu->arch.pv_eoi.msr_val);
- return false;
- }
- return val & KVM_PV_EOI_ENABLED;
-}
-
static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
{
- if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
- printk(KERN_WARNING "Can't set EOI MSR value: 0x%llx\n",
- (unsigned long long)vcpu->arch.pv_eoi.msr_val);
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0)
return;
- }
+
__set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
}
-static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
+static bool pv_eoi_test_and_clr_pending(struct kvm_vcpu *vcpu)
{
- if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
- printk(KERN_WARNING "Can't clear EOI MSR value: 0x%llx\n",
- (unsigned long long)vcpu->arch.pv_eoi.msr_val);
- return;
- }
+ u8 val;
+
+ if (pv_eoi_get_user(vcpu, &val) < 0)
+ return false;
+
+ val &= KVM_PV_EOI_ENABLED;
+
+ if (val && pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0)
+ return false;
+
+ /*
+ * Clear pending bit in any case: it will be set again on vmentry.
+ * While this might not be ideal from performance point of view,
+ * this makes sure pv eoi is only enabled when we know it's safe.
+ */
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+
+ return val;
}
static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
- if (apic->vcpu->arch.apicv_active)
+ if (kvm_x86_ops.sync_pir_to_irr)
highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);
@@ -1101,6 +1100,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
kvm_lapic_set_irr(vector, apic);
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
+ } else {
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
}
break;
@@ -1172,8 +1174,8 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_lapic *src = NULL;
struct kvm_apic_map *map;
struct kvm_vcpu *vcpu;
- unsigned long bitmap;
- int i, vcpu_idx;
+ unsigned long bitmap, i;
+ int vcpu_idx;
bool ret;
rcu_read_lock();
@@ -1931,7 +1933,7 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
/* If the preempt notifier has already run, it also called apic_timer_expired */
if (!apic->lapic_timer.hv_timer_in_use)
goto out;
- WARN_ON(rcuwait_active(&vcpu->wait));
+ WARN_ON(kvm_vcpu_is_blocking(vcpu));
apic_timer_expired(apic, false);
cancel_hv_timer(apic);
@@ -2321,13 +2323,14 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_apicv);
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 msr_val;
int i;
if (!init_event) {
- vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
+ msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_reset_bsp(vcpu))
- vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ msr_val |= MSR_IA32_APICBASE_BSP;
+ kvm_lapic_set_base(vcpu, msr_val);
}
if (!apic)
@@ -2336,11 +2339,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
- if (!init_event) {
- apic->base_address = APIC_DEFAULT_PHYS_BASE;
-
+ /* The xAPIC ID is set at RESET even if the APIC was already enabled. */
+ if (!init_event)
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
- }
kvm_apic_set_version(apic->vcpu);
for (i = 0; i < KVM_APIC_LVT_NUM; i++)
@@ -2481,6 +2482,11 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
lapic_timer_advance_dynamic = false;
}
+ /*
+ * Stuff the APIC ENABLE bit in lieu of temporarily incrementing
+ * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset().
+ */
+ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
@@ -2673,7 +2679,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
struct kvm_lapic *apic)
{
- bool pending;
int vector;
/*
* PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
@@ -2687,14 +2692,8 @@ static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
* -> host enabled PV EOI, guest executed EOI.
*/
BUG_ON(!pv_eoi_enabled(vcpu));
- pending = pv_eoi_get_pending(vcpu);
- /*
- * Clear pending bit in any case: it will be set again on vmentry.
- * While this might not be ideal from performance point of view,
- * this makes sure pv eoi is only enabled when we know it's safe.
- */
- pv_eoi_clr_pending(vcpu);
- if (pending)
+
+ if (pv_eoi_test_and_clr_pending(vcpu))
return;
vector = apic_set_eoi(apic);
trace_kvm_pv_eoi(apic, vector);
@@ -2852,25 +2851,30 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
return 0;
}
-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
{
u64 addr = data & ~KVM_MSR_ENABLED;
struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
unsigned long new_len;
+ int ret;
if (!IS_ALIGNED(addr, 4))
return 1;
- vcpu->arch.pv_eoi.msr_val = data;
- if (!pv_eoi_enabled(vcpu))
- return 0;
+ if (data & KVM_MSR_ENABLED) {
+ if (addr == ghc->gpa && len <= ghc->len)
+ new_len = ghc->len;
+ else
+ new_len = len;
- if (addr == ghc->gpa && len <= ghc->len)
- new_len = ghc->len;
- else
- new_len = len;
+ ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
+ if (ret)
+ return ret;
+ }
- return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
+ vcpu->arch.pv_eoi.msr_val = data;
+
+ return 0;
}
int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
@@ -2942,5 +2946,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
void kvm_lapic_exit(void)
{
static_key_deferred_flush(&apic_hw_disabled);
+ WARN_ON(static_branch_unlikely(&apic_hw_disabled.key));
static_key_deferred_flush(&apic_sw_disabled);
+ WARN_ON(static_branch_unlikely(&apic_sw_disabled.key));
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index d7c25d0c1354..2b44e533fc8d 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -127,7 +127,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
-int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
void kvm_lapic_exit(void);
#define VEC_POS(v) ((v) & (32 - 1))
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e9688a9f7b57..e9fbb2c8bbe2 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -44,9 +44,8 @@
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
-#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | \
- X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE | \
- X86_CR4_LA57)
+#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
@@ -72,7 +71,8 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
unsigned long cr4, u64 efer, gpa_t nested_cr3);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
- bool accessed_dirty, gpa_t new_eptp);
+ int huge_page_level, bool accessed_dirty,
+ gpa_t new_eptp);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
@@ -80,6 +80,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu);
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
@@ -114,17 +115,91 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
vcpu->arch.mmu->shadow_root_level);
}
-int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- bool prefault);
+struct kvm_page_fault {
+ /* arguments to kvm_mmu_do_page_fault. */
+ const gpa_t addr;
+ const u32 error_code;
+ const bool prefetch;
+
+ /* Derived from error_code. */
+ const bool exec;
+ const bool write;
+ const bool present;
+ const bool rsvd;
+ const bool user;
+
+ /* Derived from mmu and global state. */
+ const bool is_tdp;
+ const bool nx_huge_page_workaround_enabled;
+
+ /*
+ * Whether a >4KB mapping can be created or is forbidden due to NX
+ * hugepages.
+ */
+ bool huge_page_disallowed;
+
+ /*
+ * Maximum page size that can be created for this fault; input to
+ * FNAME(fetch), __direct_map and kvm_tdp_mmu_map.
+ */
+ u8 max_level;
+
+ /*
+ * Page size that can be created based on the max_level and the
+ * page size used by the host mapping.
+ */
+ u8 req_level;
+
+ /*
+ * Page size that will be created based on the req_level and
+ * huge_page_disallowed.
+ */
+ u8 goal_level;
+
+ /* Shifted addr, or result of guest page table walk if addr is a gva. */
+ gfn_t gfn;
+
+ /* The memslot containing gfn. May be NULL. */
+ struct kvm_memory_slot *slot;
+
+ /* Outputs of kvm_faultin_pfn. */
+ kvm_pfn_t pfn;
+ hva_t hva;
+ bool map_writable;
+};
+
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+
+extern int nx_huge_pages;
+static inline bool is_nx_huge_page_enabled(void)
+{
+ return READ_ONCE(nx_huge_pages);
+}
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 err, bool prefault)
+ u32 err, bool prefetch)
{
+ struct kvm_page_fault fault = {
+ .addr = cr2_or_gpa,
+ .error_code = err,
+ .exec = err & PFERR_FETCH_MASK,
+ .write = err & PFERR_WRITE_MASK,
+ .present = err & PFERR_PRESENT_MASK,
+ .rsvd = err & PFERR_RSVD_MASK,
+ .user = err & PFERR_USER_MASK,
+ .prefetch = prefetch,
+ .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
+ .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(),
+
+ .max_level = KVM_MAX_HUGEPAGE_LEVEL,
+ .req_level = PG_LEVEL_4K,
+ .goal_level = PG_LEVEL_4K,
+ };
#ifdef CONFIG_RETPOLINE
- if (likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault))
- return kvm_tdp_page_fault(vcpu, cr2_or_gpa, err, prefault);
+ if (fault.is_tdp)
+ return kvm_tdp_page_fault(vcpu, &fault);
#endif
- return vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, err, prefault);
+ return vcpu->arch.mmu->page_fault(vcpu, &fault);
}
/*
@@ -230,14 +305,26 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
-static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
+static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
{
/*
- * Read memslot_have_rmaps before rmap pointers. Hence, threads reading
- * memslots_have_rmaps in any lock context are guaranteed to see the
- * pointers. Pairs with smp_store_release in alloc_all_memslots_rmaps.
+ * Read shadow_root_allocated before related pointers. Hence, threads
+ * reading shadow_root_allocated in any lock context are guaranteed to
+ * see the pointers. Pairs with smp_store_release in
+ * mmu_first_shadow_root_alloc.
*/
- return smp_load_acquire(&kvm->arch.memslots_have_rmaps);
+ return smp_load_acquire(&kvm->arch.shadow_root_allocated);
+}
+
+#ifdef CONFIG_X86_64
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
+#else
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
+#endif
+
+static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
+{
+ return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm);
}
static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
@@ -265,4 +352,17 @@ static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count)
{
atomic64_add(count, &kvm->stat.pages[level - 1]);
}
+
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception);
+
+static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu,
+ gpa_t gpa, u32 access,
+ struct x86_exception *exception)
+{
+ if (mmu != &vcpu->arch.nested_mmu)
+ return gpa;
+ return translate_nested_gpa(vcpu, gpa, access, exception);
+}
#endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 1a64ba5b9437..1d275e9d76b5 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -58,6 +58,7 @@
extern bool itlb_multihit_kvm_mitigation;
int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_period_ms;
#ifdef CONFIG_PREEMPT_RT
/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */
static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
@@ -66,23 +67,26 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
#endif
static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
-static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops nx_huge_pages_ops = {
.set = set_nx_huge_pages,
.get = param_get_bool,
};
-static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
- .set = set_nx_huge_pages_recovery_ratio,
+static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
+ .set = set_nx_huge_pages_recovery_param,
.get = param_get_uint,
};
module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
__MODULE_PARM_TYPE(nx_huge_pages, "bool");
-module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
&nx_huge_pages_recovery_ratio, 0644);
__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
+ &nx_huge_pages_recovery_period_ms, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
static bool __read_mostly force_flush_and_sync_on_reuse;
module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
@@ -331,12 +335,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
return likely(kvm_gen == spte_gen);
}
-static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
- struct x86_exception *exception)
-{
- return gpa;
-}
-
static int is_cpuid_PSE36(void)
{
return 1;
@@ -1071,20 +1069,6 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu)
return kvm_mmu_memory_cache_nr_free_objects(mc);
}
-static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
-{
- struct kvm_memory_slot *slot;
- struct kvm_mmu_page *sp;
- struct kvm_rmap_head *rmap_head;
-
- sp = sptep_to_sp(spte);
- kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
- return pte_list_add(vcpu, spte, rmap_head);
-}
-
-
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
struct kvm_memslots *slots;
@@ -1097,9 +1081,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
/*
- * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the
- * context of a vCPU so have to determine which memslots to use based
- * on context information in sp->role.
+ * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
+ * so we have to determine which memslots to use based on context
+ * information in sp->role.
*/
slots = kvm_memslots_for_spte_role(kvm, sp->role);
@@ -1464,7 +1448,7 @@ static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
{
u64 *sptep;
struct rmap_iterator iter;
- int need_flush = 0;
+ bool need_flush = false;
u64 new_spte;
kvm_pfn_t new_pfn;
@@ -1476,7 +1460,7 @@ restart:
rmap_printk("spte %p %llx gfn %llx (%d)\n",
sptep, *sptep, gfn, level);
- need_flush = 1;
+ need_flush = true;
if (pte_write(pte)) {
pte_list_remove(kvm, rmap_head, sptep);
@@ -1492,7 +1476,7 @@ restart:
if (need_flush && kvm_available_flush_tlb_with_range()) {
kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
- return 0;
+ return false;
}
return need_flush;
@@ -1592,7 +1576,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
if (is_tdp_mmu_enabled(kvm))
- flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
+ flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
return flush;
}
@@ -1633,25 +1617,29 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
for_each_rmap_spte(rmap_head, &iter, sptep)
if (is_accessed_spte(*sptep))
- return 1;
- return 0;
+ return true;
+ return false;
}
#define RMAP_RECYCLE_THRESHOLD 1000
-static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
+ u64 *spte, gfn_t gfn)
{
- struct kvm_memory_slot *slot;
- struct kvm_rmap_head *rmap_head;
struct kvm_mmu_page *sp;
+ struct kvm_rmap_head *rmap_head;
+ int rmap_count;
sp = sptep_to_sp(spte);
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
+ rmap_count = pte_list_add(vcpu, spte, rmap_head);
- kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
- KVM_PAGES_PER_HPAGE(sp->role.level));
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
+ kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
+ kvm_flush_remote_tlbs_with_address(
+ vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
+ }
}
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -1795,7 +1783,7 @@ static void mark_unsync(u64 *spte)
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
- return 0;
+ return -1;
}
#define KVM_PAGE_ARRAY_NR 16
@@ -1909,12 +1897,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
+ int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
+
+ if (ret < 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return false;
}
- return true;
+ return !!ret;
}
static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
@@ -1931,17 +1921,6 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
return true;
}
-static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
- struct list_head *invalid_list,
- bool remote_flush, bool local_flush)
-{
- if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
- return;
-
- if (local_flush)
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-}
-
#ifdef CONFIG_KVM_MMU_AUDIT
#include "mmu_audit.c"
#else
@@ -1951,7 +1930,11 @@ static void mmu_audit_disable(void) { }
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- return sp->role.invalid ||
+ if (sp->role.invalid)
+ return true;
+
+ /* TDP MMU pages due not use the MMU generation. */
+ return !sp->tdp_mmu_page &&
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
@@ -2044,7 +2027,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu,
protected |= rmap_write_protect(vcpu, sp->gfn);
if (protected) {
- kvm_flush_remote_tlbs(vcpu->kvm);
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
flush = false;
}
@@ -2054,7 +2037,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu,
mmu_pages_clear_parents(&parents);
}
if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
- kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
if (!can_yield) {
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
return -EINTR;
@@ -2065,7 +2048,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu,
}
}
- kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
return 0;
}
@@ -2097,10 +2080,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role = vcpu->arch.mmu->mmu_role.base;
role.level = level;
role.direct = direct;
- if (role.direct)
- role.gpte_is_8_bytes = true;
role.access = access;
- if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
+ if (role.has_4_byte_gpte) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
@@ -2149,7 +2130,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
break;
WARN_ON(!list_empty(&invalid_list));
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ kvm_flush_remote_tlbs(vcpu->kvm);
}
__clear_sp_write_flooding_count(sp);
@@ -2188,10 +2169,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
iterator->shadow_addr = root;
iterator->level = vcpu->arch.mmu->shadow_root_level;
- if (iterator->level == PT64_ROOT_4LEVEL &&
+ if (iterator->level >= PT64_ROOT_4LEVEL &&
vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu->direct_map)
- --iterator->level;
+ iterator->level = PT32E_ROOT_LEVEL;
if (iterator->level == PT32E_ROOT_LEVEL) {
/*
@@ -2229,7 +2210,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
u64 spte)
{
- if (is_last_spte(spte, iterator->level)) {
+ if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
iterator->level = 0;
return;
}
@@ -2576,10 +2557,10 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
return r;
}
-static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
trace_kvm_mmu_unsync_page(sp);
- ++vcpu->kvm->stat.mmu_unsync;
+ ++kvm->stat.mmu_unsync;
sp->unsync = 1;
kvm_mmu_mark_parents_unsync(sp);
@@ -2591,7 +2572,8 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
* were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
* be write-protected.
*/
-int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
+int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
+ gfn_t gfn, bool can_unsync, bool prefetch)
{
struct kvm_mmu_page *sp;
bool locked = false;
@@ -2601,7 +2583,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
* track machinery is used to write-protect upper-level shadow pages,
* i.e. this guards the role.level == 4K assertion below!
*/
- if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
return -EPERM;
/*
@@ -2610,13 +2592,16 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
* that case, KVM must complete emulation of the guest TLB flush before
* allowing shadow pages to become unsync (writable by the guest).
*/
- for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
if (!can_unsync)
return -EPERM;
if (sp->unsync)
continue;
+ if (prefetch)
+ return -EEXIST;
+
/*
* TDP MMU page faults require an additional spinlock as they
* run with mmu_lock held for read, not write, and the unsync
@@ -2626,7 +2611,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
*/
if (!locked) {
locked = true;
- spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
+ spin_lock(&kvm->arch.mmu_unsync_pages_lock);
/*
* Recheck after taking the spinlock, a different vCPU
@@ -2641,10 +2626,10 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
}
WARN_ON(sp->role.level != PG_LEVEL_4K);
- kvm_unsync_page(vcpu, sp);
+ kvm_unsync_page(kvm, sp);
}
if (locked)
- spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
+ spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
/*
* We need to ensure that the marking of unsync pages is visible
@@ -2680,48 +2665,30 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
* (sp->unsync = true)
*
* The write barrier below ensures that 1.1 happens before 1.2 and thus
- * the situation in 2.4 does not arise. The implicit barrier in 2.2
- * pairs with this write barrier.
+ * the situation in 2.4 does not arise. It pairs with the read barrier
+ * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
*/
smp_wmb();
return 0;
}
-static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned int pte_access, int level,
- gfn_t gfn, kvm_pfn_t pfn, bool speculative,
- bool can_unsync, bool host_writable)
-{
- u64 spte;
- struct kvm_mmu_page *sp;
- int ret;
-
- sp = sptep_to_sp(sptep);
-
- ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
- can_unsync, host_writable, sp_ad_disabled(sp), &spte);
-
- if (spte & PT_WRITABLE_MASK)
- kvm_vcpu_mark_page_dirty(vcpu, gfn);
-
- if (*sptep == spte)
- ret |= SET_SPTE_SPURIOUS;
- else if (mmu_spte_update(sptep, spte))
- ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
- return ret;
-}
-
-static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned int pte_access, bool write_fault, int level,
- gfn_t gfn, kvm_pfn_t pfn, bool speculative,
- bool host_writable)
+static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
+ u64 *sptep, unsigned int pte_access, gfn_t gfn,
+ kvm_pfn_t pfn, struct kvm_page_fault *fault)
{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ int level = sp->role.level;
int was_rmapped = 0;
- int rmap_count;
- int set_spte_ret;
int ret = RET_PF_FIXED;
bool flush = false;
+ bool wrprot;
+ u64 spte;
+
+ /* Prefetching always gets a writable pfn. */
+ bool host_writable = !fault || fault->map_writable;
+ bool prefetch = !fault || fault->prefetch;
+ bool write_fault = fault && fault->write;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
@@ -2752,52 +2719,36 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
was_rmapped = 1;
}
- set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
- speculative, true, host_writable);
- if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
+ wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
+ true, host_writable, &spte);
+
+ if (*sptep == spte) {
+ ret = RET_PF_SPURIOUS;
+ } else {
+ trace_kvm_mmu_set_spte(level, gfn, sptep);
+ flush |= mmu_spte_update(sptep, spte);
+ }
+
+ if (wrprot) {
if (write_fault)
ret = RET_PF_EMULATE;
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
- if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
+ if (flush)
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
KVM_PAGES_PER_HPAGE(level));
- /*
- * The fault is fully spurious if and only if the new SPTE and old SPTE
- * are identical, and emulation is not required.
- */
- if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) {
- WARN_ON_ONCE(!was_rmapped);
- return RET_PF_SPURIOUS;
- }
-
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
- trace_kvm_mmu_set_spte(level, gfn, sptep);
if (!was_rmapped) {
+ WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
kvm_update_page_stats(vcpu->kvm, level, 1);
- rmap_count = rmap_add(vcpu, sptep, gfn);
- if (rmap_count > RMAP_RECYCLE_THRESHOLD)
- rmap_recycle(vcpu, sptep, gfn);
+ rmap_add(vcpu, slot, sptep, gfn);
}
return ret;
}
-static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool no_dirty_log)
-{
- struct kvm_memory_slot *slot;
-
- slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
- if (!slot)
- return KVM_PFN_ERR_FAULT;
-
- return gfn_to_pfn_memslot_atomic(slot, gfn);
-}
-
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -2818,8 +2769,8 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
return -1;
for (i = 0; i < ret; i++, gfn++, start++) {
- mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn,
- page_to_pfn(pages[i]), true, true);
+ mmu_set_spte(vcpu, slot, start, access, gfn,
+ page_to_pfn(pages[i]), NULL);
put_page(pages[i]);
}
@@ -2842,11 +2793,13 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
if (!start)
continue;
if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
- break;
+ return;
start = NULL;
} else if (!start)
start = spte;
}
+ if (start)
+ direct_pte_prefetch_many(vcpu, sp, start, spte);
}
static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
@@ -2924,52 +2877,46 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
return min(host_level, max_level);
}
-int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
- int max_level, kvm_pfn_t *pfnp,
- bool huge_page_disallowed, int *req_level)
+void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- struct kvm_memory_slot *slot;
- kvm_pfn_t pfn = *pfnp;
+ struct kvm_memory_slot *slot = fault->slot;
kvm_pfn_t mask;
- int level;
- *req_level = PG_LEVEL_4K;
+ fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
- if (unlikely(max_level == PG_LEVEL_4K))
- return PG_LEVEL_4K;
+ if (unlikely(fault->max_level == PG_LEVEL_4K))
+ return;
- if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn))
- return PG_LEVEL_4K;
+ if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn))
+ return;
- slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true);
- if (!slot)
- return PG_LEVEL_4K;
+ if (kvm_slot_dirty_track_enabled(slot))
+ return;
/*
* Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting.
*/
- *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
- if (level == PG_LEVEL_4K || huge_page_disallowed)
- return PG_LEVEL_4K;
+ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
+ fault->gfn, fault->pfn,
+ fault->max_level);
+ if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
+ return;
/*
* mmu_notifier_retry() was successful and mmu_lock is held, so
* the pmd can't be split from under us.
*/
- mask = KVM_PAGES_PER_HPAGE(level) - 1;
- VM_BUG_ON((gfn & mask) != (pfn & mask));
- *pfnp = pfn & ~mask;
-
- return level;
+ fault->goal_level = fault->req_level;
+ mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
+ VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
+ fault->pfn &= ~mask;
}
-void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
- kvm_pfn_t *pfnp, int *goal_levelp)
+void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
{
- int level = *goal_levelp;
-
- if (cur_level == level && level > PG_LEVEL_4K &&
+ if (cur_level > PG_LEVEL_4K &&
+ cur_level == fault->goal_level &&
is_shadow_present_pte(spte) &&
!is_large_pte(spte)) {
/*
@@ -2979,42 +2926,33 @@ void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
* patching back for them into pfn the next 9 bits of
* the address.
*/
- u64 page_mask = KVM_PAGES_PER_HPAGE(level) -
- KVM_PAGES_PER_HPAGE(level - 1);
- *pfnp |= gfn & page_mask;
- (*goal_levelp)--;
+ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
+ KVM_PAGES_PER_HPAGE(cur_level - 1);
+ fault->pfn |= fault->gfn & page_mask;
+ fault->goal_level--;
}
}
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault, bool is_tdp)
+static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
- bool write = error_code & PFERR_WRITE_MASK;
- bool exec = error_code & PFERR_FETCH_MASK;
- bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
- int level, req_level, ret;
- gfn_t gfn = gpa >> PAGE_SHIFT;
- gfn_t base_gfn = gfn;
+ int ret;
+ gfn_t base_gfn = fault->gfn;
- level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
- huge_page_disallowed, &req_level);
+ kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(gpa, level, pfn);
- for_each_shadow_entry(vcpu, gpa, it) {
+ trace_kvm_mmu_spte_requested(fault);
+ for_each_shadow_entry(vcpu, fault->addr, it) {
/*
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
*/
- if (nx_huge_page_workaround_enabled)
- disallowed_hugepage_adjust(*it.sptep, gfn, it.level,
- &pfn, &level);
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, *it.sptep, it.level);
- base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
- if (it.level == level)
+ base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == fault->goal_level)
break;
drop_large_spte(vcpu, it.sptep);
@@ -3025,14 +2963,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
it.level - 1, true, ACC_ALL);
link_shadow_page(vcpu, it.sptep, sp);
- if (is_tdp && huge_page_disallowed &&
- req_level >= it.level)
+ if (fault->is_tdp && fault->huge_page_disallowed &&
+ fault->req_level >= it.level)
account_huge_nx_page(vcpu->kvm, sp);
}
- ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
- write, level, base_gfn, pfn, prefault,
- map_writable);
+ if (WARN_ON_ONCE(it.level != fault->goal_level))
+ return -EFAULT;
+
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
+ base_gfn, fault->pfn, fault);
if (ret == RET_PF_SPURIOUS)
return ret;
@@ -3064,18 +3004,19 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
return -EFAULT;
}
-static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
- kvm_pfn_t pfn, unsigned int access,
- int *ret_val)
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ unsigned int access, int *ret_val)
{
/* The pfn is invalid, report the error! */
- if (unlikely(is_error_pfn(pfn))) {
- *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+ if (unlikely(is_error_pfn(fault->pfn))) {
+ *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
return true;
}
- if (unlikely(is_noslot_pfn(pfn))) {
- vcpu_cache_mmio_info(vcpu, gva, gfn,
+ if (unlikely(!fault->slot)) {
+ gva_t gva = fault->is_tdp ? 0 : fault->addr;
+
+ vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
access & shadow_mmio_access_mask);
/*
* If MMIO caching is disabled, emulate immediately without
@@ -3091,18 +3032,17 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
return false;
}
-static bool page_fault_can_be_fast(u32 error_code)
+static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
{
/*
* Do not fix the mmio spte with invalid generation number which
* need to be updated by slow page fault path.
*/
- if (unlikely(error_code & PFERR_RSVD_MASK))
+ if (fault->rsvd)
return false;
/* See if the page fault is due to an NX violation */
- if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
- == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
+ if (unlikely(fault->exec && fault->present))
return false;
/*
@@ -3119,9 +3059,7 @@ static bool page_fault_can_be_fast(u32 error_code)
* accesses to a present page.
*/
- return shadow_acc_track_mask != 0 ||
- ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
- == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
+ return shadow_acc_track_mask != 0 || (fault->write && fault->present);
}
/*
@@ -3129,13 +3067,9 @@ static bool page_fault_can_be_fast(u32 error_code)
* someone else modified the SPTE from its original value.
*/
static bool
-fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
u64 *sptep, u64 old_spte, u64 new_spte)
{
- gfn_t gfn;
-
- WARN_ON(!sp->role.direct);
-
/*
* Theoretically we could also set dirty bit (and flush TLB) here in
* order to eliminate unnecessary PML logging. See comments in
@@ -3151,24 +3085,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
return false;
- if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
- /*
- * The gfn of direct spte is stable since it is
- * calculated by sp->gfn.
- */
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- kvm_vcpu_mark_page_dirty(vcpu, gfn);
- }
+ if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
+ mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
return true;
}
-static bool is_access_allowed(u32 fault_err_code, u64 spte)
+static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
{
- if (fault_err_code & PFERR_FETCH_MASK)
+ if (fault->exec)
return is_executable_pte(spte);
- if (fault_err_code & PFERR_WRITE_MASK)
+ if (fault->write)
return is_writable_pte(spte);
/* Fault was on Read access */
@@ -3193,9 +3121,6 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
sptep = iterator.sptep;
*spte = old_spte;
-
- if (!is_shadow_present_pte(old_spte))
- break;
}
return sptep;
@@ -3204,7 +3129,7 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
/*
* Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
*/
-static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
+static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_mmu_page *sp;
int ret = RET_PF_INVALID;
@@ -3212,7 +3137,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
u64 *sptep = NULL;
uint retry_count = 0;
- if (!page_fault_can_be_fast(error_code))
+ if (!page_fault_can_be_fast(fault))
return ret;
walk_shadow_page_lockless_begin(vcpu);
@@ -3221,9 +3146,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
u64 new_spte;
if (is_tdp_mmu(vcpu->arch.mmu))
- sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte);
+ sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
else
- sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte);
+ sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
if (!is_shadow_present_pte(spte))
break;
@@ -3242,7 +3167,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
* Need not check the access of upper level table entries since
* they are always ACC_ALL.
*/
- if (is_access_allowed(error_code, spte)) {
+ if (is_access_allowed(fault, spte)) {
ret = RET_PF_SPURIOUS;
break;
}
@@ -3257,28 +3182,28 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
* be removed in the fast path only if the SPTE was
* write-protected for dirty-logging or access tracking.
*/
- if ((error_code & PFERR_WRITE_MASK) &&
+ if (fault->write &&
spte_can_locklessly_be_made_writable(spte)) {
new_spte |= PT_WRITABLE_MASK;
/*
- * Do not fix write-permission on the large spte. Since
- * we only dirty the first page into the dirty-bitmap in
+ * Do not fix write-permission on the large spte when
+ * dirty logging is enabled. Since we only dirty the
+ * first page into the dirty-bitmap in
* fast_pf_fix_direct_spte(), other pages are missed
* if its slot has dirty logging enabled.
*
* Instead, we let the slow page fault path create a
* normal spte to fix the access.
- *
- * See the comments in kvm_arch_commit_memory_region().
*/
- if (sp->role.level > PG_LEVEL_4K)
+ if (sp->role.level > PG_LEVEL_4K &&
+ kvm_slot_dirty_track_enabled(fault->slot))
break;
}
/* Verify that the fault can be handled in the fast path */
if (new_spte == spte ||
- !is_access_allowed(error_code, new_spte))
+ !is_access_allowed(fault, new_spte))
break;
/*
@@ -3286,7 +3211,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
* since the gfn is not stable for indirect shadow page. See
* Documentation/virt/kvm/locking.rst to get more detail.
*/
- if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) {
+ if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
ret = RET_PF_FIXED;
break;
}
@@ -3299,7 +3224,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
} while (true);
- trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret);
+ trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
walk_shadow_page_lockless_end(vcpu);
return ret;
@@ -3472,6 +3397,67 @@ out_unlock:
return r;
}
+static int mmu_first_shadow_root_alloc(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r = 0, i, bkt;
+
+ /*
+ * Check if this is the first shadow root being allocated before
+ * taking the lock.
+ */
+ if (kvm_shadow_root_allocated(kvm))
+ return 0;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /* Recheck, under the lock, whether this is the first shadow root. */
+ if (kvm_shadow_root_allocated(kvm))
+ goto out_unlock;
+
+ /*
+ * Check if anything actually needs to be allocated, e.g. all metadata
+ * will be allocated upfront if TDP is disabled.
+ */
+ if (kvm_memslots_have_rmaps(kvm) &&
+ kvm_page_track_write_tracking_enabled(kvm))
+ goto out_success;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots) {
+ /*
+ * Both of these functions are no-ops if the target is
+ * already allocated, so unconditionally calling both
+ * is safe. Intentionally do NOT free allocations on
+ * failure to avoid having to track which allocations
+ * were made now versus when the memslot was created.
+ * The metadata is guaranteed to be freed when the slot
+ * is freed, and will be kept/used if userspace retries
+ * KVM_RUN instead of killing the VM.
+ */
+ r = memslot_rmap_alloc(slot, slot->npages);
+ if (r)
+ goto out_unlock;
+ r = kvm_page_track_write_tracking_alloc(slot);
+ if (r)
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * Ensure that shadow_root_allocated becomes true strictly after
+ * all the related pointers are set.
+ */
+out_success:
+ smp_store_release(&kvm->arch.shadow_root_allocated, true);
+
+out_unlock:
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+}
+
static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -3502,7 +3488,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
}
- r = alloc_all_memslots_rmaps(vcpu->kvm);
+ r = mmu_first_shadow_root_alloc(vcpu->kvm);
if (r)
return r;
@@ -3653,6 +3639,33 @@ err_pml4:
#endif
}
+static bool is_unsync_root(hpa_t root)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root))
+ return false;
+
+ /*
+ * The read barrier orders the CPU's read of SPTE.W during the page table
+ * walk before the reads of sp->unsync/sp->unsync_children here.
+ *
+ * Even if another CPU was marking the SP as unsync-ed simultaneously,
+ * any guest page table changes are not guaranteed to be visible anyway
+ * until this VCPU issues a TLB flush strictly after those changes are
+ * made. We only need to ensure that the other CPU sets these flags
+ * before any actual changes to the page tables are made. The comments
+ * in mmu_try_to_unsync_pages() describe what could go wrong if this
+ * requirement isn't satisfied.
+ */
+ smp_rmb();
+ sp = to_shadow_page(root);
+ if (sp->unsync || sp->unsync_children)
+ return true;
+
+ return false;
+}
+
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
int i;
@@ -3670,18 +3683,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu->root_hpa;
sp = to_shadow_page(root);
- /*
- * Even if another CPU was marking the SP as unsync-ed
- * simultaneously, any guest page table changes are not
- * guaranteed to be visible anyway until this VCPU issues a TLB
- * flush strictly after those changes are made. We only need to
- * ensure that the other CPU sets these flags before any actual
- * changes to the page tables are made. The comments in
- * mmu_try_to_unsync_pages() describe what could go wrong if
- * this requirement isn't satisfied.
- */
- if (!smp_load_acquire(&sp->unsync) &&
- !smp_load_acquire(&sp->unsync_children))
+ if (!is_unsync_root(root))
return;
write_lock(&vcpu->kvm->mmu_lock);
@@ -3711,21 +3713,26 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
write_unlock(&vcpu->kvm->mmu_lock);
}
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
- u32 access, struct x86_exception *exception)
+void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
{
- if (exception)
- exception->error_code = 0;
- return vaddr;
+ unsigned long roots_to_free = 0;
+ int i;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ /* sync prev_roots by simply freeing them */
+ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
}
-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
- u32 access,
- struct x86_exception *exception)
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t vaddr, u32 access,
+ struct x86_exception *exception)
{
if (exception)
exception->error_code = 0;
- return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
+ return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
}
static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
@@ -3763,9 +3770,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level
spte = mmu_spte_get_lockless(iterator.sptep);
sptes[leaf] = spte;
-
- if (!is_shadow_present_pte(spte))
- break;
}
return leaf;
@@ -3856,20 +3860,19 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
}
static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
- u32 error_code, gfn_t gfn)
+ struct kvm_page_fault *fault)
{
- if (unlikely(error_code & PFERR_RSVD_MASK))
+ if (unlikely(fault->rsvd))
return false;
- if (!(error_code & PFERR_PRESENT_MASK) ||
- !(error_code & PFERR_WRITE_MASK))
+ if (!fault->present || !fault->write)
return false;
/*
* guest is writing the page which is write tracked which can
* not be fixed by page fault handler.
*/
- if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
return true;
return false;
@@ -3881,11 +3884,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
u64 spte;
walk_shadow_page_lockless_begin(vcpu);
- for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
clear_sp_write_flooding_count(iterator.sptep);
- if (!is_shadow_present_pte(spte))
- break;
- }
walk_shadow_page_lockless_end(vcpu);
}
@@ -3903,11 +3903,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
-static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
- bool write, bool *writable, int *r)
+static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r)
{
- struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ struct kvm_memory_slot *slot = fault->slot;
bool async;
/*
@@ -3921,8 +3919,9 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
if (!kvm_is_visible_memslot(slot)) {
/* Don't expose private memslots to L2. */
if (is_guest_mode(vcpu)) {
- *pfn = KVM_PFN_NOSLOT;
- *writable = false;
+ fault->slot = NULL;
+ fault->pfn = KVM_PFN_NOSLOT;
+ fault->map_writable = false;
return false;
}
/*
@@ -3939,46 +3938,74 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
}
async = false;
- *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
- write, writable, hva);
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
+ fault->write, &fault->map_writable,
+ &fault->hva);
if (!async)
return false; /* *pfn has correct page already */
- if (!prefault && kvm_can_do_async_pf(vcpu)) {
- trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
- if (kvm_find_async_pf_gfn(vcpu, gfn)) {
- trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
+ if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
+ trace_kvm_try_async_get_page(fault->addr, fault->gfn);
+ if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
+ trace_kvm_async_pf_doublefault(fault->addr, fault->gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
goto out_retry;
- } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
+ } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn))
goto out_retry;
}
- *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
- write, writable, hva);
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
+ fault->write, &fault->map_writable,
+ &fault->hva);
+ return false;
out_retry:
*r = RET_PF_RETRY;
return true;
}
-static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- bool prefault, int max_level, bool is_tdp)
+/*
+ * Returns true if the page fault is stale and needs to be retried, i.e. if the
+ * root was invalidated by a memslot update or a relevant mmu_notifier fired.
+ */
+static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault, int mmu_seq)
+{
+ struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa);
+
+ /* Special roots, e.g. pae_root, are not backed by shadow pages. */
+ if (sp && is_obsolete_sp(vcpu->kvm, sp))
+ return true;
+
+ /*
+ * Roots without an associated shadow page are considered invalid if
+ * there is a pending request to free obsolete roots. The request is
+ * only a hint that the current root _may_ be obsolete and needs to be
+ * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
+ * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
+ * to reload even if no vCPU is actively using the root.
+ */
+ if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu))
+ return true;
+
+ return fault->slot &&
+ mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
+}
+
+static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
- bool write = error_code & PFERR_WRITE_MASK;
- bool map_writable;
- gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
- kvm_pfn_t pfn;
- hva_t hva;
int r;
- if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ fault->gfn = fault->addr >> PAGE_SHIFT;
+ fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
+
+ if (page_fault_handle_page_track(vcpu, fault))
return RET_PF_EMULATE;
- r = fast_page_fault(vcpu, gpa, error_code);
+ r = fast_page_fault(vcpu, fault);
if (r != RET_PF_INVALID)
return r;
@@ -3989,11 +4016,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva,
- write, &map_writable, &r))
+ if (kvm_faultin_pfn(vcpu, fault, &r))
return r;
- if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
+ if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r))
return r;
r = RET_PF_RETRY;
@@ -4003,36 +4029,35 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
else
write_lock(&vcpu->kvm->mmu_lock);
- if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
+ if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
+
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
if (is_tdp_mmu_fault)
- r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
- pfn, prefault);
+ r = kvm_tdp_mmu_map(vcpu, fault);
else
- r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
- prefault, is_tdp);
+ r = __direct_map(vcpu, fault);
out_unlock:
if (is_tdp_mmu_fault)
read_unlock(&vcpu->kvm->mmu_lock);
else
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
+ kvm_release_pfn_clean(fault->pfn);
return r;
}
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
- u32 error_code, bool prefault)
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
{
- pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
+ pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
- return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
- PG_LEVEL_2M, false);
+ fault->max_level = PG_LEVEL_2M;
+ return direct_page_fault(vcpu, fault);
}
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
@@ -4068,23 +4093,19 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
}
EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
-int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- bool prefault)
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- int max_level;
-
- for (max_level = KVM_MAX_HUGEPAGE_LEVEL;
- max_level > PG_LEVEL_4K;
- max_level--) {
- int page_num = KVM_PAGES_PER_HPAGE(max_level);
- gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
+ while (fault->max_level > PG_LEVEL_4K) {
+ int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
+ gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
break;
+
+ --fault->max_level;
}
- return direct_page_fault(vcpu, gpa, error_code, prefault,
- max_level, true);
+ return direct_page_fault(vcpu, fault);
}
static void nonpaging_init_context(struct kvm_mmu *context)
@@ -4205,7 +4226,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu)
}
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- unsigned int access, int *nr_present)
+ unsigned int access)
{
if (unlikely(is_mmio_spte(*sptep))) {
if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -4213,7 +4234,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
return true;
}
- (*nr_present)++;
mark_mmio_spte(vcpu, sptep, gfn, access);
return true;
}
@@ -4352,22 +4372,28 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
static void
__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
- u64 pa_bits_rsvd, bool execonly)
+ u64 pa_bits_rsvd, bool execonly, int huge_page_level)
{
u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
+ u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
u64 bad_mt_xwr;
+ if (huge_page_level < PG_LEVEL_1G)
+ large_1g_rsvd = rsvd_bits(7, 7);
+ if (huge_page_level < PG_LEVEL_2M)
+ large_2m_rsvd = rsvd_bits(7, 7);
+
rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
- rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6);
- rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
/* large page */
rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
- rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29);
- rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20);
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
@@ -4383,10 +4409,11 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
}
static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context, bool execonly)
+ struct kvm_mmu *context, bool execonly, int huge_page_level)
{
__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
- vcpu->arch.reserved_gpa_bits, execonly);
+ vcpu->arch.reserved_gpa_bits, execonly,
+ huge_page_level);
}
static inline u64 reserved_hpa_bits(void)
@@ -4462,7 +4489,8 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
false, true);
else
__reset_rsvds_bits_mask_ept(shadow_zero_check,
- reserved_hpa_bits(), false);
+ reserved_hpa_bits(), false,
+ max_huge_page_level);
if (!shadow_me_mask)
return;
@@ -4482,7 +4510,8 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
- reserved_hpa_bits(), execonly);
+ reserved_hpa_bits(), execonly,
+ max_huge_page_level);
}
#define BYTE_MASK(access) \
@@ -4596,10 +4625,10 @@ static void update_pkru_bitmask(struct kvm_mmu *mmu)
unsigned bit;
bool wp;
- if (!is_cr4_pke(mmu)) {
- mmu->pkru_mask = 0;
+ mmu->pkru_mask = 0;
+
+ if (!is_cr4_pke(mmu))
return;
- }
wp = is_cr0_wp(mmu);
@@ -4679,6 +4708,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu,
/* PKEY and LA57 are active iff long mode is active. */
ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
+ ext.efer_lma = ____is_efer_lma(regs);
}
ext.valid = 1;
@@ -4730,7 +4760,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
role.base.ad_disabled = (shadow_accessed_mask == 0);
role.base.level = kvm_mmu_get_tdp_level(vcpu);
role.base.direct = true;
- role.base.gpte_is_8_bytes = true;
+ role.base.has_4_byte_gpte = false;
return role;
}
@@ -4775,7 +4805,7 @@ kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
- role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs);
+ role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs);
return role;
}
@@ -4851,7 +4881,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
struct kvm_mmu_role_regs regs = {
.cr0 = cr0,
- .cr4 = cr4,
+ .cr4 = cr4 & ~X86_CR4_PKE,
.efer = efer,
};
union kvm_mmu_role new_role;
@@ -4874,7 +4904,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
role.base.level = level;
- role.base.gpte_is_8_bytes = true;
+ role.base.has_4_byte_gpte = false;
role.base.direct = false;
role.base.ad_disabled = !accessed_dirty;
role.base.guest_mode = true;
@@ -4889,7 +4919,8 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
}
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
- bool accessed_dirty, gpa_t new_eptp)
+ int huge_page_level, bool accessed_dirty,
+ gpa_t new_eptp)
{
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
u8 level = vmx_eptp_page_walk_level(new_eptp);
@@ -4915,8 +4946,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->direct_map = false;
update_permission_bitmask(context, true);
- update_pkru_bitmask(context);
- reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+ context->pkru_mask = 0;
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
@@ -4980,13 +5011,13 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
* the gva_to_gpa functions between mmu and nested_mmu are swapped.
*/
if (!is_paging(vcpu))
- g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+ g_context->gva_to_gpa = nonpaging_gva_to_gpa;
else if (is_long_mode(vcpu))
- g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ g_context->gva_to_gpa = paging64_gva_to_gpa;
else if (is_pae(vcpu))
- g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ g_context->gva_to_gpa = paging64_gva_to_gpa;
else
- g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
+ g_context->gva_to_gpa = paging32_gva_to_gpa;
reset_guest_paging_metadata(vcpu, g_context);
}
@@ -5021,6 +5052,14 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
/*
* Invalidate all MMU roles to force them to reinitialize as CPUID
* information is factored into reserved bit calculations.
+ *
+ * Correctly handling multiple vCPU models with respect to paging and
+ * physical address properties) in a single VM would require tracking
+ * all relevant CPUID information in kvm_mmu_page_role. That is very
+ * undesirable as it would increase the memory requirements for
+ * gfn_track (see struct kvm_mmu_page_role comments). For now that
+ * problem is swept under the rug; KVM's CPUID API is horrific and
+ * it's all but impossible to solve it without introducing a new API.
*/
vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
@@ -5028,24 +5067,10 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
/*
- * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
- * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
- * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
- * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise
- * sweep the problem under the rug.
- *
- * KVM's horrific CPUID ABI makes the problem all but impossible to
- * solve, as correctly handling multiple vCPU models (with respect to
- * paging and physical address properties) in a single VM would require
- * tracking all relevant CPUID information in kvm_mmu_page_role. That
- * is very undesirable as it would double the memory requirements for
- * gfn_track (see struct kvm_mmu_page_role comments), and in practice
- * no sane VMM mucks with the core vCPU model on the fly.
+ * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
+ * kvm_arch_vcpu_ioctl().
*/
- if (vcpu->arch.last_vmentry_cpu != -1) {
- pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n");
- pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n");
- }
+ KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -5157,7 +5182,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
gpa, bytes, sp->role.word);
offset = offset_in_page(gpa);
- pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
+ pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
/*
* Sometimes, the OS only writes the last one bytes to update status
@@ -5181,7 +5206,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
page_offset = offset_in_page(gpa);
level = sp->role.level;
*nspte = 1;
- if (!sp->role.gpte_is_8_bytes) {
+ if (sp->role.has_4_byte_gpte) {
page_offset <<= 1; /* 32->64 */
/*
* A 32-bit pde maps 4MB while the shadow pdes map
@@ -5212,7 +5237,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
LIST_HEAD(invalid_list);
u64 entry, gentry, *spte;
int npte;
- bool remote_flush, local_flush;
+ bool flush = false;
/*
* If we don't have indirect shadow pages, it means no page is
@@ -5221,8 +5246,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
return;
- remote_flush = local_flush = false;
-
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
/*
@@ -5251,18 +5274,17 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!spte)
continue;
- local_flush = true;
while (npte--) {
entry = *spte;
mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
if (gentry && sp->role.level != PG_LEVEL_4K)
++vcpu->kvm->stat.mmu_pde_zapped;
if (need_remote_flush(entry, *spte))
- remote_flush = true;
+ flush = true;
++spte;
}
}
- kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
write_unlock(&vcpu->kvm->mmu_lock);
}
@@ -5368,7 +5390,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE);
+ kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
@@ -5473,8 +5495,8 @@ slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
}
static __always_inline bool
-slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool flush_on_yield)
+slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool flush_on_yield)
{
return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
PG_LEVEL_4K, flush_on_yield);
@@ -5496,10 +5518,13 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
mmu->root_hpa = INVALID_PAGE;
mmu->root_pgd = 0;
- mmu->translate_gpa = translate_gpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+ /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
+ if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
+ return 0;
+
/*
* When using PAE paging, the four PDPTEs are treated as 'root' pages,
* while the PDP table is a per-vCPU construct that's allocated at MMU
@@ -5509,7 +5534,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
* generally doesn't use PAE paging and can skip allocating the PDP
* table. The main exception, handled here, is SVM's 32-bit NPT. The
* other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
- * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
+ * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
*/
if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
return 0;
@@ -5554,8 +5579,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
- vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
-
ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
if (ret)
return ret;
@@ -5694,13 +5717,7 @@ void kvm_mmu_init_vm(struct kvm *kvm)
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
- if (!kvm_mmu_init_tdp_mmu(kvm))
- /*
- * No smp_load/store wrappers needed here as we are in
- * VM init and there cannot be any memslots / other threads
- * accessing this struct kvm yet.
- */
- kvm->arch.memslots_have_rmaps = true;
+ kvm_mmu_init_tdp_mmu(kvm);
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
@@ -5716,55 +5733,64 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
kvm_mmu_uninit_tdp_mmu(kvm);
}
+static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ const struct kvm_memory_slot *memslot;
+ struct kvm_memslots *slots;
+ struct kvm_memslot_iter iter;
+ bool flush = false;
+ gfn_t start, end;
+ int i;
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return flush;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
+ memslot = iter.slot;
+ start = max(gfn_start, memslot->base_gfn);
+ end = min(gfn_end, memslot->base_gfn + memslot->npages);
+ if (WARN_ON_ONCE(start >= end))
+ continue;
+
+ flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true, flush);
+ }
+ }
+
+ return flush;
+}
+
/*
* Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
* (not including it)
*/
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
+ bool flush;
int i;
- bool flush = false;
+
+ if (WARN_ON_ONCE(gfn_end <= gfn_start))
+ return;
write_lock(&kvm->mmu_lock);
kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
- if (kvm_memslots_have_rmaps(kvm)) {
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(memslot, slots) {
- gfn_t start, end;
-
- start = max(gfn_start, memslot->base_gfn);
- end = min(gfn_end, memslot->base_gfn + memslot->npages);
- if (start >= end)
- continue;
-
- flush = slot_handle_level_range(kvm,
- (const struct kvm_memory_slot *) memslot,
- kvm_zap_rmapp, PG_LEVEL_4K,
- KVM_MAX_HUGEPAGE_LEVEL, start,
- end - 1, true, flush);
- }
- }
- if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
- gfn_end - gfn_start);
- }
+ flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end);
if (is_tdp_mmu_enabled(kvm)) {
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
gfn_end, flush);
- if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
- gfn_end - gfn_start);
}
if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+ gfn_end - gfn_start);
kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
@@ -5856,21 +5882,21 @@ restart:
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
- bool flush = false;
-
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
- flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
- if (flush)
+ /*
+ * Zap only 4k SPTEs since the legacy MMU only supports dirty
+ * logging at a 4k granularity and never creates collapsible
+ * 2m SPTEs during dirty logging.
+ */
+ if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true))
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
write_unlock(&kvm->mmu_lock);
}
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
- flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
- if (flush)
- kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
read_unlock(&kvm->mmu_lock);
}
}
@@ -5897,8 +5923,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
- flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty,
- false);
+ /*
+ * Clear dirty bits only on 4k SPTEs since the legacy MMU only
+ * support dirty logging at a 4k granularity.
+ */
+ flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
write_unlock(&kvm->mmu_lock);
}
@@ -6136,30 +6165,6 @@ out:
return ret;
}
-/*
- * Calculate mmu pages needed for kvm.
- */
-unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
-{
- unsigned long nr_mmu_pages;
- unsigned long nr_pages = 0;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
- int i;
-
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
-
- kvm_for_each_memslot(memslot, slots)
- nr_pages += memslot->npages;
- }
-
- nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
- nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
-
- return nr_mmu_pages;
-}
-
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
@@ -6176,18 +6181,47 @@ void kvm_mmu_module_exit(void)
mmu_audit_disable();
}
-static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
+/*
+ * Calculate the effective recovery period, accounting for '0' meaning "let KVM
+ * select a halving time of 1 hour". Returns true if recovery is enabled.
+ */
+static bool calc_nx_huge_pages_recovery_period(uint *period)
+{
+ /*
+ * Use READ_ONCE to get the params, this may be called outside of the
+ * param setters, e.g. by the kthread to compute its next timeout.
+ */
+ bool enabled = READ_ONCE(nx_huge_pages);
+ uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+
+ if (!enabled || !ratio)
+ return false;
+
+ *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+ if (!*period) {
+ /* Make sure the period is not less than one second. */
+ ratio = min(ratio, 3600u);
+ *period = 60 * 60 * 1000 / ratio;
+ }
+ return true;
+}
+
+static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
{
- unsigned int old_val;
+ bool was_recovery_enabled, is_recovery_enabled;
+ uint old_period, new_period;
int err;
- old_val = nx_huge_pages_recovery_ratio;
+ was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
+
err = param_set_uint(val, kp);
if (err)
return err;
- if (READ_ONCE(nx_huge_pages) &&
- !old_val && nx_huge_pages_recovery_ratio) {
+ is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
+
+ if (is_recovery_enabled &&
+ (!was_recovery_enabled || old_period > new_period)) {
struct kvm *kvm;
mutex_lock(&kvm_lock);
@@ -6250,9 +6284,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
static long get_nx_lpage_recovery_timeout(u64 start_time)
{
- return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
- ? start_time + 60 * HZ - get_jiffies_64()
- : MAX_SCHEDULE_TIMEOUT;
+ bool enabled;
+ uint period;
+
+ enabled = calc_nx_huge_pages_recovery_period(&period);
+
+ return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
}
static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index bf2bdbf333c2..da6166b5c377 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -104,7 +104,7 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
return kvm_mmu_role_as_id(sp->role);
}
-static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
+static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
{
/*
* When using the EPT page-modification log, the GPAs in the CPU dirty
@@ -112,19 +112,13 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
* on write protection to record dirty pages, which bypasses PML, since
* writes now result in a vmexit. Note, the check on CPU dirty logging
* being enabled is mandatory as the bits used to denote WP-only SPTEs
- * are reserved for NPT w/ PAE (32-bit KVM).
+ * are reserved for PAE paging (32-bit KVM).
*/
- return vcpu->arch.mmu == &vcpu->arch.guest_mmu &&
- kvm_x86_ops.cpu_dirty_log_size;
+ return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode;
}
-extern int nx_huge_pages;
-static inline bool is_nx_huge_page_enabled(void)
-{
- return READ_ONCE(nx_huge_pages);
-}
-
-int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync);
+int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
+ gfn_t gfn, bool can_unsync, bool prefetch);
void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
@@ -155,19 +149,11 @@ enum {
RET_PF_SPURIOUS,
};
-/* Bits which may be returned by set_spte() */
-#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
-#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
-#define SET_SPTE_SPURIOUS BIT(2)
-
int kvm_mmu_max_mapping_level(struct kvm *kvm,
const struct kvm_memory_slot *slot, gfn_t gfn,
kvm_pfn_t pfn, int max_level);
-int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
- int max_level, kvm_pfn_t *pfnp,
- bool huge_page_disallowed, int *req_level);
-void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
- kvm_pfn_t *pfnp, int *goal_levelp);
+void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 2924a4081a19..de5e8e4e1aa7 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -35,7 +35,7 @@
" %snxe %sad root %u %s%c", \
__entry->mmu_valid_gen, \
__entry->gfn, role.level, \
- role.gpte_is_8_bytes ? 8 : 4, \
+ role.has_4_byte_gpte ? 4 : 8, \
role.quadrant, \
role.direct ? " direct" : "", \
access_str[role.access], \
@@ -252,9 +252,9 @@ TRACE_EVENT(
TRACE_EVENT(
fast_page_fault,
- TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
+ TP_PROTO(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
u64 *sptep, u64 old_spte, int ret),
- TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret),
+ TP_ARGS(vcpu, fault, sptep, old_spte, ret),
TP_STRUCT__entry(
__field(int, vcpu_id)
@@ -268,8 +268,8 @@ TRACE_EVENT(
TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
- __entry->cr2_or_gpa = cr2_or_gpa;
- __entry->error_code = error_code;
+ __entry->cr2_or_gpa = fault->addr;
+ __entry->error_code = fault->error_code;
__entry->sptep = sptep;
__entry->old_spte = old_spte;
__entry->new_spte = *sptep;
@@ -367,8 +367,8 @@ TRACE_EVENT(
TRACE_EVENT(
kvm_mmu_spte_requested,
- TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn),
- TP_ARGS(addr, level, pfn),
+ TP_PROTO(struct kvm_page_fault *fault),
+ TP_ARGS(fault),
TP_STRUCT__entry(
__field(u64, gfn)
@@ -377,9 +377,9 @@ TRACE_EVENT(
),
TP_fast_assign(
- __entry->gfn = addr >> PAGE_SHIFT;
- __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
- __entry->level = level;
+ __entry->gfn = fault->gfn;
+ __entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1));
+ __entry->level = fault->goal_level;
),
TP_printk("gfn %llx pfn %llx level %d",
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 21427e84a82e..68eb1fb548b6 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -19,6 +19,12 @@
#include "mmu.h"
#include "mmu_internal.h"
+bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
+{
+ return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
+ !tdp_enabled || kvm_shadow_root_allocated(kvm);
+}
+
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
{
int i;
@@ -29,12 +35,17 @@ void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
}
}
-int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
unsigned long npages)
{
- int i;
+ int i;
for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+ if (i == KVM_PAGE_TRACK_WRITE &&
+ !kvm_page_track_write_tracking_enabled(kvm))
+ continue;
+
slot->arch.gfn_track[i] =
kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
GFP_KERNEL_ACCOUNT);
@@ -57,6 +68,21 @@ static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode)
return true;
}
+int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot)
+{
+ unsigned short *gfn_track;
+
+ if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE])
+ return 0;
+
+ gfn_track = kvcalloc(slot->npages, sizeof(*gfn_track), GFP_KERNEL_ACCOUNT);
+ if (gfn_track == NULL)
+ return -ENOMEM;
+
+ slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE] = gfn_track;
+ return 0;
+}
+
static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
enum kvm_page_track_mode mode, short count)
{
@@ -92,6 +118,10 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
if (WARN_ON(!page_track_mode_is_valid(mode)))
return;
+ if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE &&
+ !kvm_page_track_write_tracking_enabled(kvm)))
+ return;
+
update_gfn_track(slot, gfn, mode, 1);
/*
@@ -126,6 +156,10 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
if (WARN_ON(!page_track_mode_is_valid(mode)))
return;
+ if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE &&
+ !kvm_page_track_write_tracking_enabled(kvm)))
+ return;
+
update_gfn_track(slot, gfn, mode, -1);
/*
@@ -139,19 +173,22 @@ EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page);
/*
* check if the corresponding access on the specified guest page is tracked.
*/
-bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
- enum kvm_page_track_mode mode)
+bool kvm_slot_page_track_is_active(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t gfn, enum kvm_page_track_mode mode)
{
- struct kvm_memory_slot *slot;
int index;
if (WARN_ON(!page_track_mode_is_valid(mode)))
return false;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (!slot)
return false;
+ if (mode == KVM_PAGE_TRACK_WRITE &&
+ !kvm_page_track_write_tracking_enabled(kvm))
+ return false;
+
index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
}
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 913d52a7923e..5b5bdac97c7b 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -403,9 +403,8 @@ retry_walk:
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
- nested_access,
- &walker->fault);
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn),
+ nested_access, &walker->fault);
/*
* FIXME: This can happen if emulation (for of an INS/OUTS
@@ -467,7 +466,7 @@ retry_walk:
if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
gfn += pse36_gfn_delta(pte);
- real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault);
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
if (real_gpa == UNMAPPED_GVA)
return 0;
@@ -547,20 +546,11 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
access);
}
-#if PTTYPE != PTTYPE_EPT
-static int FNAME(walk_addr_nested)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr,
- u32 access)
-{
- return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
- addr, access);
-}
-#endif
-
static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, pt_element_t gpte, bool no_dirty_log)
{
+ struct kvm_memory_slot *slot;
unsigned pte_access;
gfn_t gfn;
kvm_pfn_t pfn;
@@ -573,30 +563,21 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn,
no_dirty_log && (pte_access & ACC_WRITE_MASK));
- if (is_error_pfn(pfn))
+ if (!slot)
return false;
- /*
- * we call mmu_set_spte() with host_writable = true because
- * pte_prefetch_gfn_to_pfn always gets a writable pfn.
- */
- mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn,
- true, true);
+ pfn = gfn_to_pfn_memslot_atomic(slot, gfn);
+ if (is_error_pfn(pfn))
+ return false;
+ mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL);
kvm_release_pfn_clean(pfn);
return true;
}
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte)
-{
- pt_element_t gpte = *(const pt_element_t *)pte;
-
- FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
-}
-
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
struct guest_walker *gw, int level)
{
@@ -663,21 +644,16 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
* If the guest tries to write a write-protected page, we need to
* emulate this operation, return 1 to indicate this case.
*/
-static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
- struct guest_walker *gw, u32 error_code,
- int max_level, kvm_pfn_t pfn, bool map_writable,
- bool prefault)
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ struct guest_walker *gw)
{
- bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
- bool write_fault = error_code & PFERR_WRITE_MASK;
- bool exec = error_code & PFERR_FETCH_MASK;
- bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned int direct_access, access;
- int top_level, level, req_level, ret;
- gfn_t base_gfn = gw->gfn;
+ int top_level, ret;
+ gfn_t base_gfn = fault->gfn;
+ WARN_ON_ONCE(gw->gfn != base_gfn);
direct_access = gw->pte_access;
top_level = vcpu->arch.mmu->root_level;
@@ -695,7 +671,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
goto out_gpte_changed;
- for (shadow_walk_init(&it, vcpu, addr);
+ for (shadow_walk_init(&it, vcpu, fault->addr);
shadow_walk_okay(&it) && it.level > gw->level;
shadow_walk_next(&it)) {
gfn_t table_gfn;
@@ -707,7 +683,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
if (!is_shadow_present_pte(*it.sptep)) {
table_gfn = gw->table_gfn[it.level - 2];
access = gw->pt_access[it.level - 2];
- sp = kvm_mmu_get_page(vcpu, table_gfn, addr,
+ sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
it.level-1, false, access);
/*
* We must synchronize the pagetable before linking it
@@ -741,10 +717,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
link_shadow_page(vcpu, it.sptep, sp);
}
- level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn,
- huge_page_disallowed, &req_level);
+ kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
+ trace_kvm_mmu_spte_requested(fault);
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
clear_sp_write_flooding_count(it.sptep);
@@ -753,12 +728,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
*/
- if (nx_huge_page_workaround_enabled)
- disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level,
- &pfn, &level);
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, *it.sptep, it.level);
- base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
- if (it.level == level)
+ base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == fault->goal_level)
break;
validate_direct_spte(vcpu, it.sptep, direct_access);
@@ -766,16 +740,20 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
drop_large_spte(vcpu, it.sptep);
if (!is_shadow_present_pte(*it.sptep)) {
- sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
+ sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr,
it.level - 1, true, direct_access);
link_shadow_page(vcpu, it.sptep, sp);
- if (huge_page_disallowed && req_level >= it.level)
+ if (fault->huge_page_disallowed &&
+ fault->req_level >= it.level)
account_huge_nx_page(vcpu->kvm, sp);
}
}
- ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
- it.level, base_gfn, pfn, prefault, map_writable);
+ if (WARN_ON_ONCE(it.level != fault->goal_level))
+ return -EFAULT;
+
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access,
+ base_gfn, fault->pfn, fault);
if (ret == RET_PF_SPURIOUS)
return ret;
@@ -841,45 +819,40 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
* a negative value on error.
*/
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
- bool prefault)
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- bool write_fault = error_code & PFERR_WRITE_MASK;
- bool user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
int r;
- kvm_pfn_t pfn;
- hva_t hva;
unsigned long mmu_seq;
- bool map_writable, is_self_change_mapping;
- int max_level;
+ bool is_self_change_mapping;
- pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
+ pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
+ WARN_ON_ONCE(fault->is_tdp);
/*
+ * Look up the guest pte for the faulting address.
* If PFEC.RSVD is set, this is a shadow page fault.
* The bit needs to be cleared before walking guest page tables.
*/
- error_code &= ~PFERR_RSVD_MASK;
-
- /*
- * Look up the guest pte for the faulting address.
- */
- r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
+ r = FNAME(walk_addr)(&walker, vcpu, fault->addr,
+ fault->error_code & ~PFERR_RSVD_MASK);
/*
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
- if (!prefault)
+ if (!fault->prefetch)
kvm_inject_emulated_page_fault(vcpu, &walker.fault);
return RET_PF_RETRY;
}
- if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
- shadow_page_table_clear_flood(vcpu, addr);
+ fault->gfn = walker.gfn;
+ fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
+
+ if (page_fault_handle_page_track(vcpu, fault)) {
+ shadow_page_table_clear_flood(vcpu, fault->addr);
return RET_PF_EMULATE;
}
@@ -890,29 +863,28 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
vcpu->arch.write_fault_to_shadow_pgtable = false;
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
- &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
+ &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable);
if (is_self_change_mapping)
- max_level = PG_LEVEL_4K;
+ fault->max_level = PG_LEVEL_4K;
else
- max_level = walker.level;
+ fault->max_level = walker.level;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
- write_fault, &map_writable, &r))
+ if (kvm_faultin_pfn(vcpu, fault, &r))
return r;
- if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
+ if (handle_abnormal_pfn(vcpu, fault, walker.pte_access, &r))
return r;
/*
* Do not change pte_access if the pfn is a mmio page, otherwise
* we will cache the incorrect access into mmio spte.
*/
- if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
- !is_cr0_wp(vcpu->arch.mmu) && !user_fault && !is_noslot_pfn(pfn)) {
+ if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
+ !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
walker.pte_access |= ACC_WRITE_MASK;
walker.pte_access &= ~ACC_USER_MASK;
@@ -928,20 +900,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
- if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
+
+ if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
- r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn,
- map_writable, prefault);
+ r = FNAME(fetch)(vcpu, fault, &walker);
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
+ kvm_release_pfn_clean(fault->pfn);
return r;
}
@@ -1007,73 +979,57 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
sizeof(pt_element_t)))
break;
- FNAME(update_pte)(vcpu, sp, sptep, &gpte);
+ FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false);
}
- if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
+ if (!sp->unsync_children)
break;
}
write_unlock(&vcpu->kvm->mmu_lock);
}
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access,
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t addr, u32 access,
struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
int r;
- r = FNAME(walk_addr)(&walker, vcpu, addr, access);
-
- if (r) {
- gpa = gfn_to_gpa(walker.gfn);
- gpa |= addr & ~PAGE_MASK;
- } else if (exception)
- *exception = walker.fault;
-
- return gpa;
-}
-
-#if PTTYPE != PTTYPE_EPT
-/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */
-static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
- u32 access,
- struct x86_exception *exception)
-{
- struct guest_walker walker;
- gpa_t gpa = UNMAPPED_GVA;
- int r;
-
#ifndef CONFIG_X86_64
/* A 64-bit GVA should be impossible on 32-bit KVM. */
- WARN_ON_ONCE(vaddr >> 32);
+ WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu);
#endif
- r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
+ r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access);
if (r) {
gpa = gfn_to_gpa(walker.gfn);
- gpa |= vaddr & ~PAGE_MASK;
+ gpa |= addr & ~PAGE_MASK;
} else if (exception)
*exception = walker.fault;
return gpa;
}
-#endif
/*
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
+ *
+ * Returns
+ * < 0: the sp should be zapped
+ * 0: the sp is synced and no tlb flushing is required
+ * > 0: the sp is synced and tlb flushing is required
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base;
- int i, nr_present = 0;
+ int i;
bool host_writable;
gpa_t first_pte_gpa;
- int set_spte_ret = 0;
+ bool flush = false;
/*
* Ignore various flags when verifying that it's safe to sync a shadow
@@ -1098,11 +1054,13 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
*/
if (WARN_ON_ONCE(sp->role.direct ||
(sp->role.word ^ mmu_role.word) & ~sync_role_ign.word))
- return 0;
+ return -1;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+ u64 *sptep, spte;
+ struct kvm_memory_slot *slot;
unsigned pte_access;
pt_element_t gpte;
gpa_t pte_gpa;
@@ -1115,10 +1073,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
sizeof(pt_element_t)))
- return 0;
+ return -1;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
+ flush = true;
continue;
}
@@ -1127,30 +1085,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pte_access &= FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
- &nr_present))
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
continue;
if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i]);
- set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
+ flush = true;
continue;
}
- nr_present++;
-
- host_writable = sp->spt[i] & shadow_host_writable_mask;
+ sptep = &sp->spt[i];
+ spte = *sptep;
+ host_writable = spte & shadow_host_writable_mask;
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ make_spte(vcpu, sp, slot, pte_access, gfn,
+ spte_to_pfn(spte), spte, true, false,
+ host_writable, &spte);
- set_spte_ret |= set_spte(vcpu, &sp->spt[i],
- pte_access, PG_LEVEL_4K,
- gfn, spte_to_pfn(sp->spt[i]),
- true, false, host_writable);
+ flush |= mmu_spte_update(sptep, spte);
}
- if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH)
- kvm_flush_remote_tlbs(vcpu->kvm);
-
- return nr_present;
+ return flush;
}
#undef pt_element_t
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 3e97cdb13eb7..351b04ad62a1 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -16,6 +16,7 @@
#include "spte.h"
#include <asm/e820/api.h>
+#include <asm/memtype.h>
#include <asm/vmx.h>
static bool __read_mostly enable_mmio_caching = true;
@@ -89,17 +90,19 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
E820_TYPE_RAM);
}
-int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
- gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative,
- bool can_unsync, bool host_writable, bool ad_disabled,
- u64 *new_spte)
+bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ const struct kvm_memory_slot *slot,
+ unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
+ u64 old_spte, bool prefetch, bool can_unsync,
+ bool host_writable, u64 *new_spte)
{
+ int level = sp->role.level;
u64 spte = SPTE_MMU_PRESENT_MASK;
- int ret = 0;
+ bool wrprot = false;
- if (ad_disabled)
+ if (sp->role.ad_disabled)
spte |= SPTE_TDP_AD_DISABLED_MASK;
- else if (kvm_vcpu_ad_need_write_protect(vcpu))
+ else if (kvm_mmu_page_ad_need_write_protect(sp))
spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
/*
@@ -109,7 +112,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
* read access. See FNAME(gpte_access) in paging_tmpl.h.
*/
spte |= shadow_present_mask;
- if (!speculative)
+ if (!prefetch)
spte |= spte_shadow_accessed_mask(spte);
if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
@@ -150,7 +153,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
* is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots.
* Same reasoning can be applied to dirty page accounting.
*/
- if (!can_unsync && is_writable_pte(old_spte))
+ if (is_writable_pte(old_spte))
goto out;
/*
@@ -159,10 +162,10 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
* e.g. it's write-tracked (upper-level SPs) or has one or more
* shadow pages and unsync'ing pages is not allowed.
*/
- if (mmu_try_to_unsync_pages(vcpu, gfn, can_unsync)) {
+ if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) {
pgprintk("%s: found shadow page for %llx, marking ro\n",
__func__, gfn);
- ret |= SET_SPTE_WRITE_PROTECTED_PT;
+ wrprot = true;
pte_access &= ~ACC_WRITE_MASK;
spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
}
@@ -171,16 +174,22 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
if (pte_access & ACC_WRITE_MASK)
spte |= spte_shadow_dirty_mask(spte);
- if (speculative)
+out:
+ if (prefetch)
spte = mark_spte_for_access_track(spte);
-out:
WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level),
"spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level,
get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level));
+ if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) {
+ /* Enforced by kvm_mmu_hugepage_adjust. */
+ WARN_ON(level > PG_LEVEL_4K);
+ mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
+ }
+
*new_spte = spte;
- return ret;
+ return wrprot;
}
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index eb7b227fc6cf..a4af2a42695c 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -310,12 +310,7 @@ static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
u64 spte, int level)
{
- /*
- * Use a bitwise-OR instead of a logical-OR to aggregate the reserved
- * bits and EPT's invalid memtype/XWR checks to avoid an extra Jcc
- * (this is extremely unlikely to be short-circuited as true).
- */
- return __is_bad_mt_xwr(rsvd_check, spte) |
+ return __is_bad_mt_xwr(rsvd_check, spte) ||
__is_rsvd_bits_set(rsvd_check, spte, level);
}
@@ -334,15 +329,11 @@ static inline u64 get_mmio_spte_generation(u64 spte)
return gen;
}
-/* Bits which may be returned by set_spte() */
-#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
-#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
-#define SET_SPTE_SPURIOUS BIT(2)
-
-int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
- gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative,
- bool can_unsync, bool host_writable, bool ad_disabled,
- u64 *new_spte);
+bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ const struct kvm_memory_slot *slot,
+ unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
+ u64 old_spte, bool prefetch, bool can_unsync,
+ bool host_writable, u64 *new_spte);
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
u64 mark_spte_for_access_track(u64 spte);
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index b3ed302c1a35..caa96c270b95 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -26,6 +26,7 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
*/
void tdp_iter_restart(struct tdp_iter *iter)
{
+ iter->yielded = false;
iter->yielded_gfn = iter->next_last_level_gfn;
iter->level = iter->root_level;
@@ -160,6 +161,11 @@ static bool try_step_up(struct tdp_iter *iter)
*/
void tdp_iter_next(struct tdp_iter *iter)
{
+ if (iter->yielded) {
+ tdp_iter_restart(iter);
+ return;
+ }
+
if (try_step_down(iter))
return;
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index b1748b988d3a..e19cabbcb65c 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -45,6 +45,12 @@ struct tdp_iter {
* iterator walks off the end of the paging structure.
*/
bool valid;
+ /*
+ * True if KVM dropped mmu_lock and yielded in the middle of a walk, in
+ * which case tdp_iter_next() needs to restart the walk at the root
+ * level instead of advancing to the next entry.
+ */
+ bool yielded;
};
/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 64ccfc1fa553..7b1bc816b7c3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -165,8 +165,9 @@ static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
role = vcpu->arch.mmu->mmu_role.base;
role.level = level;
role.direct = true;
- role.gpte_is_8_bytes = true;
+ role.has_4_byte_gpte = false;
role.access = ACC_ALL;
+ role.ad_disabled = !shadow_accessed_mask;
return role;
}
@@ -316,9 +317,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
gfn_t base_gfn = sp->gfn;
- u64 old_child_spte;
- u64 *sptep;
- gfn_t gfn;
int i;
trace_kvm_mmu_prepare_zap_page(sp);
@@ -326,8 +324,9 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
tdp_mmu_unlink_page(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- sptep = rcu_dereference(pt) + i;
- gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 *sptep = rcu_dereference(pt) + i;
+ gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 old_child_spte;
if (shared) {
/*
@@ -373,7 +372,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
shared);
}
- kvm_flush_remote_tlbs_with_address(kvm, gfn,
+ kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
KVM_PAGES_PER_HPAGE(level + 1));
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -489,8 +488,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
}
/*
- * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
- * and handle the associated bookkeeping, but do not mark the page dirty
+ * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping. Do not mark the page dirty
* in KVM's dirty bitmaps.
*
* @kvm: kvm instance
@@ -499,10 +498,12 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* Returns: true if the SPTE was set, false if it was not. If false is returned,
* this function will have no side-effects.
*/
-static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
+ WARN_ON_ONCE(iter->yielded);
+
lockdep_assert_held_read(&kvm->mmu_lock);
/*
@@ -527,43 +528,6 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
return true;
}
-/*
- * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a
- * TDP page fault.
- *
- * @vcpu: The vcpu instance that took the TDP page fault.
- * @iter: a tdp_iter instance currently on the SPTE that should be set
- * @new_spte: The value the SPTE should be set to
- *
- * Returns: true if the SPTE was set, false if it was not. If false is returned,
- * this function will have no side-effects.
- */
-static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu,
- struct tdp_iter *iter,
- u64 new_spte)
-{
- struct kvm *kvm = vcpu->kvm;
-
- if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
- return false;
-
- /*
- * Use kvm_vcpu_gfn_to_memslot() instead of going through
- * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot.
- */
- if (is_writable_pte(new_spte)) {
- struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn);
-
- if (slot && kvm_slot_dirty_track_enabled(slot)) {
- /* Enforced by kvm_mmu_hugepage_adjust. */
- WARN_ON_ONCE(iter->level > PG_LEVEL_4K);
- mark_page_dirty_in_slot(kvm, slot, iter->gfn);
- }
- }
-
- return true;
-}
-
static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter)
{
@@ -573,7 +537,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* immediately installing a present entry in its place
* before the TLBs are flushed.
*/
- if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE))
+ if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
return false;
kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
@@ -613,6 +577,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte, bool record_acc_track,
bool record_dirty_log)
{
+ WARN_ON_ONCE(iter->yielded);
+
lockdep_assert_held_write(&kvm->mmu_lock);
/*
@@ -678,18 +644,19 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
* If this function should yield and flush is set, it will perform a remote
* TLB flush before yielding.
*
- * If this function yields, it will also reset the tdp_iter's walk over the
- * paging structure and the calling function should skip to the next
- * iteration to allow the iterator to continue its traversal from the
- * paging structure root.
+ * If this function yields, iter->yielded is set and the caller must skip to
+ * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
+ * over the paging structures to allow the iterator to continue its traversal
+ * from the paging structure root.
*
- * Return true if this function yielded and the iterator's traversal was reset.
- * Return false if a yield was not needed.
+ * Returns true if this function yielded.
*/
-static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
- struct tdp_iter *iter, bool flush,
- bool shared)
+static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool flush, bool shared)
{
+ WARN_ON(iter->yielded);
+
/* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn)
return false;
@@ -709,12 +676,10 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
WARN_ON(iter->gfn > iter->next_last_level_gfn);
- tdp_iter_restart(iter);
-
- return true;
+ iter->yielded = true;
}
- return false;
+ return iter->yielded;
}
/*
@@ -929,26 +894,26 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
* Installs a last-level SPTE to handle a TDP page fault.
* (NPT/EPT violation/misconfiguration)
*/
-static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
- int map_writable,
- struct tdp_iter *iter,
- kvm_pfn_t pfn, bool prefault)
+static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ struct tdp_iter *iter)
{
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
u64 new_spte;
int ret = RET_PF_FIXED;
- int make_spte_ret = 0;
+ bool wrprot = false;
- if (unlikely(is_noslot_pfn(pfn)))
+ WARN_ON(sp->role.level != fault->goal_level);
+ if (unlikely(!fault->slot))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
else
- make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
- pfn, iter->old_spte, prefault, true,
- map_writable, !shadow_accessed_mask,
- &new_spte);
+ wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
+ fault->pfn, iter->old_spte, fault->prefetch, true,
+ fault->map_writable, &new_spte);
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
- else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte))
+ else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
return RET_PF_RETRY;
/*
@@ -956,10 +921,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
* protected, emulation is needed. If the emulation was skipped,
* the vCPU would have the same fault again.
*/
- if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
- if (write)
+ if (wrprot) {
+ if (fault->write)
ret = RET_PF_EMULATE;
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
@@ -986,37 +950,26 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
* Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
* page tables and SPTEs to translate the faulting guest physical address.
*/
-int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault)
+int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
- bool write = error_code & PFERR_WRITE_MASK;
- bool exec = error_code & PFERR_FETCH_MASK;
- bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_mmu *mmu = vcpu->arch.mmu;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
u64 *child_pt;
u64 new_spte;
int ret;
- gfn_t gfn = gpa >> PAGE_SHIFT;
- int level;
- int req_level;
- level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
- huge_page_disallowed, &req_level);
+ kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(gpa, level, pfn);
+ trace_kvm_mmu_spte_requested(fault);
rcu_read_lock();
- tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
- if (nx_huge_page_workaround_enabled)
- disallowed_hugepage_adjust(iter.old_spte, gfn,
- iter.level, &pfn, &level);
+ tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
- if (iter.level == level)
+ if (iter.level == fault->goal_level)
break;
/*
@@ -1052,10 +1005,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
new_spte = make_nonleaf_spte(child_pt,
!shadow_accessed_mask);
- if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) {
+ if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
tdp_mmu_link_page(vcpu->kvm, sp,
- huge_page_disallowed &&
- req_level >= iter.level);
+ fault->huge_page_disallowed &&
+ fault->req_level >= iter.level);
trace_kvm_mmu_get_page(sp, true);
} else {
@@ -1065,13 +1018,12 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
}
}
- if (iter.level != level) {
+ if (iter.level != fault->goal_level) {
rcu_read_unlock();
return RET_PF_RETRY;
}
- ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
- pfn, prefault);
+ ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
rcu_read_unlock();
return ret;
@@ -1082,9 +1034,9 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
- flush |= zap_gfn_range(kvm, root, range->start, range->end,
- range->may_block, flush, false);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
+ flush = zap_gfn_range(kvm, root, range->start, range->end,
+ range->may_block, flush, false);
return flush;
}
@@ -1241,8 +1193,7 @@ retry:
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
- new_spte)) {
+ if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
/*
* The iter must explicitly re-read the SPTE because
* the atomic cmpxchg failed.
@@ -1310,8 +1261,7 @@ retry:
continue;
}
- if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
- new_spte)) {
+ if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
/*
* The iter must explicitly re-read the SPTE because
* the atomic cmpxchg failed.
@@ -1415,10 +1365,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
* Clear leaf entries which could be replaced by large mappings, for
* GFNs within the slot.
*/
-static bool zap_collapsible_spte_range(struct kvm *kvm,
+static void zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
- const struct kvm_memory_slot *slot,
- bool flush)
+ const struct kvm_memory_slot *slot)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
@@ -1429,10 +1378,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
tdp_root_for_each_pte(iter, root, start, end) {
retry:
- if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
- flush = false;
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
- }
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
@@ -1444,6 +1391,7 @@ retry:
pfn, PG_LEVEL_NUM))
continue;
+ /* Note, a successful atomic zap also does a remote TLB flush. */
if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
/*
* The iter must explicitly re-read the SPTE because
@@ -1452,30 +1400,24 @@ retry:
iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
goto retry;
}
- flush = true;
}
rcu_read_unlock();
-
- return flush;
}
/*
* Clear non-leaf entries (and free associated page tables) which could
* be replaced by large mappings, for GFNs within the slot.
*/
-bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- bool flush)
+void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
- flush = zap_collapsible_spte_range(kvm, root, slot, flush);
-
- return flush;
+ zap_collapsible_spte_range(kvm, root, slot);
}
/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 358f447d4012..3899004a5d91 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -48,9 +48,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm);
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
-int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault);
+int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush);
@@ -66,9 +64,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
-bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- bool flush);
+void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
@@ -92,7 +89,6 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
#ifdef CONFIG_X86_64
bool kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
@@ -114,7 +110,6 @@ static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
#else
static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; }
static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; }
#endif
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 0772bad9165c..261b39cbef6e 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -55,43 +55,41 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
kvm_pmu_deliver_pmi(vcpu);
}
-static void kvm_perf_overflow(struct perf_event *perf_event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
{
- struct kvm_pmc *pmc = perf_event->overflow_handler_context;
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
- __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
- kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
- }
+ /* Ignore counters that have been reprogrammed already. */
+ if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
+ return;
+
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+
+ if (!pmc->intr)
+ return;
+
+ /*
+ * Inject PMI. If vcpu was in a guest mode during NMI PMI
+ * can be ejected on a guest mode re-entry. Otherwise we can't
+ * be sure that vcpu wasn't executing hlt instruction at the
+ * time of vmexit and is not going to re-enter guest mode until
+ * woken up. So we should wake it, but this is impossible from
+ * NMI context. Do it from irq work instead.
+ */
+ if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
+ irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
+ else
+ kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
}
-static void kvm_perf_overflow_intr(struct perf_event *perf_event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static void kvm_perf_overflow(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
- struct kvm_pmu *pmu = pmc_to_pmu(pmc);
-
- if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
- __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
- kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
- /*
- * Inject PMI. If vcpu was in a guest mode during NMI PMI
- * can be ejected on a guest mode re-entry. Otherwise we can't
- * be sure that vcpu wasn't executing hlt instruction at the
- * time of vmexit and is not going to re-enter guest mode until
- * woken up. So we should wake it, but this is impossible from
- * NMI context. Do it from irq work instead.
- */
- if (!kvm_is_in_guest())
- irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
- else
- kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
- }
+ __kvm_perf_overflow(pmc, true);
}
static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
@@ -126,7 +124,6 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
}
event = perf_event_create_kernel_counter(&attr, -1, current,
- intr ? kvm_perf_overflow_intr :
kvm_perf_overflow, pmc);
if (IS_ERR(event)) {
pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
@@ -138,6 +135,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
pmc_to_pmu(pmc)->event_count++;
clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
pmc->is_paused = false;
+ pmc->intr = intr;
}
static void pmc_pause_counter(struct kvm_pmc *pmc)
@@ -174,7 +172,6 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
{
unsigned config, type = PERF_TYPE_RAW;
- u8 event_select, unit_mask;
struct kvm *kvm = pmc->vcpu->kvm;
struct kvm_pmu_event_filter *filter;
int i;
@@ -206,17 +203,12 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
if (!allow_event)
return;
- event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
- unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
-
if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
ARCH_PERFMON_EVENTSEL_INV |
ARCH_PERFMON_EVENTSEL_CMASK |
HSW_IN_TX |
HSW_IN_TX_CHECKPOINTED))) {
- config = kvm_x86_ops.pmu_ops->find_arch_event(pmc_to_pmu(pmc),
- event_select,
- unit_mask);
+ config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
if (config != PERF_COUNT_HW_MAX)
type = PERF_TYPE_HARDWARE;
}
@@ -268,7 +260,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
pmc->current_config = (u64)ctrl;
pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
- kvm_x86_ops.pmu_ops->find_fixed_event(idx),
+ kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc),
!(en_field & 0x2), /* exclude user */
!(en_field & 0x1), /* exclude kernel */
pmi, false, false);
@@ -319,7 +311,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
}
/* check if idx is a valid index to access PMU */
-int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx);
}
@@ -490,6 +482,66 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
kvm_pmu_reset(vcpu);
}
+static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ u64 prev_count;
+
+ prev_count = pmc->counter;
+ pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
+
+ reprogram_counter(pmu, pmc->idx);
+ if (pmc->counter < prev_count)
+ __kvm_perf_overflow(pmc, false);
+}
+
+static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
+ unsigned int perf_hw_id)
+{
+ u64 old_eventsel = pmc->eventsel;
+ unsigned int config;
+
+ pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK);
+ config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
+ pmc->eventsel = old_eventsel;
+ return config == perf_hw_id;
+}
+
+static inline bool cpl_is_matched(struct kvm_pmc *pmc)
+{
+ bool select_os, select_user;
+ u64 config = pmc->current_config;
+
+ if (pmc_is_gp(pmc)) {
+ select_os = config & ARCH_PERFMON_EVENTSEL_OS;
+ select_user = config & ARCH_PERFMON_EVENTSEL_USR;
+ } else {
+ select_os = config & 0x1;
+ select_user = config & 0x2;
+ }
+
+ return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
+}
+
+void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int i;
+
+ for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
+ pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
+
+ if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
+ continue;
+
+ /* Ignore checks for edge detect, pin control, invert and CMASK bits */
+ if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
+ kvm_pmu_incr_counter(pmc);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
+
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
{
struct kvm_pmu_event_filter tmp, *filter;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 0e4f2b1fa9fb..7a7b8d5b775e 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -24,15 +24,13 @@ struct kvm_event_hw_type_mapping {
};
struct kvm_pmu_ops {
- unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select,
- u8 unit_mask);
- unsigned (*find_fixed_event)(int idx);
+ unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc);
bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
unsigned int idx, u64 *mask);
struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr);
- int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx);
+ bool (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx);
bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -149,7 +147,7 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
-int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx);
+bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx);
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -159,6 +157,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu);
void kvm_pmu_cleanup(struct kvm_vcpu *vcpu);
void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
+void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id);
bool is_vmware_backdoor_pmc(u32 pmc_idx);
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 8052d92069e0..0e5b49294086 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -293,7 +293,7 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh)
{
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
kvm_for_each_vcpu(i, vcpu, kvm) {
bool m = kvm_apic_match_dest(vcpu, source,
@@ -675,10 +675,18 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
smp_mb__after_atomic();
if (avic_vcpu_is_running(vcpu)) {
- int cpuid = vcpu->cpu;
+ int cpu = READ_ONCE(vcpu->cpu);
- if (cpuid != get_cpu())
- wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid));
+ /*
+ * Note, the vCPU could get migrated to a different pCPU at any
+ * point, which could result in signalling the wrong/previous
+ * pCPU. But if that happens the vCPU is guaranteed to do a
+ * VMRUN (after being migrated) and thus will process pending
+ * interrupts, i.e. a doorbell is not needed (and the spurious
+ * one is harmless).
+ */
+ if (cpu != get_cpu())
+ wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
put_cpu();
} else
kvm_vcpu_wake_up(vcpu);
@@ -900,11 +908,13 @@ out:
bool svm_check_apicv_inhibit_reasons(ulong bit)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
BIT(APICV_INHIBIT_REASON_NESTED) |
BIT(APICV_INHIBIT_REASON_IRQWIN) |
BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
- BIT(APICV_INHIBIT_REASON_X2APIC);
+ BIT(APICV_INHIBIT_REASON_X2APIC) |
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
return supported & BIT(bit);
}
@@ -988,16 +998,18 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ int cpu = get_cpu();
+ WARN_ON(cpu != vcpu->cpu);
svm->avic_is_running = is_run;
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
- if (is_run)
- avic_vcpu_load(vcpu, vcpu->cpu);
- else
- avic_vcpu_put(vcpu);
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ if (is_run)
+ avic_vcpu_load(vcpu, cpu);
+ else
+ avic_vcpu_put(vcpu);
+ }
+ put_cpu();
}
void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 510b833cbd39..cf206855ebf0 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -58,8 +58,9 @@ static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_excep
struct vcpu_svm *svm = to_svm(vcpu);
WARN_ON(!is_guest_mode(vcpu));
- if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
- !svm->nested.nested_run_pending) {
+ if (vmcb12_is_intercept(&svm->nested.ctl,
+ INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
+ !svm->nested.nested_run_pending) {
svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
svm->vmcb->control.exit_code_hi = 0;
svm->vmcb->control.exit_info_1 = fault->error_code;
@@ -121,7 +122,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
void recalc_intercepts(struct vcpu_svm *svm)
{
- struct vmcb_control_area *c, *h, *g;
+ struct vmcb_control_area *c, *h;
+ struct vmcb_ctrl_area_cached *g;
unsigned int i;
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
@@ -163,37 +165,6 @@ void recalc_intercepts(struct vcpu_svm *svm)
vmcb_set_intercept(c, INTERCEPT_VMSAVE);
}
-static void copy_vmcb_control_area(struct vmcb_control_area *dst,
- struct vmcb_control_area *from)
-{
- unsigned int i;
-
- for (i = 0; i < MAX_INTERCEPT; i++)
- dst->intercepts[i] = from->intercepts[i];
-
- dst->iopm_base_pa = from->iopm_base_pa;
- dst->msrpm_base_pa = from->msrpm_base_pa;
- dst->tsc_offset = from->tsc_offset;
- /* asid not copied, it is handled manually for svm->vmcb. */
- dst->tlb_ctl = from->tlb_ctl;
- dst->int_ctl = from->int_ctl;
- dst->int_vector = from->int_vector;
- dst->int_state = from->int_state;
- dst->exit_code = from->exit_code;
- dst->exit_code_hi = from->exit_code_hi;
- dst->exit_info_1 = from->exit_info_1;
- dst->exit_info_2 = from->exit_info_2;
- dst->exit_int_info = from->exit_int_info;
- dst->exit_int_info_err = from->exit_int_info_err;
- dst->nested_ctl = from->nested_ctl;
- dst->event_inj = from->event_inj;
- dst->event_inj_err = from->event_inj_err;
- dst->nested_cr3 = from->nested_cr3;
- dst->virt_ext = from->virt_ext;
- dst->pause_filter_count = from->pause_filter_count;
- dst->pause_filter_thresh = from->pause_filter_thresh;
-}
-
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
/*
@@ -203,7 +174,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
*/
int i;
- if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return true;
for (i = 0; i < MSRPM_OFFSETS; i++) {
@@ -238,10 +209,22 @@ static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
}
-static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
- struct vmcb_control_area *control)
+static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl)
+{
+ /* Nested FLUSHBYASID is not supported yet. */
+ switch(tlb_ctl) {
+ case TLB_CONTROL_DO_NOTHING:
+ case TLB_CONTROL_FLUSH_ALL_ASID:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *control)
{
- if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN)))
+ if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN)))
return false;
if (CC(control->asid == 0))
@@ -257,12 +240,26 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
IOPM_SIZE)))
return false;
+ if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl)))
+ return false;
+
return true;
}
-static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu,
- struct vmcb_save_area *save)
+/* Common checks that apply to both L1 and L2 state. */
+static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area_cached *save)
{
+ if (CC(!(save->efer & EFER_SVME)))
+ return false;
+
+ if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+ CC(save->cr0 & ~0xffffffffULL))
+ return false;
+
+ if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
+ return false;
+
/*
* These checks are also performed by KVM_SET_SREGS,
* except that EFER.LMA is not checked by SVM against
@@ -278,48 +275,90 @@ static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu,
if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
return false;
+ if (CC(!kvm_valid_efer(vcpu, save->efer)))
+ return false;
+
return true;
}
-/* Common checks that apply to both L1 and L2 state. */
-static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu,
- struct vmcb_save_area *save)
+static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu)
{
- /*
- * FIXME: these should be done after copying the fields,
- * to avoid TOC/TOU races. For these save area checks
- * the possible damage is limited since kvm_set_cr0 and
- * kvm_set_cr4 handle failure; EFER_SVME is an exception
- * so it is force-set later in nested_prepare_vmcb_save.
- */
- if (CC(!(save->efer & EFER_SVME)))
- return false;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_save_area_cached *save = &svm->nested.save;
- if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
- CC(save->cr0 & ~0xffffffffULL))
- return false;
+ return __nested_vmcb_check_save(vcpu, save);
+}
- if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
- return false;
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl;
- if (!nested_vmcb_check_cr3_cr4(vcpu, save))
- return false;
+ return __nested_vmcb_check_controls(vcpu, ctl);
+}
- if (CC(!kvm_valid_efer(vcpu, save->efer)))
- return false;
+static
+void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to,
+ struct vmcb_control_area *from)
+{
+ unsigned int i;
- return true;
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ to->intercepts[i] = from->intercepts[i];
+
+ to->iopm_base_pa = from->iopm_base_pa;
+ to->msrpm_base_pa = from->msrpm_base_pa;
+ to->tsc_offset = from->tsc_offset;
+ to->tlb_ctl = from->tlb_ctl;
+ to->int_ctl = from->int_ctl;
+ to->int_vector = from->int_vector;
+ to->int_state = from->int_state;
+ to->exit_code = from->exit_code;
+ to->exit_code_hi = from->exit_code_hi;
+ to->exit_info_1 = from->exit_info_1;
+ to->exit_info_2 = from->exit_info_2;
+ to->exit_int_info = from->exit_int_info;
+ to->exit_int_info_err = from->exit_int_info_err;
+ to->nested_ctl = from->nested_ctl;
+ to->event_inj = from->event_inj;
+ to->event_inj_err = from->event_inj_err;
+ to->nested_cr3 = from->nested_cr3;
+ to->virt_ext = from->virt_ext;
+ to->pause_filter_count = from->pause_filter_count;
+ to->pause_filter_thresh = from->pause_filter_thresh;
+
+ /* Copy asid here because nested_vmcb_check_controls will check it. */
+ to->asid = from->asid;
+ to->msrpm_base_pa &= ~0x0fffULL;
+ to->iopm_base_pa &= ~0x0fffULL;
}
-void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
- struct vmcb_control_area *control)
+void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
+ struct vmcb_control_area *control)
{
- copy_vmcb_control_area(&svm->nested.ctl, control);
+ __nested_copy_vmcb_control_to_cache(&svm->nested.ctl, control);
+}
- /* Copy it here because nested_svm_check_controls will check it. */
- svm->nested.ctl.asid = control->asid;
- svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL;
- svm->nested.ctl.iopm_base_pa &= ~0x0fffULL;
+static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
+ struct vmcb_save_area *from)
+{
+ /*
+ * Copy only fields that are validated, as we need them
+ * to avoid TOC/TOU races.
+ */
+ to->efer = from->efer;
+ to->cr0 = from->cr0;
+ to->cr3 = from->cr3;
+ to->cr4 = from->cr4;
+
+ to->dr6 = from->dr6;
+ to->dr7 = from->dr7;
+}
+
+void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
+ struct vmcb_save_area *save)
+{
+ __nested_copy_vmcb_save_to_cache(&svm->nested.save, save);
}
/*
@@ -422,14 +461,13 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
return -EINVAL;
if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) &&
- CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)))
+ CC(!load_pdptrs(vcpu, cr3)))
return -EINVAL;
if (!nested_npt)
kvm_mmu_new_pgd(vcpu, cr3);
vcpu->arch.cr3 = cr3;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
kvm_init_mmu(vcpu);
@@ -475,15 +513,10 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
- /*
- * Force-set EFER_SVME even though it is checked earlier on the
- * VMCB12, because the guest can flip the bit between the check
- * and now. Clearing EFER_SVME would call svm_free_nested.
- */
- svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME);
+ svm_set_efer(&svm->vcpu, svm->nested.save.efer);
- svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
- svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
+ svm_set_cr0(&svm->vcpu, svm->nested.save.cr0);
+ svm_set_cr4(&svm->vcpu, svm->nested.save.cr4);
svm->vcpu.arch.cr2 = vmcb12->save.cr2;
@@ -498,8 +531,8 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
/* These bits will be set properly on the first execution when new_vmc12 is true */
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
- svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
- svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
+ svm->vmcb->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
+ svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
vmcb_mark_dirty(svm->vmcb, VMCB_DR);
}
}
@@ -538,8 +571,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
if (nested_npt_enabled(svm))
nested_svm_init_mmu_context(vcpu);
- svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset =
- vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ vcpu->arch.l1_tsc_offset,
+ svm->nested.ctl.tsc_offset,
+ svm->tsc_ratio_msr);
+
+ svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
+
+ if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
+ WARN_ON(!svm->tsc_scaling_enabled);
+ nested_svm_update_tsc_ratio_msr(vcpu);
+ }
svm->vmcb->control.int_ctl =
(svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
@@ -550,9 +592,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
svm->vmcb->control.event_inj = svm->nested.ctl.event_inj;
svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err;
- svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count;
- svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh;
-
nested_svm_transition_tlb_flush(vcpu);
/* Enter Guest-Mode */
@@ -607,7 +646,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
nested_vmcb02_prepare_control(svm);
nested_vmcb02_prepare_save(svm, vmcb12);
- ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
+ ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
nested_npt_enabled(svm), from_vmrun);
if (ret)
return ret;
@@ -657,10 +696,11 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
if (WARN_ON_ONCE(!svm->nested.initialized))
return -EINVAL;
- nested_load_control_from_vmcb12(svm, &vmcb12->control);
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
- if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) ||
- !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) {
+ if (!nested_vmcb_check_save(vcpu) ||
+ !nested_vmcb_check_controls(vcpu)) {
vmcb12->control.exit_code = SVM_EXIT_ERR;
vmcb12->control.exit_code_hi = 0;
vmcb12->control.exit_info_1 = 0;
@@ -810,11 +850,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
- vmcb12->control.pause_filter_count =
- svm->vmcb->control.pause_filter_count;
- vmcb12->control.pause_filter_thresh =
- svm->vmcb->control.pause_filter_thresh;
-
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
svm_switch_vmcb(svm, &svm->vmcb01);
@@ -832,6 +867,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
+ if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
+ WARN_ON(!svm->tsc_scaling_enabled);
+ vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
+ }
+
svm->nested.ctl.nested_cr3 = 0;
/*
@@ -966,7 +1007,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
u32 offset, msr, value;
int write, mask;
- if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return NESTED_EXIT_HOST;
msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
@@ -993,7 +1034,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
u8 start_bit;
u64 gpa;
- if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
return NESTED_EXIT_HOST;
port = svm->vmcb->control.exit_info_1 >> 16;
@@ -1024,12 +1065,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
vmexit = nested_svm_intercept_ioio(svm);
break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
- if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
- if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
break;
}
@@ -1047,7 +1088,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
break;
}
default: {
- if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
}
}
@@ -1125,7 +1166,7 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
static inline bool nested_exit_on_init(struct vcpu_svm *svm)
{
- return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
}
static int svm_check_nested_events(struct kvm_vcpu *vcpu)
@@ -1219,11 +1260,57 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
return NESTED_EXIT_CONTINUE;
}
+void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->arch.tsc_scaling_ratio =
+ kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
+ svm->tsc_ratio_msr);
+ svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
+}
+
+/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
+static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
+ struct vmcb_ctrl_area_cached *from)
+{
+ unsigned int i;
+
+ memset(dst, 0, sizeof(struct vmcb_control_area));
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ dst->intercepts[i] = from->intercepts[i];
+
+ dst->iopm_base_pa = from->iopm_base_pa;
+ dst->msrpm_base_pa = from->msrpm_base_pa;
+ dst->tsc_offset = from->tsc_offset;
+ dst->asid = from->asid;
+ dst->tlb_ctl = from->tlb_ctl;
+ dst->int_ctl = from->int_ctl;
+ dst->int_vector = from->int_vector;
+ dst->int_state = from->int_state;
+ dst->exit_code = from->exit_code;
+ dst->exit_code_hi = from->exit_code_hi;
+ dst->exit_info_1 = from->exit_info_1;
+ dst->exit_info_2 = from->exit_info_2;
+ dst->exit_int_info = from->exit_int_info;
+ dst->exit_int_info_err = from->exit_int_info_err;
+ dst->nested_ctl = from->nested_ctl;
+ dst->event_inj = from->event_inj;
+ dst->event_inj_err = from->event_inj_err;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->virt_ext = from->virt_ext;
+ dst->pause_filter_count = from->pause_filter_count;
+ dst->pause_filter_thresh = from->pause_filter_thresh;
+}
+
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
u32 user_data_size)
{
struct vcpu_svm *svm;
+ struct vmcb_control_area *ctl;
+ unsigned long r;
struct kvm_nested_state kvm_state = {
.flags = 0,
.format = KVM_STATE_NESTED_FORMAT_SVM,
@@ -1265,9 +1352,18 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
*/
if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
return -EFAULT;
- if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
- sizeof(user_vmcb->control)))
+
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
+ if (!ctl)
+ return -ENOMEM;
+
+ nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl);
+ r = copy_to_user(&user_vmcb->control, ctl,
+ sizeof(user_vmcb->control));
+ kfree(ctl);
+ if (r)
return -EFAULT;
+
if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
sizeof(user_vmcb->save)))
return -EFAULT;
@@ -1284,6 +1380,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
&user_kvm_nested_state->data.svm[0];
struct vmcb_control_area *ctl;
struct vmcb_save_area *save;
+ struct vmcb_save_area_cached save_cached;
+ struct vmcb_ctrl_area_cached ctl_cached;
unsigned long cr0;
int ret;
@@ -1336,7 +1434,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
goto out_free;
ret = -EINVAL;
- if (!nested_vmcb_check_controls(vcpu, ctl))
+ __nested_copy_vmcb_control_to_cache(&ctl_cached, ctl);
+ if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
goto out_free;
/*
@@ -1351,10 +1450,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
* Validate host state saved from before VMRUN (see
* nested_svm_check_permissions).
*/
+ __nested_copy_vmcb_save_to_cache(&save_cached, save);
if (!(save->cr0 & X86_CR0_PG) ||
!(save->cr0 & X86_CR0_PE) ||
(save->rflags & X86_EFLAGS_VM) ||
- !nested_vmcb_valid_sregs(vcpu, save))
+ !__nested_vmcb_check_save(vcpu, &save_cached))
goto out_free;
/*
@@ -1390,7 +1490,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save);
- nested_load_control_from_vmcb12(svm, ctl);
+ nested_copy_vmcb_control_to_cache(svm, ctl);
svm_switch_vmcb(svm, &svm->nested.vmcb02);
nested_vmcb02_prepare_control(svm);
@@ -1417,7 +1517,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
* the guest CR3 might be restored prior to setting the nested
* state which can lead to a load of wrong PDPTRs.
*/
- if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)))
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
return false;
if (!nested_svm_vmrun_msrpm(svm)) {
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index fdf587f19c5f..12d8b301065a 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -16,6 +16,7 @@
#include "cpuid.h"
#include "lapic.h"
#include "pmu.h"
+#include "svm.h"
enum pmu_type {
PMU_TYPE_COUNTER = 0,
@@ -100,6 +101,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
{
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+ if (!pmu)
+ return NULL;
+
switch (msr) {
case MSR_F15H_PERF_CTL0:
case MSR_F15H_PERF_CTL1:
@@ -134,12 +138,16 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
return &pmu->gp_counters[msr_to_index(msr)];
}
-static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
- u8 event_select,
- u8 unit_mask)
+static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
{
+ u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+ u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
+ /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
+ if (WARN_ON(pmc_is_fixed(pmc)))
+ return PERF_COUNT_HW_MAX;
+
for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
if (amd_event_mapping[i].eventsel == event_select
&& amd_event_mapping[i].unit_mask == unit_mask)
@@ -151,12 +159,6 @@ static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
return amd_event_mapping[i].event_type;
}
-/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
-static unsigned amd_find_fixed_event(int idx)
-{
- return PERF_COUNT_HW_MAX;
-}
-
/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
* AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE).
*/
@@ -181,14 +183,13 @@ static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER);
}
-/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
-static int amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
idx &= ~(3u << 30);
- return (idx >= pmu->nr_arch_gp_counters);
+ return idx < pmu->nr_arch_gp_counters;
}
/* idx is the ECX register of RDPMC instruction */
@@ -282,7 +283,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
- pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->reserved_bits = 0xfffffff000280000ull;
pmu->version = 1;
/* not applicable to AMD; but clean them to prevent any fall out */
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
@@ -320,8 +321,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
}
struct kvm_pmu_ops amd_pmu_ops = {
- .find_arch_event = amd_find_arch_event,
- .find_fixed_event = amd_find_fixed_event,
+ .pmc_perf_hw_id = amd_pmc_perf_hw_id,
.pmc_is_enabled = amd_pmc_is_enabled,
.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index c36b5fe4c27c..6a22798eaaee 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -17,10 +17,10 @@
#include <linux/misc_cgroup.h>
#include <linux/processor.h>
#include <linux/trace_events.h>
-#include <asm/fpu/internal.h>
#include <asm/pkru.h>
#include <asm/trapnr.h>
+#include <asm/fpu/xcr.h>
#include "x86.h"
#include "svm.h"
@@ -120,16 +120,26 @@ static bool __sev_recycle_asids(int min_asid, int max_asid)
return true;
}
+static int sev_misc_cg_try_charge(struct kvm_sev_info *sev)
+{
+ enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ return misc_cg_try_charge(type, sev->misc_cg, 1);
+}
+
+static void sev_misc_cg_uncharge(struct kvm_sev_info *sev)
+{
+ enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ misc_cg_uncharge(type, sev->misc_cg, 1);
+}
+
static int sev_asid_new(struct kvm_sev_info *sev)
{
int asid, min_asid, max_asid, ret;
bool retry = true;
- enum misc_res_type type;
- type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
WARN_ON(sev->misc_cg);
sev->misc_cg = get_current_misc_cg();
- ret = misc_cg_try_charge(type, sev->misc_cg, 1);
+ ret = sev_misc_cg_try_charge(sev);
if (ret) {
put_misc_cg(sev->misc_cg);
sev->misc_cg = NULL;
@@ -162,7 +172,7 @@ again:
return asid;
e_uncharge:
- misc_cg_uncharge(type, sev->misc_cg, 1);
+ sev_misc_cg_uncharge(sev);
put_misc_cg(sev->misc_cg);
sev->misc_cg = NULL;
return ret;
@@ -179,7 +189,6 @@ static void sev_asid_free(struct kvm_sev_info *sev)
{
struct svm_cpu_data *sd;
int cpu;
- enum misc_res_type type;
mutex_lock(&sev_bitmap_lock);
@@ -192,8 +201,7 @@ static void sev_asid_free(struct kvm_sev_info *sev)
mutex_unlock(&sev_bitmap_lock);
- type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
- misc_cg_uncharge(type, sev->misc_cg, 1);
+ sev_misc_cg_uncharge(sev);
put_misc_cg(sev->misc_cg);
sev->misc_cg = NULL;
}
@@ -229,7 +237,6 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- bool es_active = argp->id == KVM_SEV_ES_INIT;
int asid, ret;
if (kvm->created_vcpus)
@@ -239,7 +246,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (unlikely(sev->active))
return ret;
- sev->es_active = es_active;
+ sev->active = true;
+ sev->es_active = argp->id == KVM_SEV_ES_INIT;
asid = sev_asid_new(sev);
if (asid < 0)
goto e_no_asid;
@@ -249,8 +257,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (ret)
goto e_free;
- sev->active = true;
- sev->asid = asid;
INIT_LIST_HEAD(&sev->regions_list);
return 0;
@@ -260,6 +266,7 @@ e_free:
sev->asid = 0;
e_no_asid:
sev->es_active = false;
+ sev->active = false;
return ret;
}
@@ -590,7 +597,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
* traditional VMSA as it has been built so far (in prep
* for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
*/
- memcpy(svm->vmsa, save, sizeof(*save));
+ memcpy(svm->sev_es.vmsa, save, sizeof(*save));
return 0;
}
@@ -612,19 +619,25 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
* the VMSA memory content (i.e it will write the same memory region
* with the guest's key), so invalidate it first.
*/
- clflush_cache_range(svm->vmsa, PAGE_SIZE);
+ clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
vmsa.reserved = 0;
vmsa.handle = to_kvm_svm(kvm)->sev_info.handle;
- vmsa.address = __sme_pa(svm->vmsa);
+ vmsa.address = __sme_pa(svm->sev_es.vmsa);
vmsa.len = PAGE_SIZE;
- return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
+ if (ret)
+ return ret;
+
+ vcpu->arch.guest_state_protected = true;
+ return 0;
}
static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_vcpu *vcpu;
- int i, ret;
+ unsigned long i;
+ int ret;
if (!sev_es_guest(kvm))
return -ENOTTY;
@@ -1479,6 +1492,13 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free_trans;
}
+ /*
+ * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP
+ * encrypts the written data with the guest's key, and the cache may
+ * contain dirty, unencrypted data.
+ */
+ sev_clflush_pages(guest_page, n);
+
/* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
data.guest_address |= sev_me_mask;
@@ -1510,7 +1530,7 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
}
-static bool cmd_allowed_from_miror(u32 cmd_id)
+static bool is_cmd_allowed_from_mirror(u32 cmd_id)
{
/*
* Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES
@@ -1524,6 +1544,223 @@ static bool cmd_allowed_from_miror(u32 cmd_id)
return false;
}
+static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+ int r = -EBUSY;
+
+ if (dst_kvm == src_kvm)
+ return -EINVAL;
+
+ /*
+ * Bail if these VMs are already involved in a migration to avoid
+ * deadlock between two VMs trying to migrate to/from each other.
+ */
+ if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
+ return -EBUSY;
+
+ if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
+ goto release_dst;
+
+ r = -EINTR;
+ if (mutex_lock_killable(&dst_kvm->lock))
+ goto release_src;
+ if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
+ goto unlock_dst;
+ return 0;
+
+unlock_dst:
+ mutex_unlock(&dst_kvm->lock);
+release_src:
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+release_dst:
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ return r;
+}
+
+static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+
+ mutex_unlock(&dst_kvm->lock);
+ mutex_unlock(&src_kvm->lock);
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+}
+
+
+static int sev_lock_vcpus_for_migration(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i, j;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (mutex_lock_killable(&vcpu->mutex))
+ goto out_unlock;
+ }
+
+ return 0;
+
+out_unlock:
+ kvm_for_each_vcpu(j, vcpu, kvm) {
+ if (i == j)
+ break;
+
+ mutex_unlock(&vcpu->mutex);
+ }
+ return -EINTR;
+}
+
+static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ mutex_unlock(&vcpu->mutex);
+ }
+}
+
+static void sev_migrate_from(struct kvm_sev_info *dst,
+ struct kvm_sev_info *src)
+{
+ dst->active = true;
+ dst->asid = src->asid;
+ dst->handle = src->handle;
+ dst->pages_locked = src->pages_locked;
+ dst->enc_context_owner = src->enc_context_owner;
+
+ src->asid = 0;
+ src->active = false;
+ src->handle = 0;
+ src->pages_locked = 0;
+ src->enc_context_owner = NULL;
+
+ list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
+}
+
+static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
+{
+ unsigned long i;
+ struct kvm_vcpu *dst_vcpu, *src_vcpu;
+ struct vcpu_svm *dst_svm, *src_svm;
+
+ if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus))
+ return -EINVAL;
+
+ kvm_for_each_vcpu(i, src_vcpu, src) {
+ if (!src_vcpu->arch.guest_state_protected)
+ return -EINVAL;
+ }
+
+ kvm_for_each_vcpu(i, src_vcpu, src) {
+ src_svm = to_svm(src_vcpu);
+ dst_vcpu = kvm_get_vcpu(dst, i);
+ dst_svm = to_svm(dst_vcpu);
+
+ /*
+ * Transfer VMSA and GHCB state to the destination. Nullify and
+ * clear source fields as appropriate, the state now belongs to
+ * the destination.
+ */
+ memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es));
+ dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa;
+ dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa;
+ dst_vcpu->arch.guest_state_protected = true;
+
+ memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es));
+ src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE;
+ src_svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+ src_vcpu->arch.guest_state_protected = false;
+ }
+ to_kvm_svm(src)->sev_info.es_active = false;
+ to_kvm_svm(dst)->sev_info.es_active = true;
+
+ return 0;
+}
+
+int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
+{
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *src_sev, *cg_cleanup_sev;
+ struct file *source_kvm_file;
+ struct kvm *source_kvm;
+ bool charged = false;
+ int ret;
+
+ source_kvm_file = fget(source_fd);
+ if (!file_is_kvm(source_kvm_file)) {
+ ret = -EBADF;
+ goto out_fput;
+ }
+
+ source_kvm = source_kvm_file->private_data;
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ goto out_fput;
+
+ if (sev_guest(kvm) || !sev_guest(source_kvm)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ src_sev = &to_kvm_svm(source_kvm)->sev_info;
+
+ /*
+ * VMs mirroring src's encryption context rely on it to keep the
+ * ASID allocated, but below we are clearing src_sev->asid.
+ */
+ if (src_sev->num_mirrored_vms) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ dst_sev->misc_cg = get_current_misc_cg();
+ cg_cleanup_sev = dst_sev;
+ if (dst_sev->misc_cg != src_sev->misc_cg) {
+ ret = sev_misc_cg_try_charge(dst_sev);
+ if (ret)
+ goto out_dst_cgroup;
+ charged = true;
+ }
+
+ ret = sev_lock_vcpus_for_migration(kvm);
+ if (ret)
+ goto out_dst_cgroup;
+ ret = sev_lock_vcpus_for_migration(source_kvm);
+ if (ret)
+ goto out_dst_vcpu;
+
+ if (sev_es_guest(source_kvm)) {
+ ret = sev_es_migrate_from(kvm, source_kvm);
+ if (ret)
+ goto out_source_vcpu;
+ }
+ sev_migrate_from(dst_sev, src_sev);
+ kvm_vm_dead(source_kvm);
+ cg_cleanup_sev = src_sev;
+ ret = 0;
+
+out_source_vcpu:
+ sev_unlock_vcpus_for_migration(source_kvm);
+out_dst_vcpu:
+ sev_unlock_vcpus_for_migration(kvm);
+out_dst_cgroup:
+ /* Operates on the source on success, on the destination on failure. */
+ if (charged)
+ sev_misc_cg_uncharge(cg_cleanup_sev);
+ put_misc_cg(cg_cleanup_sev->misc_cg);
+ cg_cleanup_sev->misc_cg = NULL;
+out_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+out_fput:
+ if (source_kvm_file)
+ fput(source_kvm_file);
+ return ret;
+}
+
int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
@@ -1542,7 +1779,7 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
/* Only the enc_context_owner handles some memory enc operations. */
if (is_mirroring_enc_context(kvm) &&
- !cmd_allowed_from_miror(sev_cmd.id)) {
+ !is_cmd_allowed_from_mirror(sev_cmd.id)) {
r = -EINVAL;
goto out;
}
@@ -1739,71 +1976,60 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
{
struct file *source_kvm_file;
struct kvm *source_kvm;
- struct kvm_sev_info source_sev, *mirror_sev;
+ struct kvm_sev_info *source_sev, *mirror_sev;
int ret;
source_kvm_file = fget(source_fd);
if (!file_is_kvm(source_kvm_file)) {
ret = -EBADF;
- goto e_source_put;
+ goto e_source_fput;
}
source_kvm = source_kvm_file->private_data;
- mutex_lock(&source_kvm->lock);
-
- if (!sev_guest(source_kvm)) {
- ret = -EINVAL;
- goto e_source_unlock;
- }
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ goto e_source_fput;
- /* Mirrors of mirrors should work, but let's not get silly */
- if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
+ /*
+ * Mirrors of mirrors should work, but let's not get silly. Also
+ * disallow out-of-band SEV/SEV-ES init if the target is already an
+ * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
+ * created after SEV/SEV-ES initialization, e.g. to init intercepts.
+ */
+ if (sev_guest(kvm) || !sev_guest(source_kvm) ||
+ is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
ret = -EINVAL;
- goto e_source_unlock;
+ goto e_unlock;
}
- memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info,
- sizeof(source_sev));
-
/*
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
+ source_sev = &to_kvm_svm(source_kvm)->sev_info;
kvm_get_kvm(source_kvm);
-
- fput(source_kvm_file);
- mutex_unlock(&source_kvm->lock);
- mutex_lock(&kvm->lock);
-
- if (sev_guest(kvm)) {
- ret = -EINVAL;
- goto e_mirror_unlock;
- }
+ source_sev->num_mirrored_vms++;
/* Set enc_context_owner and copy its encryption context over */
mirror_sev = &to_kvm_svm(kvm)->sev_info;
mirror_sev->enc_context_owner = source_kvm;
mirror_sev->active = true;
- mirror_sev->asid = source_sev.asid;
- mirror_sev->fd = source_sev.fd;
- mirror_sev->es_active = source_sev.es_active;
- mirror_sev->handle = source_sev.handle;
+ mirror_sev->asid = source_sev->asid;
+ mirror_sev->fd = source_sev->fd;
+ mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->handle = source_sev->handle;
+ INIT_LIST_HEAD(&mirror_sev->regions_list);
+ ret = 0;
+
/*
* Do not copy ap_jump_table. Since the mirror does not share the same
* KVM contexts as the original, and they may have different
* memory-views.
*/
- mutex_unlock(&kvm->lock);
- return 0;
-
-e_mirror_unlock:
- mutex_unlock(&kvm->lock);
- kvm_put_kvm(source_kvm);
- return ret;
-e_source_unlock:
- mutex_unlock(&source_kvm->lock);
-e_source_put:
+e_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+e_source_fput:
if (source_kvm_file)
fput(source_kvm_file);
return ret;
@@ -1815,17 +2041,24 @@ void sev_vm_destroy(struct kvm *kvm)
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;
+ WARN_ON(sev->num_mirrored_vms);
+
if (!sev_guest(kvm))
return;
/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
if (is_mirroring_enc_context(kvm)) {
- kvm_put_kvm(sev->enc_context_owner);
+ struct kvm *owner_kvm = sev->enc_context_owner;
+ struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;
+
+ mutex_lock(&owner_kvm->lock);
+ if (!WARN_ON(!owner_sev->num_mirrored_vms))
+ owner_sev->num_mirrored_vms--;
+ mutex_unlock(&owner_kvm->lock);
+ kvm_put_kvm(owner_kvm);
return;
}
- mutex_lock(&kvm->lock);
-
/*
* Ensure that all guest tagged cache entries are flushed before
* releasing the pages back to the system for use. CLFLUSH will
@@ -1845,8 +2078,6 @@ void sev_vm_destroy(struct kvm *kvm)
}
}
- mutex_unlock(&kvm->lock);
-
sev_unbind_asid(kvm, sev->handle);
sev_asid_free(sev);
}
@@ -2026,16 +2257,16 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
if (vcpu->arch.guest_state_protected)
- sev_flush_guest_memory(svm, svm->vmsa, PAGE_SIZE);
- __free_page(virt_to_page(svm->vmsa));
+ sev_flush_guest_memory(svm, svm->sev_es.vmsa, PAGE_SIZE);
+ __free_page(virt_to_page(svm->sev_es.vmsa));
- if (svm->ghcb_sa_free)
- kfree(svm->ghcb_sa);
+ if (svm->sev_es.ghcb_sa_free)
+ kvfree(svm->sev_es.ghcb_sa);
}
static void dump_ghcb(struct vcpu_svm *svm)
{
- struct ghcb *ghcb = svm->ghcb;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
unsigned int nbits;
/* Re-use the dump_invalid_vmcb module parameter */
@@ -2061,7 +2292,7 @@ static void dump_ghcb(struct vcpu_svm *svm)
static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
- struct ghcb *ghcb = svm->ghcb;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
/*
* The GHCB protocol so far allows for the following data
@@ -2081,7 +2312,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct kvm_vcpu *vcpu = &svm->vcpu;
- struct ghcb *ghcb = svm->ghcb;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
u64 exit_code;
/*
@@ -2122,24 +2353,29 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
-static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
+static bool sev_es_validate_vmgexit(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu;
struct ghcb *ghcb;
- u64 exit_code = 0;
-
- ghcb = svm->ghcb;
+ u64 exit_code;
+ u64 reason;
- /* Only GHCB Usage code 0 is supported */
- if (ghcb->ghcb_usage)
- goto vmgexit_err;
+ ghcb = svm->sev_es.ghcb;
/*
- * Retrieve the exit code now even though is may not be marked valid
+ * Retrieve the exit code now even though it may not be marked valid
* as it could help with debugging.
*/
exit_code = ghcb_get_sw_exit_code(ghcb);
+ /* Only GHCB Usage code 0 is supported */
+ if (ghcb->ghcb_usage) {
+ reason = GHCB_ERR_INVALID_USAGE;
+ goto vmgexit_err;
+ }
+
+ reason = GHCB_ERR_MISSING_INPUT;
+
if (!ghcb_sw_exit_code_is_valid(ghcb) ||
!ghcb_sw_exit_info_1_is_valid(ghcb) ||
!ghcb_sw_exit_info_2_is_valid(ghcb))
@@ -2218,61 +2454,66 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
break;
default:
+ reason = GHCB_ERR_INVALID_EVENT;
goto vmgexit_err;
}
- return 0;
+ return true;
vmgexit_err:
vcpu = &svm->vcpu;
- if (ghcb->ghcb_usage) {
+ if (reason == GHCB_ERR_INVALID_USAGE) {
vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
ghcb->ghcb_usage);
+ } else if (reason == GHCB_ERR_INVALID_EVENT) {
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
+ exit_code);
} else {
- vcpu_unimpl(vcpu, "vmgexit: exit reason %#llx is not valid\n",
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
exit_code);
dump_ghcb(svm);
}
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_code;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ /* Clear the valid entries fields */
+ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
- return -EINVAL;
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, reason);
+
+ return false;
}
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
{
- if (!svm->ghcb)
+ if (!svm->sev_es.ghcb)
return;
- if (svm->ghcb_sa_free) {
+ if (svm->sev_es.ghcb_sa_free) {
/*
* The scratch area lives outside the GHCB, so there is a
* buffer that, depending on the operation performed, may
* need to be synced, then freed.
*/
- if (svm->ghcb_sa_sync) {
+ if (svm->sev_es.ghcb_sa_sync) {
kvm_write_guest(svm->vcpu.kvm,
- ghcb_get_sw_scratch(svm->ghcb),
- svm->ghcb_sa, svm->ghcb_sa_len);
- svm->ghcb_sa_sync = false;
+ ghcb_get_sw_scratch(svm->sev_es.ghcb),
+ svm->sev_es.ghcb_sa,
+ svm->sev_es.ghcb_sa_len);
+ svm->sev_es.ghcb_sa_sync = false;
}
- kfree(svm->ghcb_sa);
- svm->ghcb_sa = NULL;
- svm->ghcb_sa_free = false;
+ kvfree(svm->sev_es.ghcb_sa);
+ svm->sev_es.ghcb_sa = NULL;
+ svm->sev_es.ghcb_sa_free = false;
}
- trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->ghcb);
+ trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb);
sev_es_sync_to_ghcb(svm);
- kvm_vcpu_unmap(&svm->vcpu, &svm->ghcb_map, true);
- svm->ghcb = NULL;
+ kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true);
+ svm->sev_es.ghcb = NULL;
}
void pre_sev_run(struct vcpu_svm *svm, int cpu)
@@ -2302,7 +2543,7 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
{
struct vmcb_control_area *control = &svm->vmcb->control;
- struct ghcb *ghcb = svm->ghcb;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
u64 ghcb_scratch_beg, ghcb_scratch_end;
u64 scratch_gpa_beg, scratch_gpa_end;
void *scratch_va;
@@ -2310,14 +2551,14 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
scratch_gpa_beg = ghcb_get_sw_scratch(ghcb);
if (!scratch_gpa_beg) {
pr_err("vmgexit: scratch gpa not provided\n");
- return false;
+ goto e_scratch;
}
scratch_gpa_end = scratch_gpa_beg + len;
if (scratch_gpa_end < scratch_gpa_beg) {
pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
len, scratch_gpa_beg);
- return false;
+ goto e_scratch;
}
if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
@@ -2335,10 +2576,10 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
scratch_gpa_end > ghcb_scratch_end) {
pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
scratch_gpa_beg, scratch_gpa_end);
- return false;
+ goto e_scratch;
}
- scratch_va = (void *)svm->ghcb;
+ scratch_va = (void *)svm->sev_es.ghcb;
scratch_va += (scratch_gpa_beg - control->ghcb_gpa);
} else {
/*
@@ -2348,18 +2589,18 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
if (len > GHCB_SCRATCH_AREA_LIMIT) {
pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
len, GHCB_SCRATCH_AREA_LIMIT);
- return false;
+ goto e_scratch;
}
- scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT);
+ scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
if (!scratch_va)
- return false;
+ goto e_scratch;
if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
/* Unable to copy scratch area from guest */
pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
- kfree(scratch_va);
- return false;
+ kvfree(scratch_va);
+ goto e_scratch;
}
/*
@@ -2368,14 +2609,20 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
* the vCPU next time (i.e. a read was requested so the data
* must be written back to the guest memory).
*/
- svm->ghcb_sa_sync = sync;
- svm->ghcb_sa_free = true;
+ svm->sev_es.ghcb_sa_sync = sync;
+ svm->sev_es.ghcb_sa_free = true;
}
- svm->ghcb_sa = scratch_va;
- svm->ghcb_sa_len = len;
+ svm->sev_es.ghcb_sa = scratch_va;
+ svm->sev_es.ghcb_sa_len = len;
return true;
+
+e_scratch:
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
+
+ return false;
}
static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
@@ -2426,7 +2673,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
if (!ret) {
- ret = -EINVAL;
+ /* Error, keep GHCB MSR value as-is */
break;
}
@@ -2462,10 +2709,13 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
GHCB_MSR_TERM_REASON_POS);
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
reason_set, reason_code);
- fallthrough;
+
+ ret = -EINVAL;
+ break;
}
default:
- ret = -EINVAL;
+ /* Error, keep GHCB MSR value as-is */
+ break;
}
trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
@@ -2489,32 +2739,35 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
if (!ghcb_gpa) {
vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
- return -EINVAL;
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
}
- if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
+ if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) {
/* Unable to map GHCB from guest */
vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
ghcb_gpa);
- return -EINVAL;
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
}
- svm->ghcb = svm->ghcb_map.hva;
- ghcb = svm->ghcb_map.hva;
+ svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
+ ghcb = svm->sev_es.ghcb_map.hva;
trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb);
exit_code = ghcb_get_sw_exit_code(ghcb);
- ret = sev_es_validate_vmgexit(svm);
- if (ret)
- return ret;
+ if (!sev_es_validate_vmgexit(svm))
+ return 1;
sev_es_sync_from_ghcb(svm);
ghcb_set_sw_exit_info_1(ghcb, 0);
ghcb_set_sw_exit_info_2(ghcb, 0);
- ret = -EINVAL;
+ ret = 1;
switch (exit_code) {
case SVM_VMGEXIT_MMIO_READ:
if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
@@ -2523,7 +2776,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
ret = kvm_sev_es_mmio_read(vcpu,
control->exit_info_1,
control->exit_info_2,
- svm->ghcb_sa);
+ svm->sev_es.ghcb_sa);
break;
case SVM_VMGEXIT_MMIO_WRITE:
if (!setup_vmgexit_scratch(svm, false, control->exit_info_2))
@@ -2532,7 +2785,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
ret = kvm_sev_es_mmio_write(vcpu,
control->exit_info_1,
control->exit_info_2,
- svm->ghcb_sa);
+ svm->sev_es.ghcb_sa);
break;
case SVM_VMGEXIT_NMI_COMPLETE:
ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
@@ -2555,20 +2808,17 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
default:
pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
control->exit_info_1);
- ghcb_set_sw_exit_info_1(ghcb, 1);
- ghcb_set_sw_exit_info_2(ghcb,
- X86_TRAP_UD |
- SVM_EVTINJ_TYPE_EXEPT |
- SVM_EVTINJ_VALID);
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT);
}
- ret = 1;
break;
}
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
control->exit_info_1, control->exit_info_2);
+ ret = -EINVAL;
break;
default:
ret = svm_invoke_exit_handler(vcpu, exit_code);
@@ -2579,11 +2829,21 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
{
- if (!setup_vmgexit_scratch(svm, in, svm->vmcb->control.exit_info_2))
+ int count;
+ int bytes;
+
+ if (svm->vmcb->control.exit_info_2 > INT_MAX)
return -EINVAL;
- return kvm_sev_es_string_io(&svm->vcpu, size, port,
- svm->ghcb_sa, svm->ghcb_sa_len, in);
+ count = svm->vmcb->control.exit_info_2;
+ if (unlikely(check_mul_overflow(count, size, &bytes)))
+ return -EINVAL;
+
+ if (!setup_vmgexit_scratch(svm, in, bytes))
+ return 1;
+
+ return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
+ count, in);
}
void sev_es_init_vmcb(struct vcpu_svm *svm)
@@ -2598,7 +2858,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm)
* VMCB page. Do not include the encryption mask on the VMSA physical
* address since hardware will access it using the guest key.
*/
- svm->vmcb->control.vmsa_pa = __pa(svm->vmsa);
+ svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
/* Can't intercept CR register access, HV can't modify CR registers */
svm_clr_intercept(svm, INTERCEPT_CR0_READ);
@@ -2631,11 +2891,11 @@ void sev_es_init_vmcb(struct vcpu_svm *svm)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
}
-void sev_es_create_vcpu(struct vcpu_svm *svm)
+void sev_es_vcpu_reset(struct vcpu_svm *svm)
{
/*
- * Set the GHCB MSR value as per the GHCB specification when creating
- * a vCPU for an SEV-ES guest.
+ * Set the GHCB MSR value as per the GHCB specification when emulating
+ * vCPU RESET for an SEV-ES guest.
*/
set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
GHCB_VERSION_MIN,
@@ -2670,8 +2930,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
struct vcpu_svm *svm = to_svm(vcpu);
/* First SIPI: Use the values as initially set by the VMM */
- if (!svm->received_first_sipi) {
- svm->received_first_sipi = true;
+ if (!svm->sev_es.received_first_sipi) {
+ svm->sev_es.received_first_sipi = true;
return;
}
@@ -2680,8 +2940,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
* the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
* non-zero value.
*/
- if (!svm->ghcb)
+ if (!svm->sev_es.ghcb)
return;
- ghcb_set_sw_exit_info_2(svm->ghcb, 1);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 989685098b3e..46bcc706f257 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -25,6 +25,7 @@
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/rwsem.h>
+#include <linux/cc_platform.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -36,6 +37,7 @@
#include <asm/spec-ctrl.h>
#include <asm/cpu_device_id.h>
#include <asm/traps.h>
+#include <asm/fpu/api.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -186,6 +188,17 @@ module_param(vls, int, 0444);
static int vgif = true;
module_param(vgif, int, 0444);
+/* enable/disable LBR virtualization */
+static int lbrv = true;
+module_param(lbrv, int, 0444);
+
+/* enable/disable PMU virtualization */
+bool pmu = true;
+module_param(pmu, bool, 0444);
+
+static int tsc_scaling = true;
+module_param(tsc_scaling, int, 0444);
+
/*
* enable / disable AVIC. Because the defaults differ for APICv
* support between VMX and SVM we cannot use module_param_named.
@@ -256,7 +269,7 @@ u32 svm_msrpm_offset(u32 msr)
#define MAX_INST_SIZE 15
-static int get_max_npt_level(void)
+static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
@@ -455,7 +468,7 @@ static int has_svm(void)
return 0;
}
- if (sev_active()) {
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
pr_info("KVM is unsupported when running as an SEV guest\n");
return 0;
}
@@ -466,7 +479,7 @@ static int has_svm(void)
static void svm_hardware_disable(void)
{
/* Make sure we clean up behind us */
- if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
+ if (tsc_scaling)
wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
cpu_svm_disable();
@@ -509,6 +522,10 @@ static int svm_hardware_enable(void)
wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ /*
+ * Set the default value, even if we don't use TSC scaling
+ * to avoid having stale value in the msr
+ */
wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
__this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
}
@@ -572,12 +589,10 @@ static int svm_cpu_init(int cpu)
if (!sd)
return ret;
sd->cpu = cpu;
- sd->save_area = alloc_page(GFP_KERNEL);
+ sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!sd->save_area)
goto free_cpu_data;
- clear_page(page_address(sd->save_area));
-
ret = sev_cpu_init(sd);
if (ret)
goto free_save_area;
@@ -929,6 +944,9 @@ static __init void svm_set_cpu_caps(void)
if (npt_enabled)
kvm_cpu_cap_set(X86_FEATURE_NPT);
+ if (tsc_scaling)
+ kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
+
/* Nested VM can receive #VMEXIT instead of triggering #GP */
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
@@ -938,6 +956,10 @@ static __init void svm_set_cpu_caps(void)
boot_cpu_has(X86_FEATURE_AMD_SSBD))
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+ /* AMD PMU PERFCTR_CORE CPUID */
+ if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+ kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
+
/* CPUID 0x8000001F (SME/SEV features) */
sev_set_cpu_caps();
}
@@ -976,10 +998,15 @@ static __init int svm_hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
kvm_enable_efer_bits(EFER_FFXSR);
- if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- kvm_has_tsc_control = true;
- kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
- kvm_tsc_scaling_ratio_frac_bits = 32;
+ if (tsc_scaling) {
+ if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ tsc_scaling = false;
+ } else {
+ pr_info("TSC scaling supported\n");
+ kvm_has_tsc_control = true;
+ kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 32;
+ }
}
tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
@@ -1008,9 +1035,9 @@ static __init int svm_hardware_setup(void)
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
- /* Force VM NPT level equal to the host's max NPT level */
- kvm_configure_mmu(npt_enabled, get_max_npt_level(),
- get_max_npt_level(), PG_LEVEL_1G);
+ /* Force VM NPT level equal to the host's paging level */
+ kvm_configure_mmu(npt_enabled, get_npt_level(),
+ get_npt_level(), PG_LEVEL_1G);
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
/* Note, SEV setup consumes npt_enabled. */
@@ -1059,6 +1086,16 @@ static __init int svm_hardware_setup(void)
pr_info("Virtual GIF supported\n");
}
+ if (lbrv) {
+ if (!boot_cpu_has(X86_FEATURE_LBRV))
+ lbrv = false;
+ else
+ pr_info("LBR virtualization supported\n");
+ }
+
+ if (!pmu)
+ pr_info("PMU virtualization is disabled\n");
+
svm_set_cpu_caps();
/*
@@ -1109,7 +1146,9 @@ static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
{
- return kvm_default_tsc_scaling_ratio;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->tsc_ratio_msr;
}
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -1121,7 +1160,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
{
wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
}
@@ -1150,6 +1189,38 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
}
}
+static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (guest_cpuid_is_intel(vcpu)) {
+ /*
+ * We must intercept SYSENTER_EIP and SYSENTER_ESP
+ * accesses because the processor only stores 32 bits.
+ * For the same reason we cannot use virtual VMLOAD/VMSAVE.
+ */
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+ } else {
+ /*
+ * If hardware supports Virtual VMLOAD VMSAVE then enable it
+ * in VMCB and clear intercepts to avoid #VMEXIT.
+ */
+ if (vls) {
+ svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+ svm_clr_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ }
+ /* No need to intercept these MSRs */
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+ }
+}
+
static void init_vmcb(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1296,11 +1367,25 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
}
svm_hv_init_vmcb(svm->vmcb);
+ init_vmcb_after_set_cpuid(vcpu);
vmcb_mark_all_dirty(svm->vmcb);
enable_gif(svm);
+}
+
+static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ svm_vcpu_init_msrpm(vcpu, svm->msrpm);
+
+ svm_init_osvw(vcpu);
+ vcpu->arch.microcode_version = 0x01000065;
+ svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio;
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_vcpu_reset(svm);
}
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -1311,6 +1396,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
svm->virt_spec_ctrl = 0;
init_vmcb(vcpu);
+
+ if (!init_event)
+ __svm_vcpu_reset(vcpu);
}
void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
@@ -1346,10 +1434,10 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
/*
* SEV-ES guests maintain an encrypted version of their FPU
* state which is restored and saved on VMRUN and VMEXIT.
- * Free the fpu structure to prevent KVM from attempting to
- * access the FPU state.
+ * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
+ * do xsave/xrstor on it.
*/
- kvm_free_guest_fpu(vcpu);
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
}
err = avic_init_vcpu(svm);
@@ -1370,24 +1458,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm->vmcb01.ptr = page_address(vmcb01_page);
svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
+ svm_switch_vmcb(svm, &svm->vmcb01);
if (vmsa_page)
- svm->vmsa = page_address(vmsa_page);
+ svm->sev_es.vmsa = page_address(vmsa_page);
svm->guest_state_loaded = false;
- svm_switch_vmcb(svm, &svm->vmcb01);
- init_vmcb(vcpu);
-
- svm_vcpu_init_msrpm(vcpu, svm->msrpm);
-
- svm_init_osvw(vcpu);
- vcpu->arch.microcode_version = 0x01000065;
-
- if (sev_es_guest(vcpu->kvm))
- /* Perform SEV-ES specific VMCB creation updates */
- sev_es_create_vcpu(svm);
-
return 0;
error_free_vmsa_page:
@@ -1447,7 +1524,7 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
vmsave(__sme_page_pa(sd->save_area));
}
- if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ if (tsc_scaling) {
u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
__this_cpu_write(current_tsc_ratio, tsc_ratio);
@@ -1517,12 +1594,27 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
+static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+
+ return sev_es_guest(vcpu->kvm)
+ ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
+ : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
+ kvm_register_mark_available(vcpu, reg);
+
switch (reg) {
case VCPU_EXREG_PDPTR:
- BUG_ON(!npt_enabled);
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ /*
+ * When !npt_enabled, mmu->pdptrs[] is already available since
+ * it is always updated per SDM when moving to CRs.
+ */
+ if (npt_enabled)
+ load_pdptrs(vcpu, kvm_read_cr3(vcpu));
break;
default:
KVM_BUG_ON(1, vcpu->kvm);
@@ -1709,6 +1801,24 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}
+static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * For guests that don't set guest_state_protected, the cr3 update is
+ * handled via kvm_mmu_load() while entering the guest. For guests
+ * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
+ * VMCB save area now, since the save area will become the initial
+ * contents of the VMSA, and future VMCB save area updates won't be
+ * seen.
+ */
+ if (sev_es_guest(vcpu->kvm)) {
+ svm->vmcb->save.cr3 = cr3;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+ }
+}
+
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2440,7 +2550,7 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
bool ret = false;
if (!is_guest_mode(vcpu) ||
- (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
+ (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
return false;
cr0 &= ~SVM_CR0_SELECTIVE_MASK;
@@ -2657,6 +2767,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
struct vcpu_svm *svm = to_svm(vcpu);
switch (msr_info->index) {
+ case MSR_AMD64_TSC_RATIO:
+ if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
+ return 1;
+ msr_info->data = svm->tsc_ratio_msr;
+ break;
case MSR_STAR:
msr_info->data = svm->vmcb01.ptr->save.star;
break;
@@ -2762,11 +2877,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->ghcb))
+ if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
return kvm_complete_insn_gp(vcpu, err);
- ghcb_set_sw_exit_info_1(svm->ghcb, 1);
- ghcb_set_sw_exit_info_2(svm->ghcb,
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
X86_TRAP_GP |
SVM_EVTINJ_TYPE_EXEPT |
SVM_EVTINJ_VALID);
@@ -2806,6 +2921,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
u32 ecx = msr->index;
u64 data = msr->data;
switch (ecx) {
+ case MSR_AMD64_TSC_RATIO:
+ if (!msr->host_initiated && !svm->tsc_scaling_enabled)
+ return 1;
+
+ if (data & TSC_RATIO_RSVD)
+ return 1;
+
+ svm->tsc_ratio_msr = data;
+
+ if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
+ nested_svm_update_tsc_ratio_msr(vcpu);
+
+ break;
case MSR_IA32_CR_PAT:
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
return 1;
@@ -2918,7 +3046,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->tsc_aux = data;
break;
case MSR_IA32_DEBUGCTLMSR:
- if (!boot_cpu_has(X86_FEATURE_LBRV)) {
+ if (!lbrv) {
vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
__func__, data);
break;
@@ -3035,11 +3163,6 @@ static int invpcid_interception(struct kvm_vcpu *vcpu)
type = svm->vmcb->control.exit_info_2;
gva = svm->vmcb->control.exit_info_1;
- if (type > 3) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
return kvm_handle_invpcid(vcpu, type, gva);
}
@@ -3278,11 +3401,13 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
return svm_exit_handlers[exit_code](vcpu);
}
-static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2,
u32 *intr_info, u32 *error_code)
{
struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+ *reason = control->exit_code;
*info1 = control->exit_info_1;
*info2 = control->exit_info_2;
*intr_info = control->exit_int_info;
@@ -3299,7 +3424,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
- trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
+ trace_kvm_exit(vcpu, KVM_ISA_SVM);
/* SEV-ES guests must use the CR write traps to track CR registers. */
if (!sev_es_guest(vcpu->kvm)) {
@@ -3312,7 +3437,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (is_guest_mode(vcpu)) {
int vmexit;
- trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM);
+ trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
vmexit = nested_svm_exit_special(svm);
@@ -3485,14 +3610,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (!gif_set(svm))
return true;
- if (sev_es_guest(vcpu->kvm)) {
- /*
- * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
- * bit to determine the state of the IF flag.
- */
- if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK))
- return true;
- } else if (is_guest_mode(vcpu)) {
+ if (is_guest_mode(vcpu)) {
/* As long as interrupts are being delivered... */
if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
@@ -3503,7 +3621,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (nested_exit_on_intr(svm))
return false;
} else {
- if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
+ if (!svm_get_if_flag(vcpu))
return true;
}
@@ -3780,8 +3898,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
pre_svm_run(vcpu);
- WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
-
sync_lapic_to_cr8(vcpu);
if (unlikely(svm->asid != svm->vmcb->control.asid)) {
@@ -3848,9 +3964,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
}
+ vcpu->arch.regs_dirty = 0;
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_before_interrupt(vcpu);
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
kvm_load_host_xsave_state(vcpu);
stgi();
@@ -3882,8 +3999,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.apf.host_apf_flags =
kvm_read_and_reset_apf_flags();
- if (npt_enabled)
- kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR);
+ vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
/*
* We need to handle MC intercepts here before the vcpu has a chance to
@@ -3913,9 +4029,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
hv_track_root_tdp(vcpu, root_hpa);
- /* Loading L2's CR3 is handled by enter_svm_guest_mode. */
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
- return;
cr3 = vcpu->arch.cr3;
} else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
@@ -4001,6 +4114,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
+ svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
+
svm_recalc_instruction_intercepts(vcpu, svm);
/* For sev guests, the memory encryption bit is not reserved in CR3. */
@@ -4027,33 +4142,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_request_apicv_update(vcpu->kvm, false,
APICV_INHIBIT_REASON_NESTED);
}
-
- if (guest_cpuid_is_intel(vcpu)) {
- /*
- * We must intercept SYSENTER_EIP and SYSENTER_ESP
- * accesses because the processor only stores 32 bits.
- * For the same reason we cannot use virtual VMLOAD/VMSAVE.
- */
- svm_set_intercept(svm, INTERCEPT_VMLOAD);
- svm_set_intercept(svm, INTERCEPT_VMSAVE);
- svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
-
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
- } else {
- /*
- * If hardware supports Virtual VMLOAD VMSAVE then enable it
- * in VMCB and clear intercepts to avoid #VMEXIT.
- */
- if (vls) {
- svm_clr_intercept(svm, INTERCEPT_VMLOAD);
- svm_clr_intercept(svm, INTERCEPT_VMSAVE);
- svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
- }
- /* No need to intercept these MSRs */
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
- }
+ init_vmcb_after_set_cpuid(vcpu);
}
static bool svm_has_wbinvd_exit(void)
@@ -4158,7 +4247,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
info->intercept == x86_intercept_clts)
break;
- if (!(vmcb_is_intercept(&svm->nested.ctl,
+ if (!(vmcb12_is_intercept(&svm->nested.ctl,
INTERCEPT_SELECTIVE_CR0)))
break;
@@ -4377,7 +4466,8 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
*/
vmcb12 = map.hva;
- nested_load_control_from_vmcb12(svm, &vmcb12->control);
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
unmap_save:
@@ -4520,6 +4610,8 @@ static int svm_vm_init(struct kvm *kvm)
}
static struct kvm_x86_ops svm_x86_ops __initdata = {
+ .name = "kvm_amd",
+
.hardware_unsetup = svm_hardware_teardown,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
@@ -4550,6 +4642,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
.set_cr0 = svm_set_cr0,
+ .post_set_cr3 = svm_post_set_cr3,
.is_valid_cr4 = svm_is_valid_cr4,
.set_cr4 = svm_set_cr4,
.set_efer = svm_set_efer,
@@ -4562,6 +4655,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
+ .get_if_flag = svm_get_if_flag,
.tlb_flush_all = svm_flush_tlb,
.tlb_flush_current = svm_flush_tlb,
@@ -4592,7 +4686,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
- .sync_pir_to_irr = kvm_lapic_find_highest_irr,
.apicv_post_state_restore = avic_post_state_restore,
.set_tss_addr = svm_set_tss_addr,
@@ -4637,6 +4730,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.mem_enc_unreg_region = svm_unregister_enc_region,
.vm_copy_enc_context_from = svm_vm_copy_asid_from,
+ .vm_move_enc_context_from = svm_vm_migrate_from,
.can_emulate_instruction = svm_can_emulate_instruction,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 128a54b1fbf1..9f153c59f2c8 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -32,6 +32,7 @@
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
extern bool intercept_smi;
+extern bool pmu;
/*
* Clean bits in VMCB.
@@ -79,7 +80,9 @@ struct kvm_sev_info {
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
+ unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
+ atomic_t migration_in_progress;
};
struct kvm_svm {
@@ -103,6 +106,40 @@ struct kvm_vmcb_info {
uint64_t asid_generation;
};
+struct vmcb_save_area_cached {
+ u64 efer;
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+};
+
+struct vmcb_ctrl_area_cached {
+ u32 intercepts[MAX_INTERCEPT];
+ u16 pause_filter_thresh;
+ u16 pause_filter_count;
+ u64 iopm_base_pa;
+ u64 msrpm_base_pa;
+ u64 tsc_offset;
+ u32 asid;
+ u8 tlb_ctl;
+ u32 int_ctl;
+ u32 int_vector;
+ u32 int_state;
+ u32 exit_code;
+ u32 exit_code_hi;
+ u64 exit_info_1;
+ u64 exit_info_2;
+ u32 exit_int_info;
+ u32 exit_int_info_err;
+ u64 nested_ctl;
+ u32 event_inj;
+ u32 event_inj_err;
+ u64 nested_cr3;
+ u64 virt_ext;
+};
+
struct svm_nested_state {
struct kvm_vmcb_info vmcb02;
u64 hsave_msr;
@@ -118,11 +155,31 @@ struct svm_nested_state {
bool nested_run_pending;
/* cache for control fields of the guest */
- struct vmcb_control_area ctl;
+ struct vmcb_ctrl_area_cached ctl;
+
+ /*
+ * Note: this struct is not kept up-to-date while L2 runs; it is only
+ * valid within nested_svm_vmrun.
+ */
+ struct vmcb_save_area_cached save;
bool initialized;
};
+struct vcpu_sev_es_state {
+ /* SEV-ES support */
+ struct vmcb_save_area *vmsa;
+ struct ghcb *ghcb;
+ struct kvm_host_map ghcb_map;
+ bool received_first_sipi;
+
+ /* SEV-ES scratch area support */
+ void *ghcb_sa;
+ u32 ghcb_sa_len;
+ bool ghcb_sa_sync;
+ bool ghcb_sa_free;
+};
+
struct vcpu_svm {
struct kvm_vcpu vcpu;
/* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */
@@ -140,6 +197,8 @@ struct vcpu_svm {
u64 next_rip;
u64 spec_ctrl;
+
+ u64 tsc_ratio_msr;
/*
* Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
* translated into the appropriate L2_CFG bits on the host to
@@ -160,7 +219,8 @@ struct vcpu_svm {
unsigned long int3_rip;
/* cached guest cpuid flags for faster access */
- bool nrips_enabled : 1;
+ bool nrips_enabled : 1;
+ bool tsc_scaling_enabled : 1;
u32 ldr_reg;
u32 dfr_reg;
@@ -183,17 +243,7 @@ struct vcpu_svm {
DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
} shadow_msr_intercept;
- /* SEV-ES support */
- struct vmcb_save_area *vmsa;
- struct ghcb *ghcb;
- struct kvm_host_map ghcb_map;
- bool received_first_sipi;
-
- /* SEV-ES scratch area support */
- void *ghcb_sa;
- u64 ghcb_sa_len;
- bool ghcb_sa_sync;
- bool ghcb_sa_free;
+ struct vcpu_sev_es_state sev_es;
bool guest_state_loaded;
};
@@ -218,12 +268,12 @@ DECLARE_PER_CPU(struct svm_cpu_data *, svm_data);
void recalc_intercepts(struct vcpu_svm *svm);
-static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
+static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
{
return container_of(kvm, struct kvm_svm, kvm);
}
-static inline bool sev_guest(struct kvm *kvm)
+static __always_inline bool sev_guest(struct kvm *kvm)
{
#ifdef CONFIG_KVM_AMD_SEV
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -234,12 +284,12 @@ static inline bool sev_guest(struct kvm *kvm)
#endif
}
-static inline bool sev_es_guest(struct kvm *kvm)
+static __always_inline bool sev_es_guest(struct kvm *kvm)
{
#ifdef CONFIG_KVM_AMD_SEV
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- return sev_guest(kvm) && sev->es_active;
+ return sev->es_active && !WARN_ON_ONCE(!sev->active);
#else
return false;
#endif
@@ -271,11 +321,21 @@ static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
}
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_svm, vcpu);
}
+/*
+ * Only the PDPTRs are loaded on demand into the shadow MMU. All other
+ * fields are synchronized in handle_exit, because accessing the VMCB is cheap.
+ *
+ * CR3 might be out of date in the VMCB but it is not marked dirty; instead,
+ * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3
+ * is changed. svm_load_mmu_pgd() then syncs the new CR3 value into the VMCB.
+ */
+#define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR)
+
static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
{
WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
@@ -294,6 +354,12 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
return test_bit(bit, (unsigned long *)&control->intercepts);
}
+static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ return test_bit(bit, (unsigned long *)&control->intercepts);
+}
+
static inline void set_dr_intercepts(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
@@ -446,17 +512,17 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu)
static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
{
- return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
}
static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
{
- return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
}
static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
{
- return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
@@ -483,8 +549,12 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
int nested_svm_exit_special(struct vcpu_svm *svm);
-void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
- struct vmcb_control_area *control);
+void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
+void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier);
+void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
+ struct vmcb_control_area *control);
+void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
+ struct vmcb_save_area *save);
void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
@@ -553,6 +623,7 @@ int svm_register_enc_region(struct kvm *kvm,
int svm_unregister_enc_region(struct kvm *kvm,
struct kvm_enc_region *range);
int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd);
+int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd);
void pre_sev_run(struct vcpu_svm *svm, int cpu);
void __init sev_set_cpu_caps(void);
void __init sev_hardware_setup(void);
@@ -562,7 +633,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu);
int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_init_vmcb(struct vcpu_svm *svm);
-void sev_es_create_vcpu(struct vcpu_svm *svm);
+void sev_es_vcpu_reset(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
index 22e2b019de37..9430d6437c9f 100644
--- a/arch/x86/kvm/svm/svm_ops.h
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -56,12 +56,12 @@ static inline void invlpga(unsigned long addr, u32 asid)
* VMSAVE, VMLOAD, etc... is still controlled by the effective address size,
* hence 'unsigned long' instead of 'hpa_t'.
*/
-static inline void vmsave(unsigned long pa)
+static __always_inline void vmsave(unsigned long pa)
{
svm_asm1(vmsave, "a" (pa), "memory");
}
-static inline void vmload(unsigned long pa)
+static __always_inline void vmload(unsigned long pa)
{
svm_asm1(vmload, "a" (pa), "memory");
}
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 4fa17df123cd..dfaeb47fcf2a 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -148,7 +148,7 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %edi
#endif
pop %_ASM_BP
- ret
+ RET
3: cmpb $0, kvm_rebooting
jne 2b
@@ -202,7 +202,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
pop %edi
#endif
pop %_ASM_BP
- ret
+ RET
3: cmpb $0, kvm_rebooting
jne 2b
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 03ebe368333e..92e6f6702f00 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -288,8 +288,8 @@ TRACE_EVENT(kvm_apic,
#define TRACE_EVENT_KVM_EXIT(name) \
TRACE_EVENT(name, \
- TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), \
- TP_ARGS(exit_reason, vcpu, isa), \
+ TP_PROTO(struct kvm_vcpu *vcpu, u32 isa), \
+ TP_ARGS(vcpu, isa), \
\
TP_STRUCT__entry( \
__field( unsigned int, exit_reason ) \
@@ -303,11 +303,12 @@ TRACE_EVENT(name, \
), \
\
TP_fast_assign( \
- __entry->exit_reason = exit_reason; \
__entry->guest_rip = kvm_rip_read(vcpu); \
__entry->isa = isa; \
__entry->vcpu_id = vcpu->vcpu_id; \
- static_call(kvm_x86_get_exit_info)(vcpu, &__entry->info1, \
+ static_call(kvm_x86_get_exit_info)(vcpu, \
+ &__entry->exit_reason, \
+ &__entry->info1, \
&__entry->info2, \
&__entry->intr_info, \
&__entry->error_code); \
@@ -1355,6 +1356,30 @@ TRACE_EVENT(kvm_apicv_update_request,
__entry->bit)
);
+TRACE_EVENT(kvm_apicv_accept_irq,
+ TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec),
+ TP_ARGS(apicid, dm, tm, vec),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( __u16, dm )
+ __field( __u16, tm )
+ __field( __u8, vec )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apicid;
+ __entry->dm = dm;
+ __entry->tm = tm;
+ __entry->vec = vec;
+ ),
+
+ TP_printk("apicid %x vec %u (%s|%s)",
+ __entry->apicid, __entry->vec,
+ __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
+ __entry->tm ? "level" : "edge")
+);
+
/*
* Tracepoint for AMD AVIC
*/
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 4705ad55abb5..c8029b7845b6 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -312,6 +312,15 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
}
+static inline int ept_caps_to_lpage_level(u32 ept_caps)
+{
+ if (ept_caps & VMX_EPT_1GB_PAGE_BIT)
+ return PG_LEVEL_1G;
+ if (ept_caps & VMX_EPT_2MB_PAGE_BIT)
+ return PG_LEVEL_2M;
+ return PG_LEVEL_4K;
+}
+
static inline bool cpu_has_vmx_ept_ad_bits(void)
{
return vmx_capability.ept & VMX_EPT_AD_BIT;
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 152ab0aa82cf..16731d2cf231 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -93,7 +93,7 @@ static __always_inline int get_evmcs_offset(unsigned long field,
return evmcs_field->offset;
}
-static inline void evmcs_write64(unsigned long field, u64 value)
+static __always_inline void evmcs_write64(unsigned long field, u64 value)
{
u16 clean_field;
int offset = get_evmcs_offset(field, &clean_field);
@@ -183,7 +183,7 @@ static inline void evmcs_load(u64 phys_addr)
__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */
-static inline void evmcs_write64(unsigned long field, u64 value) {}
+static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
static inline void evmcs_write32(unsigned long field, u32 value) {}
static inline void evmcs_write16(unsigned long field, u16 value) {}
static inline u64 evmcs_read64(unsigned long field) { return 0; }
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index eedcebf58004..f235f77cbc03 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -191,7 +191,7 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
* failValid writes the error number to the current VMCS, which
* can't be done if there isn't a current VMCS.
*/
- if (vmx->nested.current_vmptr == -1ull &&
+ if (vmx->nested.current_vmptr == INVALID_GPA &&
!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
return nested_vmx_failInvalid(vcpu);
@@ -218,7 +218,7 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
{
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
vmx->nested.need_vmcs12_to_shadow_sync = false;
}
@@ -245,7 +245,8 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
src = &prev->host_state;
dest = &vmx->loaded_vmcs->host_state;
- vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
+ vmx_set_vmcs_host_state(dest, src->cr3, src->fs_sel, src->gs_sel,
+ src->fs_base, src->gs_base);
dest->ldt_sel = src->ldt_sel;
#ifdef CONFIG_X86_64
dest->ds_sel = src->ds_sel;
@@ -269,7 +270,13 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
- vmx_register_cache_reset(vcpu);
+ vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
+
+ /*
+ * All lazily updated registers will be reloaded from VMCS12 on both
+ * vmentry and vmexit.
+ */
+ vcpu->arch.regs_dirty = 0;
}
/*
@@ -290,9 +297,10 @@ static void free_nested(struct kvm_vcpu *vcpu)
vmx->nested.vmxon = false;
vmx->nested.smm.vmxon = false;
+ vmx->nested.vmxon_ptr = INVALID_GPA;
free_vpid(vmx->nested.vpid02);
vmx->nested.posted_intr_nv = -1;
- vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmptr = INVALID_GPA;
if (enable_shadow_vmcs) {
vmx_disable_shadow_vmcs(vmx);
vmcs_clear(vmx->vmcs01.shadow_vmcs);
@@ -390,9 +398,11 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
{
- kvm_init_shadow_ept_mmu(vcpu,
- to_vmx(vcpu)->nested.msrs.ept_caps &
- VMX_EPT_EXECUTE_ONLY_BIT,
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
+ int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
+
+ kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
nested_ept_ad_enabled(vcpu),
nested_ept_get_eptp(vcpu));
}
@@ -524,67 +534,19 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
}
/*
- * Check if MSR is intercepted for L01 MSR bitmap.
- */
-static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
-{
- unsigned long *msr_bitmap;
- int f = sizeof(unsigned long);
-
- if (!cpu_has_vmx_msr_bitmap())
- return true;
-
- msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
-
- if (msr <= 0x1fff) {
- return !!test_bit(msr, msr_bitmap + 0x800 / f);
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- return !!test_bit(msr, msr_bitmap + 0xc00 / f);
- }
-
- return true;
-}
-
-/*
- * If a msr is allowed by L0, we should check whether it is allowed by L1.
- * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1
+ * itself utilizing x2APIC. All MSRs were previously set to be intercepted,
+ * only the "disable intercept" case needs to be handled.
*/
-static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
- unsigned long *msr_bitmap_nested,
- u32 msr, int type)
+static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1,
+ unsigned long *msr_bitmap_l0,
+ u32 msr, int type)
{
- int f = sizeof(unsigned long);
-
- /*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
- */
- if (msr <= 0x1fff) {
- if (type & MSR_TYPE_R &&
- !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
- /* read-low */
- __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
-
- if (type & MSR_TYPE_W &&
- !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
- /* write-low */
- __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
-
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- if (type & MSR_TYPE_R &&
- !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
- /* read-high */
- __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
+ if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr))
+ vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr);
- if (type & MSR_TYPE_W &&
- !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
- /* write-high */
- __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
-
- }
+ if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr))
+ vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr);
}
static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
@@ -599,6 +561,34 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
}
}
+#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \
+static inline \
+void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \
+ unsigned long *msr_bitmap_l1, \
+ unsigned long *msr_bitmap_l0, u32 msr) \
+{ \
+ if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \
+ vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \
+ vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \
+ else \
+ vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \
+}
+BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
+BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
+
+static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
+ unsigned long *msr_bitmap_l1,
+ unsigned long *msr_bitmap_l0,
+ u32 msr, int types)
+{
+ if (types & MSR_TYPE_R)
+ nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
+ msr_bitmap_l0, msr);
+ if (types & MSR_TYPE_W)
+ nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
+ msr_bitmap_l0, msr);
+}
+
/*
* Merge L0's and L1's MSR bitmap, return false to indicate that
* we do not use the hardware.
@@ -606,16 +596,31 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
int msr;
unsigned long *msr_bitmap_l1;
- unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
- struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
+ unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
/* Nothing to do if the MSR bitmap is not in use. */
if (!cpu_has_vmx_msr_bitmap() ||
!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
return false;
+ /*
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
+ * and tells KVM (L0) there were no changes in MSR bitmap for L2.
+ */
+ if (!vmx->nested.force_msr_bitmap_recalc && evmcs &&
+ evmcs->hv_enlightenments_control.msr_bitmap &&
+ evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
+ return true;
+
if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
return false;
@@ -624,7 +629,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
/*
* To keep the control flow simple, pay eight 8-byte writes (sixteen
* 4-byte writes on 32-bit systems) up front to enable intercepts for
- * the x2APIC MSR range and selectively disable them below.
+ * the x2APIC MSR range and selectively toggle those relevant to L2.
*/
enable_x2apic_msr_intercepts(msr_bitmap_l0);
@@ -643,61 +648,46 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
}
}
- nested_vmx_disable_intercept_for_msr(
+ nested_vmx_disable_intercept_for_x2apic_msr(
msr_bitmap_l1, msr_bitmap_l0,
X2APIC_MSR(APIC_TASKPRI),
MSR_TYPE_R | MSR_TYPE_W);
if (nested_cpu_has_vid(vmcs12)) {
- nested_vmx_disable_intercept_for_msr(
+ nested_vmx_disable_intercept_for_x2apic_msr(
msr_bitmap_l1, msr_bitmap_l0,
X2APIC_MSR(APIC_EOI),
MSR_TYPE_W);
- nested_vmx_disable_intercept_for_msr(
+ nested_vmx_disable_intercept_for_x2apic_msr(
msr_bitmap_l1, msr_bitmap_l0,
X2APIC_MSR(APIC_SELF_IPI),
MSR_TYPE_W);
}
}
- /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
+ /*
+ * Always check vmcs01's bitmap to honor userspace MSR filters and any
+ * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
+ */
#ifdef CONFIG_X86_64
- nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
- MSR_FS_BASE, MSR_TYPE_RW);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_FS_BASE, MSR_TYPE_RW);
- nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
- MSR_GS_BASE, MSR_TYPE_RW);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_GS_BASE, MSR_TYPE_RW);
- nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
- MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
#endif
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
- /*
- * Checking the L0->L1 bitmap is trying to verify two things:
- *
- * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
- * ensures that we do not accidentally generate an L02 MSR bitmap
- * from the L12 MSR bitmap that is too permissive.
- * 2. That L1 or L2s have actually used the MSR. This avoids
- * unnecessarily merging of the bitmap if the MSR is unused. This
- * works properly because we only update the L01 MSR bitmap lazily.
- * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
- * updated to reflect this when L1 (or its L2s) actually write to
- * the MSR.
- */
- if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- MSR_IA32_SPEC_CTRL,
- MSR_TYPE_R | MSR_TYPE_W);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_PRED_CMD, MSR_TYPE_W);
- if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- MSR_IA32_PRED_CMD,
- MSR_TYPE_W);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
- kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
+ vmx->nested.force_msr_bitmap_recalc = false;
return true;
}
@@ -705,33 +695,39 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- struct kvm_host_map map;
- struct vmcs12 *shadow;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
- vmcs12->vmcs_link_pointer == -1ull)
+ vmcs12->vmcs_link_pointer == INVALID_GPA)
return;
- shadow = get_shadow_vmcs12(vcpu);
-
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
+ if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+ kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+ vmcs12->vmcs_link_pointer, VMCS12_SIZE))
return;
- memcpy(shadow, map.hva, VMCS12_SIZE);
- kvm_vcpu_unmap(vcpu, &map, false);
+ kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+ VMCS12_SIZE);
}
static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
- vmcs12->vmcs_link_pointer == -1ull)
+ vmcs12->vmcs_link_pointer == INVALID_GPA)
+ return;
+
+ if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+ kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+ vmcs12->vmcs_link_pointer, VMCS12_SIZE))
return;
- kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
- get_shadow_vmcs12(vcpu), VMCS12_SIZE);
+ kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+ VMCS12_SIZE);
}
/*
@@ -1124,7 +1120,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
* must not be dereferenced.
*/
if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
- CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
+ CC(!load_pdptrs(vcpu, cr3))) {
*entry_failure_code = ENTRY_FAIL_PDPTE;
return -EINVAL;
}
@@ -1133,7 +1129,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
kvm_mmu_new_pgd(vcpu, cr3);
vcpu->arch.cr3 = cr3;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
kvm_init_mmu(vcpu);
@@ -1191,29 +1187,26 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
WARN_ON(!enable_vpid);
/*
- * If VPID is enabled and used by vmc12, but L2 does not have a unique
- * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
- * a VPID for L2, flush the current context as the effective ASID is
- * common to both L1 and L2.
- *
- * Defer the flush so that it runs after vmcs02.EPTP has been set by
- * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
- * redundant flushes further down the nested pipeline.
- *
- * If a TLB flush isn't required due to any of the above, and vpid12 is
- * changing then the new "virtual" VPID (vpid12) will reuse the same
- * "real" VPID (vpid02), and so needs to be flushed. There's no direct
- * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
- * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate
- * guest-physical mappings, so there is no need to sync the nEPT MMU.
+ * VPID is enabled and in use by vmcs12. If vpid12 is changing, then
+ * emulate a guest TLB flush as KVM does not track vpid12 history nor
+ * is the VPID incorporated into the MMU context. I.e. KVM must assume
+ * that the new vpid12 has never been used and thus represents a new
+ * guest ASID that cannot have entries in the TLB.
*/
- if (!nested_has_guest_tlb_tag(vcpu)) {
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- } else if (is_vmenter &&
- vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- vpid_sync_context(nested_get_vpid02(vcpu));
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
}
+
+ /*
+ * If VPID is enabled, used by vmc12, and vpid12 is not changing but
+ * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
+ * KVM was unable to allocate a VPID for L2, flush the current context
+ * as the effective ASID is common to both L1 and L2.
+ */
+ if (!nested_has_guest_tlb_tag(vcpu))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
@@ -1994,7 +1987,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
}
if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
- vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmptr = INVALID_GPA;
nested_release_evmcs(vcpu);
@@ -2053,10 +2046,13 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
* Clean fields data can't be used on VMLAUNCH and when we switch
* between different L2 guests as KVM keeps a single VMCS12 per L1.
*/
- if (from_launch || evmcs_gpa_changed)
+ if (from_launch || evmcs_gpa_changed) {
vmx->nested.hv_evmcs->hv_clean_fields &=
~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+ vmx->nested.force_msr_bitmap_recalc = true;
+ }
+
return EVMPTRLD_SUCCEEDED;
}
@@ -2178,7 +2174,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
}
if (cpu_has_vmx_encls_vmexit())
- vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+ vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
/*
* Set the MSR load/store lists to match L0's settings. Only the
@@ -2197,7 +2193,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
{
prepare_vmcs02_constant_state(vmx);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
if (enable_vpid) {
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
@@ -2623,8 +2619,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
- vmcs12->guest_ia32_perf_global_ctrl)))
+ vmcs12->guest_ia32_perf_global_ctrl))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
+ }
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
kvm_rip_write(vcpu, vmcs12->guest_rip);
@@ -2865,6 +2863,17 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
return 0;
}
+static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+#ifdef CONFIG_X86_64
+ if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
+ !!(vcpu->arch.efer & EFER_LMA)))
+ return -EINVAL;
+#endif
+ return 0;
+}
+
static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
@@ -2889,18 +2898,16 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
return -EINVAL;
#ifdef CONFIG_X86_64
- ia32e = !!(vcpu->arch.efer & EFER_LMA);
+ ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
#else
ia32e = false;
#endif
if (ia32e) {
- if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
- CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
+ if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
return -EINVAL;
} else {
- if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
- CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
+ if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
CC((vmcs12->host_rip) >> 32))
return -EINVAL;
@@ -2945,27 +2952,31 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- int r = 0;
- struct vmcs12 *shadow;
- struct kvm_host_map map;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
+ struct vmcs_hdr hdr;
- if (vmcs12->vmcs_link_pointer == -1ull)
+ if (vmcs12->vmcs_link_pointer == INVALID_GPA)
return 0;
if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
return -EINVAL;
- if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
- return -EINVAL;
+ if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+ CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+ vmcs12->vmcs_link_pointer, VMCS12_SIZE)))
+ return -EINVAL;
- shadow = map.hva;
+ if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
+ offsetof(struct vmcs12, hdr),
+ sizeof(hdr))))
+ return -EINVAL;
- if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
- CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
- r = -EINVAL;
+ if (CC(hdr.revision_id != VMCS12_REVISION) ||
+ CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
+ return -EINVAL;
- kvm_vcpu_unmap(vcpu, &map, false);
- return r;
+ return 0;
}
/*
@@ -3044,7 +3055,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
+ unsigned long cr4;
bool vm_fail;
if (!nested_early_check)
@@ -3067,12 +3078,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
*/
vmcs_writel(GUEST_RFLAGS, 0);
- cr3 = __get_current_cr3_fast();
- if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- vmx->loaded_vmcs->host_state.cr3 = cr3;
- }
-
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
vmcs_writel(HOST_CR4, cr4);
@@ -3162,7 +3167,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
* the guest CR3 might be restored prior to setting the nested
* state which can lead to a load of wrong PDPTRs.
*/
- if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)))
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
return false;
}
@@ -3216,7 +3221,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
* Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
* force VM-Entry to fail.
*/
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
}
}
@@ -3360,8 +3365,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
};
u32 failed_index;
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ kvm_service_local_tlb_flush_requests(vcpu);
evaluate_pending_interrupts = exec_controls_get(vmx) &
(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
@@ -3522,12 +3526,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (evmptrld_status == EVMPTRLD_ERROR) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
- } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) {
- return nested_vmx_failInvalid(vcpu);
}
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+
+ if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
+ return nested_vmx_failInvalid(vcpu);
+
if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
- vmx->nested.current_vmptr == -1ull))
+ vmx->nested.current_vmptr == INVALID_GPA))
return nested_vmx_failInvalid(vcpu);
vmcs12 = get_vmcs12(vcpu);
@@ -3570,6 +3577,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (nested_vmx_check_controls(vcpu, vmcs12))
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ if (nested_vmx_check_address_space_size(vcpu, vmcs12))
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+
if (nested_vmx_check_host_state(vcpu, vmcs12))
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
@@ -3618,7 +3628,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
!(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
(vmcs12->guest_rflags & X86_EFLAGS_IF))) {
vmx->nested.nested_run_pending = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
break;
case GUEST_ACTIVITY_WAIT_SIPI:
@@ -4515,9 +4525,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
(void)nested_get_evmcs_page(vcpu);
}
- /* Service the TLB flush request for L2 before switching to L1. */
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ /* Service pending TLB flush requests for L2 before switching to L1. */
+ kvm_service_local_tlb_flush_requests(vcpu);
/*
* VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
@@ -4870,6 +4879,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
+ vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_shadow_vmcs12)
goto out_cached_shadow_vmcs12;
@@ -4975,7 +4985,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vmx->nested.current_vmptr == -1ull)
+ if (vmx->nested.current_vmptr == INVALID_GPA)
return;
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
@@ -4995,7 +5005,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
- vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmptr = INVALID_GPA;
}
/* Emulate the VMXOFF instruction */
@@ -5090,12 +5100,12 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
return 1;
/*
- * In VMX non-root operation, when the VMCS-link pointer is -1ull,
+ * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
* any VMREAD sets the ALU flags for VMfailInvalid.
*/
- if (vmx->nested.current_vmptr == -1ull ||
+ if (vmx->nested.current_vmptr == INVALID_GPA ||
(is_guest_mode(vcpu) &&
- get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
+ get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
return nested_vmx_failInvalid(vcpu);
/* Decode instruction info and find the field to read */
@@ -5182,12 +5192,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return 1;
/*
- * In VMX non-root operation, when the VMCS-link pointer is -1ull,
+ * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
* any VMWRITE sets the ALU flags for VMfailInvalid.
*/
- if (vmx->nested.current_vmptr == -1ull ||
+ if (vmx->nested.current_vmptr == INVALID_GPA ||
(is_guest_mode(vcpu) &&
- get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
+ get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
return nested_vmx_failInvalid(vcpu);
if (instr_info & BIT(10))
@@ -5273,6 +5283,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
vmx->nested.need_vmcs12_to_shadow_sync = true;
}
vmx->nested.dirty_vmcs12 = true;
+ vmx->nested.force_msr_bitmap_recalc = true;
}
/* Emulate the VMPTRLD instruction */
@@ -5299,10 +5310,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return 1;
if (vmx->nested.current_vmptr != vmptr) {
- struct kvm_host_map map;
- struct vmcs12 *new_vmcs12;
+ struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
+ struct vmcs_hdr hdr;
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
/*
* Reads from an unbacked page return all 1s,
* which means that the 32 bits located at the
@@ -5313,12 +5324,16 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
- new_vmcs12 = map.hva;
+ if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
+ offsetof(struct vmcs12, hdr),
+ sizeof(hdr))) {
+ return nested_vmx_fail(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ }
- if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
- (new_vmcs12->hdr.shadow_vmcs &&
+ if (hdr.revision_id != VMCS12_REVISION ||
+ (hdr.shadow_vmcs &&
!nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
- kvm_vcpu_unmap(vcpu, &map, false);
return nested_vmx_fail(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
@@ -5329,8 +5344,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
* Load VMCS12 from guest memory since it is not already
* cached.
*/
- memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
- kvm_vcpu_unmap(vcpu, &map, false);
+ if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
+ VMCS12_SIZE)) {
+ return nested_vmx_fail(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ }
set_current_vmptr(vmx, vmptr);
}
@@ -5378,7 +5396,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
struct {
u64 eptp, gpa;
} operand;
- int i, r;
+ int i, r, gpr_index;
if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_EPT) ||
@@ -5391,7 +5409,8 @@ static int handle_invept(struct kvm_vcpu *vcpu)
return 1;
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
@@ -5458,7 +5477,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
u64 gla;
} operand;
u16 vpid02;
- int r;
+ int r, gpr_index;
if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_VPID) ||
@@ -5471,7 +5490,8 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return 1;
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
types = (vmx->nested.msrs.vpid_caps &
VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
@@ -5630,7 +5650,7 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
gpa_t bitmap, last_bitmap;
u8 b;
- last_bitmap = (gpa_t)-1;
+ last_bitmap = INVALID_GPA;
b = -1;
while (size > 0) {
@@ -6065,7 +6085,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
goto reflect_vmexit;
}
- trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX);
+ trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
/* If L0 (KVM) wants the exit, it trumps L1's desires. */
if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
@@ -6106,8 +6126,8 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
.format = KVM_STATE_NESTED_FORMAT_VMX,
.size = sizeof(kvm_state),
.hdr.vmx.flags = 0,
- .hdr.vmx.vmxon_pa = -1ull,
- .hdr.vmx.vmcs12_pa = -1ull,
+ .hdr.vmx.vmxon_pa = INVALID_GPA,
+ .hdr.vmx.vmcs12_pa = INVALID_GPA,
.hdr.vmx.preemption_timer_deadline = 0,
};
struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
@@ -6133,7 +6153,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
if (is_guest_mode(vcpu) &&
nested_cpu_has_shadow_vmcs(vmcs12) &&
- vmcs12->vmcs_link_pointer != -1ull)
+ vmcs12->vmcs_link_pointer != INVALID_GPA)
kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
}
@@ -6209,7 +6229,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
return -EFAULT;
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
- vmcs12->vmcs_link_pointer != -1ull) {
+ vmcs12->vmcs_link_pointer != INVALID_GPA) {
if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
get_shadow_vmcs12(vcpu), VMCS12_SIZE))
return -EFAULT;
@@ -6244,11 +6264,11 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
return -EINVAL;
- if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
+ if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) {
if (kvm_state->hdr.vmx.smm.flags)
return -EINVAL;
- if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
+ if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)
return -EINVAL;
/*
@@ -6302,7 +6322,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
vmx_leave_nested(vcpu);
- if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
+ if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA)
return 0;
vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
@@ -6315,13 +6335,13 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
/* See vmx_has_valid_vmcs12. */
if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
(kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
- (kvm_state->hdr.vmx.vmcs12_pa != -1ull))
+ (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA))
return -EINVAL;
else
return 0;
}
- if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
+ if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) {
if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
!page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
return -EINVAL;
@@ -6366,7 +6386,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
ret = -EINVAL;
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
- vmcs12->vmcs_link_pointer != -1ull) {
+ vmcs12->vmcs_link_pointer != INVALID_GPA) {
struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
if (kvm_state->size <
@@ -6399,6 +6419,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
goto error_guest_mode;
vmx->nested.dirty_vmcs12 = true;
+ vmx->nested.force_msr_bitmap_recalc = true;
ret = nested_vmx_enter_non_root_mode(vcpu, false);
if (ret)
goto error_guest_mode;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 10cc4f65c4ef..5e0ac57d6d1b 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -68,16 +68,17 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
reprogram_counter(pmu, bit);
}
-static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
- u8 event_select,
- u8 unit_mask)
+static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+ u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
- if (intel_arch_events[i].eventsel == event_select
- && intel_arch_events[i].unit_mask == unit_mask
- && (pmu->available_event_types & (1 << i)))
+ if (intel_arch_events[i].eventsel == event_select &&
+ intel_arch_events[i].unit_mask == unit_mask &&
+ (pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i)))
break;
if (i == ARRAY_SIZE(intel_arch_events))
@@ -86,18 +87,6 @@ static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
return intel_arch_events[i].event_type;
}
-static unsigned intel_find_fixed_event(int idx)
-{
- u32 event;
- size_t size = ARRAY_SIZE(fixed_pmc_events);
-
- if (idx >= size)
- return PERF_COUNT_HW_MAX;
-
- event = fixed_pmc_events[array_index_nospec(idx, size)];
- return intel_arch_events[event].event_type;
-}
-
/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
{
@@ -118,16 +107,15 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
}
}
-/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
-static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
bool fixed = idx & (1u << 30);
idx &= ~(3u << 30);
- return (!fixed && idx >= pmu->nr_arch_gp_counters) ||
- (fixed && idx >= pmu->nr_arch_fixed_counters);
+ return fixed ? idx < pmu->nr_arch_fixed_counters
+ : idx < pmu->nr_arch_gp_counters;
}
static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
@@ -365,7 +353,7 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = pmu->global_ctrl;
return 0;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- msr_info->data = pmu->global_ovf_ctrl;
+ msr_info->data = 0;
return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
@@ -423,7 +411,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!(data & pmu->global_ovf_ctrl_mask)) {
if (!msr_info->host_initiated)
pmu->global_status &= ~data;
- pmu->global_ovf_ctrl = data;
return 0;
}
break;
@@ -461,6 +448,21 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
+static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
+{
+ size_t size = ARRAY_SIZE(fixed_pmc_events);
+ struct kvm_pmc *pmc;
+ u32 event;
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ pmc = &pmu->fixed_counters[i];
+ event = fixed_pmc_events[array_index_nospec(i, size)];
+ pmc->eventsel = (intel_arch_events[event].unit_mask << 8) |
+ intel_arch_events[event].eventsel;
+ }
+}
+
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -502,12 +504,14 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_fixed_counters = 0;
} else {
pmu->nr_arch_fixed_counters =
- min_t(int, edx.split.num_counters_fixed,
- x86_pmu.num_counters_fixed);
+ min3(ARRAY_SIZE(fixed_pmc_events),
+ (size_t) edx.split.num_counters_fixed,
+ (size_t) x86_pmu.num_counters_fixed);
edx.split.bit_width_fixed = min_t(int,
edx.split.bit_width_fixed, x86_pmu.bit_width_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
+ setup_fixed_pmc_eventsel(pmu);
}
pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
@@ -588,8 +592,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
pmc->counter = 0;
}
- pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
- pmu->global_ovf_ctrl = 0;
+ pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
intel_pmu_release_guest_lbr_event(vcpu);
}
@@ -706,8 +709,7 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
}
struct kvm_pmu_ops intel_pmu_ops = {
- .find_arch_event = intel_find_arch_event,
- .find_fixed_event = intel_find_fixed_event,
+ .pmc_perf_hw_id = intel_pmc_perf_hw_id,
.pmc_is_enabled = intel_pmc_is_enabled,
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 5f81ef092bd4..88c53c521094 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -5,15 +5,28 @@
#include <asm/cpu.h>
#include "lapic.h"
+#include "irq.h"
#include "posted_intr.h"
#include "trace.h"
#include "vmx.h"
/*
- * We maintain a per-CPU linked-list of vCPU, so in wakeup_handler() we
- * can find which vCPU should be waken up.
+ * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler()
+ * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when
+ * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled.
+ * The vCPUs posted interrupt descriptor is updated at the same time to set its
+ * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices
+ * wake the target vCPUs. vCPUs are removed from the list and the notification
+ * vector is reset when the vCPU is scheduled in.
*/
static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+/*
+ * Protect the per-CPU list with a per-CPU spinlock to handle task migration.
+ * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
+ * ->sched_in() path will need to take the vCPU off the list of the _previous_
+ * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will
+ * occur if a wakeup IRQ arrives and attempts to acquire the lock.
+ */
static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
@@ -21,6 +34,20 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
return &(to_vmx(vcpu)->pi_desc);
}
+static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
+{
+ /*
+ * PID.ON can be set at any time by a different vCPU or by hardware,
+ * e.g. a device. PID.control must be written atomically, and the
+ * update must be retried with a fresh snapshot an ON change causes
+ * the cmpxchg to fail.
+ */
+ if (cmpxchg64(&pi_desc->control, old, new) != old)
+ return -EBUSY;
+
+ return 0;
+}
+
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -28,11 +55,14 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
unsigned int dest;
/*
- * In case of hot-plug or hot-unplug, we may have to undo
- * vmx_vcpu_pi_put even if there is no assigned device. And we
- * always keep PI.NDST up to date for simplicity: it makes the
- * code easier, and CPU migration is not a fast path.
+ * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and
+ * PI.SN up-to-date even if there is no assigned device or if APICv is
+ * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC.
*/
+ if (!enable_apicv || !lapic_in_kernel(vcpu))
+ return;
+
+ /* Nothing to do if PI.SN and PI.NDST both have the desired value. */
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
return;
@@ -48,20 +78,17 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
goto after_clear_sn;
}
- /* The full case. */
- do {
- old.control = new.control = pi_desc->control;
-
- dest = cpu_physical_id(cpu);
+ /* The full case. Set the new destination and clear SN. */
+ dest = cpu_physical_id(cpu);
+ if (!x2apic_mode)
+ dest = (dest << 8) & 0xFF00;
- if (x2apic_mode)
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
+ do {
+ old.control = new.control = READ_ONCE(pi_desc->control);
+ new.ndst = dest;
new.sn = 0;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
+ } while (pi_try_set_control(pi_desc, old.control, new.control));
after_clear_sn:
@@ -77,13 +104,18 @@ after_clear_sn:
pi_set_on(pi_desc);
}
+static bool vmx_can_use_vtd_pi(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm) && enable_apicv &&
+ kvm_arch_has_assigned_device(kvm) &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
+ if (!vmx_can_use_vtd_pi(vcpu->kvm))
return;
/* Set SN when the vCPU is preempted */
@@ -97,29 +129,31 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
struct pi_desc old, new;
unsigned int dest;
- do {
- old.control = new.control = pi_desc->control;
- WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
- "Wakeup handler not enabled while the VCPU is blocked\n");
+ /*
+ * Remove the vCPU from the wakeup list of the _previous_ pCPU, which
+ * will not be the same as the current pCPU if the task was migrated.
+ */
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- dest = cpu_physical_id(vcpu->cpu);
+ dest = cpu_physical_id(vcpu->cpu);
+ if (!x2apic_mode)
+ dest = (dest << 8) & 0xFF00;
- if (x2apic_mode)
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
+ WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR,
+ "Wakeup handler not enabled while the vCPU was blocking");
+
+ do {
+ old.control = new.control = READ_ONCE(pi_desc->control);
+
+ new.ndst = dest;
/* set 'NV' to 'notification vector' */
new.nv = POSTED_INTR_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
- if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_del(&vcpu->blocked_vcpu_list);
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- vcpu->pre_pcpu = -1;
- }
+ } while (pi_try_set_control(pi_desc, old.control, new.control));
+
+ vcpu->pre_pcpu = -1;
}
/*
@@ -128,7 +162,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
* - Store the vCPU to the wakeup list, so when interrupts happen
* we can find the right vCPU to wake up.
* - Change the Posted-interrupt descriptor as below:
- * 'NDST' <-- vcpu->pre_pcpu
* 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
* - If 'ON' is set during this process, which means at least one
* interrupt is posted for this vCPU, we cannot block it, in
@@ -137,70 +170,50 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
*/
int pi_pre_block(struct kvm_vcpu *vcpu)
{
- unsigned int dest;
struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ unsigned long flags;
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
+ if (!vmx_can_use_vtd_pi(vcpu->kvm) ||
+ vmx_interrupt_blocked(vcpu))
return 0;
- WARN_ON(irqs_disabled());
- local_irq_disable();
- if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
- vcpu->pre_pcpu = vcpu->cpu;
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_add_tail(&vcpu->blocked_vcpu_list,
- &per_cpu(blocked_vcpu_on_cpu,
- vcpu->pre_pcpu));
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- }
-
- do {
- old.control = new.control = pi_desc->control;
+ local_irq_save(flags);
- WARN((pi_desc->sn == 1),
- "Warning: SN field of posted-interrupts "
- "is set before blocking\n");
+ vcpu->pre_pcpu = vcpu->cpu;
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
+ list_add_tail(&vcpu->blocked_vcpu_list,
+ &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu));
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
- /*
- * Since vCPU can be preempted during this process,
- * vcpu->cpu could be different with pre_pcpu, we
- * need to set pre_pcpu as the destination of wakeup
- * notification event, then we can find the right vCPU
- * to wakeup in wakeup handler if interrupts happen
- * when the vCPU is in blocked state.
- */
- dest = cpu_physical_id(vcpu->pre_pcpu);
+ WARN(pi_desc->sn == 1,
+ "Posted Interrupt Suppress Notification set before blocking");
- if (x2apic_mode)
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
+ do {
+ old.control = new.control = READ_ONCE(pi_desc->control);
/* set 'NV' to 'wakeup vector' */
new.nv = POSTED_INTR_WAKEUP_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
+ } while (pi_try_set_control(pi_desc, old.control, new.control));
/* We should not block the vCPU if an interrupt is posted for it. */
- if (pi_test_on(pi_desc) == 1)
+ if (pi_test_on(pi_desc))
__pi_post_block(vcpu);
- local_irq_enable();
+ local_irq_restore(flags);
return (vcpu->pre_pcpu == -1);
}
void pi_post_block(struct kvm_vcpu *vcpu)
{
+ unsigned long flags;
+
if (vcpu->pre_pcpu == -1)
return;
- WARN_ON(irqs_disabled());
- local_irq_disable();
+ local_irq_save(flags);
__pi_post_block(vcpu);
- local_irq_enable();
+ local_irq_restore(flags);
}
/*
@@ -216,7 +229,7 @@ void pi_wakeup_handler(void)
blocked_vcpu_list) {
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- if (pi_test_on(pi_desc) == 1)
+ if (pi_test_on(pi_desc))
kvm_vcpu_kick(vcpu);
}
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
@@ -270,9 +283,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
struct vcpu_data vcpu_info;
int idx, ret = 0;
- if (!kvm_arch_has_assigned_device(kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(kvm->vcpus[0]))
+ if (!vmx_can_use_vtd_pi(kvm))
return 0;
idx = srcu_read_lock(&kvm->irq_srcu);
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 7f7b2326caf5..36ae035f14aa 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -40,7 +40,7 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
-static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
@@ -74,13 +74,13 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
-static inline int pi_test_on(struct pi_desc *pi_desc)
+static inline bool pi_test_on(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_ON,
(unsigned long *)&pi_desc->control);
}
-static inline int pi_test_sn(struct pi_desc *pi_desc)
+static inline bool pi_test_sn(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_SN,
(unsigned long *)&pi_desc->control);
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 6693ebdc0770..35e7ec91ae86 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -53,11 +53,9 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr,
unsigned int size)
{
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = addr;
- vcpu->run->internal.data[1] = size;
+ uint64_t data[2] = { addr, size };
+
+ __kvm_prepare_emulation_failure_exit(vcpu, data, ARRAY_SIZE(data));
}
static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
@@ -112,9 +110,7 @@ static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
* but the error code isn't (yet) plumbed through the ENCLS helpers.
*/
if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -155,9 +151,7 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0);
sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1);
if (!sgx_12_0 || !sgx_12_1) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 6e5de2e2b0da..e325c290a816 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -129,6 +129,11 @@ static inline bool is_machine_check(u32 intr_info)
return is_exception_n(intr_info, MC_VECTOR);
}
+static inline bool is_nm_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, NM_VECTOR);
+}
+
/* Undocumented: icebp/int1 */
static inline bool is_icebp(u32 intr_info)
{
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 3a6461694fc2..435c187927c4 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -49,14 +49,14 @@ SYM_FUNC_START_LOCAL(vmx_vmenter)
je 2f
1: vmresume
- ret
+ RET
2: vmlaunch
- ret
+ RET
3: cmpb $0, kvm_rebooting
je 4f
- ret
+ RET
4: ud2
_ASM_EXTABLE(1b, 3b)
@@ -89,7 +89,7 @@ SYM_FUNC_START(vmx_vmexit)
pop %_ASM_AX
.Lvmexit_skip_rsb:
#endif
- ret
+ RET
SYM_FUNC_END(vmx_vmexit)
/**
@@ -228,7 +228,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
pop %edi
#endif
pop %_ASM_BP
- ret
+ RET
/* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */
2: mov $1, %eax
@@ -293,7 +293,7 @@ SYM_FUNC_START(vmread_error_trampoline)
pop %_ASM_AX
pop %_ASM_BP
- ret
+ RET
SYM_FUNC_END(vmread_error_trampoline)
SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
@@ -326,5 +326,5 @@ SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
*/
mov %_ASM_BP, %_ASM_SP
pop %_ASM_BP
- ret
+ RET
SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 116b08904ac3..1b2e9d8c5cc9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -35,7 +35,8 @@
#include <asm/cpu_device_id.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
@@ -161,6 +162,8 @@ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
MSR_FS_BASE,
MSR_GS_BASE,
MSR_KERNEL_GS_BASE,
+ MSR_IA32_XFD,
+ MSR_IA32_XFD_ERR,
#endif
MSR_IA32_SYSENTER_CS,
MSR_IA32_SYSENTER_ESP,
@@ -602,15 +605,13 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
unsigned int slot = msr - vmx->guest_uret_msrs;
int ret = 0;
- u64 old_msr_data = msr->data;
- msr->data = data;
if (msr->load_into_hardware) {
preempt_disable();
- ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);
+ ret = kvm_set_user_return_msr(slot, data, msr->mask);
preempt_enable();
- if (ret)
- msr->data = old_msr_data;
}
+ if (!ret)
+ msr->data = data;
return ret;
}
@@ -763,30 +764,27 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
}
+ /*
+ * Disabling xfd interception indicates that dynamic xfeatures
+ * might be used in the guest. Always trap #NM in this case
+ * to save guest xfd_err timely.
+ */
+ if (vcpu->arch.xfd_no_write_intercept)
+ eb |= (1u << NM_VECTOR);
+
vmcs_write32(EXCEPTION_BITMAP, eb);
}
/*
* Check if MSR is intercepted for currently loaded MSR bitmap.
*/
-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
{
- unsigned long *msr_bitmap;
- int f = sizeof(unsigned long);
-
- if (!cpu_has_vmx_msr_bitmap())
+ if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
return true;
- msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
-
- if (msr <= 0x1fff) {
- return !!test_bit(msr, msr_bitmap + 0x800 / f);
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- return !!test_bit(msr, msr_bitmap + 0xc00 / f);
- }
-
- return true;
+ return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap,
+ MSR_IA32_SPEC_CTRL);
}
static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
@@ -1059,8 +1057,8 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
wrmsrl(MSR_IA32_RTIT_CTL, 0);
- pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
- pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
+ pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
}
}
@@ -1070,17 +1068,26 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
return;
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
- pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
- pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
+ pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
}
- /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
- wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ /*
+ * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
+ * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
+ */
+ if (vmx->pt_desc.host.ctl)
+ wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
-void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
- unsigned long fs_base, unsigned long gs_base)
+void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3,
+ u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base)
{
+ if (unlikely(cr3 != host->cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ host->cr3 = cr3;
+ }
if (unlikely(fs_sel != host->fs_sel)) {
if (!(fs_sel & 7))
vmcs_write16(HOST_FS_SELECTOR, fs_sel);
@@ -1175,7 +1182,9 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
gs_base = segment_base(gs_sel);
#endif
- vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
+ vmx_set_vmcs_host_state(host_state, __get_current_cr3_fast(),
+ fs_sel, gs_sel, fs_base, gs_base);
+
vmx->guest_state_loaded = true;
}
@@ -1278,7 +1287,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
if (!already_loaded) {
void *gdt = get_current_gdt_ro();
- unsigned long sysenter_esp;
/*
* Flush all EPTP/VPID contexts, the new pCPU may have stale
@@ -1294,8 +1302,11 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
- rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
+ /* 22.2.3 */
+ vmcs_writel(HOST_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(cpu) + 1));
+ }
vmx->loaded_vmcs->cpu = cpu;
}
@@ -1370,6 +1381,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmx->emulation_required = vmx_emulation_required(vcpu);
}
+static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -1456,16 +1472,16 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
* cause a #GP fault.
*/
value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
return 1;
return 0;
@@ -1755,7 +1771,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
}
/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
@@ -1886,8 +1902,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
if (!vmx_pt_mode_is_host_guest() ||
- (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_num_address_ranges)))
+ (index >= 2 * vmx->pt_desc.num_address_ranges))
return 1;
if (index % 2)
msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
@@ -1963,6 +1978,24 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KERNEL_GS_BASE:
vmx_write_guest_kernel_gs_base(vmx, data);
break;
+ case MSR_IA32_XFD:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ /*
+ * Always intercepting WRMSR could incur non-negligible
+ * overhead given xfd might be changed frequently in
+ * guest context switch. Disable write interception
+ * upon the first write with a non-zero value (indicating
+ * potential usage on dynamic xfeatures). Also update
+ * exception bitmap to trap #NM for proper virtualization
+ * of guest xfd_err.
+ */
+ if (!ret && data) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
+ MSR_TYPE_RW);
+ vcpu->arch.xfd_no_write_intercept = true;
+ vmx_update_exception_bitmap(vcpu);
+ }
+ break;
#endif
case MSR_IA32_SYSENTER_CS:
if (is_guest_mode(vcpu))
@@ -2103,9 +2136,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
ret = kvm_set_msr_common(vcpu, msr_info);
break;
- case MSR_IA32_TSC_ADJUST:
- ret = kvm_set_msr_common(vcpu, msr_info);
- break;
case MSR_IA32_MCG_EXT_CTL:
if ((!msr_info->host_initiated &&
!(to_vmx(vcpu)->msr_ia32_feature_control &
@@ -2202,8 +2232,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!pt_can_write_msr(vmx))
return 1;
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
- if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_num_address_ranges))
+ if (index >= 2 * vmx->pt_desc.num_address_ranges)
return 1;
if (is_noncanonical_address(data, vcpu))
return 1;
@@ -2655,15 +2684,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
-
- if (IS_ENABLED(CONFIG_HYPERV) &&
- static_branch_unlikely(&enable_evmcs) &&
- (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
- struct hv_enlightened_vmcs *evmcs =
- (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
-
- evmcs->hv_enlightenments_control.msr_bitmap = 1;
- }
}
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
@@ -2927,6 +2947,13 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
}
}
+static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return nested_get_vpid02(vcpu);
+ return to_vmx(vcpu)->vpid;
+}
+
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -2939,31 +2966,29 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
if (enable_ept)
ept_sync_context(construct_eptp(vcpu, root_hpa,
mmu->shadow_root_level));
- else if (!is_guest_mode(vcpu))
- vpid_sync_context(to_vmx(vcpu)->vpid);
else
- vpid_sync_context(nested_get_vpid02(vcpu));
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
- * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
+ * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
* vmx_flush_tlb_guest() for an explanation of why this is ok.
*/
- vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
+ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
- * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
- * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit
- * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+ * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
+ * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
+ * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
* disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
* i.e. no explicit INVVPID is necessary.
*/
- vpid_sync_context(to_vmx(vcpu)->vpid);
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -2993,7 +3018,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
- kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
}
#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
@@ -3075,6 +3100,13 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
/* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+
+ /*
+ * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
+ * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
+ */
+ if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
}
/* depends on vcpu->arch.cr0 to be set to a new value */
@@ -3118,9 +3150,9 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
- else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
guest_cr3 = vcpu->arch.cr3;
- else /* vmcs01.GUEST_CR3 is already up-to-date. */
+ else /* vmcs.GUEST_CR3 is already up-to-date. */
update_guest_cr3 = false;
vmx_ept_load_pdptrs(vcpu);
} else {
@@ -3131,6 +3163,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
vmcs_writel(GUEST_CR3, guest_cr3);
}
+
static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
/*
@@ -3695,44 +3728,17 @@ void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock);
}
-static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
{
- int f = sizeof(unsigned long);
-
- if (msr <= 0x1fff)
- __clear_bit(msr, msr_bitmap + 0x000 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
-}
-
-static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
-
- if (msr <= 0x1fff)
- __clear_bit(msr, msr_bitmap + 0x800 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
-}
-
-static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
-
- if (msr <= 0x1fff)
- __set_bit(msr, msr_bitmap + 0x000 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
-}
-
-static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
+ /*
+ * When KVM is a nested hypervisor on top of Hyper-V and uses
+ * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
+ * bitmap has changed.
+ */
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_touch_msr_bitmap();
- if (msr <= 0x1fff)
- __set_bit(msr, msr_bitmap + 0x800 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+ vmx->nested.force_msr_bitmap_recalc = true;
}
void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
@@ -3743,8 +3749,7 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
if (!cpu_has_vmx_msr_bitmap())
return;
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ vmx_msr_bitmap_l01_changed(vmx);
/*
* Mark the desired intercept state in shadow bitmap, this is needed
@@ -3788,8 +3793,7 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
if (!cpu_has_vmx_msr_bitmap())
return;
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ vmx_msr_bitmap_l01_changed(vmx);
/*
* Mark the desired intercept state in shadow bitmap, this is needed
@@ -3879,7 +3883,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
- for (i = 0; i < vmx->pt_desc.addr_range; i++) {
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
}
@@ -3979,6 +3983,19 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
*/
vmx->nested.pi_pending = true;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * This pairs with the smp_mb_*() after setting vcpu->mode in
+ * vcpu_enter_guest() to guarantee the vCPU sees the event
+ * request if triggering a posted interrupt "fails" because
+ * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
+ * the smb_wmb() in kvm_make_request() only ensures everything
+ * done before making the request is visible when the request
+ * is visible, it doesn't ensure ordering between the store to
+ * vcpu->requests and the load from vcpu->mode.
+ */
+ smp_mb__after_atomic();
+
/* the PIR and ON have been set by L1. */
if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
kvm_vcpu_kick(vcpu);
@@ -4012,8 +4029,13 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (pi_test_and_set_on(&vmx->pi_desc))
return 0;
- if (vcpu != kvm_get_running_vcpu() &&
- !kvm_vcpu_trigger_posted_interrupt(vcpu, false))
+ /*
+ * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
+ * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
+ * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
+ * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
+ */
+ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
kvm_vcpu_kick(vcpu);
return 0;
@@ -4070,6 +4092,12 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+
+ /*
+ * If 32-bit syscall is enabled, vmx_vcpu_load_vcms rewrites
+ * HOST_IA32_SYSENTER_ESP.
+ */
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
@@ -4088,8 +4116,10 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
~vcpu->arch.cr4_guest_rsvd_bits;
- if (!enable_ept)
- vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE;
+ if (!enable_ept) {
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
+ }
if (is_guest_mode(&vmx->vcpu))
vcpu->arch.cr4_guest_owned_bits &=
~get_vmcs12(vcpu)->cr4_guest_host_mask;
@@ -4328,10 +4358,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
#define VMX_XSS_EXIT_BITMAP 0
-/*
- * Noting that the initialization of Guest-state Area of VMCS is in
- * vmx_vcpu_reset().
- */
static void init_vmcs(struct vcpu_vmx *vmx)
{
if (nested)
@@ -4340,7 +4366,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
- vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
/* Control */
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
@@ -4436,10 +4462,40 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmx_setup_uret_msrs(vmx);
}
+static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ init_vmcs(vmx);
+
+ if (nested)
+ memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
+
+ vcpu_setup_sgx_lepubkeyhash(vcpu);
+
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.vmxon_ptr = INVALID_GPA;
+ vmx->nested.current_vmptr = INVALID_GPA;
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+
+ vcpu->arch.microcode_version = 0x100000000ULL;
+ vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
+
+ /*
+ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+ * or POSTED_INTR_WAKEUP_VECTOR.
+ */
+ vmx->pi_desc.nv = POSTED_INTR_VECTOR;
+ vmx->pi_desc.sn = 1;
+}
+
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (!init_event)
+ __vmx_vcpu_reset(vcpu);
+
vmx->rmode.vm86_active = 0;
vmx->spec_ctrl = 0;
@@ -4449,6 +4505,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_set_cr8(vcpu, 0);
vmx_segment_cache_clear(vmx);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
seg_setup(VCPU_SREG_CS);
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
@@ -4714,7 +4771,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
if (kvm_emulate_instruction(vcpu, 0)) {
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
return 1;
}
@@ -4770,6 +4827,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
if (is_machine_check(intr_info) || is_nmi(intr_info))
return 1; /* handled by handle_exception_nmi_irqoff() */
+ /*
+ * Queue the exception here instead of in handle_nm_fault_irqoff().
+ * This ensures the nested_vmx check is not skipped so vmexit can
+ * be reflected to L1 (when it intercepts #NM) before reaching this
+ * point.
+ */
+ if (is_nm_fault(intr_info)) {
+ kvm_queue_exception(vcpu, NM_VECTOR);
+ return 1;
+ }
+
if (is_invalid_opcode(intr_info))
return handle_ud(vcpu);
@@ -5379,16 +5447,13 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (vmx->emulation_required && !vmx->rmode.vm86_active &&
vcpu->arch.exception.pending) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
/*
@@ -5468,6 +5533,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
u64 pcid;
u64 gla;
} operand;
+ int gpr_index;
if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5475,12 +5541,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
}
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
-
- if (type > 3) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
/* According to the Intel instruction reference, the memory operand
* is read even if it isn't needed (e.g., for type==all)
@@ -5562,9 +5624,13 @@ static int handle_encls(struct kvm_vcpu *vcpu)
static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
{
- vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
- vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
- return 0;
+ /*
+ * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
+ * VM-Exits. Unconditionally set the flag here and leave the handling to
+ * vmx_handle_exit().
+ */
+ to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
+ return 1;
}
/*
@@ -5629,11 +5695,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2,
u32 *intr_info, u32 *error_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ *reason = vmx->exit_reason.full;
*info1 = vmx_get_exit_qual(vcpu);
if (!(vmx->exit_reason.failed_vmentry)) {
*info2 = vmx->idt_vectoring_info;
@@ -5903,18 +5971,14 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vmx_flush_pml_buffer(vcpu);
/*
- * We should never reach this point with a pending nested VM-Enter, and
- * more specifically emulation of L2 due to invalid guest state (see
- * below) should never happen as that means we incorrectly allowed a
- * nested VM-Enter with an invalid vmcs12.
+ * KVM should never reach this point with a pending nested VM-Enter.
+ * More specifically, short-circuiting VM-Entry to emulate L2 due to
+ * invalid guest state should never happen as that means KVM knowingly
+ * allowed a nested VM-Enter with an invalid vmcs12. More below.
*/
if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
return -EIO;
- /* If guest state is invalid, start emulating */
- if (vmx->emulation_required)
- return handle_invalid_guest_state(vcpu);
-
if (is_guest_mode(vcpu)) {
/*
* PML is never enabled when running L2, bail immediately if a
@@ -5936,10 +6000,30 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
*/
nested_mark_vmcs12_pages_dirty(vcpu);
+ /*
+ * Synthesize a triple fault if L2 state is invalid. In normal
+ * operation, nested VM-Enter rejects any attempt to enter L2
+ * with invalid state. However, those checks are skipped if
+ * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
+ * L2 state is invalid, it means either L1 modified SMRAM state
+ * or userspace provided bad state. Synthesize TRIPLE_FAULT as
+ * doing so is architecturally allowed in the RSM case, and is
+ * the least awful solution for the userspace case without
+ * risking false positives.
+ */
+ if (vmx->emulation_required) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+ return 1;
+ }
+
if (nested_vmx_reflect_vmexit(vcpu))
return 1;
}
+ /* If guest state is invalid, start emulating. L2 is handled above. */
+ if (vmx->emulation_required)
+ return handle_invalid_guest_state(vcpu);
+
if (exit_reason.failed_vmentry) {
dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -6051,9 +6135,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
int ret = __vmx_handle_exit(vcpu, exit_fastpath);
/*
- * Even when current exit reason is handled by KVM internally, we
- * still need to exit to user space when bus lock detected to inform
- * that there is a bus lock in guest.
+ * Exit to user space when bus lock detected to inform that there is
+ * a bus lock in guest.
*/
if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
if (ret > 0)
@@ -6285,9 +6368,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
- bool max_irr_updated;
+ bool got_posted_interrupt;
- if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm))
+ if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
return -EIO;
if (pi_test_on(&vmx->pi_desc)) {
@@ -6297,27 +6380,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
- max_irr_updated =
+ got_posted_interrupt =
kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
-
- /*
- * If we are running L2 and L1 has a new pending interrupt
- * which can be injected, we should re-evaluate
- * what should be done with this new L1 interrupt.
- * If L1 intercepts external-interrupts, we should
- * exit from L2 to L1. Otherwise, interrupt should be
- * delivered directly to L2.
- */
- if (is_guest_mode(vcpu) && max_irr_updated) {
- if (nested_exit_on_intr(vcpu))
- kvm_vcpu_exiting_guest_mode(vcpu);
- else
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- }
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
+ got_posted_interrupt = false;
}
- vmx_hwapic_irr_update(vcpu, max_irr);
+
+ /*
+ * Newly recognized interrupts are injected via either virtual interrupt
+ * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
+ * disabled in two cases:
+ *
+ * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
+ * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
+ * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
+ * into L2, but KVM doesn't use virtual interrupt delivery to inject
+ * interrupts into L2, and so KVM_REQ_EVENT is again needed.
+ *
+ * 2) If APICv is disabled for this vCPU, assigned devices may still
+ * attempt to post interrupts. The posted interrupt vector will cause
+ * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
+ */
+ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
+ vmx_set_rvi(max_irr);
+ else if (got_posted_interrupt)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
return max_irr;
}
@@ -6345,11 +6434,33 @@ void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
unsigned long entry)
{
- kvm_before_interrupt(vcpu);
+ bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
+
+ kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
vmx_do_interrupt_nmi_irqoff(entry);
kvm_after_interrupt(vcpu);
}
+static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Save xfd_err to guest_fpu before interrupt is enabled, so the
+ * MSR value is not clobbered by the host activity before the guest
+ * has chance to consume it.
+ *
+ * Do not blindly read xfd_err here, since this exception might
+ * be caused by L1 interception on a platform which doesn't
+ * support xfd at all.
+ *
+ * Do it conditionally upon guest_fpu::xfd. xfd_err matters
+ * only when xfd contains a non-zero value.
+ *
+ * Queuing exception is done in vmx_handle_exit. See comment there.
+ */
+ if (vcpu->arch.guest_fpu.fpstate->xfd)
+ rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+}
+
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
@@ -6358,6 +6469,9 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
/* if exit due to PF check for async PF */
if (is_page_fault(intr_info))
vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ /* if exit due to NM, handle before interrupts are enabled */
+ else if (is_nm_fault(intr_info))
+ handle_nm_fault_irqoff(&vmx->vcpu);
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
@@ -6408,6 +6522,7 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
return nested;
case MSR_AMD64_VIRT_SPEC_CTRL:
+ case MSR_AMD64_TSC_RATIO:
/* This is AMD only. */
return false;
default:
@@ -6615,7 +6730,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
+ unsigned long cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -6628,9 +6743,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
* consistency check VM-Exit due to invalid guest state and bail.
*/
if (unlikely(vmx->emulation_required)) {
-
- /* We don't emulate invalid state of a nested guest */
- vmx->fail = is_guest_mode(vcpu);
+ vmx->fail = 0;
vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
vmx->exit_reason.failed_vmentry = 1;
@@ -6658,12 +6771,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
-
- cr3 = __get_current_cr3_fast();
- if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- vmx->loaded_vmcs->host_state.cr3 = cr3;
- }
+ vcpu->arch.regs_dirty = 0;
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
@@ -6722,7 +6830,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+ if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
@@ -6752,7 +6860,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
loadsegment(es, __USER_DS);
#endif
- vmx_register_cache_reset(vcpu);
+ vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
pt_guest_exit(vmx);
@@ -6784,7 +6892,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (likely(!vmx->exit_reason.failed_vmentry))
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
+ trace_kvm_exit(vcpu, KVM_ISA_VMX);
if (unlikely(vmx->exit_reason.failed_vmentry))
return EXIT_FASTPATH_NONE;
@@ -6815,7 +6923,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
- int i, cpu, err;
+ int i, err;
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
vmx = to_vmx(vcpu);
@@ -6836,10 +6944,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
goto free_vpid;
}
- for (i = 0; i < kvm_nr_uret_msrs; ++i) {
- vmx->guest_uret_msrs[i].data = 0;
+ for (i = 0; i < kvm_nr_uret_msrs; ++i)
vmx->guest_uret_msrs[i].mask = -1ull;
- }
if (boot_cpu_has(X86_FEATURE_RTM)) {
/*
* TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
@@ -6855,6 +6961,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
if (err < 0)
goto free_pml;
+ /*
+ * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
+ * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
+ * feature only for vmcs01, KVM currently isn't equipped to realize any
+ * performance benefits from enabling it for vmcs02.
+ */
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
+
/* The MSR bitmap starts with all ones */
bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
@@ -6876,12 +6995,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
}
vmx->loaded_vmcs = &vmx->vmcs01;
- cpu = get_cpu();
- vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
- init_vmcs(vmx);
- vmx_vcpu_put(vcpu);
- put_cpu();
+
if (cpu_need_virtualize_apic_accesses(vcpu)) {
err = alloc_apic_access_page(vcpu->kvm);
if (err)
@@ -6894,27 +7008,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
goto free_vmcs;
}
- if (nested)
- memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
- else
- memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
-
- vcpu_setup_sgx_lepubkeyhash(vcpu);
-
- vmx->nested.posted_intr_nv = -1;
- vmx->nested.current_vmptr = -1ull;
- vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
-
- vcpu->arch.microcode_version = 0x100000000ULL;
- vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
-
- /*
- * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
- * or POSTED_INTR_WAKEUP_VECTOR.
- */
- vmx->pi_desc.nv = POSTED_INTR_VECTOR;
- vmx->pi_desc.sn = 1;
-
return 0;
free_vmcs:
@@ -6986,7 +7079,6 @@ static int __init vmx_check_processor_compat(void)
static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
- u64 ipat = 0;
/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
* memory aliases with conflicting memory types and sometimes MCEs.
@@ -7006,30 +7098,22 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
* EPT memory type is used to emulate guest CD/MTRR.
*/
- if (is_mmio) {
- cache = MTRR_TYPE_UNCACHABLE;
- goto exit;
- }
+ if (is_mmio)
+ return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
- ipat = VMX_EPT_IPAT_BIT;
- cache = MTRR_TYPE_WRBACK;
- goto exit;
- }
+ if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
- ipat = VMX_EPT_IPAT_BIT;
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
cache = MTRR_TYPE_WRBACK;
else
cache = MTRR_TYPE_UNCACHABLE;
- goto exit;
- }
- cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
+ return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
+ }
-exit:
- return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
+ return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
}
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
@@ -7129,12 +7213,13 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
}
/* Get the number of configurable Address Ranges for filtering */
- vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
+ vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_num_address_ranges);
/* Initialize and clear the no dependency bits */
vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
- RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
+ RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
+ RTIT_CTL_BRANCH_EN);
/*
* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
@@ -7152,12 +7237,11 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
/*
- * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
- * MTCFreq can be set
+ * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
*/
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
- RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
+ RTIT_CTL_MTC_RANGE);
/* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
@@ -7177,7 +7261,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
/* unmask address range configure area */
- for (i = 0; i < vmx->pt_desc.addr_range; i++)
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
@@ -7221,6 +7305,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
}
}
+ if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
+
+
set_cr4_guest_host_mask(vmx);
vmx_write_encls_bitmap(vcpu, NULL);
@@ -7553,6 +7642,8 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
static void hardware_unsetup(void)
{
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
if (nested)
nested_vmx_hardware_unsetup();
@@ -7562,12 +7653,16 @@ static void hardware_unsetup(void)
static bool vmx_check_apicv_inhibit_reasons(ulong bit)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
- BIT(APICV_INHIBIT_REASON_HYPERV);
+ BIT(APICV_INHIBIT_REASON_ABSENT) |
+ BIT(APICV_INHIBIT_REASON_HYPERV) |
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
return supported & BIT(bit);
}
static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .name = "kvm_intel",
+
.hardware_unsetup = hardware_unsetup,
.hardware_enable = hardware_enable,
@@ -7608,6 +7703,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
+ .get_if_flag = vmx_get_if_flag,
.tlb_flush_all = vmx_flush_tlb_all,
.tlb_flush_current = vmx_flush_tlb_current,
@@ -7703,6 +7799,20 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
};
+static unsigned int vmx_handle_intel_pt_intr(void)
+{
+ struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+
+ /* '0' on failure so that the !PT case can use a RET0 static call. */
+ if (!kvm_arch_pmi_in_guest(vcpu))
+ return 0;
+
+ kvm_make_request(KVM_REQ_PMI, vcpu);
+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
+ (unsigned long *)&vcpu->arch.pmu.global_status);
+ return 1;
+}
+
static __init void vmx_setup_user_return_msrs(void)
{
@@ -7729,11 +7839,13 @@ static __init void vmx_setup_user_return_msrs(void)
kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
}
+static struct kvm_x86_init_ops vmx_init_ops __initdata;
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
- int r, ept_lpage_level;
+ int r;
store_idt(&dt);
host_idt_base = dt.address;
@@ -7811,10 +7923,10 @@ static __init int hardware_setup(void)
ple_window_shrink = 0;
}
- if (!cpu_has_vmx_apicv()) {
+ if (!cpu_has_vmx_apicv())
enable_apicv = 0;
+ if (!enable_apicv)
vmx_x86_ops.sync_pir_to_irr = NULL;
- }
if (cpu_has_vmx_tsc_scaling()) {
kvm_has_tsc_control = true;
@@ -7830,16 +7942,8 @@ static __init int hardware_setup(void)
kvm_mmu_set_ept_masks(enable_ept_ad_bits,
cpu_has_vmx_ept_execute_only());
- if (!enable_ept)
- ept_lpage_level = 0;
- else if (cpu_has_vmx_ept_1g_page())
- ept_lpage_level = PG_LEVEL_1G;
- else if (cpu_has_vmx_ept_2m_page())
- ept_lpage_level = PG_LEVEL_2M;
- else
- ept_lpage_level = PG_LEVEL_4K;
kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
- ept_lpage_level);
+ ept_caps_to_lpage_level(vmx_capability.ept));
/*
* Only enable PML when hardware supports PML feature, and both EPT
@@ -7881,14 +7985,16 @@ static __init int hardware_setup(void)
vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
-
kvm_mce_cap_supported |= MCG_LMCE_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
return -EINVAL;
if (!enable_ept || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
+ if (pt_mode == PT_MODE_HOST_GUEST)
+ vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ else
+ vmx_init_ops.handle_intel_pt_intr = NULL;
setup_default_sgx_lepubkeyhash();
@@ -7906,6 +8012,9 @@ static __init int hardware_setup(void)
r = alloc_kvm_area();
if (r)
nested_vmx_hardware_unsetup();
+
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+
return r;
}
@@ -7914,6 +8023,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = {
.disabled_by_bios = vmx_disabled_by_bios,
.check_processor_compatibility = vmx_check_processor_compat,
.hardware_setup = hardware_setup,
+ .handle_intel_pt_intr = NULL,
.runtime_ops = &vmx_x86_ops,
};
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 592217fd7d92..f8fc7441baea 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -62,7 +62,7 @@ struct pt_ctx {
struct pt_desc {
u64 ctl_bitmask;
- u32 addr_range;
+ u32 num_address_ranges;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
struct pt_ctx host;
struct pt_ctx guest;
@@ -142,6 +142,16 @@ struct nested_vmx {
struct vmcs12 *cached_shadow_vmcs12;
/*
+ * GPA to HVA cache for accessing vmcs12->vmcs_link_pointer
+ */
+ struct gfn_to_hva_cache shadow_vmcs12_cache;
+
+ /*
+ * GPA to HVA cache for VMCS12
+ */
+ struct gfn_to_hva_cache vmcs12_cache;
+
+ /*
* Indicates if the shadow vmcs or enlightened vmcs must be updated
* with the data held by struct vmcs12.
*/
@@ -149,6 +159,15 @@ struct nested_vmx {
bool dirty_vmcs12;
/*
+ * Indicates whether MSR bitmap for L2 needs to be rebuilt due to
+ * changes in MSR bitmap for L1 or switching to a different L2. Note,
+ * this flag can only be used reliably in conjunction with a paravirt L1
+ * which informs L0 whether any changes to MSR bitmap for L2 were done
+ * on its side.
+ */
+ bool force_msr_bitmap_recalc;
+
+ /*
* Indicates lazily loaded guest state has not yet been decached from
* vmcs02.
*/
@@ -330,7 +349,7 @@ struct vcpu_vmx {
struct lbr_desc lbr_desc;
/* Save desired MSR intercept (read: pass-through) state */
-#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15
struct {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
@@ -352,8 +371,9 @@ int allocate_vpid(void);
void free_vpid(int vpid);
void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
-void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
- unsigned long fs_base, unsigned long gs_base);
+void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3,
+ u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base);
int vmx_get_cpl(struct kvm_vcpu *vcpu);
bool vmx_emulation_required(struct kvm_vcpu *vcpu);
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
@@ -400,6 +420,34 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+/*
+ * Note, early Intel manuals have the write-low and read-high bitmap offsets
+ * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
+ * 0xc0000000-0xc0001fff. The former (low) uses bytes 0-0x3ff for reads and
+ * 0x800-0xbff for writes. The latter (high) uses 0x400-0x7ff for reads and
+ * 0xc00-0xfff for writes. MSRs not covered by either of the ranges always
+ * VM-Exit.
+ */
+#define __BUILD_VMX_MSR_BITMAP_HELPER(rtype, action, bitop, access, base) \
+static inline rtype vmx_##action##_msr_bitmap_##access(unsigned long *bitmap, \
+ u32 msr) \
+{ \
+ int f = sizeof(unsigned long); \
+ \
+ if (msr <= 0x1fff) \
+ return bitop##_bit(msr, bitmap + base / f); \
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) \
+ return bitop##_bit(msr & 0x1fff, bitmap + (base + 0x400) / f); \
+ return (rtype)true; \
+}
+#define BUILD_VMX_MSR_BITMAP_HELPERS(ret_type, action, bitop) \
+ __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0x0) \
+ __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 0x800)
+
+BUILD_VMX_MSR_BITMAP_HELPERS(bool, test, test)
+BUILD_VMX_MSR_BITMAP_HELPERS(void, clear, __clear)
+BUILD_VMX_MSR_BITMAP_HELPERS(void, set, __set)
+
static inline u8 vmx_get_rvi(void)
{
return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
@@ -435,19 +483,21 @@ BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL)
BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
-static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
- | (1 << VCPU_EXREG_RFLAGS)
- | (1 << VCPU_EXREG_PDPTR)
- | (1 << VCPU_EXREG_SEGMENTS)
- | (1 << VCPU_EXREG_CR0)
- | (1 << VCPU_EXREG_CR3)
- | (1 << VCPU_EXREG_CR4)
- | (1 << VCPU_EXREG_EXIT_INFO_1)
- | (1 << VCPU_EXREG_EXIT_INFO_2));
- vcpu->arch.regs_dirty = 0;
-}
+/*
+ * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the
+ * cache on demand. Other registers not listed here are synced to
+ * the cache immediately after VM-Exit.
+ */
+#define VMX_REGS_LAZY_LOAD_SET ((1 << VCPU_REGS_RIP) | \
+ (1 << VCPU_REGS_RSP) | \
+ (1 << VCPU_EXREG_RFLAGS) | \
+ (1 << VCPU_EXREG_PDPTR) | \
+ (1 << VCPU_EXREG_SEGMENTS) | \
+ (1 << VCPU_EXREG_CR0) | \
+ (1 << VCPU_EXREG_CR3) | \
+ (1 << VCPU_EXREG_CR4) | \
+ (1 << VCPU_EXREG_EXIT_INFO_1) | \
+ (1 << VCPU_EXREG_EXIT_INFO_2))
static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
{
@@ -522,4 +572,9 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu)
void dump_vmcs(struct kvm_vcpu *vcpu);
+static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info)
+{
+ return (vmx_instr_info >> 28) & 0xf;
+}
+
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 9e9ef47e988c..5e7f41225780 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -71,6 +71,31 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
{
unsigned long value;
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
+ asm_volatile_goto("1: vmread %[field], %[output]\n\t"
+ "jna %l[do_fail]\n\t"
+
+ _ASM_EXTABLE(1b, %l[do_exception])
+
+ : [output] "=r" (value)
+ : [field] "r" (field)
+ : "cc"
+ : do_fail, do_exception);
+
+ return value;
+
+do_fail:
+ WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field);
+ pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field);
+ return 0;
+
+do_exception:
+ kvm_spurious_fault();
+ return 0;
+
+#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
asm volatile("1: vmread %2, %1\n\t"
".byte 0x3e\n\t" /* branch taken hint */
"ja 3f\n\t"
@@ -80,9 +105,11 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
* @field, and bounce through the trampoline to preserve
* volatile registers.
*/
- "push $0\n\t"
+ "xorl %k1, %k1\n\t"
+ "2:\n\t"
+ "push %1\n\t"
"push %2\n\t"
- "2:call vmread_error_trampoline\n\t"
+ "call vmread_error_trampoline\n\t"
/*
* Unwind the stack. Note, the trampoline zeros out the
@@ -93,14 +120,12 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
"3:\n\t"
/* VMREAD faulted. As above, except push '1' for @fault. */
- ".pushsection .fixup, \"ax\"\n\t"
- "4: push $1\n\t"
- "push %2\n\t"
- "jmp 2b\n\t"
- ".popsection\n\t"
- _ASM_EXTABLE(1b, 4b)
- : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc");
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1)
+
+ : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc");
return value;
+
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
}
static __always_inline u16 vmcs_read16(unsigned long field)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index aabd3a2ec1bc..76b4803dd3bd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -68,7 +68,9 @@
#include <asm/mce.h>
#include <asm/pkru.h>
#include <linux/kernel_stat.h>
-#include <asm/fpu/internal.h> /* Ugh! */
+#include <asm/fpu/api.h>
+#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
#include <asm/pvclock.h>
#include <asm/div64.h>
#include <asm/irq_remapping.h>
@@ -116,6 +118,7 @@ static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
static void store_regs(struct kvm_vcpu *vcpu);
static int sync_regs(struct kvm_vcpu *vcpu);
+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
@@ -208,7 +211,7 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs;
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
- | XFEATURE_MASK_PKRU)
+ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
u64 __read_mostly host_efer;
EXPORT_SYMBOL_GPL(host_efer);
@@ -293,8 +296,6 @@ u64 __read_mostly host_xcr0;
u64 __read_mostly supported_xcr0;
EXPORT_SYMBOL_GPL(supported_xcr0);
-static struct kmem_cache *x86_fpu_cache;
-
static struct kmem_cache *x86_emulator_cache;
/*
@@ -710,6 +711,17 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
}
EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
+static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+ EMULTYPE_COMPLETE_USER_EXIT);
+}
+
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
++vcpu->stat.pf_guest;
@@ -790,30 +802,6 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
}
EXPORT_SYMBOL_GPL(kvm_require_dr);
-/*
- * This function will be used to read from the physical memory of the currently
- * running guest. The difference to kvm_vcpu_read_guest_page is that this function
- * can read from guest physical or from the guest's guest physical memory.
- */
-int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gfn_t ngfn, void *data, int offset, int len,
- u32 access)
-{
- struct x86_exception exception;
- gfn_t real_gfn;
- gpa_t ngpa;
-
- ngpa = gfn_to_gpa(ngfn);
- real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
- if (real_gfn == UNMAPPED_GVA)
- return -EFAULT;
-
- real_gfn = gpa_to_gfn(real_gfn);
-
- return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
-}
-EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-
static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
{
return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
@@ -822,37 +810,50 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
/*
* Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
*/
-int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
- unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
+ gpa_t real_gpa;
int i;
int ret;
u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
- ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
- offset * sizeof(u64), sizeof(pdpte),
- PFERR_USER_MASK|PFERR_WRITE_MASK);
- if (ret < 0) {
- ret = 0;
- goto out;
- }
+ /*
+ * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
+ * to an L1 GPA.
+ */
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
+ PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
+ if (real_gpa == UNMAPPED_GVA)
+ return 0;
+
+ /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte,
+ cr3 & GENMASK(11, 5), sizeof(pdpte));
+ if (ret < 0)
+ return 0;
+
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
if ((pdpte[i] & PT_PRESENT_MASK) &&
(pdpte[i] & pdptr_rsvd_bits(vcpu))) {
- ret = 0;
- goto out;
+ return 0;
}
}
- ret = 1;
+
+ /*
+ * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled.
+ * Shadow page roots need to be reconstructed instead.
+ */
+ if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
+ kvm_mmu_free_roots(vcpu, mmu, KVM_MMU_ROOT_CURRENT);
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
vcpu->arch.pdptrs_from_userspace = false;
-out:
-
- return ret;
+ return 1;
}
EXPORT_SYMBOL_GPL(load_pdptrs);
@@ -876,7 +877,6 @@ EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
- unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
cr0 |= X86_CR0_ET;
@@ -906,11 +906,12 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
#endif
if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
- is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
- !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
+ is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) &&
+ !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
return 1;
- if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ if (!(cr0 & X86_CR0_PG) &&
+ (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
return 1;
static_call(kvm_x86_set_cr0)(vcpu, cr0);
@@ -993,7 +994,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
/*
* Do not allow the guest to set bits that we do not support
* saving. However, xcr0 bit 0 is always set, even if the
- * emulated CPU does not support XSAVE (see fx_init).
+ * emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
*/
valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
if (xcr0 & ~valid_bits)
@@ -1009,6 +1010,11 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
return 1;
}
+
+ if ((xcr0 & XFEATURE_MASK_XTILE) &&
+ ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE))
+ return 1;
+
vcpu->arch.xcr0 = xcr0;
if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
@@ -1042,17 +1048,34 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
{
- if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) ||
- (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
+ /*
+ * If any role bit is changed, the MMU needs to be reset.
+ *
+ * If CR4.PCIDE is changed 1 -> 0, the guest TLB must be flushed.
+ * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
+ * according to the SDM; however, stale prev_roots could be reused
+ * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
+ * free them all. KVM_REQ_MMU_RELOAD is fit for the both cases; it
+ * is slow, but changing CR4.PCIDE is a rare case.
+ *
+ * If CR4.PGE is changed, the guest TLB must be flushed.
+ *
+ * Note: resetting MMU is a superset of KVM_REQ_MMU_RELOAD and
+ * KVM_REQ_MMU_RELOAD is a superset of KVM_REQ_TLB_FLUSH_GUEST, hence
+ * the usage of "else if".
+ */
+ if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
+ else if ((cr4 ^ old_cr4) & X86_CR4_PCIDE)
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ else if ((cr4 ^ old_cr4) & X86_CR4_PGE)
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- X86_CR4_SMEP;
if (!kvm_is_valid_cr4(vcpu, cr4))
return 1;
@@ -1063,9 +1086,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if ((cr4 ^ old_cr4) & X86_CR4_LA57)
return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
- && ((cr4 ^ old_cr4) & pdptr_bits)
- && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- kvm_read_cr3(vcpu)))
+ && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS)
+ && !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
return 1;
if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
@@ -1092,6 +1114,18 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
int i;
/*
+ * MOV CR3 and INVPCID are usually not intercepted when using TDP, but
+ * this is reachable when running EPT=1 and unrestricted_guest=0, and
+ * also via the emulator. KVM's TDP page tables are not in the scope of
+ * the invalidation, but the guest's TLB entries need to be flushed as
+ * the CPU may have cached entries in its TLB for the target PCID.
+ */
+ if (unlikely(tdp_enabled)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
+ }
+
+ /*
* If neither the current CR3 nor any of the prev_roots use the given
* PCID, then nothing needs to be done here because a resync will
* happen anyway before switching to any other CR3.
@@ -1101,6 +1135,14 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
+ /*
+ * If PCID is disabled, there is no need to free prev_roots even if the
+ * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
+ * with PCIDE=0.
+ */
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ return;
+
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
@@ -1134,14 +1176,15 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
return 1;
- if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+ if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
return 1;
if (cr3 != kvm_read_cr3(vcpu))
kvm_mmu_new_pgd(vcpu, cr3);
vcpu->arch.cr3 = cr3;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ /* Do not call post_set_cr3, we do not get here for confidential guests. */
handle_tlb_flush:
/*
@@ -1311,7 +1354,7 @@ static const u32 msrs_to_save_all[] = {
MSR_IA32_UMWAIT_CONTROL,
MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
- MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
@@ -1339,6 +1382,7 @@ static const u32 msrs_to_save_all[] = {
MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+ MSR_IA32_XFD, MSR_IA32_XFD_ERR,
};
static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
@@ -1381,6 +1425,7 @@ static const u32 emulated_msrs_all[] = {
MSR_PLATFORM_INFO,
MSR_MISC_FEATURES_ENABLES,
MSR_AMD64_VIRT_SPEC_CTRL,
+ MSR_AMD64_TSC_RATIO,
MSR_IA32_POWER_CTL,
MSR_IA32_UCODE_REV,
@@ -1794,22 +1839,36 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
}
EXPORT_SYMBOL_GPL(kvm_set_msr);
-static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
{
- int err = vcpu->run->msr.error;
- if (!err) {
+ if (!vcpu->run->msr.error) {
kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
}
+}
+
+static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
+}
- return static_call(kvm_x86_complete_emulated_msr)(vcpu, err);
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+ complete_userspace_rdmsr(vcpu);
+ return complete_emulated_msr_access(vcpu);
}
-static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
+static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
{
return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
}
+static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
+{
+ complete_userspace_rdmsr(vcpu);
+ return complete_fast_msr_access(vcpu);
+}
+
static u64 kvm_msr_reason(int r)
{
switch (r) {
@@ -1844,18 +1903,6 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
return 1;
}
-static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
-{
- return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
- complete_emulated_rdmsr, r);
-}
-
-static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
-{
- return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
- complete_emulated_wrmsr, r);
-}
-
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
@@ -1864,18 +1911,16 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
r = kvm_get_msr(vcpu, ecx, &data);
- /* MSR read failed? See if we should ask user space */
- if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
- /* Bounce to user space */
- return 0;
- }
-
if (!r) {
trace_kvm_msr_read(ecx, data);
kvm_rax_write(vcpu, data & -1u);
kvm_rdx_write(vcpu, (data >> 32) & -1u);
} else {
+ /* MSR read failed? See if we should ask user space */
+ if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
+ complete_fast_rdmsr, r))
+ return 0;
trace_kvm_msr_read_ex(ecx);
}
@@ -1891,19 +1936,18 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
r = kvm_set_msr(vcpu, ecx, data);
- /* MSR write failed? See if we should ask user space */
- if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
- /* Bounce to user space */
- return 0;
-
- /* Signal all other negative errors to userspace */
- if (r < 0)
- return r;
-
- if (!r)
+ if (!r) {
trace_kvm_msr_write(ecx, data);
- else
+ } else {
+ /* MSR write failed? See if we should ask user space */
+ if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data,
+ complete_fast_msr_access, r))
+ return 0;
+ /* Signal all other negative errors to userspace */
+ if (r < 0)
+ return r;
trace_kvm_msr_write_ex(ecx, data);
+ }
return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
}
@@ -2098,7 +2142,7 @@ static s64 get_kvmclock_base_ns(void)
}
#endif
-void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
{
int version;
int r;
@@ -2454,13 +2498,64 @@ static inline bool kvm_check_tsc_unstable(void)
return check_tsc_unstable();
}
+/*
+ * Infers attempts to synchronize the guest's tsc from host writes. Sets the
+ * offset for the vcpu and tracks the TSC matching generation that the vcpu
+ * participates in.
+ */
+static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
+ u64 ns, bool matched)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ lockdep_assert_held(&kvm->arch.tsc_write_lock);
+
+ /*
+ * We also track th most recent recorded KHZ, write and time to
+ * allow the matching interval to be extended at each write.
+ */
+ kvm->arch.last_tsc_nsec = ns;
+ kvm->arch.last_tsc_write = tsc;
+ kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ kvm->arch.last_tsc_offset = offset;
+
+ vcpu->arch.last_guest_tsc = tsc;
+
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+
+ if (!matched) {
+ /*
+ * We split periods of matched TSC writes into generations.
+ * For each generation, we track the original measured
+ * nanosecond time, offset, and write, so if TSCs are in
+ * sync, we can match exact offset, and if not, we can match
+ * exact software computation in compute_guest_tsc()
+ *
+ * These values are tracked in kvm->arch.cur_xxx variables.
+ */
+ kvm->arch.cur_tsc_generation++;
+ kvm->arch.cur_tsc_nsec = ns;
+ kvm->arch.cur_tsc_write = tsc;
+ kvm->arch.cur_tsc_offset = offset;
+ kvm->arch.nr_vcpus_matched_tsc = 0;
+ } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
+ kvm->arch.nr_vcpus_matched_tsc++;
+ }
+
+ /* Keep track of which generation this VCPU has synchronized to */
+ vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+ vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+ vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+ kvm_track_tsc_matching(vcpu);
+}
+
static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
- bool matched;
- bool already_matched;
+ bool matched = false;
bool synchronizing = false;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -2506,51 +2601,10 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
offset = kvm_compute_l1_tsc_offset(vcpu, data);
}
matched = true;
- already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
- } else {
- /*
- * We split periods of matched TSC writes into generations.
- * For each generation, we track the original measured
- * nanosecond time, offset, and write, so if TSCs are in
- * sync, we can match exact offset, and if not, we can match
- * exact software computation in compute_guest_tsc()
- *
- * These values are tracked in kvm->arch.cur_xxx variables.
- */
- kvm->arch.cur_tsc_generation++;
- kvm->arch.cur_tsc_nsec = ns;
- kvm->arch.cur_tsc_write = data;
- kvm->arch.cur_tsc_offset = offset;
- matched = false;
}
- /*
- * We also track th most recent recorded KHZ, write and time to
- * allow the matching interval to be extended at each write.
- */
- kvm->arch.last_tsc_nsec = ns;
- kvm->arch.last_tsc_write = data;
- kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
-
- vcpu->arch.last_guest_tsc = data;
-
- /* Keep track of which generation this VCPU has synchronized to */
- vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
- vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
- vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
-
- kvm_vcpu_write_tsc_offset(vcpu, offset);
+ __kvm_synchronize_tsc(vcpu, offset, data, ns, matched);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
-
- spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
- if (!matched) {
- kvm->arch.nr_vcpus_matched_tsc = 0;
- } else if (!already_matched) {
- kvm->arch.nr_vcpus_matched_tsc++;
- }
-
- kvm_track_tsc_matching(vcpu);
- spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
}
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2738,6 +2792,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
int vclock_mode;
bool host_tsc_clocksource, vcpus_matched;
+ lockdep_assert_held(&kvm->arch.tsc_write_lock);
vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
atomic_read(&kvm->online_vcpus));
@@ -2762,68 +2817,101 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
#endif
}
-void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+static void kvm_make_mclock_inprogress_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
}
-static void kvm_gen_update_masterclock(struct kvm *kvm)
+static void __kvm_start_pvclock_update(struct kvm *kvm)
{
-#ifdef CONFIG_X86_64
- int i;
- struct kvm_vcpu *vcpu;
- struct kvm_arch *ka = &kvm->arch;
- unsigned long flags;
-
- kvm_hv_invalidate_tsc_page(kvm);
+ raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
+ write_seqcount_begin(&kvm->arch.pvclock_sc);
+}
+static void kvm_start_pvclock_update(struct kvm *kvm)
+{
kvm_make_mclock_inprogress_request(kvm);
/* no guest entries from this point */
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
- pvclock_update_vm_gtod_copy(kvm);
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+ __kvm_start_pvclock_update(kvm);
+}
+static void kvm_end_pvclock_update(struct kvm *kvm)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ write_seqcount_end(&ka->pvclock_sc);
+ raw_spin_unlock_irq(&ka->tsc_write_lock);
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
/* guest entries allowed */
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
-#endif
}
-u64 get_kvmclock_ns(struct kvm *kvm)
+static void kvm_update_masterclock(struct kvm *kvm)
+{
+ kvm_hv_invalidate_tsc_page(kvm);
+ kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
+ kvm_end_pvclock_update(kvm);
+}
+
+/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */
+static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
{
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
- unsigned long flags;
- u64 ret;
-
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
- if (!ka->use_master_clock) {
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
- return get_kvmclock_base_ns() + ka->kvmclock_offset;
- }
-
- hv_clock.tsc_timestamp = ka->master_cycle_now;
- hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
/* both __this_cpu_read() and rdtsc() should be on the same cpu */
get_cpu();
- if (__this_cpu_read(cpu_tsc_khz)) {
+ data->flags = 0;
+ if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) {
+#ifdef CONFIG_X86_64
+ struct timespec64 ts;
+
+ if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
+ data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
+ data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
+ } else
+#endif
+ data->host_tsc = rdtsc();
+
+ data->flags |= KVM_CLOCK_TSC_STABLE;
+ hv_clock.tsc_timestamp = ka->master_cycle_now;
+ hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
&hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul);
- ret = __pvclock_read_cycles(&hv_clock, rdtsc());
- } else
- ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
+ data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
+ } else {
+ data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+ }
put_cpu();
+}
- return ret;
+static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+ __get_kvmclock(kvm, data);
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+ struct kvm_clock_data data;
+
+ get_kvmclock(kvm, &data);
+ return data.clock;
}
static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
@@ -2888,6 +2976,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, tgt_tsc_khz;
+ unsigned seq;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
@@ -2902,13 +2991,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
*/
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
- use_master_clock = ka->use_master_clock;
- if (use_master_clock) {
- host_tsc = ka->master_cycle_now;
- kernel_ns = ka->master_kernel_ns;
- }
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+ use_master_clock = ka->use_master_clock;
+ if (use_master_clock) {
+ host_tsc = ka->master_cycle_now;
+ kernel_ns = ka->master_kernel_ns;
+ }
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -2999,7 +3089,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
static void kvmclock_update_fn(struct work_struct *work)
{
- int i;
+ unsigned long i;
struct delayed_work *dwork = to_delayed_work(work);
struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
kvmclock_update_work);
@@ -3179,24 +3269,49 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
++vcpu->stat.tlb_flush;
if (!tdp_enabled) {
- /*
+ /*
* A TLB flush on behalf of the guest is equivalent to
* INVPCID(all), toggling CR4.PGE, etc., which requires
- * a forced sync of the shadow page tables. Unload the
- * entire MMU here and the subsequent load will sync the
- * shadow page tables, and also flush the TLB.
+ * a forced sync of the shadow page tables. Ensure all the
+ * roots are synced and the guest TLB in hardware is clean.
*/
- kvm_mmu_unload(vcpu);
- return;
+ kvm_mmu_sync_roots(vcpu);
+ kvm_mmu_sync_prev_roots(vcpu);
}
static_call(kvm_x86_tlb_flush_guest)(vcpu);
}
+
+static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ static_call(kvm_x86_tlb_flush_current)(vcpu);
+}
+
+/*
+ * Service "local" TLB flush requests, which are specific to the current MMU
+ * context. In addition to the generic event handling in vcpu_enter_guest(),
+ * TLB flushes that are targeted at an MMU context also need to be serviced
+ * prior before nested VM-Enter/VM-Exit.
+ */
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
+{
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
+
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
+
static void record_steal_time(struct kvm_vcpu *vcpu)
{
- struct kvm_host_map map;
- struct kvm_steal_time *st;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ u64 steal;
+ u32 version;
if (kvm_xen_msr_enabled(vcpu->kvm)) {
kvm_xen_runstate_set_running(vcpu);
@@ -3206,47 +3321,86 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- /* -EAGAIN is returned in atomic context so we can just return. */
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
- &map, &vcpu->arch.st.cache, false))
+ if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
return;
- st = map.hva +
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+ slots = kvm_memslots(vcpu->kvm);
+
+ if (unlikely(slots->generation != ghc->generation ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+ gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+
+ /* We rely on the fact that it fits in a single page. */
+ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+ return;
+ }
+ st = (struct kvm_steal_time __user *)ghc->hva;
/*
* Doing a TLB flush here, on the guest's behalf, can avoid
* expensive IPIs.
*/
if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
- u8 st_preempted = xchg(&st->preempted, 0);
+ u8 st_preempted = 0;
+ int err = -EFAULT;
+
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
+
+ asm volatile("1: xchgb %0, %2\n"
+ "xor %1, %1\n"
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : "+q" (st_preempted),
+ "+&r" (err),
+ "+m" (st->preempted));
+ if (err)
+ goto out;
+
+ user_access_end();
+
+ vcpu->arch.st.preempted = 0;
trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
st_preempted & KVM_VCPU_FLUSH_TLB);
if (st_preempted & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb_guest(vcpu);
+
+ if (!user_access_begin(st, sizeof(*st)))
+ goto dirty;
} else {
- st->preempted = 0;
- }
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
- vcpu->arch.st.preempted = 0;
+ unsafe_put_user(0, &st->preempted, out);
+ vcpu->arch.st.preempted = 0;
+ }
- if (st->version & 1)
- st->version += 1; /* first time write, random junk */
+ unsafe_get_user(version, &st->version, out);
+ if (version & 1)
+ version += 1; /* first time write, random junk */
- st->version += 1;
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
smp_wmb();
- st->steal += current->sched_info.run_delay -
+ unsafe_get_user(steal, &st->steal, out);
+ steal += current->sched_info.run_delay -
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ unsafe_put_user(steal, &st->steal, out);
- smp_wmb();
-
- st->version += 1;
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+ out:
+ user_access_end();
+ dirty:
+ mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -3282,7 +3436,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated)
return 1;
- if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
+ if (kvm_get_msr_feature(&msr_ent))
return 1;
if (data & ~msr_ent.data)
return 1;
@@ -3452,7 +3606,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
return 1;
- if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
+ if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8)))
return 1;
break;
@@ -3538,6 +3692,30 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.msr_misc_features_enables = data;
break;
+#ifdef CONFIG_X86_64
+ case MSR_IA32_XFD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
+ vcpu->arch.guest_supported_xcr0))
+ return 1;
+
+ fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
+ break;
+ case MSR_IA32_XFD_ERR:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
+ vcpu->arch.guest_supported_xcr0))
+ return 1;
+
+ vcpu->arch.guest_fpu.xfd_err = data;
+ break;
+#endif
default:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
@@ -3858,6 +4036,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_K7_HWCR:
msr_info->data = vcpu->arch.msr_hwcr;
break;
+#ifdef CONFIG_X86_64
+ case MSR_IA32_XFD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
+ break;
+ case MSR_IA32_XFD_ERR:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ msr_info->data = vcpu->arch.guest_fpu.xfd_err;
+ break;
+#endif
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
@@ -4026,8 +4220,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SGX_ATTRIBUTE:
#endif
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
case KVM_CAP_SREGS2:
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+ case KVM_CAP_VCPU_ATTRIBUTES:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -4039,7 +4235,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_XEN_HVM:
r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
- KVM_XEN_HVM_CONFIG_SHARED_INFO;
+ KVM_XEN_HVM_CONFIG_SHARED_INFO |
+ KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL;
if (sched_info_on())
r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
break;
@@ -4048,7 +4245,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_SYNC_X86_VALID_FIELDS;
break;
case KVM_CAP_ADJUST_CLOCK:
- r = KVM_CLOCK_TSC_STABLE;
+ r = KVM_CLOCK_VALID_FLAGS;
break;
case KVM_CAP_X86_DISABLE_EXITS:
r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
@@ -4071,13 +4268,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = !static_call(kvm_x86_cpu_has_accelerated_tpr)();
break;
case KVM_CAP_NR_VCPUS:
- r = KVM_SOFT_MAX_VCPUS;
+ r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
break;
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;
case KVM_CAP_MAX_VCPU_ID:
- r = KVM_MAX_VCPU_ID;
+ r = KVM_MAX_VCPU_IDS;
break;
case KVM_CAP_PV_MMU: /* obsolete */
r = 0;
@@ -4117,6 +4314,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
else
r = 0;
break;
+ case KVM_CAP_XSAVE2: {
+ u64 guest_perm = xstate_get_guest_group_perm();
+
+ r = xstate_required_size(supported_xcr0 & guest_perm, false);
+ if (r < sizeof(struct kvm_xsave))
+ r = sizeof(struct kvm_xsave);
+ break;
+ }
default:
break;
}
@@ -4285,8 +4490,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
- struct kvm_host_map map;
- struct kvm_steal_time *st;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ static const u8 preempted = KVM_VCPU_PREEMPTED;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -4294,16 +4501,23 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
if (vcpu->arch.st.preempted)
return;
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
- &vcpu->arch.st.cache, true))
+ /* This happens on process exit */
+ if (unlikely(current->mm != vcpu->kvm->mm))
+ return;
+
+ slots = kvm_memslots(vcpu->kvm);
+
+ if (unlikely(slots->generation != ghc->generation ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot))
return;
- st = map.hva +
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+ st = (struct kvm_steal_time __user *)ghc->hva;
+ BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
- st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+ if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
+ vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+ mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -4331,8 +4545,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -4700,144 +4913,37 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
return 0;
}
-#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
-
-static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
-{
- struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
- u64 xstate_bv = xsave->header.xfeatures;
- u64 valid;
-
- /*
- * Copy legacy XSAVE area, to avoid complications with CPUID
- * leaves 0 and 1 in the loop below.
- */
- memcpy(dest, xsave, XSAVE_HDR_OFFSET);
-
- /* Set XSTATE_BV */
- xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
- *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
-
- /*
- * Copy each region from the possibly compacted offset to the
- * non-compacted offset.
- */
- valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
- while (valid) {
- u32 size, offset, ecx, edx;
- u64 xfeature_mask = valid & -valid;
- int xfeature_nr = fls64(xfeature_mask) - 1;
- void *src;
-
- cpuid_count(XSTATE_CPUID, xfeature_nr,
- &size, &offset, &ecx, &edx);
-
- if (xfeature_nr == XFEATURE_PKRU) {
- memcpy(dest + offset, &vcpu->arch.pkru,
- sizeof(vcpu->arch.pkru));
- } else {
- src = get_xsave_addr(xsave, xfeature_nr);
- if (src)
- memcpy(dest + offset, src, size);
- }
-
- valid -= xfeature_mask;
- }
-}
-
-static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
{
- struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
- u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
- u64 valid;
-
- /*
- * Copy legacy XSAVE area, to avoid complications with CPUID
- * leaves 0 and 1 in the loop below.
- */
- memcpy(xsave, src, XSAVE_HDR_OFFSET);
-
- /* Set XSTATE_BV and possibly XCOMP_BV. */
- xsave->header.xfeatures = xstate_bv;
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
- /*
- * Copy each region from the non-compacted offset to the
- * possibly compacted offset.
- */
- valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
- while (valid) {
- u32 size, offset, ecx, edx;
- u64 xfeature_mask = valid & -valid;
- int xfeature_nr = fls64(xfeature_mask) - 1;
-
- cpuid_count(XSTATE_CPUID, xfeature_nr,
- &size, &offset, &ecx, &edx);
-
- if (xfeature_nr == XFEATURE_PKRU) {
- memcpy(&vcpu->arch.pkru, src + offset,
- sizeof(vcpu->arch.pkru));
- } else {
- void *dest = get_xsave_addr(xsave, xfeature_nr);
-
- if (dest)
- memcpy(dest, src + offset, size);
- }
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+ return;
- valid -= xfeature_mask;
- }
+ fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+ guest_xsave->region,
+ sizeof(guest_xsave->region),
+ vcpu->arch.pkru);
}
-static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
- struct kvm_xsave *guest_xsave)
+static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+ u8 *state, unsigned int size)
{
- if (!vcpu->arch.guest_fpu)
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
return;
- if (boot_cpu_has(X86_FEATURE_XSAVE)) {
- memset(guest_xsave, 0, sizeof(struct kvm_xsave));
- fill_xsave((u8 *) guest_xsave->region, vcpu);
- } else {
- memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu->state.fxsave,
- sizeof(struct fxregs_state));
- *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
- XFEATURE_MASK_FPSSE;
- }
+ fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+ state, size, vcpu->arch.pkru);
}
-#define XSAVE_MXCSR_OFFSET 24
-
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
- u64 xstate_bv;
- u32 mxcsr;
-
- if (!vcpu->arch.guest_fpu)
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
return 0;
- xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
- mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
-
- if (boot_cpu_has(X86_FEATURE_XSAVE)) {
- /*
- * Here we allow setting states that are not present in
- * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
- * with old userspace.
- */
- if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
- return -EINVAL;
- load_xsave(vcpu, (u8 *)guest_xsave->region);
- } else {
- if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
- mxcsr & ~mxcsr_feature_mask)
- return -EINVAL;
- memcpy(&vcpu->arch.guest_fpu->state.fxsave,
- guest_xsave->region, sizeof(struct fxregs_state));
- }
- return 0;
+ return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
+ guest_xsave->region,
+ supported_xcr0, &vcpu->arch.pkru);
}
static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
@@ -4892,6 +4998,115 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
return 0;
}
+static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int r;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET:
+ r = 0;
+ break;
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+ int r;
+
+ if ((u64)(unsigned long)uaddr != attr->addr)
+ return -EFAULT;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET:
+ r = -EFAULT;
+ if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
+ break;
+ r = 0;
+ break;
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+ struct kvm *kvm = vcpu->kvm;
+ int r;
+
+ if ((u64)(unsigned long)uaddr != attr->addr)
+ return -EFAULT;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET: {
+ u64 offset, tsc, ns;
+ unsigned long flags;
+ bool matched;
+
+ r = -EFAULT;
+ if (get_user(offset, uaddr))
+ break;
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+
+ matched = (vcpu->arch.virtual_tsc_khz &&
+ kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
+ kvm->arch.last_tsc_offset == offset);
+
+ tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
+ ns = get_kvmclock_base_ns();
+
+ __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+ r = 0;
+ break;
+ }
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu,
+ unsigned int ioctl,
+ void __user *argp)
+{
+ struct kvm_device_attr attr;
+ int r;
+
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ return -EFAULT;
+
+ if (attr.group != KVM_VCPU_TSC_CTRL)
+ return -ENXIO;
+
+ switch (ioctl) {
+ case KVM_HAS_DEVICE_ATTR:
+ r = kvm_arch_tsc_has_attr(vcpu, &attr);
+ break;
+ case KVM_GET_DEVICE_ATTR:
+ r = kvm_arch_tsc_get_attr(vcpu, &attr);
+ break;
+ case KVM_SET_DEVICE_ATTR:
+ r = kvm_arch_tsc_set_attr(vcpu, &attr);
+ break;
+ }
+
+ return r;
+}
+
static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
struct kvm_enable_cap *cap)
{
@@ -5015,6 +5230,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
+ /*
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+ * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+ * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
+ * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
+ * the core vCPU model on the fly, so fail.
+ */
+ r = -EINVAL;
+ if (vcpu->arch.last_vmentry_cpu != -1)
+ goto out;
+
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@@ -5025,6 +5251,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
+ /*
+ * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in
+ * KVM_SET_CPUID case above.
+ */
+ r = -EINVAL;
+ if (vcpu->arch.last_vmentry_cpu != -1)
+ goto out;
+
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@@ -5154,6 +5388,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_GET_XSAVE: {
+ r = -EINVAL;
+ if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
+ break;
+
u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.xsave)
@@ -5168,7 +5406,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SET_XSAVE: {
- u.xsave = memdup_user(argp, sizeof(*u.xsave));
+ int size = vcpu->arch.guest_fpu.uabi_size;
+
+ u.xsave = memdup_user(argp, size);
if (IS_ERR(u.xsave)) {
r = PTR_ERR(u.xsave);
goto out_nofree;
@@ -5177,6 +5417,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
break;
}
+
+ case KVM_GET_XSAVE2: {
+ int size = vcpu->arch.guest_fpu.uabi_size;
+
+ u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xsave, size))
+ break;
+
+ r = 0;
+ break;
+ }
+
case KVM_GET_XCRS: {
u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
@@ -5346,6 +5605,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = __set_sregs2(vcpu, u.sregs2);
break;
}
+ case KVM_HAS_DEVICE_ATTR:
+ case KVM_GET_DEVICE_ATTR:
+ case KVM_SET_DEVICE_ATTR:
+ r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
+ break;
default:
r = -EINVAL;
}
@@ -5536,7 +5800,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
* VM-Exit.
*/
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vcpu_kick(vcpu);
@@ -5584,6 +5848,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+ kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT);
r = 0;
split_irqchip_unlock:
mutex_unlock(&kvm->lock);
@@ -5665,6 +5930,12 @@ split_irqchip_unlock:
if (kvm_x86_ops.vm_copy_enc_context_from)
r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
return r;
+ case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
+ r = -EINVAL;
+ if (kvm_x86_ops.vm_move_enc_context_from)
+ r = kvm_x86_ops.vm_move_enc_context_from(
+ kvm, cap->args[0]);
+ return r;
case KVM_CAP_EXIT_HYPERCALL:
if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
r = -EINVAL;
@@ -5798,7 +6069,8 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
static int kvm_arch_suspend_notifier(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
- int i, ret = 0;
+ unsigned long i;
+ int ret = 0;
mutex_lock(&kvm->lock);
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -5829,6 +6101,63 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
}
#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_clock_data data = { 0 };
+
+ get_kvmclock(kvm, &data);
+ if (copy_to_user(argp, &data, sizeof(data)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct kvm_clock_data data;
+ u64 now_raw_ns;
+
+ if (copy_from_user(&data, argp, sizeof(data)))
+ return -EFAULT;
+
+ /*
+ * Only KVM_CLOCK_REALTIME is used, but allow passing the
+ * result of KVM_GET_CLOCK back to KVM_SET_CLOCK.
+ */
+ if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
+ return -EINVAL;
+
+ kvm_hv_invalidate_tsc_page(kvm);
+ kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
+
+ /*
+ * This pairs with kvm_guest_time_update(): when masterclock is
+ * in use, we use master_kernel_ns + kvmclock_offset to set
+ * unsigned 'system_time' so if we use get_kvmclock_ns() (which
+ * is slightly ahead) here we risk going negative on unsigned
+ * 'system_time' when 'data.clock' is very small.
+ */
+ if (data.flags & KVM_CLOCK_REALTIME) {
+ u64 now_real_ns = ktime_get_real_ns();
+
+ /*
+ * Avoid stepping the kvmclock backwards.
+ */
+ if (now_real_ns > data.realtime)
+ data.clock += now_real_ns - data.realtime;
+ }
+
+ if (ka->use_master_clock)
+ now_raw_ns = ka->master_kernel_ns;
+ else
+ now_raw_ns = get_kvmclock_base_ns();
+ ka->kvmclock_offset = data.clock - now_raw_ns;
+ kvm_end_pvclock_update(kvm);
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -5901,6 +6230,7 @@ set_identity_unlock:
/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
+ kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT);
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
@@ -6072,60 +6402,12 @@ set_pit2_out:
break;
}
#endif
- case KVM_SET_CLOCK: {
- struct kvm_arch *ka = &kvm->arch;
- struct kvm_clock_data user_ns;
- u64 now_ns;
-
- r = -EFAULT;
- if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
- goto out;
-
- r = -EINVAL;
- if (user_ns.flags)
- goto out;
-
- r = 0;
- /*
- * TODO: userspace has to take care of races with VCPU_RUN, so
- * kvm_gen_update_masterclock() can be cut down to locked
- * pvclock_update_vm_gtod_copy().
- */
- kvm_gen_update_masterclock(kvm);
-
- /*
- * This pairs with kvm_guest_time_update(): when masterclock is
- * in use, we use master_kernel_ns + kvmclock_offset to set
- * unsigned 'system_time' so if we use get_kvmclock_ns() (which
- * is slightly ahead) here we risk going negative on unsigned
- * 'system_time' when 'user_ns.clock' is very small.
- */
- spin_lock_irq(&ka->pvclock_gtod_sync_lock);
- if (kvm->arch.use_master_clock)
- now_ns = ka->master_kernel_ns;
- else
- now_ns = get_kvmclock_base_ns();
- ka->kvmclock_offset = user_ns.clock - now_ns;
- spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
-
- kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
+ case KVM_SET_CLOCK:
+ r = kvm_vm_ioctl_set_clock(kvm, argp);
break;
- }
- case KVM_GET_CLOCK: {
- struct kvm_clock_data user_ns;
- u64 now_ns;
-
- now_ns = get_kvmclock_ns(kvm);
- user_ns.clock = now_ns;
- user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
- memset(&user_ns.pad, 0, sizeof(user_ns.pad));
-
- r = -EFAULT;
- if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
- goto out;
- r = 0;
+ case KVM_GET_CLOCK:
+ r = kvm_vm_ioctl_get_clock(kvm, argp);
break;
- }
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
if (kvm_x86_ops.mem_enc_op)
@@ -6248,6 +6530,11 @@ static void kvm_init_msr_list(void)
min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
continue;
break;
+ case MSR_IA32_XFD:
+ case MSR_IA32_XFD_ERR:
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ continue;
+ break;
default:
break;
}
@@ -6331,13 +6618,14 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
gpa_t t_gpa;
BUG_ON(!mmu_is_nested(vcpu));
/* NPT walks are always user-walks */
access |= PFERR_USER_MASK;
- t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
+ t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
return t_gpa;
}
@@ -6345,25 +6633,31 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
@@ -6371,19 +6665,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
}
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 access,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
- exception);
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -6411,13 +6707,14 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
struct x86_exception *exception)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
/* Inline kvm_read_guest_virt_helper for speed. */
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
- exception);
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
+ exception);
if (unlikely(gpa == UNMAPPED_GVA))
return X86EMUL_PROPAGATE_FAULT;
@@ -6476,13 +6773,12 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes
struct kvm_vcpu *vcpu, u32 access,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
- access,
- exception);
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -6569,6 +6865,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t *gpa, struct x86_exception *exception,
bool write)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
@@ -6586,7 +6883,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
return 1;
}
- *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
if (*gpa == UNMAPPED_GVA)
return -1;
@@ -6906,7 +7203,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
}
static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
- unsigned short port, void *val,
+ unsigned short port,
unsigned int count, bool in)
{
vcpu->arch.pio.port = port;
@@ -6914,10 +7211,8 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
vcpu->arch.pio.count = count;
vcpu->arch.pio.size = size;
- if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
- vcpu->arch.pio.count = 0;
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data))
return 1;
- }
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -6929,26 +7224,44 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
return 0;
}
-static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
- unsigned short port, void *val, unsigned int count)
+static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, unsigned int count)
{
- int ret;
+ WARN_ON(vcpu->arch.pio.count);
+ memset(vcpu->arch.pio_data, 0, size * count);
+ return emulator_pio_in_out(vcpu, size, port, count, true);
+}
- if (vcpu->arch.pio.count)
- goto data_avail;
+static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val)
+{
+ int size = vcpu->arch.pio.size;
+ unsigned count = vcpu->arch.pio.count;
+ memcpy(val, vcpu->arch.pio_data, size * count);
+ trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
+ vcpu->arch.pio.count = 0;
+}
- memset(vcpu->arch.pio_data, 0, size * count);
+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *val, unsigned int count)
+{
+ if (vcpu->arch.pio.count) {
+ /*
+ * Complete a previous iteration that required userspace I/O.
+ * Note, @count isn't guaranteed to match pio.count as userspace
+ * can modify ECX before rerunning the vCPU. Ignore any such
+ * shenanigans as KVM doesn't support modifying the rep count,
+ * and the emulator ensures @count doesn't overflow the buffer.
+ */
+ } else {
+ int r = __emulator_pio_in(vcpu, size, port, count);
+ if (!r)
+ return r;
- ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
- if (ret) {
-data_avail:
- memcpy(val, vcpu->arch.pio_data, size * count);
- trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
- vcpu->arch.pio.count = 0;
- return 1;
+ /* Results already available, fall through. */
}
- return 0;
+ complete_emulator_pio_in(vcpu, val);
+ return 1;
}
static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
@@ -6963,9 +7276,15 @@ static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned short port, const void *val,
unsigned int count)
{
+ int ret;
+
memcpy(vcpu->arch.pio_data, val, size * count);
trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
- return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
+ ret = emulator_pio_in_out(vcpu, size, port, count, false);
+ if (ret)
+ vcpu->arch.pio.count = 0;
+
+ return ret;
}
static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
@@ -7198,7 +7517,8 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
r = kvm_get_msr(vcpu, msr_index, pdata);
- if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
+ if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
+ complete_emulated_rdmsr, r)) {
/* Bounce to user space */
return X86EMUL_IO_NEEDED;
}
@@ -7214,7 +7534,8 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
r = kvm_set_msr(vcpu, msr_index, data);
- if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
+ if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
+ complete_emulated_msr_access, r)) {
/* Bounce to user space */
return X86EMUL_IO_NEEDED;
}
@@ -7239,7 +7560,9 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc)
{
- return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
+ if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc))
+ return 0;
+ return -EINVAL;
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -7475,28 +7798,77 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
}
EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
-static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
+ u8 ndata, u8 *insn_bytes, u8 insn_size)
{
- struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
- u32 insn_size = ctxt->fetch.end - ctxt->fetch.data;
struct kvm_run *run = vcpu->run;
+ u64 info[5];
+ u8 info_start;
+
+ /*
+ * Zero the whole array used to retrieve the exit info, as casting to
+ * u32 for select entries will leave some chunks uninitialized.
+ */
+ memset(&info, 0, sizeof(info));
+
+ static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
+ &info[2], (u32 *)&info[3],
+ (u32 *)&info[4]);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
- run->emulation_failure.ndata = 0;
+
+ /*
+ * There's currently space for 13 entries, but 5 are used for the exit
+ * reason and info. Restrict to 4 to reduce the maintenance burden
+ * when expanding kvm_run.emulation_failure in the future.
+ */
+ if (WARN_ON_ONCE(ndata > 4))
+ ndata = 4;
+
+ /* Always include the flags as a 'data' entry. */
+ info_start = 1;
run->emulation_failure.flags = 0;
if (insn_size) {
- run->emulation_failure.ndata = 3;
+ BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
+ sizeof(run->emulation_failure.insn_bytes) != 16));
+ info_start += 2;
run->emulation_failure.flags |=
KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
run->emulation_failure.insn_size = insn_size;
memset(run->emulation_failure.insn_bytes, 0x90,
sizeof(run->emulation_failure.insn_bytes));
- memcpy(run->emulation_failure.insn_bytes,
- ctxt->fetch.data, insn_size);
+ memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
}
+
+ memcpy(&run->internal.data[info_start], info, sizeof(info));
+ memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
+ ndata * sizeof(data[0]));
+
+ run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
+}
+
+static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
+ ctxt->fetch.end - ctxt->fetch.data);
+}
+
+void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
+ u8 ndata)
+{
+ prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit);
+
+void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+{
+ __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
}
+EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
@@ -7512,16 +7884,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
if (kvm->arch.exit_on_emulation_error ||
(emulation_type & EMULTYPE_SKIP)) {
- prepare_emulation_failure_exit(vcpu);
+ prepare_emulation_ctxt_failure_exit(vcpu);
return 0;
}
kvm_queue_exception(vcpu, UD_VECTOR);
if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ prepare_emulation_ctxt_failure_exit(vcpu);
return 0;
}
@@ -7716,6 +8086,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
if (unlikely(!r))
return 0;
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+
/*
* rflags is the old, "raw" value of the flags. The new value has
* not been saved yet.
@@ -7883,12 +8255,23 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
}
/*
- * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
- * for kvm_skip_emulated_instruction(). The caller is responsible for
- * updating interruptibility state and injecting single-step #DBs.
+ * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for
+ * use *only* by vendor callbacks for kvm_skip_emulated_instruction().
+ * The caller is responsible for updating interruptibility state and
+ * injecting single-step #DBs.
*/
if (emulation_type & EMULTYPE_SKIP) {
- kvm_rip_write(vcpu, ctxt->_eip);
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ ctxt->eip = (u32)ctxt->_eip;
+ else
+ ctxt->eip = ctxt->_eip;
+
+ if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) {
+ r = 1;
+ goto writeback;
+ }
+
+ kvm_rip_write(vcpu, ctxt->eip);
if (ctxt->eflags & X86_EFLAGS_RF)
kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
return 1;
@@ -7952,17 +8335,24 @@ restart:
writeback = false;
r = 0;
vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+ } else if (vcpu->arch.complete_userspace_io) {
+ writeback = false;
+ r = 0;
} else if (r == EMULATION_RESTART)
goto restart;
else
r = 1;
+writeback:
if (writeback) {
unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+ if (ctxt->is_branch)
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
@@ -8121,14 +8511,13 @@ static void tsc_khz_changed(void *data)
static void kvm_hyperv_tsc_notifier(void)
{
struct kvm *kvm;
- struct kvm_vcpu *vcpu;
int cpu;
- unsigned long flags;
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_make_mclock_inprogress_request(kvm);
+ /* no guest entries from this point */
hyperv_stop_tsc_emulation();
/* TSC frequency always matches when on Hyper-V */
@@ -8137,18 +8526,11 @@ static void kvm_hyperv_tsc_notifier(void)
kvm_max_guest_tsc_khz = tsc_khz;
list_for_each_entry(kvm, &vm_list, vm_list) {
- struct kvm_arch *ka = &kvm->arch;
-
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+ __kvm_start_pvclock_update(kvm);
pvclock_update_vm_gtod_copy(kvm);
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
-
- kvm_for_each_vcpu(cpu, vcpu, kvm)
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-
- kvm_for_each_vcpu(cpu, vcpu, kvm)
- kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+ kvm_end_pvclock_update(kvm);
}
+
mutex_unlock(&kvm_lock);
}
#endif
@@ -8157,7 +8539,8 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
- int i, send_ipi = 0;
+ int send_ipi = 0;
+ unsigned long i;
/*
* We allow guests to temporarily run on slowing clocks,
@@ -8282,57 +8665,12 @@ static void kvm_timer_init(void)
kvmclock_cpu_online, kvmclock_cpu_down_prep);
}
-DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
-EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
-
-int kvm_is_in_guest(void)
-{
- return __this_cpu_read(current_vcpu) != NULL;
-}
-
-static int kvm_is_user_mode(void)
-{
- int user_mode = 3;
-
- if (__this_cpu_read(current_vcpu))
- user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu));
-
- return user_mode != 0;
-}
-
-static unsigned long kvm_get_guest_ip(void)
-{
- unsigned long ip = 0;
-
- if (__this_cpu_read(current_vcpu))
- ip = kvm_rip_read(__this_cpu_read(current_vcpu));
-
- return ip;
-}
-
-static void kvm_handle_intel_pt_intr(void)
-{
- struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
-
- kvm_make_request(KVM_REQ_PMI, vcpu);
- __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
- (unsigned long *)&vcpu->arch.pmu.global_status);
-}
-
-static struct perf_guest_info_callbacks kvm_guest_cbs = {
- .is_in_guest = kvm_is_in_guest,
- .is_user_mode = kvm_is_user_mode,
- .get_guest_ip = kvm_get_guest_ip,
- .handle_intel_pt_intr = kvm_handle_intel_pt_intr,
-};
-
#ifdef CONFIG_X86_64
static void pvclock_gtod_update_fn(struct work_struct *work)
{
struct kvm *kvm;
-
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
@@ -8389,18 +8727,20 @@ int kvm_arch_init(void *opaque)
int r;
if (kvm_x86_ops.hardware_enable) {
- printk(KERN_ERR "kvm: already loaded the other module\n");
+ pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
r = -EEXIST;
goto out;
}
if (!ops->cpu_has_kvm_support()) {
- pr_err_ratelimited("kvm: no hardware support\n");
+ pr_err_ratelimited("kvm: no hardware support for '%s'\n",
+ ops->runtime_ops->name);
r = -EOPNOTSUPP;
goto out;
}
if (ops->disabled_by_bios()) {
- pr_err_ratelimited("kvm: disabled by bios\n");
+ pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
+ ops->runtime_ops->name);
r = -EOPNOTSUPP;
goto out;
}
@@ -8417,18 +8757,11 @@ int kvm_arch_init(void *opaque)
}
r = -ENOMEM;
- x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
- __alignof__(struct fpu), SLAB_ACCOUNT,
- NULL);
- if (!x86_fpu_cache) {
- printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
- goto out;
- }
x86_emulator_cache = kvm_alloc_emulator_cache();
if (!x86_emulator_cache) {
pr_err("kvm: failed to allocate cache for x86 emulator\n");
- goto out_free_x86_fpu_cache;
+ goto out;
}
user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
@@ -8444,8 +8777,6 @@ int kvm_arch_init(void *opaque)
kvm_timer_init();
- perf_register_guest_info_callbacks(&kvm_guest_cbs);
-
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
@@ -8466,8 +8797,6 @@ out_free_percpu:
free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
kmem_cache_destroy(x86_emulator_cache);
-out_free_x86_fpu_cache:
- kmem_cache_destroy(x86_fpu_cache);
out:
return r;
}
@@ -8479,7 +8808,6 @@ void kvm_arch_exit(void)
clear_hv_tscchange_cb();
#endif
kvm_lapic_exit();
- perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
@@ -8494,15 +8822,21 @@ void kvm_arch_exit(void)
kvm_mmu_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
- kmem_cache_destroy(x86_fpu_cache);
#ifdef CONFIG_KVM_XEN
static_key_deferred_flush(&kvm_xen_enabled);
WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
#endif
}
-static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason)
+static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
{
+ /*
+ * The vCPU has halted, e.g. executed HLT. Update the run state if the
+ * local APIC is in-kernel, the run loop will detect the non-runnable
+ * state and halt the vCPU. Exit to userspace if the local APIC is
+ * managed by userspace, in which case userspace is responsible for
+ * handling wake events.
+ */
++vcpu->stat.halt_exits;
if (lapic_in_kernel(vcpu)) {
vcpu->arch.mp_state = state;
@@ -8513,11 +8847,11 @@ static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason)
}
}
-int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
{
- return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
+EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
@@ -8526,7 +8860,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
* TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
* KVM_EXIT_DEBUG here.
*/
- return kvm_vcpu_halt(vcpu) && ret;
+ return kvm_emulate_halt_noskip(vcpu) && ret;
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
@@ -8534,7 +8868,8 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
{
int ret = kvm_skip_emulated_instruction(vcpu);
- return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret;
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
+ KVM_EXIT_AP_RESET_HOLD) && ret;
}
EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
@@ -8595,12 +8930,11 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated);
static void kvm_apicv_init(struct kvm *kvm)
{
- mutex_init(&kvm->arch.apicv_update_lock);
+ init_rwsem(&kvm->arch.apicv_update_lock);
- if (enable_apicv)
- clear_bit(APICV_INHIBIT_REASON_DISABLE,
- &kvm->arch.apicv_inhibit_reasons);
- else
+ set_bit(APICV_INHIBIT_REASON_ABSENT,
+ &kvm->arch.apicv_inhibit_reasons);
+ if (!enable_apicv)
set_bit(APICV_INHIBIT_REASON_DISABLE,
&kvm->arch.apicv_inhibit_reasons);
}
@@ -8669,7 +9003,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
trace_kvm_hypercall(nr, a0, a1, a2, a3);
- op_64_bit = is_64_bit_mode(vcpu);
+ op_64_bit = is_64_bit_hypercall(vcpu);
if (!op_64_bit) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
@@ -8773,19 +9107,20 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
- /*
- * if_flag is obsolete and useless, so do not bother
- * setting it for SEV-ES guests. Userspace can just
- * use kvm_run->ready_for_interrupt_injection.
- */
- kvm_run->if_flag = !vcpu->arch.guest_state_protected
- && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-
+ kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
+
+ /*
+ * The call to kvm_ready_for_interrupt_injection() may end up in
+ * kvm_xen_has_interrupt() which may require the srcu lock to be
+ * held, to protect against changes in the vcpu_info address.
+ */
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_run->ready_for_interrupt_injection =
pic_in_kernel(vcpu->kvm) ||
kvm_vcpu_ready_for_interrupt_injection(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (is_smm(vcpu))
kvm_run->flags |= KVM_RUN_X86_SMM;
@@ -9242,14 +9577,7 @@ static void process_smi(struct kvm_vcpu *vcpu)
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
unsigned long *vcpu_bitmap)
{
- cpumask_var_t cpus;
-
- zalloc_cpumask_var(&cpus, GFP_ATOMIC);
-
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
- NULL, vcpu_bitmap, cpus);
-
- free_cpumask_var(cpus);
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap);
}
void kvm_make_scan_ioapic_request(struct kvm *kvm)
@@ -9264,7 +9592,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
- mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+ down_read(&vcpu->kvm->arch.apicv_update_lock);
activate = kvm_apicv_activated(vcpu->kvm);
if (vcpu->arch.apicv_active == activate)
@@ -9284,7 +9612,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
out:
- mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
+ up_read(&vcpu->kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
@@ -9292,6 +9620,8 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
{
unsigned long old, new;
+ lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
+
if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
!static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
return;
@@ -9305,6 +9635,18 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
if (!!old != !!new) {
trace_kvm_apicv_update_request(activate, bit);
+ /*
+ * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
+ * false positives in the sanity check WARN in svm_vcpu_run().
+ * This task will wait for all vCPUs to ack the kick IRQ before
+ * updating apicv_inhibit_reasons, and all other vCPUs will
+ * block on acquiring apicv_update_lock so that vCPUs can't
+ * redo svm_vcpu_run() without seeing the new inhibit state.
+ *
+ * Note, holding apicv_update_lock and taking it in the read
+ * side (handling the request) also prevents other vCPUs from
+ * servicing the request with a stale apicv_inhibit_reasons.
+ */
kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
kvm->arch.apicv_inhibit_reasons = new;
if (new) {
@@ -9318,9 +9660,9 @@ EXPORT_SYMBOL_GPL(__kvm_request_apicv_update);
void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
{
- mutex_lock(&kvm->arch.apicv_update_lock);
+ down_write(&kvm->arch.apicv_update_lock);
__kvm_request_apicv_update(kvm, activate, bit);
- mutex_unlock(&kvm->arch.apicv_update_lock);
+ up_write(&kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
@@ -9334,8 +9676,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -9353,12 +9694,16 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
- if (to_hv_vcpu(vcpu))
+ if (to_hv_vcpu(vcpu)) {
bitmap_or((ulong *)eoi_exit_bitmap,
vcpu->arch.ioapic_handled_vectors,
to_hv_synic(vcpu)->vec_bitmap, 256);
+ static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+ return;
+ }
- static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+ static_call(kvm_x86_load_eoi_exitmap)(
+ vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
}
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
@@ -9417,7 +9762,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_request_pending(vcpu)) {
- if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) {
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
r = -EIO;
goto out;
}
@@ -9432,7 +9777,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
- kvm_gen_update_masterclock(vcpu->kvm);
+ kvm_update_masterclock(vcpu->kvm);
if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
kvm_gen_kvmclock_update(vcpu);
if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
@@ -9450,10 +9795,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/* Flushing all ASIDs flushes the current ASID... */
kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
- kvm_vcpu_flush_tlb_guest(vcpu);
+ kvm_service_local_tlb_flush_requests(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -9604,10 +9946,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/*
* This handles the case where a posted interrupt was
- * notified with kvm_vcpu_kick.
+ * notified with kvm_vcpu_kick. Assigned devices can
+ * use the POSTED_INTR_VECTOR even if APICv is disabled,
+ * so do it even if APICv is disabled on this vCPU.
*/
- if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -9628,6 +9972,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_return();
+ if (vcpu->arch.guest_fpu.xfd_err)
+ wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
@@ -9639,18 +9986,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
for (;;) {
+ /*
+ * Assert that vCPU vs. VM APICv state is consistent. An APICv
+ * update must kick and wait for all vCPUs before toggling the
+ * per-VM state, and responsing vCPUs must wait for the update
+ * to complete before servicing KVM_REQ_APICV_UPDATE.
+ */
+ WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+
exit_fastpath = static_call(kvm_x86_run)(vcpu);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
- if (unlikely(kvm_vcpu_exit_request(vcpu))) {
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+
+ if (unlikely(kvm_vcpu_exit_request(vcpu))) {
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
break;
}
-
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
- }
+ }
/*
* Do this here before restoring debug registers on the host. And
@@ -9681,8 +10036,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
+ /*
+ * Sync xfd before calling handle_exit_irqoff() which may
+ * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
+ * in #NM irqoff handler).
+ */
+ if (vcpu->arch.xfd_no_write_intercept)
+ fpu_sync_guest_vmexit_xfd_state();
+
static_call(kvm_x86_handle_exit_irqoff)(vcpu);
+ if (vcpu->arch.guest_fpu.xfd_err)
+ wrmsrl(MSR_IA32_XFD_ERR, 0);
+
/*
* Consume any pending interrupts, including the possible source of
* VM-Exit on SVM and any ticks that occur between VM-Exit and now.
@@ -9690,7 +10056,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* interrupts on processors that implement an interrupt shadow, the
* stat.exits increment will do nicely.
*/
- kvm_before_interrupt(vcpu);
+ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
local_irq_enable();
++vcpu->stat.exits;
local_irq_disable();
@@ -9750,7 +10116,10 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
if (!kvm_arch_vcpu_runnable(vcpu) &&
(!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- kvm_vcpu_block(vcpu);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ kvm_vcpu_halt(vcpu);
+ else
+ kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (kvm_x86_ops.post_block)
@@ -9913,58 +10282,21 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
return 0;
}
-static void kvm_save_current_fpu(struct fpu *fpu)
-{
- /*
- * If the target FPU state is not resident in the CPU registers, just
- * memcpy() from current, else save CPU state directly to the target.
- */
- if (test_thread_flag(TIF_NEED_FPU_LOAD))
- memcpy(&fpu->state, &current->thread.fpu.state,
- fpu_kernel_xstate_size);
- else
- save_fpregs_to_fpstate(fpu);
-}
-
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
- fpregs_lock();
-
- kvm_save_current_fpu(vcpu->arch.user_fpu);
-
/*
- * Guests with protected state can't have it set by the hypervisor,
- * so skip trying to set it.
+ * Exclude PKRU from restore as restored separately in
+ * kvm_x86_ops.run().
*/
- if (vcpu->arch.guest_fpu)
- /* PKRU is separately restored in kvm_x86_ops.run. */
- __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
- ~XFEATURE_MASK_PKRU);
-
- fpregs_mark_activate();
- fpregs_unlock();
-
+ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
trace_kvm_fpu(1);
}
/* When vcpu_run ends, restore user space FPU context. */
static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
- fpregs_lock();
-
- /*
- * Guests with protected state can't have it read by the hypervisor,
- * so skip trying to save it.
- */
- if (vcpu->arch.guest_fpu)
- kvm_save_current_fpu(vcpu->arch.guest_fpu);
-
- restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
-
- fpregs_mark_activate();
- fpregs_unlock();
-
+ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
++vcpu->stat.fpu_reload;
trace_kvm_fpu(0);
}
@@ -10347,7 +10679,8 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
vcpu->arch.cr2 = sregs->cr2;
*mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3);
kvm_set_cr8(vcpu, sregs->cr8);
@@ -10364,7 +10697,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
if (update_pdptrs) {
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (is_pae_paging(vcpu)) {
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ load_pdptrs(vcpu, kvm_read_cr3(vcpu));
*mmu_reset_needed = 1;
}
srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -10458,6 +10791,24 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
return ret;
}
+static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
+{
+ bool inhibit = false;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ down_write(&kvm->arch.apicv_update_lock);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) {
+ inhibit = true;
+ break;
+ }
+ }
+ __kvm_request_apicv_update(kvm, !inhibit, APICV_INHIBIT_REASON_BLOCKIRQ);
+ up_write(&kvm->arch.apicv_update_lock);
+}
+
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
{
@@ -10510,6 +10861,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
static_call(kvm_x86_update_exception_bitmap)(vcpu);
+ kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
+
r = 0;
out:
@@ -10545,12 +10898,12 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
struct fxregs_state *fxsave;
- if (!vcpu->arch.guest_fpu)
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
return 0;
vcpu_load(vcpu);
- fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+ fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
memcpy(fpu->fpr, fxsave->st_space, 128);
fpu->fcw = fxsave->cwd;
fpu->fsw = fxsave->swd;
@@ -10568,12 +10921,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
struct fxregs_state *fxsave;
- if (!vcpu->arch.guest_fpu)
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
return 0;
vcpu_load(vcpu);
- fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+ fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
memcpy(fxsave->st_space, fpu->fpr, 128);
fxsave->cwd = fpu->fcw;
@@ -10624,33 +10977,6 @@ static int sync_regs(struct kvm_vcpu *vcpu)
return 0;
}
-static void fx_init(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->arch.guest_fpu)
- return;
-
- fpstate_init(&vcpu->arch.guest_fpu->state);
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
- host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
- /*
- * Ensure guest xcr0 is valid for loading
- */
- vcpu->arch.xcr0 = XFEATURE_MASK_FP;
-
- vcpu->arch.cr0 |= X86_CR0_ET;
-}
-
-void kvm_free_guest_fpu(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.guest_fpu) {
- kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
- vcpu->arch.guest_fpu = NULL;
- }
-}
-EXPORT_SYMBOL_GPL(kvm_free_guest_fpu);
-
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
@@ -10707,20 +11033,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (!alloc_emulate_ctxt(vcpu))
goto free_wbinvd_dirty_mask;
- vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!vcpu->arch.user_fpu) {
- pr_err("kvm: failed to allocate userspace's fpu\n");
- goto free_emulate_ctxt;
- }
-
- vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!vcpu->arch.guest_fpu) {
+ if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
pr_err("kvm: failed to allocate vcpu's fpu\n");
- goto free_user_fpu;
+ goto free_emulate_ctxt;
}
- fx_init(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
@@ -10752,9 +11068,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
return 0;
free_guest_fpu:
- kvm_free_guest_fpu(vcpu);
-free_user_fpu:
- kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+ fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
free_emulate_ctxt:
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_wbinvd_dirty_mask:
@@ -10792,19 +11106,15 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
int idx;
- kvm_release_pfn(cache->pfn, cache->dirty, cache);
-
kvmclock_reset(vcpu);
static_call(kvm_x86_vcpu_free)(vcpu);
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
- kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
- kvm_free_guest_fpu(vcpu);
+ fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
kvm_hv_vcpu_uninit(vcpu);
kvm_pmu_destroy(vcpu);
@@ -10821,9 +11131,19 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
+ struct kvm_cpuid_entry2 *cpuid_0x1;
unsigned long old_cr0 = kvm_read_cr0(vcpu);
unsigned long new_cr0;
- u32 eax, dummy;
+
+ /*
+ * Several of the "set" flows, e.g. ->set_cr0(), read other registers
+ * to handle side effects. RESET emulation hits those flows and relies
+ * on emulated/virtualized registers, including those that are loaded
+ * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel
+ * to detect improper or missing initialization.
+ */
+ WARN_ON_ONCE(!init_event &&
+ (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
kvm_lapic_reset(vcpu, init_event);
@@ -10856,8 +11176,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
- if (vcpu->arch.guest_fpu && kvm_mpx_supported()) {
- void *mpx_state_buffer;
+ if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
+ struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
/*
* To avoid have the INIT path from kvm_apic_has_events() that be
@@ -10865,14 +11185,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
*/
if (init_event)
kvm_put_guest_fpu(vcpu);
- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
- XFEATURE_BNDREGS);
- if (mpx_state_buffer)
- memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
- XFEATURE_BNDCSR);
- if (mpx_state_buffer)
- memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+
+ fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
+ fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
+
if (init_event)
kvm_load_guest_fpu(vcpu);
}
@@ -10886,21 +11202,19 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.xcr0 = XFEATURE_MASK_FP;
}
+ /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
- vcpu->arch.regs_avail = ~0;
- vcpu->arch.regs_dirty = ~0;
+ kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP);
/*
* Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
* if no CPUID match is found. Note, it's impossible to get a match at
* RESET since KVM emulates RESET before exposing the vCPU to userspace,
- * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET.
- * But, go through the motions in case that's ever remedied.
+ * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry
+ * on RESET. But, go through the motions in case that's ever remedied.
*/
- eax = 1;
- if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true))
- eax = 0x600;
- kvm_rdx_write(vcpu, eax);
+ cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0);
+ kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
vcpu->arch.ia32_xss = 0;
@@ -10969,7 +11283,7 @@ int kvm_arch_hardware_enable(void)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
int ret;
u64 local_tsc;
u64 max_tsc = 0;
@@ -11079,6 +11393,8 @@ int kvm_arch_hardware_setup(void *opaque)
memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
kvm_ops_static_call_update();
+ kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
supported_xss = 0;
@@ -11106,6 +11422,8 @@ int kvm_arch_hardware_setup(void *opaque)
void kvm_arch_hardware_unsetup(void)
{
+ kvm_unregister_perf_callbacks();
+
static_call(kvm_x86_hardware_unsetup)();
}
@@ -11152,13 +11470,14 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
void kvm_arch_free_vm(struct kvm *kvm)
{
kfree(to_kvm_hv(kvm)->hv_pa_pg);
- vfree(kvm);
+ __kvm_arch_free_vm(kvm);
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
int ret;
+ unsigned long flags;
if (type)
return -EINVAL;
@@ -11182,10 +11501,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
- spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
-
+ seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
pvclock_update_vm_gtod_copy(kvm);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
kvm->arch.guest_can_read_msr_platform_info = true;
@@ -11219,7 +11540,7 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
static void kvm_free_vcpus(struct kvm *kvm)
{
- unsigned int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
/*
@@ -11229,15 +11550,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
kvm_clear_async_pf_completion_queue(vcpu);
kvm_unload_vcpu_mmu(vcpu);
}
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_vcpu_destroy(vcpu);
-
- mutex_lock(&kvm->lock);
- for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
- kvm->vcpus[i] = NULL;
- atomic_set(&kvm->online_vcpus, 0);
- mutex_unlock(&kvm->lock);
+ kvm_destroy_vcpus(kvm);
}
void kvm_arch_sync_events(struct kvm *kvm)
@@ -11382,8 +11696,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
kvm_page_track_free_memslot(slot);
}
-static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
- unsigned long npages)
+int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
{
const int sz = sizeof(*slot->arch.rmap[0]);
int i;
@@ -11392,7 +11705,8 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
int level = i + 1;
int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
- WARN_ON(slot->arch.rmap[i]);
+ if (slot->arch.rmap[i])
+ continue;
slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
if (!slot->arch.rmap[i]) {
@@ -11404,54 +11718,10 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
return 0;
}
-int alloc_all_memslots_rmaps(struct kvm *kvm)
-{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
- int r, i;
-
- /*
- * Check if memslots alreday have rmaps early before acquiring
- * the slots_arch_lock below.
- */
- if (kvm_memslots_have_rmaps(kvm))
- return 0;
-
- mutex_lock(&kvm->slots_arch_lock);
-
- /*
- * Read memslots_have_rmaps again, under the slots arch lock,
- * before allocating the rmaps
- */
- if (kvm_memslots_have_rmaps(kvm)) {
- mutex_unlock(&kvm->slots_arch_lock);
- return 0;
- }
-
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(slot, slots) {
- r = memslot_rmap_alloc(slot, slot->npages);
- if (r) {
- mutex_unlock(&kvm->slots_arch_lock);
- return r;
- }
- }
- }
-
- /*
- * Ensure that memslots_have_rmaps becomes true strictly after
- * all the rmap pointers are set.
- */
- smp_store_release(&kvm->arch.memslots_have_rmaps, true);
- mutex_unlock(&kvm->slots_arch_lock);
- return 0;
-}
-
static int kvm_alloc_memslot_metadata(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- unsigned long npages)
+ struct kvm_memory_slot *slot)
{
+ unsigned long npages = slot->npages;
int i, r;
/*
@@ -11498,7 +11768,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
}
}
- if (kvm_page_track_create_memslot(slot, npages))
+ if (kvm_page_track_create_memslot(kvm, slot, npages))
goto out_free;
return 0;
@@ -11516,7 +11786,7 @@ out_free:
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
/*
* memslots->generation has been incremented.
@@ -11530,13 +11800,18 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
- const struct kvm_userspace_memory_region *mem,
- enum kvm_mr_change change)
+ const struct kvm_memory_slot *old,
+ struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
{
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
- return kvm_alloc_memslot_metadata(kvm, memslot,
- mem->memory_size >> PAGE_SHIFT);
+ return kvm_alloc_memslot_metadata(kvm, new);
+
+ if (change == KVM_MR_FLAGS_ONLY)
+ memcpy(&new->arch, &old->arch, sizeof(old->arch));
+ else if (WARN_ON_ONCE(change != KVM_MR_DELETE))
+ return -EIO;
+
return 0;
}
@@ -11560,13 +11835,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES;
+ u32 old_flags = old ? old->flags : 0;
+ u32 new_flags = new ? new->flags : 0;
+ bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
/*
* Update CPU dirty logging if dirty logging is being toggled. This
* applies to all operations.
*/
- if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)
+ if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)
kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
/*
@@ -11584,7 +11861,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
* MOVE/DELETE: The old mappings will already have been cleaned up by
* kvm_arch_flush_shadow_memslot().
*/
- if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
+ if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY))
return;
/*
@@ -11592,7 +11869,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
* other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
* logging isn't being toggled on or off.
*/
- if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)))
+ if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)))
return;
if (!log_dirty_pages) {
@@ -11628,14 +11905,18 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem,
struct kvm_memory_slot *old,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- if (!kvm->arch.n_requested_mmu_pages)
- kvm_mmu_change_mmu_pages(kvm,
- kvm_mmu_calculate_default_mmu_pages(kvm));
+ if (!kvm->arch.n_requested_mmu_pages &&
+ (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) {
+ unsigned long nr_mmu_pages;
+
+ nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
+ nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
+ kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+ }
kvm_mmu_slot_apply_flags(kvm, old, new, change);
@@ -11736,6 +12017,11 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
return vcpu->arch.preempted_in_kernel;
}
+unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
+{
+ return kvm_rip_read(vcpu);
+}
+
int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
{
return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
@@ -12096,6 +12382,15 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set);
}
+bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (new->type != KVM_IRQ_ROUTING_MSI)
+ return true;
+
+ return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
+}
+
bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
@@ -12136,12 +12431,13 @@ EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
struct x86_exception fault;
u32 access = error_code &
(PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
if (!(error_code & PFERR_PRESENT_MASK) ||
- vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) {
+ mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) {
/*
* If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
* tables probably do not match the TLB. Just proceed
@@ -12177,9 +12473,7 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
* doesn't seem to be a real use-case behind such requests, just return
* KVM_EXIT_INTERNAL_ERROR for now.
*/
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -12239,7 +12533,8 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
return kvm_skip_emulated_instruction(vcpu);
default:
- BUG(); /* We have already checked above that type <= 3 */
+ kvm_inject_gp(vcpu, 0);
+ return 1;
}
}
EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
@@ -12367,44 +12662,81 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
}
EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
-static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port);
+
+static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu)
{
- memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data,
- vcpu->arch.pio.count * vcpu->arch.pio.size);
- vcpu->arch.pio.count = 0;
+ int size = vcpu->arch.pio.size;
+ int port = vcpu->arch.pio.port;
+ vcpu->arch.pio.count = 0;
+ if (vcpu->arch.sev_pio_count)
+ return kvm_sev_es_outs(vcpu, size, port);
return 1;
}
static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
- unsigned int port, void *data, unsigned int count)
+ unsigned int port)
{
- int ret;
-
- ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port,
- data, count);
- if (ret)
- return ret;
+ for (;;) {
+ unsigned int count =
+ min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+ int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
+
+ /* memcpy done already by emulator_pio_out. */
+ vcpu->arch.sev_pio_count -= count;
+ vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
+ if (!ret)
+ break;
- vcpu->arch.pio.count = 0;
+ /* Emulation done by the kernel. */
+ if (!vcpu->arch.sev_pio_count)
+ return 1;
+ }
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
return 0;
}
static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
- unsigned int port, void *data, unsigned int count)
+ unsigned int port);
+
+static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
{
- int ret;
+ unsigned count = vcpu->arch.pio.count;
+ complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
+ vcpu->arch.sev_pio_count -= count;
+ vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
+}
- ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port,
- data, count);
- if (ret) {
- vcpu->arch.pio.count = 0;
- } else {
- vcpu->arch.guest_ins_data = data;
- vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
+static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+{
+ int size = vcpu->arch.pio.size;
+ int port = vcpu->arch.pio.port;
+
+ advance_sev_es_emulated_ins(vcpu);
+ if (vcpu->arch.sev_pio_count)
+ return kvm_sev_es_ins(vcpu, size, port);
+ return 1;
+}
+
+static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port)
+{
+ for (;;) {
+ unsigned int count =
+ min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+ if (!__emulator_pio_in(vcpu, size, port, count))
+ break;
+
+ /* Emulation done by the kernel. */
+ advance_sev_es_emulated_ins(vcpu);
+ if (!vcpu->arch.sev_pio_count)
+ return 1;
}
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
return 0;
}
@@ -12412,8 +12744,10 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
unsigned int port, void *data, unsigned int count,
int in)
{
- return in ? kvm_sev_es_ins(vcpu, size, port, data, count)
- : kvm_sev_es_outs(vcpu, size, port, data, count);
+ vcpu->arch.sev_pio_data = data;
+ vcpu->arch.sev_pio_count = count;
+ return in ? kvm_sev_es_ins(vcpu, size, port)
+ : kvm_sev_es_outs(vcpu, size, port);
}
EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
@@ -12440,6 +12774,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 7d66d63dc55a..bec8ed090abc 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -103,6 +103,7 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
@@ -153,12 +154,24 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
{
int cs_db, cs_l;
+ WARN_ON_ONCE(vcpu->arch.guest_state_protected);
+
if (!is_long_mode(vcpu))
return false;
static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
return cs_l;
}
+static inline bool is_64_bit_hypercall(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If running with protected guest state, the CS register is not
+ * accessible. The hypercall register values will have had to been
+ * provided in 64-bit mode, so assume the guest is in 64-bit.
+ */
+ return vcpu->arch.guest_state_protected || is_64_bit_mode(vcpu);
+}
+
static inline bool x86_exception_has_error_code(unsigned int vector)
{
static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) |
@@ -173,12 +186,6 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
}
-static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_current)(vcpu);
-}
-
static inline int is_pae(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
@@ -294,7 +301,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
return is_smm(vcpu) || static_call(kvm_x86_apic_init_signal_blocked)(vcpu);
}
-void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs);
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
@@ -343,8 +349,6 @@ extern bool enable_vmware_backdoor;
extern int pi_inject_timer;
-extern struct static_key kvm_no_apic_vcpu;
-
extern bool report_ignored_msrs;
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
@@ -387,18 +391,27 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm)
return kvm->arch.cstate_in_guest;
}
-DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+enum kvm_intr_type {
+ /* Values are arbitrary, but must be non-zero. */
+ KVM_HANDLING_IRQ = 1,
+ KVM_HANDLING_NMI,
+};
-static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
+static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
+ enum kvm_intr_type intr)
{
- __this_cpu_write(current_vcpu, vcpu);
+ WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr);
}
static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
{
- __this_cpu_write(current_vcpu, NULL);
+ WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0);
}
+static inline bool kvm_handling_nmi_from_guest(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.handling_intr_from_guest == KVM_HANDLING_NMI;
+}
static inline bool kvm_pat_valid(u64 data)
{
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 9ea9c3dabe37..0e3f7d6e9fd7 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -16,6 +16,7 @@
#include <trace/events/kvm.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
+#include <xen/interface/event_channel.h>
#include "trace.h"
@@ -23,38 +24,77 @@ DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
{
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ struct pvclock_wall_clock *wc;
gpa_t gpa = gfn_to_gpa(gfn);
- int wc_ofs, sec_hi_ofs;
+ u32 *wc_sec_hi;
+ u32 wc_version;
+ u64 wall_nsec;
int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
- if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) {
- ret = -EFAULT;
+ if (gfn == GPA_INVALID) {
+ kvm_gfn_to_pfn_cache_destroy(kvm, gpc);
goto out;
}
- kvm->arch.xen.shinfo_gfn = gfn;
+
+ do {
+ ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, false, true,
+ gpa, PAGE_SIZE, false);
+ if (ret)
+ goto out;
+
+ /*
+ * This code mirrors kvm_write_wall_clock() except that it writes
+ * directly through the pfn cache and doesn't mark the page dirty.
+ */
+ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+
+ /* It could be invalid again already, so we need to check */
+ read_lock_irq(&gpc->lock);
+
+ if (gpc->valid)
+ break;
+
+ read_unlock_irq(&gpc->lock);
+ } while (1);
/* Paranoia checks on the 32-bit struct layout */
BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
- /* 32-bit location by default */
- wc_ofs = offsetof(struct compat_shared_info, wc);
- sec_hi_ofs = offsetof(struct compat_shared_info, arch.wc_sec_hi);
-
#ifdef CONFIG_X86_64
/* Paranoia checks on the 64-bit struct layout */
BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
- if (kvm->arch.xen.long_mode) {
- wc_ofs = offsetof(struct shared_info, wc);
- sec_hi_ofs = offsetof(struct shared_info, wc_sec_hi);
- }
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+
+ wc_sec_hi = &shinfo->wc_sec_hi;
+ wc = &shinfo->wc;
+ } else
#endif
+ {
+ struct compat_shared_info *shinfo = gpc->khva;
+
+ wc_sec_hi = &shinfo->arch.wc_sec_hi;
+ wc = &shinfo->wc;
+ }
+
+ /* Increment and ensure an odd value */
+ wc_version = wc->version = (wc->version + 1) | 1;
+ smp_wmb();
+
+ wc->nsec = do_div(wall_nsec, 1000000000);
+ wc->sec = (u32)wall_nsec;
+ *wc_sec_hi = wall_nsec >> 32;
+ smp_wmb();
+
+ wc->version = wc_version + 1;
+ read_unlock_irq(&gpc->lock);
- kvm_write_wall_clock(kvm, gpa + wc_ofs, sec_hi_ofs - wc_ofs);
kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
out:
@@ -127,9 +167,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
state_entry_time = vx->runstate_entry_time;
state_entry_time |= XEN_RUNSTATE_UPDATE;
- BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state_entry_time) !=
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
sizeof(state_entry_time));
- BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state_entry_time) !=
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
sizeof(state_entry_time));
if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
@@ -144,9 +184,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
offsetof(struct compat_vcpu_runstate_info, state));
- BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) !=
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));
- BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) !=
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));
if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
@@ -163,9 +203,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
- BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
- sizeof(((struct compat_vcpu_runstate_info *)0)->time));
- BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
+ sizeof_field(struct compat_vcpu_runstate_info, time));
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof(vx->runstate_times));
if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
@@ -190,6 +230,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
{
+ unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
+ bool atomic = in_atomic() || !task_is_running(current);
+ int err;
u8 rc = 0;
/*
@@ -198,29 +241,118 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
*/
struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
struct kvm_memslots *slots = kvm_memslots(v->kvm);
+ bool ghc_valid = slots->generation == ghc->generation &&
+ !kvm_is_error_hva(ghc->hva) && ghc->memslot;
+
unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
/* No need for compat handling here */
BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
BUILD_BUG_ON(sizeof(rc) !=
- sizeof(((struct vcpu_info *)0)->evtchn_upcall_pending));
+ sizeof_field(struct vcpu_info, evtchn_upcall_pending));
BUILD_BUG_ON(sizeof(rc) !=
- sizeof(((struct compat_vcpu_info *)0)->evtchn_upcall_pending));
+ sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
/*
* For efficiency, this mirrors the checks for using the valid
* cache in kvm_read_guest_offset_cached(), but just uses
* __get_user() instead. And falls back to the slow path.
*/
- if (likely(slots->generation == ghc->generation &&
- !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
+ if (!evtchn_pending_sel && ghc_valid) {
/* Fast path */
- __get_user(rc, (u8 __user *)ghc->hva + offset);
+ pagefault_disable();
+ err = __get_user(rc, (u8 __user *)ghc->hva + offset);
+ pagefault_enable();
+ if (!err)
+ return rc;
+ }
+
+ /* Slow path */
+
+ /*
+ * This function gets called from kvm_vcpu_block() after setting the
+ * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+ * from a HLT. So we really mustn't sleep. If the page ended up absent
+ * at that point, just return 1 in order to trigger an immediate wake,
+ * and we'll end up getting called again from a context where we *can*
+ * fault in the page and wait for it.
+ */
+ if (atomic)
+ return 1;
+
+ if (!ghc_valid) {
+ err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
+ if (err || !ghc->memslot) {
+ /*
+ * If this failed, userspace has screwed up the
+ * vcpu_info mapping. No interrupts for you.
+ */
+ return 0;
+ }
+ }
+
+ /*
+ * Now we have a valid (protected by srcu) userspace HVA in
+ * ghc->hva which points to the struct vcpu_info. If there
+ * are any bits in the in-kernel evtchn_pending_sel then
+ * we need to write those to the guest vcpu_info and set
+ * its evtchn_upcall_pending flag. If there aren't any bits
+ * to add, we only want to *check* evtchn_upcall_pending.
+ */
+ if (evtchn_pending_sel) {
+ bool long_mode = v->kvm->arch.xen.long_mode;
+
+ if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info)))
+ return 0;
+
+ if (IS_ENABLED(CONFIG_64BIT) && long_mode) {
+ struct vcpu_info __user *vi = (void __user *)ghc->hva;
+
+ /* Attempt to set the evtchn_pending_sel bits in the
+ * guest, and if that succeeds then clear the same
+ * bits in the in-kernel version. */
+ asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
+ "\tnotq %0\n"
+ "\t" LOCK_PREFIX "andq %0, %2\n"
+ "2:\n"
+ "\t.section .fixup,\"ax\"\n"
+ "3:\tjmp\t2b\n"
+ "\t.previous\n"
+ _ASM_EXTABLE_UA(1b, 3b)
+ : "=r" (evtchn_pending_sel),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel));
+ } else {
+ struct compat_vcpu_info __user *vi = (void __user *)ghc->hva;
+ u32 evtchn_pending_sel32 = evtchn_pending_sel;
+
+ /* Attempt to set the evtchn_pending_sel bits in the
+ * guest, and if that succeeds then clear the same
+ * bits in the in-kernel version. */
+ asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
+ "\tnotl %0\n"
+ "\t" LOCK_PREFIX "andl %0, %2\n"
+ "2:\n"
+ "\t.section .fixup,\"ax\"\n"
+ "3:\tjmp\t2b\n"
+ "\t.previous\n"
+ _ASM_EXTABLE_UA(1b, 3b)
+ : "=r" (evtchn_pending_sel32),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel32));
+ }
+ rc = 1;
+ unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
+
+ err:
+ user_access_end();
+
+ mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
} else {
- /* Slow path */
- kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
- sizeof(rc));
+ __get_user(rc, (u8 __user *)ghc->hva + offset);
}
return rc;
@@ -243,15 +375,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- if (data->u.shared_info.gfn == GPA_INVALID) {
- kvm->arch.xen.shinfo_gfn = GPA_INVALID;
- r = 0;
- break;
- }
r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
break;
-
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
@@ -282,7 +408,10 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn);
+ if (kvm->arch.xen.shinfo_cache.active)
+ data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
+ else
+ data->u.shared_info.gfn = GPA_INVALID;
r = 0;
break;
@@ -644,11 +773,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
void kvm_xen_init_vm(struct kvm *kvm)
{
- kvm->arch.xen.shinfo_gfn = GPA_INVALID;
}
void kvm_xen_destroy_vm(struct kvm *kvm)
{
+ kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
+
if (kvm->arch.xen_hvm_config.msr)
static_branch_slow_dec_deferred(&kvm_xen_enabled);
}
@@ -681,7 +811,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
kvm_hv_hypercall_enabled(vcpu))
return kvm_hv_hypercall(vcpu);
- longmode = is_64_bit_mode(vcpu);
+ longmode = is_64_bit_hypercall(vcpu);
if (!longmode) {
params[0] = (u32)kvm_rbx_read(vcpu);
params[1] = (u32)kvm_rcx_read(vcpu);
@@ -720,3 +850,179 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
return 0;
}
+
+static inline int max_evtchn_port(struct kvm *kvm)
+{
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+ return EVTCHN_2L_NR_CHANNELS;
+ else
+ return COMPAT_EVTCHN_2L_NR_CHANNELS;
+}
+
+/*
+ * This follows the kvm_set_irq() API, so it returns:
+ * < 0 Interrupt was ignored (masked or not delivered for other reasons)
+ * = 0 Interrupt was coalesced (previous irq is still pending)
+ * > 0 Number of CPUs interrupt was delivered to
+ */
+int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm)
+{
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ struct kvm_vcpu *vcpu;
+ unsigned long *pending_bits, *mask_bits;
+ unsigned long flags;
+ int port_word_bit;
+ bool kick_vcpu = false;
+ int idx;
+ int rc;
+
+ vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu);
+ if (!vcpu)
+ return -1;
+
+ if (!vcpu->arch.xen.vcpu_info_set)
+ return -1;
+
+ if (e->xen_evtchn.port >= max_evtchn_port(kvm))
+ return -1;
+
+ rc = -EWOULDBLOCK;
+ read_lock_irqsave(&gpc->lock, flags);
+
+ idx = srcu_read_lock(&kvm->srcu);
+ if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ goto out_rcu;
+
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+ port_word_bit = e->xen_evtchn.port / 64;
+ } else {
+ struct compat_shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+ port_word_bit = e->xen_evtchn.port / 32;
+ }
+
+ /*
+ * If this port wasn't already set, and if it isn't masked, then
+ * we try to set the corresponding bit in the in-kernel shadow of
+ * evtchn_pending_sel for the target vCPU. And if *that* wasn't
+ * already set, then we kick the vCPU in question to write to the
+ * *real* evtchn_pending_sel in its own guest vcpu_info struct.
+ */
+ if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) {
+ rc = 0; /* It was already raised */
+ } else if (test_bit(e->xen_evtchn.port, mask_bits)) {
+ rc = -1; /* Masked */
+ } else {
+ rc = 1; /* Delivered. But was the vCPU waking already? */
+ if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
+ kick_vcpu = true;
+ }
+
+ out_rcu:
+ srcu_read_unlock(&kvm->srcu, idx);
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ if (kick_vcpu) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+
+ return rc;
+}
+
+/* This is the version called from kvm_set_irq() as the .set function */
+static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ bool mm_borrowed = false;
+ int rc;
+
+ if (!level)
+ return -1;
+
+ rc = kvm_xen_set_evtchn_fast(e, kvm);
+ if (rc != -EWOULDBLOCK)
+ return rc;
+
+ if (current->mm != kvm->mm) {
+ /*
+ * If not on a thread which already belongs to this KVM,
+ * we'd better be in the irqfd workqueue.
+ */
+ if (WARN_ON_ONCE(current->mm))
+ return -EINVAL;
+
+ kthread_use_mm(kvm->mm);
+ mm_borrowed = true;
+ }
+
+ /*
+ * For the irqfd workqueue, using the main kvm->lock mutex is
+ * fine since this function is invoked from kvm_set_irq() with
+ * no other lock held, no srcu. In future if it will be called
+ * directly from a vCPU thread (e.g. on hypercall for an IPI)
+ * then it may need to switch to using a leaf-node mutex for
+ * serializing the shared_info mapping.
+ */
+ mutex_lock(&kvm->lock);
+
+ /*
+ * It is theoretically possible for the page to be unmapped
+ * and the MMU notifier to invalidate the shared_info before
+ * we even get to use it. In that case, this looks like an
+ * infinite loop. It was tempting to do it via the userspace
+ * HVA instead... but that just *hides* the fact that it's
+ * an infinite loop, because if a fault occurs and it waits
+ * for the page to come back, it can *still* immediately
+ * fault and have to wait again, repeatedly.
+ *
+ * Conversely, the page could also have been reinstated by
+ * another thread before we even obtain the mutex above, so
+ * check again *first* before remapping it.
+ */
+ do {
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ int idx;
+
+ rc = kvm_xen_set_evtchn_fast(e, kvm);
+ if (rc != -EWOULDBLOCK)
+ break;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa,
+ PAGE_SIZE, false);
+ srcu_read_unlock(&kvm->srcu, idx);
+ } while(!rc);
+
+ mutex_unlock(&kvm->lock);
+
+ if (mm_borrowed)
+ kthread_unuse_mm(kvm->mm);
+
+ return rc;
+}
+
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+
+{
+ if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ /* We only support 2 level event channels for now */
+ if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ return -EINVAL;
+
+ e->xen_evtchn.port = ue->u.xen_evtchn.port;
+ e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu;
+ e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
+ e->set = evtchn_set_fn;
+
+ return 0;
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index cc0cf5f37450..adbcc9ed59db 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -24,6 +24,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
void kvm_xen_init_vm(struct kvm *kvm);
void kvm_xen_destroy_vm(struct kvm *kvm);
+int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm);
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue);
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return static_branch_unlikely(&kvm_xen_enabled.key) &&
@@ -134,6 +140,9 @@ struct compat_shared_info {
struct compat_arch_shared_info arch;
};
+#define COMPAT_EVTCHN_2L_NR_CHANNELS (8 * \
+ sizeof_field(struct compat_shared_info, \
+ evtchn_pending))
struct compat_vcpu_runstate_info {
int state;
uint64_t state_entry_time;
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index c6506c6a7092..f76747862bd2 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -63,7 +63,6 @@ ifeq ($(CONFIG_X86_32),y)
ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += cmpxchg8b_emu.o atomic64_386_32.o
endif
- lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
else
obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 16bc9130e7a5..e768815e58ae 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -9,81 +9,83 @@
#include <asm/alternative.h>
/* if you want SMP support, implement these with real spinlocks */
-.macro LOCK reg
+.macro IRQ_SAVE reg
pushfl
cli
.endm
-.macro UNLOCK reg
+.macro IRQ_RESTORE reg
popfl
.endm
-#define BEGIN(op) \
+#define BEGIN_IRQ_SAVE(op) \
.macro endp; \
SYM_FUNC_END(atomic64_##op##_386); \
.purgem endp; \
.endm; \
SYM_FUNC_START(atomic64_##op##_386); \
- LOCK v;
+ IRQ_SAVE v;
#define ENDP endp
-#define RET \
- UNLOCK v; \
- ret
-
-#define RET_ENDP \
- RET; \
- ENDP
+#define RET_IRQ_RESTORE \
+ IRQ_RESTORE v; \
+ RET
#define v %ecx
-BEGIN(read)
+BEGIN_IRQ_SAVE(read)
movl (v), %eax
movl 4(v), %edx
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(set)
+BEGIN_IRQ_SAVE(set)
movl %ebx, (v)
movl %ecx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(xchg)
+BEGIN_IRQ_SAVE(xchg)
movl (v), %eax
movl 4(v), %edx
movl %ebx, (v)
movl %ecx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %ecx
-BEGIN(add)
+BEGIN_IRQ_SAVE(add)
addl %eax, (v)
adcl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %ecx
-BEGIN(add_return)
+BEGIN_IRQ_SAVE(add_return)
addl (v), %eax
adcl 4(v), %edx
movl %eax, (v)
movl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %ecx
-BEGIN(sub)
+BEGIN_IRQ_SAVE(sub)
subl %eax, (v)
sbbl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %ecx
-BEGIN(sub_return)
+BEGIN_IRQ_SAVE(sub_return)
negl %edx
negl %eax
sbbl $0, %edx
@@ -91,47 +93,52 @@ BEGIN(sub_return)
adcl 4(v), %edx
movl %eax, (v)
movl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(inc)
+BEGIN_IRQ_SAVE(inc)
addl $1, (v)
adcl $0, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(inc_return)
+BEGIN_IRQ_SAVE(inc_return)
movl (v), %eax
movl 4(v), %edx
addl $1, %eax
adcl $0, %edx
movl %eax, (v)
movl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(dec)
+BEGIN_IRQ_SAVE(dec)
subl $1, (v)
sbbl $0, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(dec_return)
+BEGIN_IRQ_SAVE(dec_return)
movl (v), %eax
movl 4(v), %edx
subl $1, %eax
sbbl $0, %edx
movl %eax, (v)
movl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(add_unless)
+BEGIN_IRQ_SAVE(add_unless)
addl %eax, %ecx
adcl %edx, %edi
addl (v), %eax
@@ -143,7 +150,7 @@ BEGIN(add_unless)
movl %edx, 4(v)
movl $1, %eax
2:
- RET
+ RET_IRQ_RESTORE
3:
cmpl %edx, %edi
jne 1b
@@ -153,7 +160,7 @@ ENDP
#undef v
#define v %esi
-BEGIN(inc_not_zero)
+BEGIN_IRQ_SAVE(inc_not_zero)
movl (v), %eax
movl 4(v), %edx
testl %eax, %eax
@@ -165,7 +172,7 @@ BEGIN(inc_not_zero)
movl %edx, 4(v)
movl $1, %eax
2:
- RET
+ RET_IRQ_RESTORE
3:
testl %edx, %edx
jne 1b
@@ -174,7 +181,7 @@ ENDP
#undef v
#define v %esi
-BEGIN(dec_if_positive)
+BEGIN_IRQ_SAVE(dec_if_positive)
movl (v), %eax
movl 4(v), %edx
subl $1, %eax
@@ -183,5 +190,6 @@ BEGIN(dec_if_positive)
movl %eax, (v)
movl %edx, 4(v)
1:
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index ce6935690766..90afb488b396 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -18,7 +18,7 @@
SYM_FUNC_START(atomic64_read_cx8)
read64 %ecx
- ret
+ RET
SYM_FUNC_END(atomic64_read_cx8)
SYM_FUNC_START(atomic64_set_cx8)
@@ -28,7 +28,7 @@ SYM_FUNC_START(atomic64_set_cx8)
cmpxchg8b (%esi)
jne 1b
- ret
+ RET
SYM_FUNC_END(atomic64_set_cx8)
SYM_FUNC_START(atomic64_xchg_cx8)
@@ -37,7 +37,7 @@ SYM_FUNC_START(atomic64_xchg_cx8)
cmpxchg8b (%esi)
jne 1b
- ret
+ RET
SYM_FUNC_END(atomic64_xchg_cx8)
.macro addsub_return func ins insc
@@ -68,7 +68,7 @@ SYM_FUNC_START(atomic64_\func\()_return_cx8)
popl %esi
popl %ebx
popl %ebp
- ret
+ RET
SYM_FUNC_END(atomic64_\func\()_return_cx8)
.endm
@@ -93,7 +93,7 @@ SYM_FUNC_START(atomic64_\func\()_return_cx8)
movl %ebx, %eax
movl %ecx, %edx
popl %ebx
- ret
+ RET
SYM_FUNC_END(atomic64_\func\()_return_cx8)
.endm
@@ -118,7 +118,7 @@ SYM_FUNC_START(atomic64_dec_if_positive_cx8)
movl %ebx, %eax
movl %ecx, %edx
popl %ebx
- ret
+ RET
SYM_FUNC_END(atomic64_dec_if_positive_cx8)
SYM_FUNC_START(atomic64_add_unless_cx8)
@@ -149,7 +149,7 @@ SYM_FUNC_START(atomic64_add_unless_cx8)
addl $8, %esp
popl %ebx
popl %ebp
- ret
+ RET
4:
cmpl %edx, 4(%esp)
jne 2b
@@ -176,5 +176,5 @@ SYM_FUNC_START(atomic64_inc_not_zero_cx8)
movl $1, %eax
3:
popl %ebx
- ret
+ RET
SYM_FUNC_END(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 4304320e51f4..23318c338db0 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -127,7 +127,7 @@ SYM_FUNC_START(csum_partial)
8:
popl %ebx
popl %esi
- ret
+ RET
SYM_FUNC_END(csum_partial)
#else
@@ -245,7 +245,7 @@ SYM_FUNC_START(csum_partial)
90:
popl %ebx
popl %esi
- ret
+ RET
SYM_FUNC_END(csum_partial)
#endif
@@ -260,9 +260,9 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst,
* Copy from ds while checksumming, otherwise like csum_partial
*/
-#define EXC(y...) \
- 9999: y; \
- _ASM_EXTABLE_UA(9999b, 6001f)
+#define EXC(y...) \
+ 9999: y; \
+ _ASM_EXTABLE_TYPE(9999b, 7f, EX_TYPE_UACCESS | EX_FLAG_CLEAR_AX)
#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
@@ -358,20 +358,11 @@ EXC( movb %cl, (%edi) )
adcl $0, %eax
7:
-# Exception handler:
-.section .fixup, "ax"
-
-6001:
- xorl %eax, %eax
- jmp 7b
-
-.previous
-
popl %ebx
popl %esi
popl %edi
popl %ecx # equivalent to addl $4,%esp
- ret
+ RET
SYM_FUNC_END(csum_partial_copy_generic)
#else
@@ -439,15 +430,11 @@ EXC( movb %dl, (%edi) )
6: addl %edx, %eax
adcl $0, %eax
7:
-.section .fixup, "ax"
-6001: xorl %eax, %eax
- jmp 7b
-.previous
popl %esi
popl %edi
popl %ebx
- ret
+ RET
SYM_FUNC_END(csum_partial_copy_generic)
#undef ROUND
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index c4c7dd115953..fe59b8ac4fcc 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -17,7 +17,7 @@ SYM_FUNC_START(clear_page_rep)
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
- ret
+ RET
SYM_FUNC_END(clear_page_rep)
EXPORT_SYMBOL_GPL(clear_page_rep)
@@ -39,7 +39,7 @@ SYM_FUNC_START(clear_page_orig)
leaq 64(%rdi),%rdi
jnz .Lloop
nop
- ret
+ RET
SYM_FUNC_END(clear_page_orig)
EXPORT_SYMBOL_GPL(clear_page_orig)
@@ -47,6 +47,6 @@ SYM_FUNC_START(clear_page_erms)
movl $4096,%ecx
xorl %eax,%eax
rep stosb
- ret
+ RET
SYM_FUNC_END(clear_page_erms)
EXPORT_SYMBOL_GPL(clear_page_erms)
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 3542502faa3b..33c70c0160ea 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -37,11 +37,11 @@ SYM_FUNC_START(this_cpu_cmpxchg16b_emu)
popfq
mov $1, %al
- ret
+ RET
.Lnot_same:
popfq
xor %al,%al
- ret
+ RET
SYM_FUNC_END(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
index ca01ed6029f4..6a912d58fecc 100644
--- a/arch/x86/lib/cmpxchg8b_emu.S
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -32,7 +32,7 @@ SYM_FUNC_START(cmpxchg8b_emu)
movl %ecx, 4(%esi)
popfl
- ret
+ RET
.Lnot_same:
movl (%esi), %eax
@@ -40,7 +40,7 @@ SYM_FUNC_START(cmpxchg8b_emu)
movl 4(%esi), %edx
popfl
- ret
+ RET
SYM_FUNC_END(cmpxchg8b_emu)
EXPORT_SYMBOL(cmpxchg8b_emu)
diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S
index e5f77e293034..c859a8a09860 100644
--- a/arch/x86/lib/copy_mc_64.S
+++ b/arch/x86/lib/copy_mc_64.S
@@ -77,10 +77,8 @@ SYM_FUNC_START(copy_mc_fragile)
.L_done_memcpy_trap:
xorl %eax, %eax
.L_done:
- ret
-SYM_FUNC_END(copy_mc_fragile)
+ RET
- .section .fixup, "ax"
/*
* Return number of bytes not copied for any failure. Note that
* there is no "tail" handling since the source buffer is 8-byte
@@ -105,14 +103,14 @@ SYM_FUNC_END(copy_mc_fragile)
movl %ecx, %edx
jmp copy_mc_fragile_handle_tail
- .previous
-
- _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
- _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
- _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
+ _ASM_EXTABLE_TYPE(.L_read_leading_bytes, .E_leading_bytes, EX_TYPE_DEFAULT_MCE_SAFE)
+ _ASM_EXTABLE_TYPE(.L_read_words, .E_read_words, EX_TYPE_DEFAULT_MCE_SAFE)
+ _ASM_EXTABLE_TYPE(.L_read_trailing_bytes, .E_trailing_bytes, EX_TYPE_DEFAULT_MCE_SAFE)
_ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
_ASM_EXTABLE(.L_write_words, .E_write_words)
_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
+
+SYM_FUNC_END(copy_mc_fragile)
#endif /* CONFIG_X86_MCE */
/*
@@ -132,10 +130,8 @@ SYM_FUNC_START(copy_mc_enhanced_fast_string)
rep movsb
/* Copy successful. Return zero */
xorl %eax, %eax
- ret
-SYM_FUNC_END(copy_mc_enhanced_fast_string)
+ RET
- .section .fixup, "ax"
.E_copy:
/*
* On fault %rcx is updated such that the copy instruction could
@@ -145,9 +141,9 @@ SYM_FUNC_END(copy_mc_enhanced_fast_string)
* user-copy routines.
*/
movq %rcx, %rax
- ret
+ RET
- .previous
+ _ASM_EXTABLE_TYPE(.L_copy, .E_copy, EX_TYPE_DEFAULT_MCE_SAFE)
- _ASM_EXTABLE_FAULT(.L_copy, .E_copy)
+SYM_FUNC_END(copy_mc_enhanced_fast_string)
#endif /* !CONFIG_UML */
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index db4b4f9197c7..30ea644bf446 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -17,7 +17,7 @@ SYM_FUNC_START(copy_page)
ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
movl $4096/8, %ecx
rep movsq
- ret
+ RET
SYM_FUNC_END(copy_page)
EXPORT_SYMBOL(copy_page)
@@ -85,5 +85,5 @@ SYM_FUNC_START_LOCAL(copy_page_regs)
movq (%rsp), %rbx
movq 1*8(%rsp), %r12
addq $2*8, %rsp
- ret
+ RET
SYM_FUNC_END(copy_page_regs)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 57b79c577496..8ca5ecf16dc4 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -32,14 +32,10 @@
decl %ecx
jnz 100b
102:
- .section .fixup,"ax"
-103: addl %ecx,%edx /* ecx is zerorest also */
- jmp .Lcopy_user_handle_tail
- .previous
- _ASM_EXTABLE_CPY(100b, 103b)
- _ASM_EXTABLE_CPY(101b, 103b)
- .endm
+ _ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
+ _ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
+.endm
/*
* copy_user_generic_unrolled - memory copy with exception handling.
@@ -105,9 +101,8 @@ SYM_FUNC_START(copy_user_generic_unrolled)
jnz 21b
23: xor %eax,%eax
ASM_CLAC
- ret
+ RET
- .section .fixup,"ax"
30: shll $6,%ecx
addl %ecx,%edx
jmp 60f
@@ -115,7 +110,6 @@ SYM_FUNC_START(copy_user_generic_unrolled)
jmp 60f
50: movl %ecx,%edx
60: jmp .Lcopy_user_handle_tail /* ecx is zerorest also */
- .previous
_ASM_EXTABLE_CPY(1b, 30b)
_ASM_EXTABLE_CPY(2b, 30b)
@@ -166,20 +160,16 @@ SYM_FUNC_START(copy_user_generic_string)
movl %edx,%ecx
shrl $3,%ecx
andl $7,%edx
-1: rep
- movsq
+1: rep movsq
2: movl %edx,%ecx
-3: rep
- movsb
+3: rep movsb
xorl %eax,%eax
ASM_CLAC
- ret
+ RET
- .section .fixup,"ax"
11: leal (%rdx,%rcx,8),%ecx
12: movl %ecx,%edx /* ecx is zerorest also */
jmp .Lcopy_user_handle_tail
- .previous
_ASM_EXTABLE_CPY(1b, 11b)
_ASM_EXTABLE_CPY(3b, 12b)
@@ -200,19 +190,16 @@ EXPORT_SYMBOL(copy_user_generic_string)
*/
SYM_FUNC_START(copy_user_enhanced_fast_string)
ASM_STAC
- cmpl $64,%edx
- jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */
+ /* CPUs without FSRM should avoid rep movsb for short copies */
+ ALTERNATIVE "cmpl $64, %edx; jb .L_copy_short_string", "", X86_FEATURE_FSRM
movl %edx,%ecx
-1: rep
- movsb
+1: rep movsb
xorl %eax,%eax
ASM_CLAC
- ret
+ RET
- .section .fixup,"ax"
12: movl %ecx,%edx /* ecx is zerorest also */
jmp .Lcopy_user_handle_tail
- .previous
_ASM_EXTABLE_CPY(1b, 12b)
SYM_FUNC_END(copy_user_enhanced_fast_string)
@@ -225,6 +212,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
* Don't try to copy the tail if machine check happened
*
* Input:
+ * eax trap number written by ex_handler_copy()
* rdi destination
* rsi source
* rdx count
@@ -233,26 +221,26 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
* eax uncopied bytes or 0 if successful.
*/
SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
- movl %edx,%ecx
- cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */
+ cmp $X86_TRAP_MC,%eax
je 3f
+
+ movl %edx,%ecx
1: rep movsb
2: mov %ecx,%eax
ASM_CLAC
- ret
-
- /*
- * Return zero to pretend that this copy succeeded. This
- * is counter-intuitive, but needed to prevent the code
- * in lib/iov_iter.c from retrying and running back into
- * the poison cache line again. The machine check handler
- * will ensure that a SIGBUS is sent to the task.
- */
-3: xorl %eax,%eax
+ RET
+
+3:
+ movl %edx,%eax
ASM_CLAC
- ret
+ RET
_ASM_EXTABLE_CPY(1b, 2b)
+
+.Lcopy_user_handle_align:
+ addl %ecx,%edx /* ecx is zerorest also */
+ jmp .Lcopy_user_handle_tail
+
SYM_CODE_END(.Lcopy_user_handle_tail)
/*
@@ -361,9 +349,8 @@ SYM_FUNC_START(__copy_user_nocache)
xorl %eax,%eax
ASM_CLAC
sfence
- ret
+ RET
- .section .fixup,"ax"
.L_fixup_4x8b_copy:
shll $6,%ecx
addl %ecx,%edx
@@ -379,7 +366,6 @@ SYM_FUNC_START(__copy_user_nocache)
.L_fixup_handle_tail:
sfence
jmp .Lcopy_user_handle_tail
- .previous
_ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy)
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 1fbd8ee9642d..d9e16a2cf285 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -201,7 +201,7 @@ SYM_FUNC_START(csum_partial_copy_generic)
movq 3*8(%rsp), %r13
movq 4*8(%rsp), %r15
addq $5*8, %rsp
- ret
+ RET
.Lshort:
movl %ecx, %r10d
jmp .L1
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index e7925d668b68..1f8a8f895173 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -9,6 +9,7 @@
#include <linux/compiler.h>
#include <linux/export.h>
#include <asm/checksum.h>
+#include <asm/word-at-a-time.h>
static inline unsigned short from32to16(unsigned a)
{
@@ -21,120 +22,119 @@ static inline unsigned short from32to16(unsigned a)
}
/*
- * Do a 64-bit checksum on an arbitrary memory area.
+ * Do a checksum on an arbitrary memory area.
* Returns a 32bit checksum.
*
* This isn't as time critical as it used to be because many NICs
* do hardware checksumming these days.
- *
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
+ *
+ * Still, with CHECKSUM_COMPLETE this is called to compute
+ * checksums on IPv6 headers (40 bytes) and other small parts.
+ * it's best to have buff aligned on a 64-bit boundary
*/
-static unsigned do_csum(const unsigned char *buff, unsigned len)
+__wsum csum_partial(const void *buff, int len, __wsum sum)
{
- unsigned odd, count;
- unsigned long result = 0;
+ u64 temp64 = (__force u64)sum;
+ unsigned odd, result;
- if (unlikely(len == 0))
- return result;
odd = 1 & (unsigned long) buff;
if (unlikely(odd)) {
- result = *buff << 8;
+ if (unlikely(len == 0))
+ return sum;
+ temp64 = ror32((__force u32)sum, 8);
+ temp64 += (*(unsigned char *)buff << 8);
len--;
buff++;
}
- count = len >> 1; /* nr of 16-bit words.. */
- if (count) {
- if (2 & (unsigned long) buff) {
- result += *(unsigned short *)buff;
- count--;
- len -= 2;
- buff += 2;
- }
- count >>= 1; /* nr of 32-bit words.. */
- if (count) {
- unsigned long zero;
- unsigned count64;
- if (4 & (unsigned long) buff) {
- result += *(unsigned int *) buff;
- count--;
- len -= 4;
- buff += 4;
- }
- count >>= 1; /* nr of 64-bit words.. */
- /* main loop using 64byte blocks */
- zero = 0;
- count64 = count >> 3;
- while (count64) {
- asm("addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcq 4*8(%[src]),%[res]\n\t"
- "adcq 5*8(%[src]),%[res]\n\t"
- "adcq 6*8(%[src]),%[res]\n\t"
- "adcq 7*8(%[src]),%[res]\n\t"
- "adcq %[zero],%[res]"
- : [res] "=r" (result)
- : [src] "r" (buff), [zero] "r" (zero),
- "[res]" (result));
- buff += 64;
- count64--;
- }
+ while (unlikely(len >= 64)) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq 2*8(%[src]),%[res]\n\t"
+ "adcq 3*8(%[src]),%[res]\n\t"
+ "adcq 4*8(%[src]),%[res]\n\t"
+ "adcq 5*8(%[src]),%[res]\n\t"
+ "adcq 6*8(%[src]),%[res]\n\t"
+ "adcq 7*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [src] "r" (buff)
+ : "memory");
+ buff += 64;
+ len -= 64;
+ }
+
+ if (len & 32) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq 2*8(%[src]),%[res]\n\t"
+ "adcq 3*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [src] "r" (buff)
+ : "memory");
+ buff += 32;
+ }
+ if (len & 16) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [src] "r" (buff)
+ : "memory");
+ buff += 16;
+ }
+ if (len & 8) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [src] "r" (buff)
+ : "memory");
+ buff += 8;
+ }
+ if (len & 7) {
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+ unsigned int shift = (8 - (len & 7)) * 8;
+ unsigned long trail;
- /* last up to 7 8byte blocks */
- count %= 8;
- while (count) {
- asm("addq %1,%0\n\t"
- "adcq %2,%0\n"
- : "=r" (result)
- : "m" (*(unsigned long *)buff),
- "r" (zero), "0" (result));
- --count;
- buff += 8;
- }
- result = add32_with_carry(result>>32,
- result&0xffffffff);
+ trail = (load_unaligned_zeropad(buff) << shift) >> shift;
- if (len & 4) {
- result += *(unsigned int *) buff;
- buff += 4;
- }
+ asm("addq %[trail],%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [trail] "r" (trail));
+#else
+ if (len & 4) {
+ asm("addq %[val],%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [val] "r" ((u64)*(u32 *)buff)
+ : "memory");
+ buff += 4;
}
if (len & 2) {
- result += *(unsigned short *) buff;
+ asm("addq %[val],%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [val] "r" ((u64)*(u16 *)buff)
+ : "memory");
buff += 2;
}
+ if (len & 1) {
+ asm("addq %[val],%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [val] "r" ((u64)*(u8 *)buff)
+ : "memory");
+ }
+#endif
}
- if (len & 1)
- result += *buff;
- result = add32_with_carry(result>>32, result & 0xffffffff);
- if (unlikely(odd)) {
+ result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
+ if (unlikely(odd)) {
result = from32to16(result);
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
}
- return result;
-}
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 64-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
- return (__force __wsum)add32_with_carry(do_csum(buff, len),
- (__force u32)sum);
+ return (__force __wsum)result;
}
EXPORT_SYMBOL(csum_partial);
@@ -147,4 +147,3 @@ __sum16 ip_compute_csum(const void *buff, int len)
return csum_fold(csum_partial(buff,len,0));
}
EXPORT_SYMBOL(ip_compute_csum);
-
diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c
index be5b5fb1598b..520897061ee0 100644
--- a/arch/x86/lib/error-inject.c
+++ b/arch/x86/lib/error-inject.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/linkage.h>
#include <linux/error-injection.h>
#include <linux/kprobes.h>
@@ -10,7 +11,7 @@ asm(
".type just_return_func, @function\n"
".globl just_return_func\n"
"just_return_func:\n"
- " ret\n"
+ ASM_RET
".size just_return_func, .-just_return_func\n"
);
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index fa1bc2104b32..b70d98d79a9d 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -57,7 +57,7 @@ SYM_FUNC_START(__get_user_1)
1: movzbl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_1)
EXPORT_SYMBOL(__get_user_1)
@@ -71,7 +71,7 @@ SYM_FUNC_START(__get_user_2)
2: movzwl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_2)
EXPORT_SYMBOL(__get_user_2)
@@ -85,7 +85,7 @@ SYM_FUNC_START(__get_user_4)
3: movl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_4)
EXPORT_SYMBOL(__get_user_4)
@@ -100,7 +100,7 @@ SYM_FUNC_START(__get_user_8)
4: movq (%_ASM_AX),%rdx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
#else
LOAD_TASK_SIZE_MINUS_N(7)
cmp %_ASM_DX,%_ASM_AX
@@ -112,7 +112,7 @@ SYM_FUNC_START(__get_user_8)
5: movl 4(%_ASM_AX),%ecx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
#endif
SYM_FUNC_END(__get_user_8)
EXPORT_SYMBOL(__get_user_8)
@@ -124,7 +124,7 @@ SYM_FUNC_START(__get_user_nocheck_1)
6: movzbl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_nocheck_1)
EXPORT_SYMBOL(__get_user_nocheck_1)
@@ -134,7 +134,7 @@ SYM_FUNC_START(__get_user_nocheck_2)
7: movzwl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_nocheck_2)
EXPORT_SYMBOL(__get_user_nocheck_2)
@@ -144,7 +144,7 @@ SYM_FUNC_START(__get_user_nocheck_4)
8: movl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_nocheck_4)
EXPORT_SYMBOL(__get_user_nocheck_4)
@@ -159,7 +159,7 @@ SYM_FUNC_START(__get_user_nocheck_8)
#endif
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_nocheck_8)
EXPORT_SYMBOL(__get_user_nocheck_8)
@@ -169,7 +169,7 @@ SYM_CODE_START_LOCAL(.Lbad_get_user_clac)
bad_get_user:
xor %edx,%edx
mov $(-EFAULT),%_ASM_AX
- ret
+ RET
SYM_CODE_END(.Lbad_get_user_clac)
#ifdef CONFIG_X86_32
@@ -179,7 +179,7 @@ bad_get_user_8:
xor %edx,%edx
xor %ecx,%ecx
mov $(-EFAULT),%_ASM_AX
- ret
+ RET
SYM_CODE_END(.Lbad_get_user_8_clac)
#endif
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index dbf8cc97b7f5..12c16c6aa44a 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -32,7 +32,7 @@ SYM_FUNC_START(__sw_hweight32)
imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101
shrl $24, %eax # w = w_tmp >> 24
__ASM_SIZE(pop,) %__ASM_REG(dx)
- ret
+ RET
SYM_FUNC_END(__sw_hweight32)
EXPORT_SYMBOL(__sw_hweight32)
@@ -65,7 +65,7 @@ SYM_FUNC_START(__sw_hweight64)
popq %rdx
popq %rdi
- ret
+ RET
#else /* CONFIG_X86_32 */
/* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
pushl %ecx
@@ -77,7 +77,7 @@ SYM_FUNC_START(__sw_hweight64)
addl %ecx, %eax # result
popl %ecx
- ret
+ RET
#endif
SYM_FUNC_END(__sw_hweight64)
EXPORT_SYMBOL(__sw_hweight64)
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index a1d24fdc07cf..b781d324211b 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -37,8 +37,6 @@ enum reg_type {
*/
static bool is_string_insn(struct insn *insn)
{
- insn_get_opcode(insn);
-
/* All string instructions have a 1-byte opcode. */
if (insn->opcode.nbytes != 1)
return false;
@@ -412,32 +410,44 @@ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx)
#endif /* CONFIG_X86_64 */
}
-static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
- enum reg_type type)
+static const int pt_regoff[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+#else
+ offsetof(struct pt_regs, ds),
+ offsetof(struct pt_regs, es),
+ offsetof(struct pt_regs, fs),
+ offsetof(struct pt_regs, gs),
+#endif
+};
+
+int pt_regs_offset(struct pt_regs *regs, int regno)
+{
+ if ((unsigned)regno < ARRAY_SIZE(pt_regoff))
+ return pt_regoff[regno];
+ return -EDOM;
+}
+
+static int get_regno(struct insn *insn, enum reg_type type)
{
+ int nr_registers = ARRAY_SIZE(pt_regoff);
int regno = 0;
- static const int regoff[] = {
- offsetof(struct pt_regs, ax),
- offsetof(struct pt_regs, cx),
- offsetof(struct pt_regs, dx),
- offsetof(struct pt_regs, bx),
- offsetof(struct pt_regs, sp),
- offsetof(struct pt_regs, bp),
- offsetof(struct pt_regs, si),
- offsetof(struct pt_regs, di),
-#ifdef CONFIG_X86_64
- offsetof(struct pt_regs, r8),
- offsetof(struct pt_regs, r9),
- offsetof(struct pt_regs, r10),
- offsetof(struct pt_regs, r11),
- offsetof(struct pt_regs, r12),
- offsetof(struct pt_regs, r13),
- offsetof(struct pt_regs, r14),
- offsetof(struct pt_regs, r15),
-#endif
- };
- int nr_registers = ARRAY_SIZE(regoff);
/*
* Don't possibly decode a 32-bit instructions as
* reading a 64-bit-only register.
@@ -505,7 +515,18 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
WARN_ONCE(1, "decoded an instruction with an invalid register");
return -EINVAL;
}
- return regoff[regno];
+ return regno;
+}
+
+static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
+ enum reg_type type)
+{
+ int regno = get_regno(insn, type);
+
+ if (regno < 0)
+ return regno;
+
+ return pt_regs_offset(regs, regno);
}
/**
@@ -851,6 +872,26 @@ int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs)
}
/**
+ * insn_get_modrm_reg_ptr() - Obtain register pointer based on ModRM byte
+ * @insn: Instruction containing the ModRM byte
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * Returns:
+ *
+ * The register indicated by the reg part of the ModRM byte.
+ * The register is obtained as a pointer within pt_regs.
+ */
+unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs)
+{
+ int offset;
+
+ offset = insn_get_modrm_reg_off(insn, regs);
+ if (offset < 0)
+ return NULL;
+ return (void *)regs + offset;
+}
+
+/**
* get_seg_base_limit() - obtain base address and limit of a segment
* @insn: Instruction. Must be valid.
* @regs: Register values as seen when entering kernel mode
@@ -1405,6 +1446,9 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
if (!insn || !regs)
return (void __user *)-1L;
+ if (insn_get_opcode(insn))
+ return (void __user *)-1L;
+
switch (insn->addr_bytes) {
case 2:
return get_addr_ref_16(insn, regs);
@@ -1417,7 +1461,7 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
}
}
-static int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip)
+int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip)
{
unsigned long seg_base = 0;
@@ -1539,3 +1583,87 @@ bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
return true;
}
+
+/**
+ * insn_decode_mmio() - Decode a MMIO instruction
+ * @insn: Structure to store decoded instruction
+ * @bytes: Returns size of memory operand
+ *
+ * Decodes instruction that used for Memory-mapped I/O.
+ *
+ * Returns:
+ *
+ * Type of the instruction. Size of the memory operand is stored in
+ * @bytes. If decode failed, MMIO_DECODE_FAILED returned.
+ */
+enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
+{
+ enum mmio_type type = MMIO_DECODE_FAILED;
+
+ *bytes = 0;
+
+ if (insn_get_opcode(insn))
+ return MMIO_DECODE_FAILED;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0x88: /* MOV m8,r8 */
+ *bytes = 1;
+ fallthrough;
+ case 0x89: /* MOV m16/m32/m64, r16/m32/m64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_WRITE;
+ break;
+
+ case 0xc6: /* MOV m8, imm8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xc7: /* MOV m16/m32/m64, imm16/imm32/imm64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_WRITE_IMM;
+ break;
+
+ case 0x8a: /* MOV r8, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0x8b: /* MOV r16/r32/r64, m16/m32/m64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_READ;
+ break;
+
+ case 0xa4: /* MOVS m8, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xa5: /* MOVS m16/m32/m64, m16/m32/m64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_MOVS;
+ break;
+
+ case 0x0f: /* Two-byte instruction */
+ switch (insn->opcode.bytes[1]) {
+ case 0xb6: /* MOVZX r16/r32/r64, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xb7: /* MOVZX r32/r64, m16 */
+ if (!*bytes)
+ *bytes = 2;
+ type = MMIO_READ_ZERO_EXTEND;
+ break;
+
+ case 0xbe: /* MOVSX r16/r32/r64, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xbf: /* MOVSX r32/r64, m16 */
+ if (!*bytes)
+ *bytes = 2;
+ type = MMIO_READ_SIGN_EXTEND;
+ break;
+ }
+ break;
+ }
+
+ return type;
+}
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index c565def611e2..55e371cc69fd 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -13,6 +13,7 @@
#endif
#include <asm/inat.h> /*__ignore_sync_check__ */
#include <asm/insn.h> /* __ignore_sync_check__ */
+#include <asm/unaligned.h> /* __ignore_sync_check__ */
#include <linux/errno.h>
#include <linux/kconfig.h>
@@ -37,10 +38,10 @@
((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr)
#define __get_next(t, insn) \
- ({ t r; memcpy(&r, insn->next_byte, sizeof(t)); insn->next_byte += sizeof(t); leXX_to_cpu(t, r); })
+ ({ t r = get_unaligned((t *)(insn)->next_byte); (insn)->next_byte += sizeof(t); leXX_to_cpu(t, r); })
#define __peek_nbyte_next(t, insn, n) \
- ({ t r; memcpy(&r, (insn)->next_byte + n, sizeof(t)); leXX_to_cpu(t, r); })
+ ({ t r = get_unaligned((t *)(insn)->next_byte + n); leXX_to_cpu(t, r); })
#define get_next(t, insn) \
({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); })
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
index cb5a1964506b..a1f9416bf67a 100644
--- a/arch/x86/lib/iomap_copy_64.S
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -11,5 +11,5 @@
SYM_FUNC_START(__iowrite32_copy)
movl %edx,%ecx
rep movsd
- ret
+ RET
SYM_FUNC_END(__iowrite32_copy)
diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
index a53665116458..2b3eb8c948a3 100644
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@@ -56,11 +56,14 @@ unsigned long kaslr_get_random_long(const char *purpose)
unsigned long raw, random = get_boot_seed();
bool use_i8254 = true;
- debug_putstr(purpose);
- debug_putstr(" KASLR using");
+ if (purpose) {
+ debug_putstr(purpose);
+ debug_putstr(" KASLR using");
+ }
if (has_cpuflag(X86_FEATURE_RDRAND)) {
- debug_putstr(" RDRAND");
+ if (purpose)
+ debug_putstr(" RDRAND");
if (rdrand_long(&raw)) {
random ^= raw;
use_i8254 = false;
@@ -68,7 +71,8 @@ unsigned long kaslr_get_random_long(const char *purpose)
}
if (has_cpuflag(X86_FEATURE_TSC)) {
- debug_putstr(" RDTSC");
+ if (purpose)
+ debug_putstr(" RDTSC");
raw = rdtsc();
random ^= raw;
@@ -76,7 +80,8 @@ unsigned long kaslr_get_random_long(const char *purpose)
}
if (use_i8254) {
- debug_putstr(" i8254");
+ if (purpose)
+ debug_putstr(" i8254");
random ^= i8254();
}
@@ -86,7 +91,8 @@ unsigned long kaslr_get_random_long(const char *purpose)
: "a" (random), "rm" (mix_const));
random += raw;
- debug_putstr("...\n");
+ if (purpose)
+ debug_putstr("...\n");
return random;
}
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index e565d1c9019e..3a6e6cfe8c35 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -7,11 +7,7 @@
__visible void *memcpy(void *to, const void *from, size_t n)
{
-#if defined(CONFIG_X86_USE_3DNOW) && !defined(CONFIG_FORTIFY_SOURCE)
- return __memcpy3d(to, from, n);
-#else
return __memcpy(to, from, n);
-#endif
}
EXPORT_SYMBOL(memcpy);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 1cc9da6e29c7..59cf2343f3d9 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -39,7 +39,7 @@ SYM_FUNC_START_WEAK(memcpy)
rep movsq
movl %edx, %ecx
rep movsb
- ret
+ RET
SYM_FUNC_END(memcpy)
SYM_FUNC_END_ALIAS(__memcpy)
EXPORT_SYMBOL(memcpy)
@@ -53,7 +53,7 @@ SYM_FUNC_START_LOCAL(memcpy_erms)
movq %rdi, %rax
movq %rdx, %rcx
rep movsb
- ret
+ RET
SYM_FUNC_END(memcpy_erms)
SYM_FUNC_START_LOCAL(memcpy_orig)
@@ -137,7 +137,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
movq %r9, 1*8(%rdi)
movq %r10, -2*8(%rdi, %rdx)
movq %r11, -1*8(%rdi, %rdx)
- retq
+ RET
.p2align 4
.Lless_16bytes:
cmpl $8, %edx
@@ -149,7 +149,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
movq -1*8(%rsi, %rdx), %r9
movq %r8, 0*8(%rdi)
movq %r9, -1*8(%rdi, %rdx)
- retq
+ RET
.p2align 4
.Lless_8bytes:
cmpl $4, %edx
@@ -162,7 +162,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
movl -4(%rsi, %rdx), %r8d
movl %ecx, (%rdi)
movl %r8d, -4(%rdi, %rdx)
- retq
+ RET
.p2align 4
.Lless_3bytes:
subl $1, %edx
@@ -180,7 +180,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
movb %cl, (%rdi)
.Lend:
- retq
+ RET
SYM_FUNC_END(memcpy_orig)
.popsection
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 64801010d312..50ea390df712 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -40,7 +40,7 @@ SYM_FUNC_START(__memmove)
/* FSRM implies ERMS => no length checks, do the copy directly */
.Lmemmove_begin_forward:
ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
- ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+ ALTERNATIVE "", __stringify(movq %rdx, %rcx; rep movsb; RET), X86_FEATURE_ERMS
/*
* movsq instruction have many startup latency
@@ -205,7 +205,7 @@ SYM_FUNC_START(__memmove)
movb (%rsi), %r11b
movb %r11b, (%rdi)
13:
- retq
+ RET
SYM_FUNC_END(__memmove)
SYM_FUNC_END_ALIAS(memmove)
EXPORT_SYMBOL(__memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 9827ae267f96..d624f2bc42f1 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -40,7 +40,7 @@ SYM_FUNC_START(__memset)
movl %edx,%ecx
rep stosb
movq %r9,%rax
- ret
+ RET
SYM_FUNC_END(__memset)
SYM_FUNC_END_ALIAS(memset)
EXPORT_SYMBOL(memset)
@@ -63,7 +63,7 @@ SYM_FUNC_START_LOCAL(memset_erms)
movq %rdx,%rcx
rep stosb
movq %r9,%rax
- ret
+ RET
SYM_FUNC_END(memset_erms)
SYM_FUNC_START_LOCAL(memset_orig)
@@ -125,7 +125,7 @@ SYM_FUNC_START_LOCAL(memset_orig)
.Lende:
movq %r10,%rax
- ret
+ RET
.Lbad_alignment:
cmpq $7,%rdx
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
index cc5f4ea943d3..e69de29bb2d1 100644
--- a/arch/x86/lib/mmx_32.c
+++ b/arch/x86/lib/mmx_32.c
@@ -1,388 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * MMX 3DNow! library helper functions
- *
- * To do:
- * We can use MMX just for prefetch in IRQ's. This may be a win.
- * (reported so on K6-III)
- * We should use a better code neutral filler for the short jump
- * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
- * We also want to clobber the filler register so we don't get any
- * register forwarding stalls on the filler.
- *
- * Add *user handling. Checksums are not a win with MMX on any CPU
- * tested so far for any MMX solution figured.
- *
- * 22/09/2000 - Arjan van de Ven
- * Improved for non-engineering-sample Athlons
- *
- */
-#include <linux/hardirq.h>
-#include <linux/string.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-
-#include <asm/fpu/api.h>
-#include <asm/asm.h>
-
-/*
- * Use KFPU_387. MMX instructions are not affected by MXCSR,
- * but both AMD and Intel documentation states that even integer MMX
- * operations will result in #MF if an exception is pending in FCW.
- *
- * EMMS is not needed afterwards because, after calling kernel_fpu_end(),
- * any subsequent user of the 387 stack will reinitialize it using
- * KFPU_387.
- */
-
-void *_mmx_memcpy(void *to, const void *from, size_t len)
-{
- void *p;
- int i;
-
- if (unlikely(in_interrupt()))
- return __memcpy(to, from, len);
-
- p = to;
- i = len >> 6; /* len/64 */
-
- kernel_fpu_begin_mask(KFPU_387);
-
- __asm__ __volatile__ (
- "1: prefetch (%0)\n" /* This set is 28 bytes */
- " prefetch 64(%0)\n"
- " prefetch 128(%0)\n"
- " prefetch 192(%0)\n"
- " prefetch 256(%0)\n"
- "2: \n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : : "r" (from));
-
- for ( ; i > 5; i--) {
- __asm__ __volatile__ (
- "1: prefetch 320(%0)\n"
- "2: movq (%0), %%mm0\n"
- " movq 8(%0), %%mm1\n"
- " movq 16(%0), %%mm2\n"
- " movq 24(%0), %%mm3\n"
- " movq %%mm0, (%1)\n"
- " movq %%mm1, 8(%1)\n"
- " movq %%mm2, 16(%1)\n"
- " movq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm0\n"
- " movq 40(%0), %%mm1\n"
- " movq 48(%0), %%mm2\n"
- " movq 56(%0), %%mm3\n"
- " movq %%mm0, 32(%1)\n"
- " movq %%mm1, 40(%1)\n"
- " movq %%mm2, 48(%1)\n"
- " movq %%mm3, 56(%1)\n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : : "r" (from), "r" (to) : "memory");
-
- from += 64;
- to += 64;
- }
-
- for ( ; i > 0; i--) {
- __asm__ __volatile__ (
- " movq (%0), %%mm0\n"
- " movq 8(%0), %%mm1\n"
- " movq 16(%0), %%mm2\n"
- " movq 24(%0), %%mm3\n"
- " movq %%mm0, (%1)\n"
- " movq %%mm1, 8(%1)\n"
- " movq %%mm2, 16(%1)\n"
- " movq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm0\n"
- " movq 40(%0), %%mm1\n"
- " movq 48(%0), %%mm2\n"
- " movq 56(%0), %%mm3\n"
- " movq %%mm0, 32(%1)\n"
- " movq %%mm1, 40(%1)\n"
- " movq %%mm2, 48(%1)\n"
- " movq %%mm3, 56(%1)\n"
- : : "r" (from), "r" (to) : "memory");
-
- from += 64;
- to += 64;
- }
- /*
- * Now do the tail of the block:
- */
- __memcpy(to, from, len & 63);
- kernel_fpu_end();
-
- return p;
-}
-EXPORT_SYMBOL(_mmx_memcpy);
-
-#ifdef CONFIG_MK7
-
-/*
- * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
- * other MMX using processors do not.
- */
-
-static void fast_clear_page(void *page)
-{
- int i;
-
- kernel_fpu_begin_mask(KFPU_387);
-
- __asm__ __volatile__ (
- " pxor %%mm0, %%mm0\n" : :
- );
-
- for (i = 0; i < 4096/64; i++) {
- __asm__ __volatile__ (
- " movntq %%mm0, (%0)\n"
- " movntq %%mm0, 8(%0)\n"
- " movntq %%mm0, 16(%0)\n"
- " movntq %%mm0, 24(%0)\n"
- " movntq %%mm0, 32(%0)\n"
- " movntq %%mm0, 40(%0)\n"
- " movntq %%mm0, 48(%0)\n"
- " movntq %%mm0, 56(%0)\n"
- : : "r" (page) : "memory");
- page += 64;
- }
-
- /*
- * Since movntq is weakly-ordered, a "sfence" is needed to become
- * ordered again:
- */
- __asm__ __volatile__("sfence\n"::);
-
- kernel_fpu_end();
-}
-
-static void fast_copy_page(void *to, void *from)
-{
- int i;
-
- kernel_fpu_begin_mask(KFPU_387);
-
- /*
- * maybe the prefetch stuff can go before the expensive fnsave...
- * but that is for later. -AV
- */
- __asm__ __volatile__(
- "1: prefetch (%0)\n"
- " prefetch 64(%0)\n"
- " prefetch 128(%0)\n"
- " prefetch 192(%0)\n"
- " prefetch 256(%0)\n"
- "2: \n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b) : : "r" (from));
-
- for (i = 0; i < (4096-320)/64; i++) {
- __asm__ __volatile__ (
- "1: prefetch 320(%0)\n"
- "2: movq (%0), %%mm0\n"
- " movntq %%mm0, (%1)\n"
- " movq 8(%0), %%mm1\n"
- " movntq %%mm1, 8(%1)\n"
- " movq 16(%0), %%mm2\n"
- " movntq %%mm2, 16(%1)\n"
- " movq 24(%0), %%mm3\n"
- " movntq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm4\n"
- " movntq %%mm4, 32(%1)\n"
- " movq 40(%0), %%mm5\n"
- " movntq %%mm5, 40(%1)\n"
- " movq 48(%0), %%mm6\n"
- " movntq %%mm6, 48(%1)\n"
- " movq 56(%0), %%mm7\n"
- " movntq %%mm7, 56(%1)\n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
-
- from += 64;
- to += 64;
- }
-
- for (i = (4096-320)/64; i < 4096/64; i++) {
- __asm__ __volatile__ (
- "2: movq (%0), %%mm0\n"
- " movntq %%mm0, (%1)\n"
- " movq 8(%0), %%mm1\n"
- " movntq %%mm1, 8(%1)\n"
- " movq 16(%0), %%mm2\n"
- " movntq %%mm2, 16(%1)\n"
- " movq 24(%0), %%mm3\n"
- " movntq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm4\n"
- " movntq %%mm4, 32(%1)\n"
- " movq 40(%0), %%mm5\n"
- " movntq %%mm5, 40(%1)\n"
- " movq 48(%0), %%mm6\n"
- " movntq %%mm6, 48(%1)\n"
- " movq 56(%0), %%mm7\n"
- " movntq %%mm7, 56(%1)\n"
- : : "r" (from), "r" (to) : "memory");
- from += 64;
- to += 64;
- }
- /*
- * Since movntq is weakly-ordered, a "sfence" is needed to become
- * ordered again:
- */
- __asm__ __volatile__("sfence \n"::);
- kernel_fpu_end();
-}
-
-#else /* CONFIG_MK7 */
-
-/*
- * Generic MMX implementation without K7 specific streaming
- */
-static void fast_clear_page(void *page)
-{
- int i;
-
- kernel_fpu_begin_mask(KFPU_387);
-
- __asm__ __volatile__ (
- " pxor %%mm0, %%mm0\n" : :
- );
-
- for (i = 0; i < 4096/128; i++) {
- __asm__ __volatile__ (
- " movq %%mm0, (%0)\n"
- " movq %%mm0, 8(%0)\n"
- " movq %%mm0, 16(%0)\n"
- " movq %%mm0, 24(%0)\n"
- " movq %%mm0, 32(%0)\n"
- " movq %%mm0, 40(%0)\n"
- " movq %%mm0, 48(%0)\n"
- " movq %%mm0, 56(%0)\n"
- " movq %%mm0, 64(%0)\n"
- " movq %%mm0, 72(%0)\n"
- " movq %%mm0, 80(%0)\n"
- " movq %%mm0, 88(%0)\n"
- " movq %%mm0, 96(%0)\n"
- " movq %%mm0, 104(%0)\n"
- " movq %%mm0, 112(%0)\n"
- " movq %%mm0, 120(%0)\n"
- : : "r" (page) : "memory");
- page += 128;
- }
-
- kernel_fpu_end();
-}
-
-static void fast_copy_page(void *to, void *from)
-{
- int i;
-
- kernel_fpu_begin_mask(KFPU_387);
-
- __asm__ __volatile__ (
- "1: prefetch (%0)\n"
- " prefetch 64(%0)\n"
- " prefetch 128(%0)\n"
- " prefetch 192(%0)\n"
- " prefetch 256(%0)\n"
- "2: \n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b) : : "r" (from));
-
- for (i = 0; i < 4096/64; i++) {
- __asm__ __volatile__ (
- "1: prefetch 320(%0)\n"
- "2: movq (%0), %%mm0\n"
- " movq 8(%0), %%mm1\n"
- " movq 16(%0), %%mm2\n"
- " movq 24(%0), %%mm3\n"
- " movq %%mm0, (%1)\n"
- " movq %%mm1, 8(%1)\n"
- " movq %%mm2, 16(%1)\n"
- " movq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm0\n"
- " movq 40(%0), %%mm1\n"
- " movq 48(%0), %%mm2\n"
- " movq 56(%0), %%mm3\n"
- " movq %%mm0, 32(%1)\n"
- " movq %%mm1, 40(%1)\n"
- " movq %%mm2, 48(%1)\n"
- " movq %%mm3, 56(%1)\n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : : "r" (from), "r" (to) : "memory");
-
- from += 64;
- to += 64;
- }
- kernel_fpu_end();
-}
-
-#endif /* !CONFIG_MK7 */
-
-/*
- * Favour MMX for page clear and copy:
- */
-static void slow_zero_page(void *page)
-{
- int d0, d1;
-
- __asm__ __volatile__(
- "cld\n\t"
- "rep ; stosl"
-
- : "=&c" (d0), "=&D" (d1)
- :"a" (0), "1" (page), "0" (1024)
- :"memory");
-}
-
-void mmx_clear_page(void *page)
-{
- if (unlikely(in_interrupt()))
- slow_zero_page(page);
- else
- fast_clear_page(page);
-}
-EXPORT_SYMBOL(mmx_clear_page);
-
-static void slow_copy_page(void *to, void *from)
-{
- int d0, d1, d2;
-
- __asm__ __volatile__(
- "cld\n\t"
- "rep ; movsl"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (1024), "1" ((long) to), "2" ((long) from)
- : "memory");
-}
-
-void mmx_copy_page(void *to, void *from)
-{
- if (unlikely(in_interrupt()))
- slow_copy_page(to, from);
- else
- fast_copy_page(to, from);
-}
-EXPORT_SYMBOL(mmx_copy_page);
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index a2b9caa5274c..ebd259f31496 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -35,7 +35,7 @@ SYM_FUNC_START(\op\()_safe_regs)
movl %edi, 28(%r10)
popq %r12
popq %rbx
- ret
+ RET
3:
movl $-EIO, %r11d
jmp 2b
@@ -77,7 +77,7 @@ SYM_FUNC_START(\op\()_safe_regs)
popl %esi
popl %ebp
popl %ebx
- ret
+ RET
3:
movl $-EIO, 4(%esp)
jmp 2b
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index 0ea344c5ea43..ecb2049c1273 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -52,7 +52,7 @@ SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL)
1: movb %al,(%_ASM_CX)
xor %ecx,%ecx
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__put_user_1)
EXPORT_SYMBOL(__put_user_1)
EXPORT_SYMBOL(__put_user_nocheck_1)
@@ -66,7 +66,7 @@ SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL)
2: movw %ax,(%_ASM_CX)
xor %ecx,%ecx
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__put_user_2)
EXPORT_SYMBOL(__put_user_2)
EXPORT_SYMBOL(__put_user_nocheck_2)
@@ -80,7 +80,7 @@ SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL)
3: movl %eax,(%_ASM_CX)
xor %ecx,%ecx
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__put_user_4)
EXPORT_SYMBOL(__put_user_4)
EXPORT_SYMBOL(__put_user_nocheck_4)
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index ec9922cba30a..89b3fb244e15 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -23,50 +23,18 @@
.Ldo_rop_\@:
mov %\reg, (%_ASM_SP)
UNWIND_HINT_FUNC
- ret
+ RET
.endm
.macro THUNK reg
- .align 32
-
-SYM_FUNC_START(__x86_indirect_thunk_\reg)
+ .align RETPOLINE_THUNK_SIZE
+SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
+ UNWIND_HINT_EMPTY
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
__stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
- __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
-
-SYM_FUNC_END(__x86_indirect_thunk_\reg)
-
-.endm
-
-/*
- * This generates .altinstr_replacement symbols for use by objtool. They,
- * however, must not actually live in .altinstr_replacement since that will be
- * discarded after init, but module alternatives will also reference these
- * symbols.
- *
- * Their names matches the "__x86_indirect_" prefix to mark them as retpolines.
- */
-.macro ALT_THUNK reg
-
- .align 1
-
-SYM_FUNC_START_NOALIGN(__x86_indirect_alt_call_\reg)
- ANNOTATE_RETPOLINE_SAFE
-1: call *%\reg
-2: .skip 5-(2b-1b), 0x90
-SYM_FUNC_END(__x86_indirect_alt_call_\reg)
-
-STACK_FRAME_NON_STANDARD(__x86_indirect_alt_call_\reg)
-
-SYM_FUNC_START_NOALIGN(__x86_indirect_alt_jmp_\reg)
- ANNOTATE_RETPOLINE_SAFE
-1: jmp *%\reg
-2: .skip 5-(2b-1b), 0x90
-SYM_FUNC_END(__x86_indirect_alt_jmp_\reg)
-
-STACK_FRAME_NON_STANDARD(__x86_indirect_alt_jmp_\reg)
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_AMD
.endm
@@ -85,22 +53,16 @@ STACK_FRAME_NON_STANDARD(__x86_indirect_alt_jmp_\reg)
#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym)
#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
-#undef GEN
+ .align RETPOLINE_THUNK_SIZE
+SYM_CODE_START(__x86_indirect_thunk_array)
+
#define GEN(reg) THUNK reg
#include <asm/GEN-for-each-reg.h>
-
#undef GEN
-#define GEN(reg) EXPORT_THUNK(reg)
-#include <asm/GEN-for-each-reg.h>
-#undef GEN
-#define GEN(reg) ALT_THUNK reg
-#include <asm/GEN-for-each-reg.h>
+ .align RETPOLINE_THUNK_SIZE
+SYM_CODE_END(__x86_indirect_thunk_array)
-#undef GEN
-#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_call_ ## reg)
+#define GEN(reg) EXPORT_THUNK(reg)
#include <asm/GEN-for-each-reg.h>
-
#undef GEN
-#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_jmp_ ## reg)
-#include <asm/GEN-for-each-reg.h>
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index d15fdae9656e..53b3f202267c 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -11,6 +11,7 @@
* strings.
*/
+#define __NO_FORTIFY
#include <linux/string.h>
#include <linux/export.h>
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 7d290777246d..422257c350c6 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -8,7 +8,6 @@
*/
#include <linux/export.h>
#include <linux/uaccess.h>
-#include <asm/mmx.h>
#include <asm/asm.h>
#ifdef CONFIG_X86_INTEL_USERCOPY
@@ -43,11 +42,7 @@ do { \
" movl %2,%0\n" \
"1: rep; stosb\n" \
"2: " ASM_CLAC "\n" \
- ".section .fixup,\"ax\"\n" \
- "3: lea 0(%2,%0,4),%0\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_UA(0b, 3b) \
+ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %2) \
_ASM_EXTABLE_UA(1b, 2b) \
: "=&c"(size), "=&D" (__d0) \
: "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \
@@ -149,10 +144,6 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
"36: movl %%eax, %0\n"
"37: rep; movsb\n"
"100:\n"
- ".section .fixup,\"ax\"\n"
- "101: lea 0(%%eax,%0,4),%0\n"
- " jmp 100b\n"
- ".previous\n"
_ASM_EXTABLE_UA(1b, 100b)
_ASM_EXTABLE_UA(2b, 100b)
_ASM_EXTABLE_UA(3b, 100b)
@@ -190,7 +181,7 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
_ASM_EXTABLE_UA(35b, 100b)
_ASM_EXTABLE_UA(36b, 100b)
_ASM_EXTABLE_UA(37b, 100b)
- _ASM_EXTABLE_UA(99b, 101b)
+ _ASM_EXTABLE_TYPE_REG(99b, 100b, EX_TYPE_UCOPY_LEN4, %%eax)
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
@@ -255,30 +246,26 @@ static unsigned long __copy_user_intel_nocache(void *to,
" movl %%eax,%0\n"
"7: rep; movsb\n"
"8:\n"
- ".section .fixup,\"ax\"\n"
- "9: lea 0(%%eax,%0,4),%0\n"
- "16: jmp 8b\n"
- ".previous\n"
- _ASM_EXTABLE_UA(0b, 16b)
- _ASM_EXTABLE_UA(1b, 16b)
- _ASM_EXTABLE_UA(2b, 16b)
- _ASM_EXTABLE_UA(21b, 16b)
- _ASM_EXTABLE_UA(3b, 16b)
- _ASM_EXTABLE_UA(31b, 16b)
- _ASM_EXTABLE_UA(4b, 16b)
- _ASM_EXTABLE_UA(41b, 16b)
- _ASM_EXTABLE_UA(10b, 16b)
- _ASM_EXTABLE_UA(51b, 16b)
- _ASM_EXTABLE_UA(11b, 16b)
- _ASM_EXTABLE_UA(61b, 16b)
- _ASM_EXTABLE_UA(12b, 16b)
- _ASM_EXTABLE_UA(71b, 16b)
- _ASM_EXTABLE_UA(13b, 16b)
- _ASM_EXTABLE_UA(81b, 16b)
- _ASM_EXTABLE_UA(14b, 16b)
- _ASM_EXTABLE_UA(91b, 16b)
- _ASM_EXTABLE_UA(6b, 9b)
- _ASM_EXTABLE_UA(7b, 16b)
+ _ASM_EXTABLE_UA(0b, 8b)
+ _ASM_EXTABLE_UA(1b, 8b)
+ _ASM_EXTABLE_UA(2b, 8b)
+ _ASM_EXTABLE_UA(21b, 8b)
+ _ASM_EXTABLE_UA(3b, 8b)
+ _ASM_EXTABLE_UA(31b, 8b)
+ _ASM_EXTABLE_UA(4b, 8b)
+ _ASM_EXTABLE_UA(41b, 8b)
+ _ASM_EXTABLE_UA(10b, 8b)
+ _ASM_EXTABLE_UA(51b, 8b)
+ _ASM_EXTABLE_UA(11b, 8b)
+ _ASM_EXTABLE_UA(61b, 8b)
+ _ASM_EXTABLE_UA(12b, 8b)
+ _ASM_EXTABLE_UA(71b, 8b)
+ _ASM_EXTABLE_UA(13b, 8b)
+ _ASM_EXTABLE_UA(81b, 8b)
+ _ASM_EXTABLE_UA(14b, 8b)
+ _ASM_EXTABLE_UA(91b, 8b)
+ _ASM_EXTABLE_TYPE_REG(6b, 8b, EX_TYPE_UCOPY_LEN4, %%eax)
+ _ASM_EXTABLE_UA(7b, 8b)
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
@@ -315,14 +302,8 @@ do { \
" movl %3,%0\n" \
"1: rep; movsb\n" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "5: addl %3,%0\n" \
- " jmp 2b\n" \
- "3: lea 0(%3,%0,4),%0\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_UA(4b, 5b) \
- _ASM_EXTABLE_UA(0b, 3b) \
+ _ASM_EXTABLE_TYPE_REG(4b, 2b, EX_TYPE_UCOPY_LEN1, %3) \
+ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %3) \
_ASM_EXTABLE_UA(1b, 2b) \
: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
: "3"(size), "0"(size), "1"(to), "2"(from) \
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 508c81e97ab1..0402a749f3a0 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -35,12 +35,10 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
" incq %[dst]\n"
" decl %%ecx ; jnz 1b\n"
"2:\n"
- ".section .fixup,\"ax\"\n"
- "3: lea 0(%[size1],%[size8],8),%[size8]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE_UA(0b, 3b)
+
+ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN8, %[size1])
_ASM_EXTABLE_UA(1b, 2b)
+
: [size8] "=&c"(size), [dst] "=&D" (__d0)
: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
clac();
diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S
index 951da2ad54bb..8c270ab415be 100644
--- a/arch/x86/math-emu/div_Xsig.S
+++ b/arch/x86/math-emu/div_Xsig.S
@@ -341,7 +341,7 @@ L_exit:
popl %esi
leave
- ret
+ RET
#ifdef PARANOID
diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S
index d047d1816abe..637439bfefa4 100644
--- a/arch/x86/math-emu/div_small.S
+++ b/arch/x86/math-emu/div_small.S
@@ -44,5 +44,5 @@ SYM_FUNC_START(FPU_div_small)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(FPU_div_small)
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 034748459482..d62662bdd460 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -53,7 +53,7 @@ void fpstate_init_soft(struct swregs_state *soft)
void finit(void)
{
- fpstate_init_soft(&current->thread.fpu.state.soft);
+ fpstate_init_soft(&current->thread.fpu.fpstate->regs.soft);
}
/*
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 8679a9d6c47f..7fe56c594aa6 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -31,7 +31,7 @@
#include <linux/uaccess.h>
#include <asm/traps.h>
#include <asm/user.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include "fpu_system.h"
#include "fpu_emu.h"
@@ -640,7 +640,7 @@ int fpregs_soft_set(struct task_struct *target,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- struct swregs_state *s387 = &target->thread.fpu.state.soft;
+ struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft;
void *space = s387->st_space;
int ret;
int offset, other, i, tags, regnr, tag, newtop;
@@ -691,7 +691,7 @@ int fpregs_soft_get(struct task_struct *target,
const struct user_regset *regset,
struct membuf to)
{
- struct swregs_state *s387 = &target->thread.fpu.state.soft;
+ struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft;
const void *space = s387->st_space;
int offset = (S387->ftop & 7) * 10, other = 80 - offset;
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 9b41391867dc..eec3e4805c75 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -73,7 +73,7 @@ static inline bool seg_writable(struct desc_struct *d)
return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE;
}
-#define I387 (&current->thread.fpu.state)
+#define I387 (&current->thread.fpu.fpstate->regs)
#define FPU_info (I387->soft.info)
#define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs))
diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S
index 4afc7b1fa6e9..54a031b66142 100644
--- a/arch/x86/math-emu/mul_Xsig.S
+++ b/arch/x86/math-emu/mul_Xsig.S
@@ -62,7 +62,7 @@ SYM_FUNC_START(mul32_Xsig)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(mul32_Xsig)
@@ -115,7 +115,7 @@ SYM_FUNC_START(mul64_Xsig)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(mul64_Xsig)
@@ -175,5 +175,5 @@ SYM_FUNC_START(mul_Xsig_Xsig)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(mul_Xsig_Xsig)
diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S
index 702315eecb86..35fd723fc0df 100644
--- a/arch/x86/math-emu/polynom_Xsig.S
+++ b/arch/x86/math-emu/polynom_Xsig.S
@@ -133,5 +133,5 @@ L_accum_done:
popl %edi
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(polynomial_Xsig)
diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S
index cad1d60b1e84..594936eeed67 100644
--- a/arch/x86/math-emu/reg_norm.S
+++ b/arch/x86/math-emu/reg_norm.S
@@ -72,7 +72,7 @@ L_exit_valid:
L_exit:
popl %ebx
leave
- ret
+ RET
L_zero:
@@ -138,7 +138,7 @@ L_exit_nuo_valid:
popl %ebx
leave
- ret
+ RET
L_exit_nuo_zero:
movl TAG_Zero,%eax
@@ -146,5 +146,5 @@ L_exit_nuo_zero:
popl %ebx
leave
- ret
+ RET
SYM_FUNC_END(FPU_normalize_nuo)
diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S
index 4a9fc3cc5a4d..0bb2a092161a 100644
--- a/arch/x86/math-emu/reg_round.S
+++ b/arch/x86/math-emu/reg_round.S
@@ -437,7 +437,7 @@ fpu_Arith_exit:
popl %edi
popl %esi
leave
- ret
+ RET
/*
diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S
index 9c9e2c810afe..07247287a3af 100644
--- a/arch/x86/math-emu/reg_u_add.S
+++ b/arch/x86/math-emu/reg_u_add.S
@@ -164,6 +164,6 @@ L_exit:
popl %edi
popl %esi
leave
- ret
+ RET
#endif /* PARANOID */
SYM_FUNC_END(FPU_u_add)
diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S
index e2fb5c2644c5..b5a41e2fc484 100644
--- a/arch/x86/math-emu/reg_u_div.S
+++ b/arch/x86/math-emu/reg_u_div.S
@@ -468,7 +468,7 @@ L_exit:
popl %esi
leave
- ret
+ RET
#endif /* PARANOID */
SYM_FUNC_END(FPU_u_div)
diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S
index 0c779c87ac5b..e2588b24b8c2 100644
--- a/arch/x86/math-emu/reg_u_mul.S
+++ b/arch/x86/math-emu/reg_u_mul.S
@@ -144,7 +144,7 @@ L_exit:
popl %edi
popl %esi
leave
- ret
+ RET
#endif /* PARANOID */
SYM_FUNC_END(FPU_u_mul)
diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S
index e9bb7c248649..4c900c29e4ff 100644
--- a/arch/x86/math-emu/reg_u_sub.S
+++ b/arch/x86/math-emu/reg_u_sub.S
@@ -270,5 +270,5 @@ L_exit:
popl %edi
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(FPU_u_sub)
diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S
index d9d7de8dbd7b..126c40473bad 100644
--- a/arch/x86/math-emu/round_Xsig.S
+++ b/arch/x86/math-emu/round_Xsig.S
@@ -78,7 +78,7 @@ L_exit:
popl %esi
popl %ebx
leave
- ret
+ RET
SYM_FUNC_END(round_Xsig)
@@ -138,5 +138,5 @@ L_n_exit:
popl %esi
popl %ebx
leave
- ret
+ RET
SYM_FUNC_END(norm_Xsig)
diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S
index 726af985f758..f726bf6f6396 100644
--- a/arch/x86/math-emu/shr_Xsig.S
+++ b/arch/x86/math-emu/shr_Xsig.S
@@ -45,7 +45,7 @@ SYM_FUNC_START(shr_Xsig)
popl %ebx
popl %esi
leave
- ret
+ RET
L_more_than_31:
cmpl $64,%ecx
@@ -61,7 +61,7 @@ L_more_than_31:
movl $0,8(%esi)
popl %esi
leave
- ret
+ RET
L_more_than_63:
cmpl $96,%ecx
@@ -76,7 +76,7 @@ L_more_than_63:
movl %edx,8(%esi)
popl %esi
leave
- ret
+ RET
L_more_than_95:
xorl %eax,%eax
@@ -85,5 +85,5 @@ L_more_than_95:
movl %eax,8(%esi)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(shr_Xsig)
diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S
index 4fc89174caf0..f608a28a4c43 100644
--- a/arch/x86/math-emu/wm_shrx.S
+++ b/arch/x86/math-emu/wm_shrx.S
@@ -55,7 +55,7 @@ SYM_FUNC_START(FPU_shrx)
popl %ebx
popl %esi
leave
- ret
+ RET
L_more_than_31:
cmpl $64,%ecx
@@ -70,7 +70,7 @@ L_more_than_31:
movl $0,4(%esi)
popl %esi
leave
- ret
+ RET
L_more_than_63:
cmpl $96,%ecx
@@ -84,7 +84,7 @@ L_more_than_63:
movl %edx,4(%esi)
popl %esi
leave
- ret
+ RET
L_more_than_95:
xorl %eax,%eax
@@ -92,7 +92,7 @@ L_more_than_95:
movl %eax,4(%esi)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(FPU_shrx)
@@ -146,7 +146,7 @@ SYM_FUNC_START(FPU_shrxs)
popl %ebx
popl %esi
leave
- ret
+ RET
/* Shift by [0..31] bits */
Ls_less_than_32:
@@ -163,7 +163,7 @@ Ls_less_than_32:
popl %ebx
popl %esi
leave
- ret
+ RET
/* Shift by [64..95] bits */
Ls_more_than_63:
@@ -189,7 +189,7 @@ Ls_more_than_63:
popl %ebx
popl %esi
leave
- ret
+ RET
Ls_more_than_95:
/* Shift by [96..inf) bits */
@@ -203,5 +203,5 @@ Ls_more_than_95:
popl %ebx
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(FPU_shrxs)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 5864219221ca..fe3d3061fc11 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,9 +2,11 @@
# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c
KCOV_INSTRUMENT_tlb.o := n
KCOV_INSTRUMENT_mem_encrypt.o := n
+KCOV_INSTRUMENT_mem_encrypt_amd.o := n
KCOV_INSTRUMENT_mem_encrypt_identity.o := n
KASAN_SANITIZE_mem_encrypt.o := n
+KASAN_SANITIZE_mem_encrypt_amd.o := n
KASAN_SANITIZE_mem_encrypt_identity.o := n
# Disable KCSAN entirely, because otherwise we get warnings that some functions
@@ -13,6 +15,7 @@ KCSAN_SANITIZE := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_mem_encrypt.o = -pg
+CFLAGS_REMOVE_mem_encrypt_amd.o = -pg
CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
endif
@@ -52,6 +55,8 @@ obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
-obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o
+
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index f5e1e60c9095..6c2f1b76a0b6 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -110,6 +110,13 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
cea_map_stack(NMI);
cea_map_stack(DB);
cea_map_stack(MCE);
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) {
+ cea_map_stack(VC);
+ cea_map_stack(VC2);
+ }
+ }
}
#else
static inline void percpu_setup_exception_stacks(unsigned int cpu)
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index e1664e9f969c..dba2197c05c3 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -2,48 +2,58 @@
#include <linux/extable.h>
#include <linux/uaccess.h>
#include <linux/sched/debug.h>
+#include <linux/bitfield.h>
#include <xen/xen.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/sev.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
+#include <asm/insn-eval.h>
+#include <asm/sgx.h>
-typedef bool (*ex_handler_t)(const struct exception_table_entry *,
- struct pt_regs *, int, unsigned long,
- unsigned long);
+static inline unsigned long *pt_regs_nr(struct pt_regs *regs, int nr)
+{
+ int reg_offset = pt_regs_offset(regs, nr);
+ static unsigned long __dummy;
+
+ if (WARN_ON_ONCE(reg_offset < 0))
+ return &__dummy;
+
+ return (unsigned long *)((unsigned long)regs + reg_offset);
+}
static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
{
return (unsigned long)&x->fixup + x->fixup;
}
-static inline ex_handler_t
-ex_fixup_handler(const struct exception_table_entry *x)
-{
- return (ex_handler_t)((unsigned long)&x->handler + x->handler);
-}
-__visible bool ex_handler_default(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_default(const struct exception_table_entry *e,
+ struct pt_regs *regs)
{
- regs->ip = ex_fixup_addr(fixup);
+ if (e->data & EX_FLAG_CLEAR_AX)
+ regs->ax = 0;
+ if (e->data & EX_FLAG_CLEAR_DX)
+ regs->dx = 0;
+
+ regs->ip = ex_fixup_addr(e);
return true;
}
-EXPORT_SYMBOL(ex_handler_default);
-__visible bool ex_handler_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_fault(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
{
- regs->ip = ex_fixup_addr(fixup);
regs->ax = trapnr;
- return true;
+ return ex_handler_default(fixup, regs);
+}
+
+static bool ex_handler_sgx(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ regs->ax = trapnr | SGX_ENCLS_FAULT_FLAG;
+ return ex_handler_default(fixup, regs);
}
-EXPORT_SYMBOL_GPL(ex_handler_fault);
/*
* Handler for when we fail to restore a task's FPU state. We should never get
@@ -55,111 +65,93 @@ EXPORT_SYMBOL_GPL(ex_handler_fault);
* of vulnerability by restoring from the initial state (essentially, zeroing
* out all the FPU registers) if we can't restore from the task's FPU state.
*/
-__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
+ struct pt_regs *regs)
{
regs->ip = ex_fixup_addr(fixup);
WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
(void *)instruction_pointer(regs));
- __restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
+ fpu_reset_from_exception_fixup();
return true;
}
-EXPORT_SYMBOL_GPL(ex_handler_fprestore);
-__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
{
WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
- regs->ip = ex_fixup_addr(fixup);
- return true;
+ return ex_handler_default(fixup, regs);
}
-EXPORT_SYMBOL(ex_handler_uaccess);
-__visible bool ex_handler_copy(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_copy(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
{
WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
- regs->ip = ex_fixup_addr(fixup);
- regs->ax = trapnr;
- return true;
+ return ex_handler_fault(fixup, regs, trapnr);
}
-EXPORT_SYMBOL(ex_handler_copy);
-__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_msr(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, bool wrmsr, bool safe, int reg)
{
- if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+ if (!safe && wrmsr &&
+ pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx,
+ (unsigned int)regs->ax, regs->ip, (void *)regs->ip))
+ show_stack_regs(regs);
+
+ if (!safe && !wrmsr &&
+ pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
(unsigned int)regs->cx, regs->ip, (void *)regs->ip))
show_stack_regs(regs);
- /* Pretend that the read succeeded and returned 0. */
- regs->ip = ex_fixup_addr(fixup);
- regs->ax = 0;
- regs->dx = 0;
- return true;
-}
-EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
+ if (!wrmsr) {
+ /* Pretend that the read succeeded and returned 0. */
+ regs->ax = 0;
+ regs->dx = 0;
+ }
-__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
-{
- if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
- (unsigned int)regs->cx, (unsigned int)regs->dx,
- (unsigned int)regs->ax, regs->ip, (void *)regs->ip))
- show_stack_regs(regs);
+ if (safe)
+ *pt_regs_nr(regs, reg) = -EIO;
- /* Pretend that the write succeeded. */
- regs->ip = ex_fixup_addr(fixup);
- return true;
+ return ex_handler_default(fixup, regs);
}
-EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
-__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+static bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
+ struct pt_regs *regs)
{
if (static_cpu_has(X86_BUG_NULL_SEG))
asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
asm volatile ("mov %0, %%fs" : : "rm" (0));
- return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
+ return ex_handler_default(fixup, regs);
}
-EXPORT_SYMBOL(ex_handler_clear_fs);
-enum handler_type ex_get_fault_handler_type(unsigned long ip)
+static bool ex_handler_imm_reg(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int reg, int imm)
{
- const struct exception_table_entry *e;
- ex_handler_t handler;
+ *pt_regs_nr(regs, reg) = (long)imm;
+ return ex_handler_default(fixup, regs);
+}
- e = search_exception_tables(ip);
- if (!e)
- return EX_HANDLER_NONE;
- handler = ex_fixup_handler(e);
- if (handler == ex_handler_fault)
- return EX_HANDLER_FAULT;
- else if (handler == ex_handler_uaccess || handler == ex_handler_copy)
- return EX_HANDLER_UACCESS;
- else
- return EX_HANDLER_OTHER;
+static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr, int reg, int imm)
+{
+ regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg);
+ return ex_handler_uaccess(fixup, regs, trapnr);
+}
+
+int ex_get_fixup_type(unsigned long ip)
+{
+ const struct exception_table_entry *e = search_exception_tables(ip);
+
+ return e ? FIELD_GET(EX_DATA_TYPE_MASK, e->data) : EX_TYPE_NONE;
}
int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
unsigned long fault_addr)
{
const struct exception_table_entry *e;
- ex_handler_t handler;
+ int type, reg, imm;
#ifdef CONFIG_PNPBIOS
if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -179,8 +171,52 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
if (!e)
return 0;
- handler = ex_fixup_handler(e);
- return handler(e, regs, trapnr, error_code, fault_addr);
+ type = FIELD_GET(EX_DATA_TYPE_MASK, e->data);
+ reg = FIELD_GET(EX_DATA_REG_MASK, e->data);
+ imm = FIELD_GET(EX_DATA_IMM_MASK, e->data);
+
+ switch (type) {
+ case EX_TYPE_DEFAULT:
+ case EX_TYPE_DEFAULT_MCE_SAFE:
+ return ex_handler_default(e, regs);
+ case EX_TYPE_FAULT:
+ case EX_TYPE_FAULT_MCE_SAFE:
+ return ex_handler_fault(e, regs, trapnr);
+ case EX_TYPE_UACCESS:
+ return ex_handler_uaccess(e, regs, trapnr);
+ case EX_TYPE_COPY:
+ return ex_handler_copy(e, regs, trapnr);
+ case EX_TYPE_CLEAR_FS:
+ return ex_handler_clear_fs(e, regs);
+ case EX_TYPE_FPU_RESTORE:
+ return ex_handler_fprestore(e, regs);
+ case EX_TYPE_BPF:
+ return ex_handler_bpf(e, regs);
+ case EX_TYPE_WRMSR:
+ return ex_handler_msr(e, regs, true, false, reg);
+ case EX_TYPE_RDMSR:
+ return ex_handler_msr(e, regs, false, false, reg);
+ case EX_TYPE_WRMSR_SAFE:
+ return ex_handler_msr(e, regs, true, true, reg);
+ case EX_TYPE_RDMSR_SAFE:
+ return ex_handler_msr(e, regs, false, true, reg);
+ case EX_TYPE_WRMSR_IN_MCE:
+ ex_handler_msr_mce(regs, true);
+ break;
+ case EX_TYPE_RDMSR_IN_MCE:
+ ex_handler_msr_mce(regs, false);
+ break;
+ case EX_TYPE_POP_REG:
+ regs->sp += sizeof(long);
+ fallthrough;
+ case EX_TYPE_IMM_REG:
+ return ex_handler_imm_reg(e, regs, reg, imm);
+ case EX_TYPE_FAULT_SGX:
+ return ex_handler_sgx(e, regs, trapnr);
+ case EX_TYPE_UCOPY_LEN:
+ return ex_handler_ucopy_len(e, regs, trapnr, reg, imm);
+ }
+ BUG();
}
extern unsigned int early_recursion_flag;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 84a2c8c4af73..d0074c6ed31a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -32,6 +32,7 @@
#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
#include <asm/kvm_para.h> /* kvm_handle_async_pf */
#include <asm/vdso.h> /* fixup_vdso_exception() */
+#include <asm/irq_stack.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -631,6 +632,9 @@ static noinline void
page_fault_oops(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
+#ifdef CONFIG_VMAP_STACK
+ struct stack_info info;
+#endif
unsigned long flags;
int sig;
@@ -649,9 +653,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
* that we're in vmalloc space to avoid this.
*/
if (is_vmalloc_addr((void *)address) &&
- (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
- address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
- unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
+ get_stack_guard_info((void *)address, &info)) {
/*
* We're likely to be running with very little stack space
* left. It's plausible that we'd hit this condition but
@@ -662,13 +664,11 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
* and then double-fault, though, because we're likely to
* break the console driver and lose most of the stack dump.
*/
- asm volatile ("movq %[stack], %%rsp\n\t"
- "call handle_stack_overflow\n\t"
- "1: jmp 1b"
- : ASM_CALL_CONSTRAINT
- : "D" ("kernel stack overflow (page fault)"),
- "S" (regs), "d" (address),
- [stack] "rm" (stack));
+ call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
+ handle_stack_overflow,
+ ASM_CALL_ARG3,
+ , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
+
unreachable();
}
#endif
@@ -1413,8 +1413,7 @@ good_area:
* and if there is a fatal signal pending there is no guarantee
* that we made any progress. Handle this case first.
*/
- if (unlikely((fault & VM_FAULT_RETRY) &&
- (flags & FAULT_FLAG_ALLOW_RETRY))) {
+ if (unlikely(fault & VM_FAULT_RETRY)) {
flags |= FAULT_FLAG_TRIED;
goto retry;
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 23a14d82e783..4ba024d5b63a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -618,7 +618,7 @@ static void __init memory_map_top_down(unsigned long map_start,
*/
addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
map_end);
- memblock_free(addr, PMD_SIZE);
+ memblock_phys_free(addr, PMD_SIZE);
real_end = addr + PMD_SIZE;
/* step_size need to be small so pgt_buf from BRK could cover it */
@@ -714,6 +714,11 @@ static void __init memory_map_bottom_up(unsigned long map_start,
static void __init init_trampoline(void)
{
#ifdef CONFIG_X86_64
+ /*
+ * The code below will alias kernel page-tables in the user-range of the
+ * address space, including the Global bit. So global TLB entries will
+ * be created when using the trampoline page-table.
+ */
if (!kaslr_memory_enabled())
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
else
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bd90b8fe81e4..d4e2648a1dfb 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -238,11 +238,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
}
}
-/*
- * The <linux/kallsyms.h> already defines is_kernel_text,
- * using '__' prefix not to get in conflict.
- */
-static inline int __is_kernel_text(unsigned long addr)
+static inline int is_x86_32_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
return 1;
@@ -333,8 +329,8 @@ repeat:
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
- if (__is_kernel_text(addr) ||
- __is_kernel_text(addr2))
+ if (is_x86_32_kernel_text(addr) ||
+ is_x86_32_kernel_text(addr2))
prot = PAGE_KERNEL_LARGE_EXEC;
pages_2m++;
@@ -359,7 +355,7 @@ repeat:
*/
pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
- if (__is_kernel_text(addr))
+ if (is_x86_32_kernel_text(addr))
prot = PAGE_KERNEL_EXEC;
pages_4k++;
@@ -779,37 +775,6 @@ void __init mem_init(void)
test_wp_bit();
}
-#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_params *params)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
- int ret;
-
- /*
- * The page tables were already mapped at boot so if the caller
- * requests a different mapping type then we must change all the
- * pages with __set_memory_prot().
- */
- if (params->pgprot.pgprot != PAGE_KERNEL.pgprot) {
- ret = __set_memory_prot(start, nr_pages, params->pgprot);
- if (ret)
- return ret;
- }
-
- return __add_pages(nid, start_pfn, nr_pages, params);
-}
-
-void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
-
- __remove_pages(start_pfn, nr_pages, altmap);
-}
-#endif
-
int kernel_set_to_readonly __read_mostly;
static void mark_nxdata_nx(void)
@@ -820,7 +785,7 @@ static void mark_nxdata_nx(void)
*/
unsigned long start = PFN_ALIGN(_etext);
/*
- * This comes from __is_kernel_text upper limit. Also HPAGE where used:
+ * This comes from is_x86_32_kernel_text upper limit. Also HPAGE where used:
*/
unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 36098226a957..96d34ebb20a9 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -981,7 +981,7 @@ static void __meminit free_pagetable(struct page *page, int order)
if (PageReserved(page)) {
__ClearPageReserved(page);
- magic = (unsigned long)page->freelist;
+ magic = page->index;
if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
while (nr_pages--)
put_page_bootmem(page++);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 60ade7dd71bd..026031b3b782 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -14,7 +14,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mmiotrace.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <linux/efi.h>
#include <linux/pgtable.h>
@@ -92,7 +92,7 @@ static unsigned int __ioremap_check_ram(struct resource *res)
*/
static unsigned int __ioremap_check_encrypted(struct resource *res)
{
- if (!sev_active())
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return 0;
switch (res->desc) {
@@ -112,7 +112,7 @@ static unsigned int __ioremap_check_encrypted(struct resource *res)
*/
static void __ioremap_check_other(resource_size_t addr, struct ioremap_desc *desc)
{
- if (!sev_active())
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
if (!IS_ENABLED(CONFIG_EFI))
@@ -508,6 +508,7 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
memunmap((void *)((unsigned long)addr & PAGE_MASK));
}
+#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
* Examine the physical address to determine if it is an area of memory
* that should be mapped decrypted. If the memory is not part of the
@@ -555,7 +556,7 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr,
case E820_TYPE_NVS:
case E820_TYPE_UNUSABLE:
/* For SEV, these areas are encrypted */
- if (sev_active())
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
break;
fallthrough;
@@ -693,7 +694,7 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
unsigned long flags)
{
- if (!mem_encrypt_active())
+ if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return true;
if (flags & MEMREMAP_ENC)
@@ -702,7 +703,7 @@ bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
if (flags & MEMREMAP_DEC)
return false;
- if (sme_active()) {
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
if (memremap_is_setup_data(phys_addr, size) ||
memremap_is_efi_data(phys_addr, size))
return false;
@@ -723,12 +724,12 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
{
bool encrypted_prot;
- if (!mem_encrypt_active())
+ if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return prot;
encrypted_prot = true;
- if (sme_active()) {
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
if (early_memremap_is_setup_data(phys_addr, size) ||
memremap_is_efi_data(phys_addr, size))
encrypted_prot = false;
@@ -746,7 +747,6 @@ bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
return arch_memremap_can_ram_remap(phys_addr, size, 0);
}
-#ifdef CONFIG_AMD_MEM_ENCRYPT
/* Remap memory with encryption */
void __init *early_memremap_encrypted(resource_size_t phys_addr,
unsigned long size)
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index ef885370719a..e7b9b464a82f 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -49,7 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
p = early_alloc(PMD_SIZE, nid, false);
if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
return;
- memblock_free_ptr(p, PMD_SIZE);
+ memblock_free(p, PMD_SIZE);
}
p = early_alloc(PAGE_SIZE, nid, true);
@@ -85,7 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
p = early_alloc(PUD_SIZE, nid, false);
if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
return;
- memblock_free_ptr(p, PUD_SIZE);
+ memblock_free(p, PUD_SIZE);
}
p = early_alloc(PAGE_SIZE, nid, true);
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index ff08dc463634..50d209939c66 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -1,401 +1,26 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * AMD Memory Encryption Support
+ * Memory Encryption Support Common Code
*
* Copyright (C) 2016 Advanced Micro Devices, Inc.
*
* Author: Tom Lendacky <thomas.lendacky@amd.com>
*/
-#define DISABLE_BRANCH_PROFILING
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <linux/mm.h>
#include <linux/dma-direct.h>
+#include <linux/dma-mapping.h>
#include <linux/swiotlb.h>
+#include <linux/cc_platform.h>
#include <linux/mem_encrypt.h>
-#include <linux/device.h>
-#include <linux/kernel.h>
-#include <linux/bitops.h>
-#include <linux/dma-mapping.h>
#include <linux/virtio_config.h>
-#include <asm/tlbflush.h>
-#include <asm/fixmap.h>
-#include <asm/setup.h>
-#include <asm/bootparam.h>
-#include <asm/set_memory.h>
-#include <asm/cacheflush.h>
-#include <asm/processor-flags.h>
-#include <asm/msr.h>
-#include <asm/cmdline.h>
-
-#include "mm_internal.h"
-
-/*
- * Since SME related variables are set early in the boot process they must
- * reside in the .data section so as not to be zeroed out when the .bss
- * section is later cleared.
- */
-u64 sme_me_mask __section(".data") = 0;
-u64 sev_status __section(".data") = 0;
-u64 sev_check_data __section(".data") = 0;
-EXPORT_SYMBOL(sme_me_mask);
-DEFINE_STATIC_KEY_FALSE(sev_enable_key);
-EXPORT_SYMBOL_GPL(sev_enable_key);
-
-/* Buffer used for early in-place encryption by BSP, no locking needed */
-static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE);
-
-/*
- * This routine does not change the underlying encryption setting of the
- * page(s) that map this memory. It assumes that eventually the memory is
- * meant to be accessed as either encrypted or decrypted but the contents
- * are currently not in the desired state.
- *
- * This routine follows the steps outlined in the AMD64 Architecture
- * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
- */
-static void __init __sme_early_enc_dec(resource_size_t paddr,
- unsigned long size, bool enc)
-{
- void *src, *dst;
- size_t len;
-
- if (!sme_me_mask)
- return;
-
- wbinvd();
-
- /*
- * There are limited number of early mapping slots, so map (at most)
- * one page at time.
- */
- while (size) {
- len = min_t(size_t, sizeof(sme_early_buffer), size);
-
- /*
- * Create mappings for the current and desired format of
- * the memory. Use a write-protected mapping for the source.
- */
- src = enc ? early_memremap_decrypted_wp(paddr, len) :
- early_memremap_encrypted_wp(paddr, len);
-
- dst = enc ? early_memremap_encrypted(paddr, len) :
- early_memremap_decrypted(paddr, len);
-
- /*
- * If a mapping can't be obtained to perform the operation,
- * then eventual access of that area in the desired mode
- * will cause a crash.
- */
- BUG_ON(!src || !dst);
-
- /*
- * Use a temporary buffer, of cache-line multiple size, to
- * avoid data corruption as documented in the APM.
- */
- memcpy(sme_early_buffer, src, len);
- memcpy(dst, sme_early_buffer, len);
-
- early_memunmap(dst, len);
- early_memunmap(src, len);
-
- paddr += len;
- size -= len;
- }
-}
-
-void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
-{
- __sme_early_enc_dec(paddr, size, true);
-}
-
-void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
-{
- __sme_early_enc_dec(paddr, size, false);
-}
-
-static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
- bool map)
-{
- unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
- pmdval_t pmd_flags, pmd;
-
- /* Use early_pmd_flags but remove the encryption mask */
- pmd_flags = __sme_clr(early_pmd_flags);
-
- do {
- pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
- __early_make_pgtable((unsigned long)vaddr, pmd);
-
- vaddr += PMD_SIZE;
- paddr += PMD_SIZE;
- size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
- } while (size);
-
- flush_tlb_local();
-}
-
-void __init sme_unmap_bootdata(char *real_mode_data)
-{
- struct boot_params *boot_data;
- unsigned long cmdline_paddr;
-
- if (!sme_active())
- return;
-
- /* Get the command line address before unmapping the real_mode_data */
- boot_data = (struct boot_params *)real_mode_data;
- cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
-
- __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
-
- if (!cmdline_paddr)
- return;
-
- __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
-}
-
-void __init sme_map_bootdata(char *real_mode_data)
-{
- struct boot_params *boot_data;
- unsigned long cmdline_paddr;
-
- if (!sme_active())
- return;
-
- __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
-
- /* Get the command line address after mapping the real_mode_data */
- boot_data = (struct boot_params *)real_mode_data;
- cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
-
- if (!cmdline_paddr)
- return;
-
- __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
-}
-
-void __init sme_early_init(void)
-{
- unsigned int i;
-
- if (!sme_me_mask)
- return;
-
- early_pmd_flags = __sme_set(early_pmd_flags);
-
- __supported_pte_mask = __sme_set(__supported_pte_mask);
-
- /* Update the protection map with memory encryption mask */
- for (i = 0; i < ARRAY_SIZE(protection_map); i++)
- protection_map[i] = pgprot_encrypted(protection_map[i]);
-
- if (sev_active())
- swiotlb_force = SWIOTLB_FORCE;
-}
-
-void __init sev_setup_arch(void)
-{
- phys_addr_t total_mem = memblock_phys_mem_size();
- unsigned long size;
-
- if (!sev_active())
- return;
-
- /*
- * For SEV, all DMA has to occur via shared/unencrypted pages.
- * SEV uses SWIOTLB to make this happen without changing device
- * drivers. However, depending on the workload being run, the
- * default 64MB of SWIOTLB may not be enough and SWIOTLB may
- * run out of buffers for DMA, resulting in I/O errors and/or
- * performance degradation especially with high I/O workloads.
- *
- * Adjust the default size of SWIOTLB for SEV guests using
- * a percentage of guest memory for SWIOTLB buffers.
- * Also, as the SWIOTLB bounce buffer memory is allocated
- * from low memory, ensure that the adjusted size is within
- * the limits of low available memory.
- *
- * The percentage of guest memory used here for SWIOTLB buffers
- * is more of an approximation of the static adjustment which
- * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
- */
- size = total_mem * 6 / 100;
- size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
- swiotlb_adjust_size(size);
-}
-
-static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
-{
- pgprot_t old_prot, new_prot;
- unsigned long pfn, pa, size;
- pte_t new_pte;
-
- switch (level) {
- case PG_LEVEL_4K:
- pfn = pte_pfn(*kpte);
- old_prot = pte_pgprot(*kpte);
- break;
- case PG_LEVEL_2M:
- pfn = pmd_pfn(*(pmd_t *)kpte);
- old_prot = pmd_pgprot(*(pmd_t *)kpte);
- break;
- case PG_LEVEL_1G:
- pfn = pud_pfn(*(pud_t *)kpte);
- old_prot = pud_pgprot(*(pud_t *)kpte);
- break;
- default:
- return;
- }
-
- new_prot = old_prot;
- if (enc)
- pgprot_val(new_prot) |= _PAGE_ENC;
- else
- pgprot_val(new_prot) &= ~_PAGE_ENC;
-
- /* If prot is same then do nothing. */
- if (pgprot_val(old_prot) == pgprot_val(new_prot))
- return;
-
- pa = pfn << PAGE_SHIFT;
- size = page_level_size(level);
-
- /*
- * We are going to perform in-place en-/decryption and change the
- * physical page attribute from C=1 to C=0 or vice versa. Flush the
- * caches to ensure that data gets accessed with the correct C-bit.
- */
- clflush_cache_range(__va(pa), size);
-
- /* Encrypt/decrypt the contents in-place */
- if (enc)
- sme_early_encrypt(pa, size);
- else
- sme_early_decrypt(pa, size);
-
- /* Change the page encryption mask. */
- new_pte = pfn_pte(pfn, new_prot);
- set_pte_atomic(kpte, new_pte);
-}
-
-static int __init early_set_memory_enc_dec(unsigned long vaddr,
- unsigned long size, bool enc)
-{
- unsigned long vaddr_end, vaddr_next;
- unsigned long psize, pmask;
- int split_page_size_mask;
- int level, ret;
- pte_t *kpte;
-
- vaddr_next = vaddr;
- vaddr_end = vaddr + size;
-
- for (; vaddr < vaddr_end; vaddr = vaddr_next) {
- kpte = lookup_address(vaddr, &level);
- if (!kpte || pte_none(*kpte)) {
- ret = 1;
- goto out;
- }
-
- if (level == PG_LEVEL_4K) {
- __set_clr_pte_enc(kpte, level, enc);
- vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
- continue;
- }
-
- psize = page_level_size(level);
- pmask = page_level_mask(level);
-
- /*
- * Check whether we can change the large page in one go.
- * We request a split when the address is not aligned and
- * the number of pages to set/clear encryption bit is smaller
- * than the number of pages in the large page.
- */
- if (vaddr == (vaddr & pmask) &&
- ((vaddr_end - vaddr) >= psize)) {
- __set_clr_pte_enc(kpte, level, enc);
- vaddr_next = (vaddr & pmask) + psize;
- continue;
- }
-
- /*
- * The virtual address is part of a larger page, create the next
- * level page table mapping (4K or 2M). If it is part of a 2M
- * page then we request a split of the large page into 4K
- * chunks. A 1GB large page is split into 2M pages, resp.
- */
- if (level == PG_LEVEL_2M)
- split_page_size_mask = 0;
- else
- split_page_size_mask = 1 << PG_LEVEL_2M;
-
- /*
- * kernel_physical_mapping_change() does not flush the TLBs, so
- * a TLB flush is required after we exit from the for loop.
- */
- kernel_physical_mapping_change(__pa(vaddr & pmask),
- __pa((vaddr_end & pmask) + psize),
- split_page_size_mask);
- }
-
- ret = 0;
-
-out:
- __flush_tlb_all();
- return ret;
-}
-
-int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
-{
- return early_set_memory_enc_dec(vaddr, size, false);
-}
-
-int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
-{
- return early_set_memory_enc_dec(vaddr, size, true);
-}
-
-/*
- * SME and SEV are very similar but they are not the same, so there are
- * times that the kernel will need to distinguish between SME and SEV. The
- * sme_active() and sev_active() functions are used for this. When a
- * distinction isn't needed, the mem_encrypt_active() function can be used.
- *
- * The trampoline code is a good example for this requirement. Before
- * paging is activated, SME will access all memory as decrypted, but SEV
- * will access all memory as encrypted. So, when APs are being brought
- * up under SME the trampoline area cannot be encrypted, whereas under SEV
- * the trampoline area must be encrypted.
- */
-bool sev_active(void)
-{
- return sev_status & MSR_AMD64_SEV_ENABLED;
-}
-
-bool sme_active(void)
-{
- return sme_me_mask && !sev_active();
-}
-EXPORT_SYMBOL_GPL(sev_active);
-
-/* Needs to be called from non-instrumentable code */
-bool noinstr sev_es_active(void)
-{
- return sev_status & MSR_AMD64_SEV_ES_ENABLED;
-}
-
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
bool force_dma_unencrypted(struct device *dev)
{
/*
* For SEV, all DMA must be to unencrypted addresses.
*/
- if (sev_active())
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return true;
/*
@@ -403,7 +28,7 @@ bool force_dma_unencrypted(struct device *dev)
* device does not support DMA to addresses that include the
* encryption mask.
*/
- if (sme_active()) {
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask));
u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask,
dev->bus_dma_limit);
@@ -415,36 +40,12 @@ bool force_dma_unencrypted(struct device *dev)
return false;
}
-void __init mem_encrypt_free_decrypted_mem(void)
-{
- unsigned long vaddr, vaddr_end, npages;
- int r;
-
- vaddr = (unsigned long)__start_bss_decrypted_unused;
- vaddr_end = (unsigned long)__end_bss_decrypted;
- npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
-
- /*
- * The unused memory range was mapped decrypted, change the encryption
- * attribute from decrypted to encrypted before freeing it.
- */
- if (mem_encrypt_active()) {
- r = set_memory_encrypted(vaddr, npages);
- if (r) {
- pr_warn("failed to free unused decrypted pages\n");
- return;
- }
- }
-
- free_init_pages("unused decrypted", vaddr, vaddr_end);
-}
-
static void print_mem_encrypt_feature_info(void)
{
pr_info("AMD Memory Encryption Features active:");
/* Secure Memory Encryption */
- if (sme_active()) {
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
/*
* SME is mutually exclusive with any of the SEV
* features below.
@@ -454,11 +55,11 @@ static void print_mem_encrypt_feature_info(void)
}
/* Secure Encrypted Virtualization */
- if (sev_active())
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
pr_cont(" SEV");
/* Encrypted Register State */
- if (sev_es_active())
+ if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
pr_cont(" SEV-ES");
pr_cont("\n");
@@ -467,24 +68,17 @@ static void print_mem_encrypt_feature_info(void)
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void)
{
- if (!sme_me_mask)
+ if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return;
/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
swiotlb_update_mem_attributes();
- /*
- * With SEV, we need to unroll the rep string I/O instructions,
- * but SEV-ES supports them through the #VC handler.
- */
- if (sev_active() && !sev_es_active())
- static_branch_enable(&sev_enable_key);
-
print_mem_encrypt_feature_info();
}
int arch_has_restricted_virtio_memory_access(void)
{
- return sev_active();
+ return cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT);
}
EXPORT_SYMBOL_GPL(arch_has_restricted_virtio_memory_access);
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
new file mode 100644
index 000000000000..2b2d018ea345
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/dma-direct.h>
+#include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/dma-mapping.h>
+#include <linux/virtio_config.h>
+#include <linux/cc_platform.h>
+
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+#include <asm/setup.h>
+#include <asm/bootparam.h>
+#include <asm/set_memory.h>
+#include <asm/cacheflush.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/cmdline.h>
+
+#include "mm_internal.h"
+
+/*
+ * Since SME related variables are set early in the boot process they must
+ * reside in the .data section so as not to be zeroed out when the .bss
+ * section is later cleared.
+ */
+u64 sme_me_mask __section(".data") = 0;
+u64 sev_status __section(".data") = 0;
+u64 sev_check_data __section(".data") = 0;
+EXPORT_SYMBOL(sme_me_mask);
+
+/* Buffer used for early in-place encryption by BSP, no locking needed */
+static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE);
+
+/*
+ * This routine does not change the underlying encryption setting of the
+ * page(s) that map this memory. It assumes that eventually the memory is
+ * meant to be accessed as either encrypted or decrypted but the contents
+ * are currently not in the desired state.
+ *
+ * This routine follows the steps outlined in the AMD64 Architecture
+ * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
+ */
+static void __init __sme_early_enc_dec(resource_size_t paddr,
+ unsigned long size, bool enc)
+{
+ void *src, *dst;
+ size_t len;
+
+ if (!sme_me_mask)
+ return;
+
+ wbinvd();
+
+ /*
+ * There are limited number of early mapping slots, so map (at most)
+ * one page at time.
+ */
+ while (size) {
+ len = min_t(size_t, sizeof(sme_early_buffer), size);
+
+ /*
+ * Create mappings for the current and desired format of
+ * the memory. Use a write-protected mapping for the source.
+ */
+ src = enc ? early_memremap_decrypted_wp(paddr, len) :
+ early_memremap_encrypted_wp(paddr, len);
+
+ dst = enc ? early_memremap_encrypted(paddr, len) :
+ early_memremap_decrypted(paddr, len);
+
+ /*
+ * If a mapping can't be obtained to perform the operation,
+ * then eventual access of that area in the desired mode
+ * will cause a crash.
+ */
+ BUG_ON(!src || !dst);
+
+ /*
+ * Use a temporary buffer, of cache-line multiple size, to
+ * avoid data corruption as documented in the APM.
+ */
+ memcpy(sme_early_buffer, src, len);
+ memcpy(dst, sme_early_buffer, len);
+
+ early_memunmap(dst, len);
+ early_memunmap(src, len);
+
+ paddr += len;
+ size -= len;
+ }
+}
+
+void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, true);
+}
+
+void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, false);
+}
+
+static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
+ bool map)
+{
+ unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
+ pmdval_t pmd_flags, pmd;
+
+ /* Use early_pmd_flags but remove the encryption mask */
+ pmd_flags = __sme_clr(early_pmd_flags);
+
+ do {
+ pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
+ __early_make_pgtable((unsigned long)vaddr, pmd);
+
+ vaddr += PMD_SIZE;
+ paddr += PMD_SIZE;
+ size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
+ } while (size);
+
+ flush_tlb_local();
+}
+
+void __init sme_unmap_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return;
+
+ /* Get the command line address before unmapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
+}
+
+void __init sme_map_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return;
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
+
+ /* Get the command line address after mapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
+}
+
+void __init sme_early_init(void)
+{
+ unsigned int i;
+
+ if (!sme_me_mask)
+ return;
+
+ early_pmd_flags = __sme_set(early_pmd_flags);
+
+ __supported_pte_mask = __sme_set(__supported_pte_mask);
+
+ /* Update the protection map with memory encryption mask */
+ for (i = 0; i < ARRAY_SIZE(protection_map); i++)
+ protection_map[i] = pgprot_encrypted(protection_map[i]);
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ swiotlb_force = SWIOTLB_FORCE;
+}
+
+void __init sev_setup_arch(void)
+{
+ phys_addr_t total_mem = memblock_phys_mem_size();
+ unsigned long size;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ return;
+
+ /*
+ * For SEV, all DMA has to occur via shared/unencrypted pages.
+ * SEV uses SWIOTLB to make this happen without changing device
+ * drivers. However, depending on the workload being run, the
+ * default 64MB of SWIOTLB may not be enough and SWIOTLB may
+ * run out of buffers for DMA, resulting in I/O errors and/or
+ * performance degradation especially with high I/O workloads.
+ *
+ * Adjust the default size of SWIOTLB for SEV guests using
+ * a percentage of guest memory for SWIOTLB buffers.
+ * Also, as the SWIOTLB bounce buffer memory is allocated
+ * from low memory, ensure that the adjusted size is within
+ * the limits of low available memory.
+ *
+ * The percentage of guest memory used here for SWIOTLB buffers
+ * is more of an approximation of the static adjustment which
+ * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
+ */
+ size = total_mem * 6 / 100;
+ size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
+ swiotlb_adjust_size(size);
+}
+
+static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
+{
+ unsigned long pfn = 0;
+ pgprot_t prot;
+
+ switch (level) {
+ case PG_LEVEL_4K:
+ pfn = pte_pfn(*kpte);
+ prot = pte_pgprot(*kpte);
+ break;
+ case PG_LEVEL_2M:
+ pfn = pmd_pfn(*(pmd_t *)kpte);
+ prot = pmd_pgprot(*(pmd_t *)kpte);
+ break;
+ case PG_LEVEL_1G:
+ pfn = pud_pfn(*(pud_t *)kpte);
+ prot = pud_pgprot(*(pud_t *)kpte);
+ break;
+ default:
+ WARN_ONCE(1, "Invalid level for kpte\n");
+ return 0;
+ }
+
+ if (ret_prot)
+ *ret_prot = prot;
+
+ return pfn;
+}
+
+void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc)
+{
+#ifdef CONFIG_PARAVIRT
+ unsigned long sz = npages << PAGE_SHIFT;
+ unsigned long vaddr_end = vaddr + sz;
+
+ while (vaddr < vaddr_end) {
+ int psize, pmask, level;
+ unsigned long pfn;
+ pte_t *kpte;
+
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ WARN_ONCE(1, "kpte lookup for vaddr\n");
+ return;
+ }
+
+ pfn = pg_level_to_pfn(level, kpte, NULL);
+ if (!pfn)
+ continue;
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc);
+
+ vaddr = (vaddr & pmask) + psize;
+ }
+#endif
+}
+
+static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+{
+ pgprot_t old_prot, new_prot;
+ unsigned long pfn, pa, size;
+ pte_t new_pte;
+
+ pfn = pg_level_to_pfn(level, kpte, &old_prot);
+ if (!pfn)
+ return;
+
+ new_prot = old_prot;
+ if (enc)
+ pgprot_val(new_prot) |= _PAGE_ENC;
+ else
+ pgprot_val(new_prot) &= ~_PAGE_ENC;
+
+ /* If prot is same then do nothing. */
+ if (pgprot_val(old_prot) == pgprot_val(new_prot))
+ return;
+
+ pa = pfn << PAGE_SHIFT;
+ size = page_level_size(level);
+
+ /*
+ * We are going to perform in-place en-/decryption and change the
+ * physical page attribute from C=1 to C=0 or vice versa. Flush the
+ * caches to ensure that data gets accessed with the correct C-bit.
+ */
+ clflush_cache_range(__va(pa), size);
+
+ /* Encrypt/decrypt the contents in-place */
+ if (enc)
+ sme_early_encrypt(pa, size);
+ else
+ sme_early_decrypt(pa, size);
+
+ /* Change the page encryption mask. */
+ new_pte = pfn_pte(pfn, new_prot);
+ set_pte_atomic(kpte, new_pte);
+}
+
+static int __init early_set_memory_enc_dec(unsigned long vaddr,
+ unsigned long size, bool enc)
+{
+ unsigned long vaddr_end, vaddr_next, start;
+ unsigned long psize, pmask;
+ int split_page_size_mask;
+ int level, ret;
+ pte_t *kpte;
+
+ start = vaddr;
+ vaddr_next = vaddr;
+ vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (level == PG_LEVEL_4K) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
+ continue;
+ }
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ /*
+ * Check whether we can change the large page in one go.
+ * We request a split when the address is not aligned and
+ * the number of pages to set/clear encryption bit is smaller
+ * than the number of pages in the large page.
+ */
+ if (vaddr == (vaddr & pmask) &&
+ ((vaddr_end - vaddr) >= psize)) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & pmask) + psize;
+ continue;
+ }
+
+ /*
+ * The virtual address is part of a larger page, create the next
+ * level page table mapping (4K or 2M). If it is part of a 2M
+ * page then we request a split of the large page into 4K
+ * chunks. A 1GB large page is split into 2M pages, resp.
+ */
+ if (level == PG_LEVEL_2M)
+ split_page_size_mask = 0;
+ else
+ split_page_size_mask = 1 << PG_LEVEL_2M;
+
+ /*
+ * kernel_physical_mapping_change() does not flush the TLBs, so
+ * a TLB flush is required after we exit from the for loop.
+ */
+ kernel_physical_mapping_change(__pa(vaddr & pmask),
+ __pa((vaddr_end & pmask) + psize),
+ split_page_size_mask);
+ }
+
+ ret = 0;
+
+ notify_range_enc_status_changed(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc);
+out:
+ __flush_tlb_all();
+ return ret;
+}
+
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, false);
+}
+
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, true);
+}
+
+void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc)
+{
+ notify_range_enc_status_changed(vaddr, npages, enc);
+}
+
+void __init mem_encrypt_free_decrypted_mem(void)
+{
+ unsigned long vaddr, vaddr_end, npages;
+ int r;
+
+ vaddr = (unsigned long)__start_bss_decrypted_unused;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
+
+ /*
+ * The unused memory range was mapped decrypted, change the encryption
+ * attribute from decrypted to encrypted before freeing it.
+ */
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
+ r = set_memory_encrypted(vaddr, npages);
+ if (r) {
+ pr_warn("failed to free unused decrypted pages\n");
+ return;
+ }
+ }
+
+ free_init_pages("unused decrypted", vaddr, vaddr_end);
+}
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
index 17d292b7072f..3d1dba05fce4 100644
--- a/arch/x86/mm/mem_encrypt_boot.S
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -65,7 +65,7 @@ SYM_FUNC_START(sme_encrypt_execute)
movq %rbp, %rsp /* Restore original stack pointer */
pop %rbp
- ret
+ RET
SYM_FUNC_END(sme_encrypt_execute)
SYM_FUNC_START(__enc_copy)
@@ -151,6 +151,6 @@ SYM_FUNC_START(__enc_copy)
pop %r12
pop %r15
- ret
+ RET
.L__enc_copy_end:
SYM_FUNC_END(__enc_copy)
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 470b20208430..3f0abb403340 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -27,9 +27,19 @@
#undef CONFIG_PARAVIRT_XXL
#undef CONFIG_PARAVIRT_SPINLOCKS
+/*
+ * This code runs before CPU feature bits are set. By default, the
+ * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if
+ * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5
+ * is provided to handle this situation and, instead, use a variable that
+ * has been set by the early boot code.
+ */
+#define USE_EARLY_PGTABLE_L5
+
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <asm/setup.h>
#include <asm/sections.h>
@@ -287,7 +297,13 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
unsigned long pgtable_area_len;
unsigned long decrypted_base;
- if (!sme_active())
+ /*
+ * This is early code, use an open coded check for SME instead of
+ * using cc_platform_has(). This eliminates worries about removing
+ * instrumentation or checking boot_cpu_data in the cc_platform_has()
+ * function.
+ */
+ if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED)
return;
/*
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1e9b93b088db..c6b1213086d6 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -355,7 +355,7 @@ void __init numa_reset_distance(void)
/* numa_distance could be 1LU marking allocation failure, test cnt */
if (numa_distance_cnt)
- memblock_free_ptr(numa_distance, size);
+ memblock_free(numa_distance, size);
numa_distance_cnt = 0;
numa_distance = NULL; /* enable table creation */
}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index e801e30089c4..1a02b791d273 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -517,7 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
}
/* free the copied physical distance table */
- memblock_free_ptr(phys_dist, phys_size);
+ memblock_free(phys_dist, phys_size);
return;
no_emu:
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index ad8a5c586a35..b4072115c8ef 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -18,6 +18,7 @@
#include <linux/libnvdimm.h>
#include <linux/vmstat.h>
#include <linux/kernel.h>
+#include <linux/cc_platform.h>
#include <asm/e820/api.h>
#include <asm/processor.h>
@@ -29,6 +30,8 @@
#include <asm/proto.h>
#include <asm/memtype.h>
#include <asm/set_memory.h>
+#include <asm/hyperv-tlfs.h>
+#include <asm/mshyperv.h>
#include "../mm_internal.h"
@@ -1980,15 +1983,15 @@ int set_memory_global(unsigned long addr, int numpages)
__pgprot(_PAGE_GLOBAL), 0);
}
-static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
+/*
+ * __set_memory_enc_pgtable() is used for the hypervisors that get
+ * informed about "encryption" status via page tables.
+ */
+static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
{
struct cpa_data cpa;
int ret;
- /* Nothing to do if memory encryption is not active */
- if (!mem_encrypt_active())
- return 0;
-
/* Should not be working on unaligned addresses */
if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
addr &= PAGE_MASK;
@@ -2020,9 +2023,26 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
*/
cpa_flush(&cpa, 0);
+ /*
+ * Notify hypervisor that a given memory range is mapped encrypted
+ * or decrypted.
+ */
+ notify_range_enc_status_changed(addr, numpages, enc);
+
return ret;
}
+static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
+{
+ if (hv_is_isolation_supported())
+ return hv_set_mem_host_visibility(addr, numpages, !enc);
+
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
+ return __set_memory_enc_pgtable(addr, numpages, enc);
+
+ return 0;
+}
+
int set_memory_encrypted(unsigned long addr, int numpages)
{
return __set_memory_enc_dec(addr, numpages, true);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 59ba2968af1b..a6cf56a14939 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -361,7 +361,7 @@ static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm,
static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
{
- unsigned long next_tif = task_thread_info(next)->flags;
+ unsigned long next_tif = read_task_thread_flags(next);
unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;
/*
@@ -1148,7 +1148,7 @@ void flush_tlb_one_user(unsigned long addr)
*/
STATIC_NOPV void native_flush_tlb_global(void)
{
- unsigned long cr4, flags;
+ unsigned long flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) {
/*
@@ -1168,11 +1168,7 @@ STATIC_NOPV void native_flush_tlb_global(void)
*/
raw_local_irq_save(flags);
- cr4 = this_cpu_read(cpu_tlbstate.cr4);
- /* toggle PGE */
- native_write_cr4(cr4 ^ X86_CR4_PGE);
- /* write old PGE again and flush TLBs */
- native_write_cr4(cr4);
+ __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
raw_local_irq_restore(flags);
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 9ea57389c554..2b1e266ff95c 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1,9 +1,9 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * bpf_jit_comp.c: BPF JIT compiler
+ * BPF JIT compiler
*
* Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
- * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/netdevice.h>
#include <linux/filter.h>
@@ -15,7 +15,6 @@
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>
-#include <asm/asm-prototypes.h>
static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{
@@ -225,6 +224,14 @@ static void jit_fill_hole(void *area, unsigned int size)
struct jit_context {
int cleanup_addr; /* Epilogue code offset */
+
+ /*
+ * Program specific offsets of labels in the code; these rely on the
+ * JIT doing at least 2 passes, recording the position on the first
+ * pass, only to generate the correct offset on the second pass.
+ */
+ int tail_call_direct_label;
+ int tail_call_indirect_label;
};
/* Maximum number of bytes emitted while JITing one eBPF insn */
@@ -380,20 +387,23 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true);
}
-static int get_pop_bytes(bool *callee_regs_used)
+#define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8)
+
+static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
{
- int bytes = 0;
+ u8 *prog = *pprog;
- if (callee_regs_used[3])
- bytes += 2;
- if (callee_regs_used[2])
- bytes += 2;
- if (callee_regs_used[1])
- bytes += 2;
- if (callee_regs_used[0])
- bytes += 1;
+#ifdef CONFIG_RETPOLINE
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) {
+ EMIT_LFENCE();
+ EMIT2(0xFF, 0xE0 + reg);
+ } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
+ emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
+ } else
+#endif
+ EMIT2(0xFF, 0xE0 + reg);
- return bytes;
+ *pprog = prog;
}
/*
@@ -402,7 +412,7 @@ static int get_pop_bytes(bool *callee_regs_used)
* ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
* if (index >= array->map.max_entries)
* goto out;
- * if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
* prog = array->ptrs[index];
* if (prog == NULL)
@@ -411,29 +421,12 @@ static int get_pop_bytes(bool *callee_regs_used)
* out:
*/
static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
- u32 stack_depth)
+ u32 stack_depth, u8 *ip,
+ struct jit_context *ctx)
{
int tcc_off = -4 - round_up(stack_depth, 8);
- u8 *prog = *pprog;
- int pop_bytes = 0;
- int off1 = 42;
- int off2 = 31;
- int off3 = 9;
-
- /* count the additional bytes used for popping callee regs from stack
- * that need to be taken into account for each of the offsets that
- * are used for bailing out of the tail call
- */
- pop_bytes = get_pop_bytes(callee_regs_used);
- off1 += pop_bytes;
- off2 += pop_bytes;
- off3 += pop_bytes;
-
- if (stack_depth) {
- off1 += 7;
- off2 += 7;
- off3 += 7;
- }
+ u8 *prog = *pprog, *start = *pprog;
+ int offset;
/*
* rdi - pointer to ctx
@@ -448,17 +441,19 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
EMIT2(0x89, 0xD2); /* mov edx, edx */
EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */
offsetof(struct bpf_array, map.max_entries));
-#define OFFSET1 (off1 + RETPOLINE_RCX_BPF_JIT_SIZE) /* Number of bytes to jump */
- EMIT2(X86_JBE, OFFSET1); /* jbe out */
+
+ offset = ctx->tail_call_indirect_label - (prog + 2 - start);
+ EMIT2(X86_JBE, offset); /* jbe out */
/*
- * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
-#define OFFSET2 (off2 + RETPOLINE_RCX_BPF_JIT_SIZE)
- EMIT2(X86_JA, OFFSET2); /* ja out */
+
+ offset = ctx->tail_call_indirect_label - (prog + 2 - start);
+ EMIT2(X86_JAE, offset); /* jae out */
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
@@ -471,12 +466,11 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
* goto out;
*/
EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */
-#define OFFSET3 (off3 + RETPOLINE_RCX_BPF_JIT_SIZE)
- EMIT2(X86_JE, OFFSET3); /* je out */
- *pprog = prog;
- pop_callee_regs(pprog, callee_regs_used);
- prog = *pprog;
+ offset = ctx->tail_call_indirect_label - (prog + 2 - start);
+ EMIT2(X86_JE, offset); /* je out */
+
+ pop_callee_regs(&prog, callee_regs_used);
EMIT1(0x58); /* pop rax */
if (stack_depth)
@@ -493,70 +487,52 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
* rdi == ctx (1st arg)
* rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET
*/
- RETPOLINE_RCX_BPF_JIT();
+ emit_indirect_jump(&prog, 1 /* rcx */, ip + (prog - start));
/* out: */
+ ctx->tail_call_indirect_label = prog - start;
*pprog = prog;
}
static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
- u8 **pprog, int addr, u8 *image,
- bool *callee_regs_used, u32 stack_depth)
+ u8 **pprog, u8 *ip,
+ bool *callee_regs_used, u32 stack_depth,
+ struct jit_context *ctx)
{
int tcc_off = -4 - round_up(stack_depth, 8);
- u8 *prog = *pprog;
- int pop_bytes = 0;
- int off1 = 20;
- int poke_off;
-
- /* count the additional bytes used for popping callee regs to stack
- * that need to be taken into account for jump offset that is used for
- * bailing out from of the tail call when limit is reached
- */
- pop_bytes = get_pop_bytes(callee_regs_used);
- off1 += pop_bytes;
+ u8 *prog = *pprog, *start = *pprog;
+ int offset;
/*
- * total bytes for:
- * - nop5/ jmpq $off
- * - pop callee regs
- * - sub rsp, $val if depth > 0
- * - pop rax
- */
- poke_off = X86_PATCH_SIZE + pop_bytes + 1;
- if (stack_depth) {
- poke_off += 7;
- off1 += 7;
- }
-
- /*
- * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
- EMIT2(X86_JA, off1); /* ja out */
+
+ offset = ctx->tail_call_direct_label - (prog + 2 - start);
+ EMIT2(X86_JAE, offset); /* jae out */
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
- poke->tailcall_bypass = image + (addr - poke_off - X86_PATCH_SIZE);
+ poke->tailcall_bypass = ip + (prog - start);
poke->adj_off = X86_TAIL_CALL_OFFSET;
- poke->tailcall_target = image + (addr - X86_PATCH_SIZE);
+ poke->tailcall_target = ip + ctx->tail_call_direct_label - X86_PATCH_SIZE;
poke->bypass_addr = (u8 *)poke->tailcall_target + X86_PATCH_SIZE;
emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE,
poke->tailcall_bypass);
- *pprog = prog;
- pop_callee_regs(pprog, callee_regs_used);
- prog = *pprog;
+ pop_callee_regs(&prog, callee_regs_used);
EMIT1(0x58); /* pop rax */
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
+
/* out: */
+ ctx->tail_call_direct_label = prog - start;
*pprog = prog;
}
@@ -721,6 +697,20 @@ static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64)
*pprog = prog;
}
+/*
+ * Similar version of maybe_emit_mod() for a single register
+ */
+static void maybe_emit_1mod(u8 **pprog, u32 reg, bool is64)
+{
+ u8 *prog = *pprog;
+
+ if (is64)
+ EMIT1(add_1mod(0x48, reg));
+ else if (is_ereg(reg))
+ EMIT1(add_1mod(0x40, reg));
+ *pprog = prog;
+}
+
/* LDX: dst_reg = *(u8*)(src_reg + off) */
static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
@@ -827,9 +817,7 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
return 0;
}
-static bool ex_handler_bpf(const struct exception_table_entry *x,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code, unsigned long fault_addr)
+bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
{
u32 reg = x->fixup >> 8;
@@ -951,10 +939,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
/* neg dst */
case BPF_ALU | BPF_NEG:
case BPF_ALU64 | BPF_NEG:
- if (BPF_CLASS(insn->code) == BPF_ALU64)
- EMIT1(add_1mod(0x48, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_1mod(0x40, dst_reg));
+ maybe_emit_1mod(&prog, dst_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
EMIT2(0xF7, add_1reg(0xD8, dst_reg));
break;
@@ -968,10 +954,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_ALU64 | BPF_AND | BPF_K:
case BPF_ALU64 | BPF_OR | BPF_K:
case BPF_ALU64 | BPF_XOR | BPF_K:
- if (BPF_CLASS(insn->code) == BPF_ALU64)
- EMIT1(add_1mod(0x48, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_1mod(0x40, dst_reg));
+ maybe_emit_1mod(&prog, dst_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
/*
* b3 holds 'normal' opcode, b2 short form only valid
@@ -1028,19 +1012,30 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_ALU64 | BPF_MOD | BPF_X:
case BPF_ALU64 | BPF_DIV | BPF_X:
case BPF_ALU64 | BPF_MOD | BPF_K:
- case BPF_ALU64 | BPF_DIV | BPF_K:
- EMIT1(0x50); /* push rax */
- EMIT1(0x52); /* push rdx */
+ case BPF_ALU64 | BPF_DIV | BPF_K: {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
- if (BPF_SRC(insn->code) == BPF_X)
- /* mov r11, src_reg */
- EMIT_mov(AUX_REG, src_reg);
- else
+ if (dst_reg != BPF_REG_0)
+ EMIT1(0x50); /* push rax */
+ if (dst_reg != BPF_REG_3)
+ EMIT1(0x52); /* push rdx */
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (src_reg == BPF_REG_0 ||
+ src_reg == BPF_REG_3) {
+ /* mov r11, src_reg */
+ EMIT_mov(AUX_REG, src_reg);
+ src_reg = AUX_REG;
+ }
+ } else {
/* mov r11, imm32 */
EMIT3_off32(0x49, 0xC7, 0xC3, imm32);
+ src_reg = AUX_REG;
+ }
- /* mov rax, dst_reg */
- EMIT_mov(BPF_REG_0, dst_reg);
+ if (dst_reg != BPF_REG_0)
+ /* mov rax, dst_reg */
+ emit_mov_reg(&prog, is64, BPF_REG_0, dst_reg);
/*
* xor edx, edx
@@ -1048,63 +1043,51 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
*/
EMIT2(0x31, 0xd2);
- if (BPF_CLASS(insn->code) == BPF_ALU64)
- /* div r11 */
- EMIT3(0x49, 0xF7, 0xF3);
- else
- /* div r11d */
- EMIT3(0x41, 0xF7, 0xF3);
-
- if (BPF_OP(insn->code) == BPF_MOD)
- /* mov r11, rdx */
- EMIT3(0x49, 0x89, 0xD3);
- else
- /* mov r11, rax */
- EMIT3(0x49, 0x89, 0xC3);
+ /* div src_reg */
+ maybe_emit_1mod(&prog, src_reg, is64);
+ EMIT2(0xF7, add_1reg(0xF0, src_reg));
- EMIT1(0x5A); /* pop rdx */
- EMIT1(0x58); /* pop rax */
+ if (BPF_OP(insn->code) == BPF_MOD &&
+ dst_reg != BPF_REG_3)
+ /* mov dst_reg, rdx */
+ emit_mov_reg(&prog, is64, dst_reg, BPF_REG_3);
+ else if (BPF_OP(insn->code) == BPF_DIV &&
+ dst_reg != BPF_REG_0)
+ /* mov dst_reg, rax */
+ emit_mov_reg(&prog, is64, dst_reg, BPF_REG_0);
- /* mov dst_reg, r11 */
- EMIT_mov(dst_reg, AUX_REG);
+ if (dst_reg != BPF_REG_3)
+ EMIT1(0x5A); /* pop rdx */
+ if (dst_reg != BPF_REG_0)
+ EMIT1(0x58); /* pop rax */
break;
+ }
case BPF_ALU | BPF_MUL | BPF_K:
- case BPF_ALU | BPF_MUL | BPF_X:
case BPF_ALU64 | BPF_MUL | BPF_K:
- case BPF_ALU64 | BPF_MUL | BPF_X:
- {
- bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
-
- if (dst_reg != BPF_REG_0)
- EMIT1(0x50); /* push rax */
- if (dst_reg != BPF_REG_3)
- EMIT1(0x52); /* push rdx */
-
- /* mov r11, dst_reg */
- EMIT_mov(AUX_REG, dst_reg);
+ maybe_emit_mod(&prog, dst_reg, dst_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
- if (BPF_SRC(insn->code) == BPF_X)
- emit_mov_reg(&prog, is64, BPF_REG_0, src_reg);
+ if (is_imm8(imm32))
+ /* imul dst_reg, dst_reg, imm8 */
+ EMIT3(0x6B, add_2reg(0xC0, dst_reg, dst_reg),
+ imm32);
else
- emit_mov_imm32(&prog, is64, BPF_REG_0, imm32);
+ /* imul dst_reg, dst_reg, imm32 */
+ EMIT2_off32(0x69,
+ add_2reg(0xC0, dst_reg, dst_reg),
+ imm32);
+ break;
- if (is64)
- EMIT1(add_1mod(0x48, AUX_REG));
- else if (is_ereg(AUX_REG))
- EMIT1(add_1mod(0x40, AUX_REG));
- /* mul(q) r11 */
- EMIT2(0xF7, add_1reg(0xE0, AUX_REG));
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU64 | BPF_MUL | BPF_X:
+ maybe_emit_mod(&prog, src_reg, dst_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
- if (dst_reg != BPF_REG_3)
- EMIT1(0x5A); /* pop rdx */
- if (dst_reg != BPF_REG_0) {
- /* mov dst_reg, rax */
- EMIT_mov(dst_reg, BPF_REG_0);
- EMIT1(0x58); /* pop rax */
- }
+ /* imul dst_reg, src_reg */
+ EMIT3(0x0F, 0xAF, add_2reg(0xC0, src_reg, dst_reg));
break;
- }
+
/* Shifts */
case BPF_ALU | BPF_LSH | BPF_K:
case BPF_ALU | BPF_RSH | BPF_K:
@@ -1112,10 +1095,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_ALU64 | BPF_LSH | BPF_K:
case BPF_ALU64 | BPF_RSH | BPF_K:
case BPF_ALU64 | BPF_ARSH | BPF_K:
- if (BPF_CLASS(insn->code) == BPF_ALU64)
- EMIT1(add_1mod(0x48, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_1mod(0x40, dst_reg));
+ maybe_emit_1mod(&prog, dst_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
b3 = simple_alu_opcodes[BPF_OP(insn->code)];
if (imm32 == 1)
@@ -1146,10 +1127,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
}
/* shl %rax, %cl | shr %rax, %cl | sar %rax, %cl */
- if (BPF_CLASS(insn->code) == BPF_ALU64)
- EMIT1(add_1mod(0x48, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_1mod(0x40, dst_reg));
+ maybe_emit_1mod(&prog, dst_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
b3 = simple_alu_opcodes[BPF_OP(insn->code)];
EMIT2(0xD3, add_1reg(b3, dst_reg));
@@ -1222,8 +1201,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
/* speculation barrier */
case BPF_ST | BPF_NOSPEC:
if (boot_cpu_has(X86_FEATURE_XMM2))
- /* Emit 'lfence' */
- EMIT3(0x0F, 0xAE, 0xE8);
+ EMIT_LFENCE();
break;
/* ST: *(u8*)(dst_reg + off) = imm */
@@ -1274,19 +1252,54 @@ st: if (is_imm8(insn->off))
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
- /* test src_reg, src_reg */
- maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */
- EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg));
- /* jne start_of_ldx */
- EMIT2(X86_JNE, 0);
+ /* Though the verifier prevents negative insn->off in BPF_PROBE_MEM
+ * add abs(insn->off) to the limit to make sure that negative
+ * offset won't be an issue.
+ * insn->off is s16, so it won't affect valid pointers.
+ */
+ u64 limit = TASK_SIZE_MAX + PAGE_SIZE + abs(insn->off);
+ u8 *end_of_jmp1, *end_of_jmp2;
+
+ /* Conservatively check that src_reg + insn->off is a kernel address:
+ * 1. src_reg + insn->off >= limit
+ * 2. src_reg + insn->off doesn't become small positive.
+ * Cannot do src_reg + insn->off >= limit in one branch,
+ * since it needs two spare registers, but JIT has only one.
+ */
+
+ /* movabsq r11, limit */
+ EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
+ EMIT((u32)limit, 4);
+ EMIT(limit >> 32, 4);
+ /* cmp src_reg, r11 */
+ maybe_emit_mod(&prog, src_reg, AUX_REG, true);
+ EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
+ /* if unsigned '<' goto end_of_jmp2 */
+ EMIT2(X86_JB, 0);
+ end_of_jmp1 = prog;
+
+ /* mov r11, src_reg */
+ emit_mov_reg(&prog, true, AUX_REG, src_reg);
+ /* add r11, insn->off */
+ maybe_emit_1mod(&prog, AUX_REG, true);
+ EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
+ /* jmp if not carry to start_of_ldx
+ * Otherwise ERR_PTR(-EINVAL) + 128 will be the user addr
+ * that has to be rejected.
+ */
+ EMIT2(0x73 /* JNC */, 0);
+ end_of_jmp2 = prog;
+
/* xor dst_reg, dst_reg */
emit_mov_imm32(&prog, false, dst_reg, 0);
/* jmp byte_after_ldx */
EMIT2(0xEB, 0);
- /* populate jmp_offset for JNE above */
- temp[4] = prog - temp - 5 /* sizeof(test + jne) */;
+ /* populate jmp_offset for JB above to jump to xor dst_reg */
+ end_of_jmp1[-1] = end_of_jmp2 - end_of_jmp1;
+ /* populate jmp_offset for JNC above to jump to start_of_ldx */
start_of_ldx = prog;
+ end_of_jmp2[-1] = start_of_ldx - end_of_jmp2;
}
emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
@@ -1313,12 +1326,7 @@ st: if (is_imm8(insn->off))
}
ex->insn = delta;
- delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler;
- if (!is_simm32(delta)) {
- pr_err("extable->handler doesn't fit into 32-bit\n");
- return -EFAULT;
- }
- ex->handler = delta;
+ ex->data = EX_TYPE_BPF;
if (dst_reg > BPF_REG_9) {
pr_err("verifier error\n");
@@ -1332,7 +1340,7 @@ st: if (is_imm8(insn->off))
* End result: x86 insn "mov rbx, qword ptr [rax+0x14]"
* of 4 bytes will be ignored and rbx will be zero inited.
*/
- ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8);
+ ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8);
}
break;
@@ -1412,13 +1420,16 @@ st: if (is_imm8(insn->off))
case BPF_JMP | BPF_TAIL_CALL:
if (imm32)
emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
- &prog, addrs[i], image,
+ &prog, image + addrs[i - 1],
callee_regs_used,
- bpf_prog->aux->stack_depth);
+ bpf_prog->aux->stack_depth,
+ ctx);
else
emit_bpf_tail_call_indirect(&prog,
callee_regs_used,
- bpf_prog->aux->stack_depth);
+ bpf_prog->aux->stack_depth,
+ image + addrs[i - 1],
+ ctx);
break;
/* cond jump */
@@ -1459,10 +1470,8 @@ st: if (is_imm8(insn->off))
case BPF_JMP | BPF_JSET | BPF_K:
case BPF_JMP32 | BPF_JSET | BPF_K:
/* test dst_reg, imm32 */
- if (BPF_CLASS(insn->code) == BPF_JMP)
- EMIT1(add_1mod(0x48, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_1mod(0x40, dst_reg));
+ maybe_emit_1mod(&prog, dst_reg,
+ BPF_CLASS(insn->code) == BPF_JMP);
EMIT2_off32(0xF7, add_1reg(0xC0, dst_reg), imm32);
goto emit_cond_jmp;
@@ -1495,10 +1504,8 @@ st: if (is_imm8(insn->off))
}
/* cmp dst_reg, imm8/32 */
- if (BPF_CLASS(insn->code) == BPF_JMP)
- EMIT1(add_1mod(0x48, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_1mod(0x40, dst_reg));
+ maybe_emit_1mod(&prog, dst_reg,
+ BPF_CLASS(insn->code) == BPF_JMP);
if (is_imm8(imm32))
EMIT3(0x83, add_1reg(0xF8, dst_reg), imm32);
@@ -1969,7 +1976,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
void *orig_call)
{
int ret, i, nr_args = m->nr_args;
- int stack_size = nr_args * 8;
+ int regs_off, ip_off, args_off, stack_size = nr_args * 8;
struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY];
struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT];
struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN];
@@ -1984,14 +1991,39 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (!is_valid_bpf_tramp_flags(flags))
return -EINVAL;
+ /* Generated trampoline stack layout:
+ *
+ * RBP + 8 [ return address ]
+ * RBP + 0 [ RBP ]
+ *
+ * RBP - 8 [ return value ] BPF_TRAMP_F_CALL_ORIG or
+ * BPF_TRAMP_F_RET_FENTRY_RET flags
+ *
+ * [ reg_argN ] always
+ * [ ... ]
+ * RBP - regs_off [ reg_arg1 ] program's ctx pointer
+ *
+ * RBP - args_off [ args count ] always
+ *
+ * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag
+ */
+
/* room for return value of orig_call or fentry prog */
save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET);
if (save_ret)
stack_size += 8;
+ regs_off = stack_size;
+
+ /* args count */
+ stack_size += 8;
+ args_off = stack_size;
+
if (flags & BPF_TRAMP_F_IP_ARG)
stack_size += 8; /* room for IP address argument */
+ ip_off = stack_size;
+
if (flags & BPF_TRAMP_F_SKIP_FRAME)
/* skip patched call instruction and point orig_call to actual
* body of the kernel function.
@@ -2005,23 +2037,25 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
EMIT1(0x53); /* push rbx */
+ /* Store number of arguments of the traced function:
+ * mov rax, nr_args
+ * mov QWORD PTR [rbp - args_off], rax
+ */
+ emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args);
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off);
+
if (flags & BPF_TRAMP_F_IP_ARG) {
/* Store IP address of the traced function:
* mov rax, QWORD PTR [rbp + 8]
* sub rax, X86_PATCH_SIZE
- * mov QWORD PTR [rbp - stack_size], rax
+ * mov QWORD PTR [rbp - ip_off], rax
*/
emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
EMIT4(0x48, 0x83, 0xe8, X86_PATCH_SIZE);
- emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -stack_size);
-
- /* Continue with stack_size for regs storage, stack will
- * be correctly restored with 'leave' instruction.
- */
- stack_size -= 8;
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
}
- save_regs(m, &prog, nr_args, stack_size);
+ save_regs(m, &prog, nr_args, regs_off);
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
@@ -2033,7 +2067,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (fentry->nr_progs)
- if (invoke_bpf(m, &prog, fentry, stack_size,
+ if (invoke_bpf(m, &prog, fentry, regs_off,
flags & BPF_TRAMP_F_RET_FENTRY_RET))
return -EINVAL;
@@ -2043,7 +2077,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (!branches)
return -ENOMEM;
- if (invoke_bpf_mod_ret(m, &prog, fmod_ret, stack_size,
+ if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off,
branches)) {
ret = -EINVAL;
goto cleanup;
@@ -2051,7 +2085,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- restore_regs(m, &prog, nr_args, stack_size);
+ restore_regs(m, &prog, nr_args, regs_off);
/* call original function */
if (emit_call(&prog, orig_call, prog)) {
@@ -2081,13 +2115,13 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (fexit->nr_progs)
- if (invoke_bpf(m, &prog, fexit, stack_size, false)) {
+ if (invoke_bpf(m, &prog, fexit, regs_off, false)) {
ret = -EINVAL;
goto cleanup;
}
if (flags & BPF_TRAMP_F_RESTORE_REGS)
- restore_regs(m, &prog, nr_args, stack_size);
+ restore_regs(m, &prog, nr_args, regs_off);
/* This needs to be done regardless. If there were fmod_ret programs,
* the return value is only updated on the stack and still needs to be
@@ -2124,24 +2158,6 @@ cleanup:
return ret;
}
-static int emit_fallback_jump(u8 **pprog)
-{
- u8 *prog = *pprog;
- int err = 0;
-
-#ifdef CONFIG_RETPOLINE
- /* Note that this assumes the the compiler uses external
- * thunks for indirect calls. Both clang and GCC use the same
- * naming convention for external thunks.
- */
- err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog);
-#else
- EMIT2(0xFF, 0xE2); /* jmp rdx */
-#endif
- *pprog = prog;
- return err;
-}
-
static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
{
u8 *jg_reloc, *prog = *pprog;
@@ -2163,9 +2179,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
if (err)
return err;
- err = emit_fallback_jump(&prog); /* jmp thunk/indirect */
- if (err)
- return err;
+ emit_indirect_jump(&prog, 2 /* rdx */, prog);
*pprog = prog;
return 0;
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 3bfda5f502cb..429a89c5468b 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -15,6 +15,7 @@
#include <asm/cacheflush.h>
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
+#include <asm/asm-prototypes.h>
#include <linux/bpf.h>
/*
@@ -1267,6 +1268,21 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth)
*pprog = prog;
}
+static int emit_jmp_edx(u8 **pprog, u8 *ip)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+#ifdef CONFIG_RETPOLINE
+ EMIT1_off32(0xE9, (u8 *)__x86_indirect_thunk_edx - (ip + 5));
+#else
+ EMIT2(0xFF, 0xE2);
+#endif
+ *pprog = prog;
+
+ return cnt;
+}
+
/*
* Generate the following code:
* ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
@@ -1280,7 +1296,7 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth)
* goto *(prog->bpf_func + prologue_size);
* out:
*/
-static void emit_bpf_tail_call(u8 **pprog)
+static void emit_bpf_tail_call(u8 **pprog, u8 *ip)
{
u8 *prog = *pprog;
int cnt = 0;
@@ -1307,7 +1323,7 @@ static void emit_bpf_tail_call(u8 **pprog)
EMIT2(IA32_JBE, jmp_label(jmp_label1, 2));
/*
- * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
lo = (u32)MAX_TAIL_CALL_CNT;
@@ -1321,7 +1337,7 @@ static void emit_bpf_tail_call(u8 **pprog)
/* cmp ecx,lo */
EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo);
- /* ja out */
+ /* jae out */
EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
/* add eax,0x1 */
@@ -1362,7 +1378,7 @@ static void emit_bpf_tail_call(u8 **pprog)
* eax == ctx (1st arg)
* edx == prog->bpf_func + prologue_size
*/
- RETPOLINE_EDX_BPF_JIT();
+ cnt += emit_jmp_edx(&prog, ip + cnt);
if (jmp_label1 == -1)
jmp_label1 = cnt;
@@ -2122,7 +2138,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
break;
}
case BPF_JMP | BPF_TAIL_CALL:
- emit_bpf_tail_call(&prog);
+ emit_bpf_tail_call(&prog, image + addrs[i - 1]);
break;
/* cond jump */
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 948656069cdd..052f1d78a562 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -20,7 +20,7 @@ struct pci_root_info {
};
static bool pci_use_crs = true;
-static bool pci_ignore_seg = false;
+static bool pci_ignore_seg;
static int __init set_use_crs(const struct dmi_system_id *id)
{
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 3507f456fcd0..9e1e6b8d8876 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -632,7 +632,7 @@ static void set_dev_domain_options(struct pci_dev *pdev)
pdev->hotplug_user_indicators = 1;
}
-int pcibios_add_device(struct pci_dev *dev)
+int pcibios_device_add(struct pci_dev *dev)
{
struct pci_setup_rom *rom;
struct irq_domain *msidom;
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 3d41a09c2c14..9bb1e2941179 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -23,6 +23,7 @@
#include <xen/features.h>
#include <xen/events.h>
+#include <xen/pci.h>
#include <asm/xen/pci.h>
#include <asm/xen/cpuid.h>
#include <asm/apic.h>
@@ -113,7 +114,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
false /* no mapping of GSI to PIRQ */);
}
-#ifdef CONFIG_XEN_DOM0
+#ifdef CONFIG_XEN_PV_DOM0
static int xen_register_gsi(u32 gsi, int triggering, int polarity)
{
int rc, irq;
@@ -183,7 +184,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (ret)
goto error;
i = 0;
- for_each_pci_msi_entry(msidesc, dev) {
+ msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_NOTASSOCIATED) {
irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i],
(type == PCI_CAP_ID_MSI) ? nvec : 1,
(type == PCI_CAP_ID_MSIX) ?
@@ -234,7 +235,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (type == PCI_CAP_ID_MSI && nvec > 1)
return 1;
- for_each_pci_msi_entry(msidesc, dev) {
+ msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_NOTASSOCIATED) {
pirq = xen_allocate_pirq_msi(dev, msidesc);
if (pirq < 0) {
irq = -ENODEV;
@@ -261,7 +262,7 @@ error:
return irq;
}
-#ifdef CONFIG_XEN_DOM0
+#ifdef CONFIG_XEN_PV_DOM0
static bool __read_mostly pci_seg_supported = true;
static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
@@ -269,7 +270,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
int ret = 0;
struct msi_desc *msidesc;
- for_each_pci_msi_entry(msidesc, dev) {
+ msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_NOTASSOCIATED) {
struct physdev_map_pirq map_irq;
domid_t domid;
@@ -305,7 +306,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
return -EINVAL;
map_irq.table_base = pci_resource_start(dev, bir);
- map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
+ map_irq.entry_nr = msidesc->msi_index;
}
ret = -EINVAL;
@@ -350,10 +351,13 @@ out:
return ret;
}
-static void xen_initdom_restore_msi_irqs(struct pci_dev *dev)
+bool xen_initdom_restore_msi(struct pci_dev *dev)
{
int ret = 0;
+ if (!xen_initial_domain())
+ return true;
+
if (pci_seg_supported) {
struct physdev_pci_device restore_ext;
@@ -374,30 +378,26 @@ static void xen_initdom_restore_msi_irqs(struct pci_dev *dev)
ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore);
WARN(ret && ret != -ENOSYS, "restore_msi -> %d\n", ret);
}
+ return false;
}
-#else /* CONFIG_XEN_DOM0 */
+#else /* CONFIG_XEN_PV_DOM0 */
#define xen_initdom_setup_msi_irqs NULL
-#define xen_initdom_restore_msi_irqs NULL
-#endif /* !CONFIG_XEN_DOM0 */
+#endif /* !CONFIG_XEN_PV_DOM0 */
static void xen_teardown_msi_irqs(struct pci_dev *dev)
{
struct msi_desc *msidesc;
int i;
- for_each_pci_msi_entry(msidesc, dev) {
- if (msidesc->irq) {
- for (i = 0; i < msidesc->nvec_used; i++)
- xen_destroy_irq(msidesc->irq + i);
- }
+ msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_ASSOCIATED) {
+ for (i = 0; i < msidesc->nvec_used; i++)
+ xen_destroy_irq(msidesc->irq + i);
}
}
static void xen_pv_teardown_msi_irqs(struct pci_dev *dev)
{
- struct msi_desc *msidesc = first_pci_msi_entry(dev);
-
- if (msidesc->msi_attrib.is_msix)
+ if (dev->msix_enabled)
xen_pci_frontend_disable_msix(dev);
else
xen_pci_frontend_disable_msi(dev);
@@ -413,10 +413,7 @@ static int xen_msi_domain_alloc_irqs(struct irq_domain *domain,
if (WARN_ON_ONCE(!dev_is_pci(dev)))
return -EINVAL;
- if (first_msi_entry(dev)->msi_attrib.is_msix)
- type = PCI_CAP_ID_MSIX;
- else
- type = PCI_CAP_ID_MSI;
+ type = to_pci_dev(dev)->msix_enabled ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
return xen_msi_ops.setup_msi_irqs(to_pci_dev(dev), nvec, type);
}
@@ -465,12 +462,10 @@ static __init struct irq_domain *xen_create_pci_msi_domain(void)
static __init void xen_setup_pci_msi(void)
{
if (xen_pv_domain()) {
- if (xen_initial_domain()) {
+ if (xen_initial_domain())
xen_msi_ops.setup_msi_irqs = xen_initdom_setup_msi_irqs;
- x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs;
- } else {
+ else
xen_msi_ops.setup_msi_irqs = xen_setup_msi_irqs;
- }
xen_msi_ops.teardown_msi_irqs = xen_pv_teardown_msi_irqs;
pci_msi_ignore_mask = 1;
} else if (xen_hvm_domain()) {
@@ -555,7 +550,7 @@ int __init pci_xen_hvm_init(void)
return 0;
}
-#ifdef CONFIG_XEN_DOM0
+#ifdef CONFIG_XEN_PV_DOM0
int __init pci_xen_initial_domain(void)
{
int irq;
@@ -583,77 +578,5 @@ int __init pci_xen_initial_domain(void)
}
return 0;
}
-
-struct xen_device_domain_owner {
- domid_t domain;
- struct pci_dev *dev;
- struct list_head list;
-};
-
-static DEFINE_SPINLOCK(dev_domain_list_spinlock);
-static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
-
-static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
-{
- struct xen_device_domain_owner *owner;
-
- list_for_each_entry(owner, &dev_domain_list, list) {
- if (owner->dev == dev)
- return owner;
- }
- return NULL;
-}
-
-int xen_find_device_domain_owner(struct pci_dev *dev)
-{
- struct xen_device_domain_owner *owner;
- int domain = -ENODEV;
-
- spin_lock(&dev_domain_list_spinlock);
- owner = find_device(dev);
- if (owner)
- domain = owner->domain;
- spin_unlock(&dev_domain_list_spinlock);
- return domain;
-}
-EXPORT_SYMBOL_GPL(xen_find_device_domain_owner);
-
-int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
-{
- struct xen_device_domain_owner *owner;
-
- owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
- if (!owner)
- return -ENODEV;
-
- spin_lock(&dev_domain_list_spinlock);
- if (find_device(dev)) {
- spin_unlock(&dev_domain_list_spinlock);
- kfree(owner);
- return -EEXIST;
- }
- owner->domain = domain;
- owner->dev = dev;
- list_add_tail(&owner->list, &dev_domain_list);
- spin_unlock(&dev_domain_list_spinlock);
- return 0;
-}
-EXPORT_SYMBOL_GPL(xen_register_device_domain_owner);
-
-int xen_unregister_device_domain_owner(struct pci_dev *dev)
-{
- struct xen_device_domain_owner *owner;
-
- spin_lock(&dev_domain_list_spinlock);
- owner = find_device(dev);
- if (!owner) {
- spin_unlock(&dev_domain_list_spinlock);
- return -ENODEV;
- }
- list_del(&owner->list);
- spin_unlock(&dev_domain_list_spinlock);
- kfree(owner);
- return 0;
-}
-EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner);
#endif
+
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
index 0ac3d4357136..65fa3d866226 100644
--- a/arch/x86/platform/ce4100/falconfalls.dts
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -249,7 +249,7 @@
gpio@26 {
#gpio-cells = <2>;
- compatible = "ti,pcf8575";
+ compatible = "nxp,pcf8575";
reg = <0x26>;
gpio-controller;
};
@@ -263,7 +263,7 @@
gpio@26 {
#gpio-cells = <2>;
- compatible = "ti,pcf8575";
+ compatible = "nxp,pcf8575";
reg = <0x26>;
gpio-controller;
};
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 7515e78ef898..1f3675453a57 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -33,7 +33,7 @@
#include <linux/reboot.h>
#include <linux/slab.h>
#include <linux/ucs2_string.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <linux/sched/task.h>
#include <asm/setup.h>
@@ -284,7 +284,8 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va)
if (!(md->attribute & EFI_MEMORY_WB))
flags |= _PAGE_PCD;
- if (sev_active() && md->type != EFI_MEMORY_MAPPED_IO)
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
+ md->type != EFI_MEMORY_MAPPED_IO)
flags |= _PAGE_ENC;
pfn = md->phys_addr >> PAGE_SHIFT;
@@ -390,7 +391,7 @@ static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *m
if (!(md->attribute & EFI_MEMORY_RO))
pf |= _PAGE_RW;
- if (sev_active())
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
pf |= _PAGE_ENC;
return efi_update_mappings(md, pf);
@@ -438,7 +439,7 @@ void __init efi_runtime_update_mappings(void)
(md->type != EFI_RUNTIME_SERVICES_CODE))
pf |= _PAGE_RW;
- if (sev_active())
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
pf |= _PAGE_ENC;
efi_update_mappings(md, pf);
diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S
index 09ec84f6ef51..f3cfdb1c9a35 100644
--- a/arch/x86/platform/efi/efi_stub_32.S
+++ b/arch/x86/platform/efi/efi_stub_32.S
@@ -56,5 +56,5 @@ SYM_FUNC_START(efi_call_svam)
movl 16(%esp), %ebx
leave
- ret
+ RET
SYM_FUNC_END(efi_call_svam)
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 90380a17ab23..2206b8bc47b8 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -23,5 +23,5 @@ SYM_FUNC_START(__efi_call)
mov %rsi, %rcx
CALL_NOSPEC rdi
leave
- ret
+ RET
SYM_FUNC_END(__efi_call)
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
index fd3dd1708eba..25799d768624 100644
--- a/arch/x86/platform/efi/efi_thunk_64.S
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -37,6 +37,17 @@ SYM_CODE_START(__efi64_thunk)
push %rax
/*
+ * Copy args passed via the stack
+ */
+ subq $0x24, %rsp
+ movq 0x18(%rax), %rbp
+ movq 0x20(%rax), %rbx
+ movq 0x28(%rax), %rax
+ movl %ebp, 0x18(%rsp)
+ movl %ebx, 0x1c(%rsp)
+ movl %eax, 0x20(%rsp)
+
+ /*
* Calculate the physical address of the kernel text.
*/
movq $__START_KERNEL_map, %rax
@@ -47,7 +58,6 @@ SYM_CODE_START(__efi64_thunk)
subq %rax, %rbp
subq %rax, %rbx
- subq $28, %rsp
movl %ebx, 0x0(%rsp) /* return address */
movl %esi, 0x4(%rsp)
movl %edx, 0x8(%rsp)
@@ -60,10 +70,10 @@ SYM_CODE_START(__efi64_thunk)
pushq %rdi /* EFI runtime service address */
lretq
-1: movq 24(%rsp), %rsp
+1: movq 0x20(%rsp), %rsp
pop %rbx
pop %rbp
- retq
+ RET
.code32
2: pushl $__KERNEL_CS
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index b15ebfe40a73..b0b848d6933a 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -277,7 +277,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
return;
}
- new = early_memremap(data.phys_map, data.size);
+ new = early_memremap_prot(data.phys_map, data.size,
+ pgprot_val(pgprot_encrypted(FIXMAP_PAGE_NORMAL)));
if (!new) {
pr_err("Failed to map new boot services memmap\n");
return;
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index ee2beda590d0..1d4a00e767ec 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -274,7 +274,7 @@ static struct olpc_ec_driver ec_xo1_driver = {
static struct olpc_ec_driver ec_xo1_5_driver = {
.ec_cmd = olpc_xo1_ec_cmd,
-#ifdef CONFIG_OLPC_XO1_5_SCI
+#ifdef CONFIG_OLPC_XO15_SCI
/*
* XO-1.5 EC wakeups are available when olpc-xo15-sci driver is
* compiled in
diff --git a/arch/x86/platform/olpc/xo1-wakeup.S b/arch/x86/platform/olpc/xo1-wakeup.S
index 75f4faff8468..3a5abffe5660 100644
--- a/arch/x86/platform/olpc/xo1-wakeup.S
+++ b/arch/x86/platform/olpc/xo1-wakeup.S
@@ -77,7 +77,7 @@ save_registers:
pushfl
popl saved_context_eflags
- ret
+ RET
restore_registers:
movl saved_context_ebp, %ebp
@@ -88,7 +88,7 @@ restore_registers:
pushl saved_context_eflags
popfl
- ret
+ RET
SYM_CODE_START(do_olpc_suspend_lowlevel)
call save_processor_state
@@ -109,7 +109,7 @@ ret_point:
call restore_registers
call restore_processor_state
- ret
+ RET
SYM_CODE_END(do_olpc_suspend_lowlevel)
.data
diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c
index 9ac7457f52a3..ed0442e35434 100644
--- a/arch/x86/platform/pvh/enlighten.c
+++ b/arch/x86/platform/pvh/enlighten.c
@@ -16,15 +16,15 @@
/*
* PVH variables.
*
- * pvh_bootparams and pvh_start_info need to live in the data segment since
+ * pvh_bootparams and pvh_start_info need to live in a data segment since
* they are used after startup_{32|64}, which clear .bss, are invoked.
*/
-struct boot_params pvh_bootparams __section(".data");
-struct hvm_start_info pvh_start_info __section(".data");
+struct boot_params __initdata pvh_bootparams;
+struct hvm_start_info __initdata pvh_start_info;
-unsigned int pvh_start_info_sz = sizeof(pvh_start_info);
+const unsigned int __initconst pvh_start_info_sz = sizeof(pvh_start_info);
-static u64 pvh_get_root_pointer(void)
+static u64 __init pvh_get_root_pointer(void)
{
return pvh_start_info.rsdp_paddr;
}
@@ -107,7 +107,7 @@ void __init __weak xen_pvh_init(struct boot_params *boot_params)
BUG();
}
-static void hypervisor_specific_init(bool xen_guest)
+static void __init hypervisor_specific_init(bool xen_guest)
{
if (xen_guest)
xen_pvh_init(&pvh_bootparams);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6665f8802098..9f2b251e83c5 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -20,7 +20,7 @@
#include <asm/page.h>
#include <asm/mce.h>
#include <asm/suspend.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/debugreg.h>
#include <asm/cpu.h>
#include <asm/mmu_context.h>
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index 8786653ad3c0..5606a15cf9a1 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -32,7 +32,7 @@ SYM_FUNC_START(swsusp_arch_suspend)
FRAME_BEGIN
call swsusp_save
FRAME_END
- ret
+ RET
SYM_FUNC_END(swsusp_arch_suspend)
SYM_CODE_START(restore_image)
@@ -108,5 +108,5 @@ SYM_FUNC_START(restore_registers)
/* tell the hibernation core that we've just restored the memory */
movl %eax, in_suspend
- ret
+ RET
SYM_FUNC_END(restore_registers)
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index d9bed596d849..0a0539e1cc81 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -66,7 +66,7 @@ SYM_FUNC_START(restore_registers)
/* tell the hibernation core that we've just restored the memory */
movq %rax, in_suspend(%rip)
- ret
+ RET
SYM_FUNC_END(restore_registers)
SYM_FUNC_START(swsusp_arch_suspend)
@@ -96,7 +96,7 @@ SYM_FUNC_START(swsusp_arch_suspend)
FRAME_BEGIN
call swsusp_save
FRAME_END
- ret
+ RET
SYM_FUNC_END(swsusp_arch_suspend)
SYM_FUNC_START(restore_image)
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 95ea17a9d20c..ae53d54d7959 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -16,7 +16,7 @@ CFLAGS_sha256.o := -D__DISABLE_EXPORTS
# When linking purgatory.ro with -r unresolved symbols are not checked,
# also link a purgatory.chk binary without -r to check for unresolved symbols.
-PURGATORY_LDFLAGS := -e purgatory_start -nostdlib -z nodefaultlib
+PURGATORY_LDFLAGS := -e purgatory_start -z nodefaultlib
LDFLAGS_purgatory.ro := -r $(PURGATORY_LDFLAGS)
LDFLAGS_purgatory.chk := $(PURGATORY_LDFLAGS)
targets += purgatory.ro purgatory.chk
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 31b5856010cb..c5e29db02a46 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -2,7 +2,7 @@
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/memblock.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <linux/pgtable.h>
#include <asm/set_memory.h>
@@ -17,6 +17,32 @@ u32 *trampoline_cr4_features;
/* Hold the pgd entry used on booting additional CPUs */
pgd_t trampoline_pgd_entry;
+void load_trampoline_pgtable(void)
+{
+#ifdef CONFIG_X86_32
+ load_cr3(initial_page_table);
+#else
+ /*
+ * This function is called before exiting to real-mode and that will
+ * fail with CR4.PCIDE still set.
+ */
+ if (boot_cpu_has(X86_FEATURE_PCID))
+ cr4_clear_bits(X86_CR4_PCIDE);
+
+ write_cr3(real_mode_header->trampoline_pgd);
+#endif
+
+ /*
+ * The CR3 write above will not flush global TLB entries.
+ * Stale, global entries from previous page tables may still be
+ * present. Flush those stale entries.
+ *
+ * This ensures that memory accessed while running with
+ * trampoline_pgd is *actually* mapped into trampoline_pgd.
+ */
+ __flush_tlb_all();
+}
+
void __init reserve_real_mode(void)
{
phys_addr_t mem;
@@ -44,10 +70,10 @@ void __init reserve_real_mode(void)
static void sme_sev_setup_real_mode(struct trampoline_header *th)
{
#ifdef CONFIG_AMD_MEM_ENCRYPT
- if (sme_active())
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
th->flags |= TH_FLAGS_SME_ACTIVE;
- if (sev_es_active()) {
+ if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) {
/*
* Skip the call to verify_cpu() in secondary_startup_64 as it
* will cause #VC exceptions when the AP can't handle them yet.
@@ -72,6 +98,7 @@ static void __init setup_real_mode(void)
#ifdef CONFIG_X86_64
u64 *trampoline_pgd;
u64 efer;
+ int i;
#endif
base = (unsigned char *)real_mode_header;
@@ -81,7 +108,7 @@ static void __init setup_real_mode(void)
* decrypted memory in order to bring up other processors
* successfully. This is not needed for SEV.
*/
- if (sme_active())
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT);
memcpy(base, real_mode_blob, size);
@@ -128,8 +155,17 @@ static void __init setup_real_mode(void)
trampoline_header->flags = 0;
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
+
+ /* Map the real mode stub as virtual == physical */
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
- trampoline_pgd[511] = init_top_pgt[511].pgd;
+
+ /*
+ * Include the entirety of the kernel mapping into the trampoline
+ * PGD. This way, all mappings present in the normal kernel page
+ * tables are usable while running on trampoline_pgd.
+ */
+ for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++)
+ trampoline_pgd[i] = init_top_pgt[i].pgd;
#endif
sme_sev_setup_real_mode(trampoline_header);
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 27c82207d387..e2c5b296120d 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -14,6 +14,10 @@
static Elf_Ehdr ehdr;
static unsigned long shnum;
static unsigned int shstrndx;
+static unsigned int shsymtabndx;
+static unsigned int shxsymtabndx;
+
+static int sym_index(Elf_Sym *sym);
struct relocs {
uint32_t *offset;
@@ -35,6 +39,7 @@ struct section {
Elf_Shdr shdr;
struct section *link;
Elf_Sym *symtab;
+ Elf32_Word *xsymtab;
Elf_Rel *reltab;
char *strtab;
};
@@ -63,7 +68,9 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
"(__parainstructions|__alt_instructions)(_end)?|"
"(__iommu_table|__apicdrivers|__smp_locks)(_end)?|"
"__(start|end)_pci_.*|"
+#if CONFIG_FW_LOADER
"__(start|end)_builtin_fw|"
+#endif
"__(start|stop)___ksymtab(_gpl)?|"
"__(start|stop)___kcrctab(_gpl)?|"
"__(start|stop)___param|"
@@ -268,7 +275,7 @@ static const char *sym_name(const char *sym_strtab, Elf_Sym *sym)
name = sym_strtab + sym->st_name;
}
else {
- name = sec_name(sym->st_shndx);
+ name = sec_name(sym_index(sym));
}
return name;
}
@@ -338,6 +345,23 @@ static uint64_t elf64_to_cpu(uint64_t val)
#define elf_xword_to_cpu(x) elf32_to_cpu(x)
#endif
+static int sym_index(Elf_Sym *sym)
+{
+ Elf_Sym *symtab = secs[shsymtabndx].symtab;
+ Elf32_Word *xsymtab = secs[shxsymtabndx].xsymtab;
+ unsigned long offset;
+ int index;
+
+ if (sym->st_shndx != SHN_XINDEX)
+ return sym->st_shndx;
+
+ /* calculate offset of sym from head of table. */
+ offset = (unsigned long)sym - (unsigned long)symtab;
+ index = offset / sizeof(*sym);
+
+ return elf32_to_cpu(xsymtab[index]);
+}
+
static void read_ehdr(FILE *fp)
{
if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
@@ -471,31 +495,60 @@ static void read_strtabs(FILE *fp)
static void read_symtabs(FILE *fp)
{
int i,j;
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_SYMTAB) {
+ int num_syms;
+
+ switch (sec->shdr.sh_type) {
+ case SHT_SYMTAB_SHNDX:
+ sec->xsymtab = malloc(sec->shdr.sh_size);
+ if (!sec->xsymtab) {
+ die("malloc of %" FMT " bytes for xsymtab failed\n",
+ sec->shdr.sh_size);
+ }
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+ die("Seek to %" FMT " failed: %s\n",
+ sec->shdr.sh_offset, strerror(errno));
+ }
+ if (fread(sec->xsymtab, 1, sec->shdr.sh_size, fp)
+ != sec->shdr.sh_size) {
+ die("Cannot read extended symbol table: %s\n",
+ strerror(errno));
+ }
+ shxsymtabndx = i;
+ continue;
+
+ case SHT_SYMTAB:
+ num_syms = sec->shdr.sh_size / sizeof(Elf_Sym);
+
+ sec->symtab = malloc(sec->shdr.sh_size);
+ if (!sec->symtab) {
+ die("malloc of %" FMT " bytes for symtab failed\n",
+ sec->shdr.sh_size);
+ }
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+ die("Seek to %" FMT " failed: %s\n",
+ sec->shdr.sh_offset, strerror(errno));
+ }
+ if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
+ != sec->shdr.sh_size) {
+ die("Cannot read symbol table: %s\n",
+ strerror(errno));
+ }
+ for (j = 0; j < num_syms; j++) {
+ Elf_Sym *sym = &sec->symtab[j];
+
+ sym->st_name = elf_word_to_cpu(sym->st_name);
+ sym->st_value = elf_addr_to_cpu(sym->st_value);
+ sym->st_size = elf_xword_to_cpu(sym->st_size);
+ sym->st_shndx = elf_half_to_cpu(sym->st_shndx);
+ }
+ shsymtabndx = i;
+ continue;
+
+ default:
continue;
- }
- sec->symtab = malloc(sec->shdr.sh_size);
- if (!sec->symtab) {
- die("malloc of %" FMT " bytes for symtab failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
- for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Sym); j++) {
- Elf_Sym *sym = &sec->symtab[j];
- sym->st_name = elf_word_to_cpu(sym->st_name);
- sym->st_value = elf_addr_to_cpu(sym->st_value);
- sym->st_size = elf_xword_to_cpu(sym->st_size);
- sym->st_shndx = elf_half_to_cpu(sym->st_shndx);
}
}
}
@@ -762,7 +815,9 @@ static void percpu_init(void)
*/
static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
{
- return (sym->st_shndx == per_cpu_shndx) &&
+ int shndx = sym_index(sym);
+
+ return (shndx == per_cpu_shndx) &&
strcmp(symname, "__init_begin") &&
strcmp(symname, "__per_cpu_load") &&
strncmp(symname, "init_per_cpu_", 13);
@@ -1095,7 +1150,7 @@ static int do_reloc_info(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
sec_name(sec->shdr.sh_info),
rel_type(ELF_R_TYPE(rel->r_info)),
symname,
- sec_name(sym->st_shndx));
+ sec_name(sym_index(sym)));
return 0;
}
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 5ccb18290d71..ba5789c35809 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -40,7 +40,7 @@ $(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) \
-Iarch/x86/include/generated
targets += user-offsets.s
-include/generated/user_constants.h: $(obj)/user-offsets.s
+include/generated/user_constants.h: $(obj)/user-offsets.s FORCE
$(call filechk,offsets,__USER_CONSTANT_H__)
UNPROFILE_OBJS := stub_segv.o
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 165be7f9a964..4da336965698 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -2,6 +2,7 @@
#ifndef _ASM_UM_BARRIER_H_
#define _ASM_UM_BARRIER_H_
+#include <asm/cpufeatures.h>
#include <asm/alternative.h>
/*
diff --git a/arch/x86/um/asm/segment.h b/arch/x86/um/asm/segment.h
index 453db377150d..2ef507bc6989 100644
--- a/arch/x86/um/asm/segment.h
+++ b/arch/x86/um/asm/segment.h
@@ -8,12 +8,4 @@ extern int host_gdt_entry_tls_min;
#define GDT_ENTRY_TLS_MIN host_gdt_entry_tls_min
#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
-typedef struct {
- unsigned long seg;
-} mm_segment_t;
-
-#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
-#define KERNEL_DS MAKE_MM_SEG(~0UL)
-#define USER_DS MAKE_MM_SEG(TASK_SIZE)
-
#endif
diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S
index 13f118dec74f..aed782ab7721 100644
--- a/arch/x86/um/checksum_32.S
+++ b/arch/x86/um/checksum_32.S
@@ -110,7 +110,7 @@ csum_partial:
7:
popl %ebx
popl %esi
- ret
+ RET
#else
@@ -208,7 +208,7 @@ csum_partial:
80:
popl %ebx
popl %esi
- ret
+ RET
#endif
EXPORT_SYMBOL(csum_partial)
diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c
index 3c423dfcd78b..df8f4b4bf98b 100644
--- a/arch/x86/um/os-Linux/registers.c
+++ b/arch/x86/um/os-Linux/registers.c
@@ -15,6 +15,7 @@
#include <sys/uio.h>
#include <asm/sigcontext.h>
#include <linux/elf.h>
+#include <registers.h>
int have_xstate_support;
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index 2497bac56066..0bc4b73a9cde 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -7,6 +7,7 @@
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <asm/ptrace-abi.h>
+#include <registers.h>
#include <skas.h>
extern int arch_switch_tls(struct task_struct *to);
diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c
index 1401899dee9b..289d0159b041 100644
--- a/arch/x86/um/ptrace_64.c
+++ b/arch/x86/um/ptrace_64.c
@@ -11,6 +11,7 @@
#define __FRAME_OFFSETS
#include <asm/ptrace.h>
#include <linux/uaccess.h>
+#include <registers.h>
#include <asm/ptrace-abi.h>
/*
diff --git a/arch/x86/um/setjmp_32.S b/arch/x86/um/setjmp_32.S
index 62eaf8c80e04..2d991ddbcca5 100644
--- a/arch/x86/um/setjmp_32.S
+++ b/arch/x86/um/setjmp_32.S
@@ -34,7 +34,7 @@ kernel_setjmp:
movl %esi,12(%edx)
movl %edi,16(%edx)
movl %ecx,20(%edx) # Return address
- ret
+ RET
.size kernel_setjmp,.-kernel_setjmp
diff --git a/arch/x86/um/setjmp_64.S b/arch/x86/um/setjmp_64.S
index 1b5d40d4ff46..b46acb6a8ebd 100644
--- a/arch/x86/um/setjmp_64.S
+++ b/arch/x86/um/setjmp_64.S
@@ -33,7 +33,7 @@ kernel_setjmp:
movq %r14,40(%rdi)
movq %r15,48(%rdi)
movq %rsi,56(%rdi) # Return address
- ret
+ RET
.size kernel_setjmp,.-kernel_setjmp
diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h
index 8a7d5e1da98e..48d6cd12f8a5 100644
--- a/arch/x86/um/shared/sysdep/syscalls_64.h
+++ b/arch/x86/um/shared/sysdep/syscalls_64.h
@@ -23,9 +23,6 @@ extern syscall_handler_t *sys_call_table[];
UPT_SYSCALL_ARG5(&regs->regs), \
UPT_SYSCALL_ARG6(&regs->regs)))
-extern long old_mmap(unsigned long addr, unsigned long len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long pgoff);
extern syscall_handler_t sys_modify_ldt;
extern syscall_handler_t sys_arch_prctl;
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 7c11c9e5d7ea..263e1d08f216 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -12,6 +12,7 @@
#include <linux/uaccess.h>
#include <asm/ucontext.h>
#include <frame_kern.h>
+#include <registers.h>
#include <skas.h>
#ifdef CONFIG_X86_32
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 0575decb5e54..89df5d89d664 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -9,8 +9,6 @@
#include <linux/cache.h>
#include <asm/syscall.h>
-#define __NO_STUBS
-
/*
* Below you can see, in terms of #define's, the differences between the x86-64
* and the UML syscall table.
@@ -23,8 +21,6 @@
#define sys_vm86old sys_ni_syscall
#define sys_vm86 sys_ni_syscall
-#define old_mmap sys_old_mmap
-
#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native)
#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 95725b5a41ac..b0b4cfd2308c 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -9,8 +9,6 @@
#include <linux/cache.h>
#include <asm/syscall.h>
-#define __NO_STUBS
-
/*
* Below you can see, in terms of #define's, the differences between the x86-64
* and the UML syscall table.
@@ -20,21 +18,6 @@
#define sys_iopl sys_ni_syscall
#define sys_ioperm sys_ni_syscall
-/*
- * The UML TLS problem. Note that x86_64 does not implement this, so the below
- * is needed only for the ia32 compatibility.
- */
-
-/* On UML we call it this way ("old" means it's not mmap2) */
-#define sys_mmap old_mmap
-
-#define stub_clone sys_clone
-#define stub_fork sys_fork
-#define stub_vfork sys_vfork
-#define stub_execve sys_execve
-#define stub_execveat sys_execveat
-#define stub_rt_sigreturn sys_rt_sigreturn
-
#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_64.h>
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index 58f51667e2e4..fe5323f0c42d 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -10,7 +10,9 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <asm/prctl.h> /* XXX This should get the constants from libc */
+#include <registers.h>
#include <os.h>
+#include <registers.h>
long arch_prctl(struct task_struct *task, int option,
unsigned long __user *arg2)
@@ -35,7 +37,7 @@ long arch_prctl(struct task_struct *task, int option,
switch (option) {
case ARCH_SET_FS:
case ARCH_SET_GS:
- ret = restore_registers(pid, &current->thread.regs.regs);
+ ret = restore_pid_registers(pid, &current->thread.regs.regs);
if (ret)
return ret;
break;
@@ -87,3 +89,13 @@ void arch_switch_to(struct task_struct *to)
arch_prctl(to, ARCH_SET_FS, (void __user *) to->thread.arch.fs);
}
+
+SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, off)
+{
+ if (off & ~PAGE_MASK)
+ return -EINVAL;
+
+ return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
+}
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index afc1da68b06d..85246dd9faa1 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -23,6 +23,7 @@ config XEN_PV
select PARAVIRT_XXL
select XEN_HAVE_PVMMU
select XEN_HAVE_VPMU
+ select GUEST_PERF_EVENTS
help
Support running as a Xen PV guest.
@@ -43,13 +44,9 @@ config XEN_PV_SMP
def_bool y
depends on XEN_PV && SMP
-config XEN_DOM0
- bool "Xen PV Dom0 support"
- default y
- depends on XEN_PV && PCI_XEN && SWIOTLB_XEN
- depends on X86_IO_APIC && ACPI && PCI
- help
- Support running as a Xen PV Dom0 guest.
+config XEN_PV_DOM0
+ def_bool y
+ depends on XEN_PV && XEN_DOM0
config XEN_PVHVM
def_bool y
@@ -86,3 +83,12 @@ config XEN_PVH
def_bool n
help
Support for running as a Xen PVH guest.
+
+config XEN_DOM0
+ bool "Xen Dom0 support"
+ default XEN_PV
+ depends on (XEN_PV && SWIOTLB_XEN) || (XEN_PVH && X86_64)
+ depends on X86_IO_APIC && ACPI && PCI
+ select X86_X2APIC if XEN_PVH && X86_64
+ help
+ Support running as a Xen Dom0 guest.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 40b5779fce21..4953260e281c 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -45,7 +45,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
-obj-$(CONFIG_XEN_DOM0) += vga.o
+obj-$(CONFIG_XEN_PV_DOM0) += vga.o
obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c79bd0af2e8c..30c6e986a6cd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -3,6 +3,7 @@
#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
#include <linux/memblock.h>
#endif
+#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/kexec.h>
#include <linux/slab.h>
@@ -10,12 +11,15 @@
#include <xen/xen.h>
#include <xen/features.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/version.h>
#include <xen/page.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <asm/cpu.h>
#include <asm/e820/api.h>
+#include <asm/setup.h>
#include "xen-ops.h"
#include "smp.h"
@@ -27,34 +31,16 @@ EXPORT_SYMBOL_GPL(hypercall_page);
* Pointer to the xen_vcpu_info structure or
* &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info
* and xen_vcpu_setup for details. By default it points to share_info->vcpu_info
- * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point
- * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to
- * acknowledge pending events.
- * Also more subtly it is used by the patched version of irq enable/disable
- * e.g. xen_irq_enable_direct and xen_iret in PV mode.
- *
- * The desire to be able to do those mask/unmask operations as a single
- * instruction by using the per-cpu offset held in %gs is the real reason
- * vcpu info is in a per-cpu pointer and the original reason for this
- * hypercall.
- *
+ * but during boot it is switched to point to xen_vcpu_info.
+ * The pointer is used in __xen_evtchn_do_upcall to acknowledge pending events.
*/
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
-
-/*
- * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info
- * hypercall. This can be used both in PV and PVHVM mode. The structure
- * overrides the default per_cpu(xen_vcpu, cpu) value.
- */
DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
/* Linux <-> Xen vCPU id mapping */
DEFINE_PER_CPU(uint32_t, xen_vcpu_id);
EXPORT_PER_CPU_SYMBOL(xen_vcpu_id);
-enum xen_domain_type xen_domain_type = XEN_NATIVE;
-EXPORT_SYMBOL_GPL(xen_domain_type);
-
unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
EXPORT_SYMBOL(machine_to_phys_mapping);
unsigned long machine_to_phys_nr;
@@ -69,10 +55,12 @@ __read_mostly int xen_have_vector_callback;
EXPORT_SYMBOL_GPL(xen_have_vector_callback);
/*
- * NB: needs to live in .data because it's used by xen_prepare_pvh which runs
- * before clearing the bss.
+ * NB: These need to live in .data or alike because they're used by
+ * xen_prepare_pvh() which runs before clearing the bss.
*/
-uint32_t xen_start_flags __section(".data") = 0;
+enum xen_domain_type __ro_after_init xen_domain_type = XEN_NATIVE;
+EXPORT_SYMBOL_GPL(xen_domain_type);
+uint32_t __ro_after_init xen_start_flags;
EXPORT_SYMBOL(xen_start_flags);
/*
@@ -81,21 +69,6 @@ EXPORT_SYMBOL(xen_start_flags);
*/
struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
-/*
- * Flag to determine whether vcpu info placement is available on all
- * VCPUs. We assume it is to start with, and then set it to zero on
- * the first failure. This is because it can succeed on some VCPUs
- * and not others, since it can involve hypervisor memory allocation,
- * or because the guest failed to guarantee all the appropriate
- * constraints on all VCPUs (ie buffer can't cross a page boundary).
- *
- * Note that any particular CPU may be using a placed vcpu structure,
- * but we can only optimise if the all are.
- *
- * 0: not available, 1: available
- */
-int xen_have_vcpu_info_placement = 1;
-
static int xen_cpu_up_online(unsigned int cpu)
{
xen_init_lock_cpu(cpu);
@@ -121,10 +94,8 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
return rc >= 0 ? 0 : rc;
}
-static int xen_vcpu_setup_restore(int cpu)
+static void xen_vcpu_setup_restore(int cpu)
{
- int rc = 0;
-
/* Any per_cpu(xen_vcpu) is stale, so reset it */
xen_vcpu_info_reset(cpu);
@@ -133,11 +104,8 @@ static int xen_vcpu_setup_restore(int cpu)
* be handled by hotplug.
*/
if (xen_pv_domain() ||
- (xen_hvm_domain() && cpu_online(cpu))) {
- rc = xen_vcpu_setup(cpu);
- }
-
- return rc;
+ (xen_hvm_domain() && cpu_online(cpu)))
+ xen_vcpu_setup(cpu);
}
/*
@@ -147,7 +115,7 @@ static int xen_vcpu_setup_restore(int cpu)
*/
void xen_vcpu_restore(void)
{
- int cpu, rc;
+ int cpu;
for_each_possible_cpu(cpu) {
bool other_cpu = (cpu != smp_processor_id());
@@ -167,20 +135,9 @@ void xen_vcpu_restore(void)
if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
xen_setup_runstate_info(cpu);
- rc = xen_vcpu_setup_restore(cpu);
- if (rc)
- pr_emerg_once("vcpu restore failed for cpu=%d err=%d. "
- "System will hang.\n", cpu, rc);
- /*
- * In case xen_vcpu_setup_restore() fails, do not bring up the
- * VCPU. This helps us avoid the resulting OOPS when the VCPU
- * accesses pvclock_vcpu_time via xen_vcpu (which is NULL.)
- * Note that this does not improve the situation much -- now the
- * VM hangs instead of OOPSing -- with the VCPUs that did not
- * fail, spinning in stop_machine(), waiting for the failed
- * VCPUs to come up.
- */
- if (other_cpu && is_up && (rc == 0) &&
+ xen_vcpu_setup_restore(cpu);
+
+ if (other_cpu && is_up &&
HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
BUG();
}
@@ -197,7 +154,7 @@ void xen_vcpu_info_reset(int cpu)
}
}
-int xen_vcpu_setup(int cpu)
+void xen_vcpu_setup(int cpu)
{
struct vcpu_register_vcpu_info info;
int err;
@@ -218,44 +175,65 @@ int xen_vcpu_setup(int cpu)
*/
if (xen_hvm_domain()) {
if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
- return 0;
+ return;
}
- if (xen_have_vcpu_info_placement) {
- vcpup = &per_cpu(xen_vcpu_info, cpu);
- info.mfn = arbitrary_virt_to_mfn(vcpup);
- info.offset = offset_in_page(vcpup);
+ vcpup = &per_cpu(xen_vcpu_info, cpu);
+ info.mfn = arbitrary_virt_to_mfn(vcpup);
+ info.offset = offset_in_page(vcpup);
- /*
- * Check to see if the hypervisor will put the vcpu_info
- * structure where we want it, which allows direct access via
- * a percpu-variable.
- * N.B. This hypercall can _only_ be called once per CPU.
- * Subsequent calls will error out with -EINVAL. This is due to
- * the fact that hypervisor has no unregister variant and this
- * hypercall does not allow to over-write info.mfn and
- * info.offset.
- */
- err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info,
- xen_vcpu_nr(cpu), &info);
-
- if (err) {
- pr_warn_once("register_vcpu_info failed: cpu=%d err=%d\n",
- cpu, err);
- xen_have_vcpu_info_placement = 0;
- } else {
- /*
- * This cpu is using the registered vcpu info, even if
- * later ones fail to.
- */
- per_cpu(xen_vcpu, cpu) = vcpup;
- }
- }
+ /*
+ * N.B. This hypercall can _only_ be called once per CPU.
+ * Subsequent calls will error out with -EINVAL. This is due to
+ * the fact that hypervisor has no unregister variant and this
+ * hypercall does not allow to over-write info.mfn and
+ * info.offset.
+ */
+ err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, xen_vcpu_nr(cpu),
+ &info);
+ if (err)
+ panic("register_vcpu_info failed: cpu=%d err=%d\n", cpu, err);
+
+ per_cpu(xen_vcpu, cpu) = vcpup;
+}
+
+void __init xen_banner(void)
+{
+ unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ struct xen_extraversion extra;
- if (!xen_have_vcpu_info_placement)
- xen_vcpu_info_reset(cpu);
+ HYPERVISOR_xen_version(XENVER_extraversion, &extra);
- return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0);
+ pr_info("Booting kernel on %s\n", pv_info.name);
+ pr_info("Xen version: %u.%u%s%s\n",
+ version >> 16, version & 0xffff, extra.extraversion,
+ xen_feature(XENFEAT_mmu_pt_update_preserve_ad)
+ ? " (preserve-AD)" : "");
+}
+
+/* Check if running on Xen version (major, minor) or later */
+bool xen_running_on_version_or_later(unsigned int major, unsigned int minor)
+{
+ unsigned int version;
+
+ if (!xen_domain())
+ return false;
+
+ version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
+ ((version >> 16) > major))
+ return true;
+ return false;
+}
+
+void __init xen_add_preferred_consoles(void)
+{
+ add_preferred_console("xenboot", 0, NULL);
+ if (!boot_params.screen_info.orig_video_isVGA)
+ add_preferred_console("tty", 0, NULL);
+ add_preferred_console("hvc", 0, NULL);
+ if (boot_params.screen_info.orig_video_isVGA)
+ add_preferred_console("tty", 0, NULL);
}
void xen_reboot(int reason)
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index e68ea5f4ad1c..42300941ec29 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -163,9 +163,9 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu)
per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu);
else
per_cpu(xen_vcpu_id, cpu) = cpu;
- rc = xen_vcpu_setup(cpu);
- if (rc || !xen_have_vector_callback)
- return rc;
+ xen_vcpu_setup(cpu);
+ if (!xen_have_vector_callback)
+ return 0;
if (xen_feature(XENFEAT_hvm_safe_pvclock))
xen_setup_timer(cpu);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 6e0d0754f94f..5004feb16783 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -27,8 +27,6 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
-#include <linux/highmem.h>
-#include <linux/console.h>
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/edd.h>
@@ -109,17 +107,6 @@ struct tls_descs {
*/
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
-static void __init xen_banner(void)
-{
- unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
- struct xen_extraversion extra;
- HYPERVISOR_xen_version(XENVER_extraversion, &extra);
-
- pr_info("Booting paravirtualized kernel on %s\n", pv_info.name);
- pr_info("Xen version: %d.%d%s (preserve-AD)\n",
- version >> 16, version & 0xffff, extra.extraversion);
-}
-
static void __init xen_pv_init_platform(void)
{
populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP));
@@ -142,22 +129,6 @@ static void __init xen_pv_guest_late_init(void)
#endif
}
-/* Check if running on Xen version (major, minor) or later */
-bool
-xen_running_on_version_or_later(unsigned int major, unsigned int minor)
-{
- unsigned int version;
-
- if (!xen_domain())
- return false;
-
- version = HYPERVISOR_xen_version(XENVER_version, NULL);
- if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
- ((version >> 16) > major))
- return true;
- return false;
-}
-
static __read_mostly unsigned int cpuid_leaf5_ecx_val;
static __read_mostly unsigned int cpuid_leaf5_edx_val;
@@ -311,12 +282,12 @@ static void __init xen_init_capabilities(void)
}
}
-static void xen_set_debugreg(int reg, unsigned long val)
+static noinstr void xen_set_debugreg(int reg, unsigned long val)
{
HYPERVISOR_set_debugreg(reg, val);
}
-static unsigned long xen_get_debugreg(int reg)
+static noinstr unsigned long xen_get_debugreg(int reg)
{
return HYPERVISOR_get_debugreg(reg);
}
@@ -1021,31 +992,13 @@ void __init xen_setup_vcpu_info_placement(void)
for_each_possible_cpu(cpu) {
/* Set up direct vCPU id mapping for PV guests. */
per_cpu(xen_vcpu_id, cpu) = cpu;
-
- /*
- * xen_vcpu_setup(cpu) can fail -- in which case it
- * falls back to the shared_info version for cpus
- * where xen_vcpu_nr(cpu) < MAX_VIRT_CPUS.
- *
- * xen_cpu_up_prepare_pv() handles the rest by failing
- * them in hotplug.
- */
- (void) xen_vcpu_setup(cpu);
+ xen_vcpu_setup(cpu);
}
- /*
- * xen_vcpu_setup managed to place the vcpu_info within the
- * percpu area for all cpus, so make use of it.
- */
- if (xen_have_vcpu_info_placement) {
- pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
- pv_ops.irq.irq_disable =
- __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
- pv_ops.irq.irq_enable =
- __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
- pv_ops.mmu.read_cr2 =
- __PV_IS_CALLEE_SAVE(xen_read_cr2_direct);
- }
+ pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
+ pv_ops.irq.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
+ pv_ops.irq.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
+ pv_ops.mmu.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2_direct);
}
static const struct pv_info xen_info __initconst = {
@@ -1053,52 +1006,54 @@ static const struct pv_info xen_info __initconst = {
.name = "Xen",
};
-static const struct pv_cpu_ops xen_cpu_ops __initconst = {
- .cpuid = xen_cpuid,
+static const typeof(pv_ops) xen_cpu_ops __initconst = {
+ .cpu = {
+ .cpuid = xen_cpuid,
- .set_debugreg = xen_set_debugreg,
- .get_debugreg = xen_get_debugreg,
+ .set_debugreg = xen_set_debugreg,
+ .get_debugreg = xen_get_debugreg,
- .read_cr0 = xen_read_cr0,
- .write_cr0 = xen_write_cr0,
+ .read_cr0 = xen_read_cr0,
+ .write_cr0 = xen_write_cr0,
- .write_cr4 = xen_write_cr4,
+ .write_cr4 = xen_write_cr4,
- .wbinvd = native_wbinvd,
+ .wbinvd = native_wbinvd,
- .read_msr = xen_read_msr,
- .write_msr = xen_write_msr,
+ .read_msr = xen_read_msr,
+ .write_msr = xen_write_msr,
- .read_msr_safe = xen_read_msr_safe,
- .write_msr_safe = xen_write_msr_safe,
+ .read_msr_safe = xen_read_msr_safe,
+ .write_msr_safe = xen_write_msr_safe,
- .read_pmc = xen_read_pmc,
+ .read_pmc = xen_read_pmc,
- .load_tr_desc = paravirt_nop,
- .set_ldt = xen_set_ldt,
- .load_gdt = xen_load_gdt,
- .load_idt = xen_load_idt,
- .load_tls = xen_load_tls,
- .load_gs_index = xen_load_gs_index,
+ .load_tr_desc = paravirt_nop,
+ .set_ldt = xen_set_ldt,
+ .load_gdt = xen_load_gdt,
+ .load_idt = xen_load_idt,
+ .load_tls = xen_load_tls,
+ .load_gs_index = xen_load_gs_index,
- .alloc_ldt = xen_alloc_ldt,
- .free_ldt = xen_free_ldt,
+ .alloc_ldt = xen_alloc_ldt,
+ .free_ldt = xen_free_ldt,
- .store_tr = xen_store_tr,
+ .store_tr = xen_store_tr,
- .write_ldt_entry = xen_write_ldt_entry,
- .write_gdt_entry = xen_write_gdt_entry,
- .write_idt_entry = xen_write_idt_entry,
- .load_sp0 = xen_load_sp0,
+ .write_ldt_entry = xen_write_ldt_entry,
+ .write_gdt_entry = xen_write_gdt_entry,
+ .write_idt_entry = xen_write_idt_entry,
+ .load_sp0 = xen_load_sp0,
#ifdef CONFIG_X86_IOPL_IOPERM
- .invalidate_io_bitmap = xen_invalidate_io_bitmap,
- .update_io_bitmap = xen_update_io_bitmap,
+ .invalidate_io_bitmap = xen_invalidate_io_bitmap,
+ .update_io_bitmap = xen_update_io_bitmap,
#endif
- .io_delay = xen_io_delay,
+ .io_delay = xen_io_delay,
- .start_context_switch = paravirt_start_context_switch,
- .end_context_switch = xen_end_context_switch,
+ .start_context_switch = paravirt_start_context_switch,
+ .end_context_switch = xen_end_context_switch,
+ },
};
static void xen_restart(char *msg)
@@ -1239,7 +1194,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
- pv_ops.cpu = xen_cpu_ops;
+ pv_ops.cpu = xen_cpu_ops.cpu;
paravirt_iret = xen_iret;
xen_init_irq_ops();
@@ -1273,12 +1228,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
__supported_pte_mask &= ~_PAGE_GLOBAL;
__default_kernel_pte_mask &= ~_PAGE_GLOBAL;
- /*
- * Prevent page tables from being allocated in highmem, even
- * if CONFIG_HIGHPTE is enabled.
- */
- __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
-
/* Get mfn list */
xen_build_dynamic_phys_to_machine();
@@ -1364,7 +1313,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
if (!xen_initial_domain()) {
- add_preferred_console("xenboot", 0, NULL);
if (pci_xen)
x86_init.pci.arch_init = pci_xen_init;
x86_platform.set_legacy_features =
@@ -1409,11 +1357,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
#endif
}
- if (!boot_params.screen_info.orig_video_isVGA)
- add_preferred_console("tty", 0, NULL);
- add_preferred_console("hvc", 0, NULL);
- if (boot_params.screen_info.orig_video_isVGA)
- add_preferred_console("tty", 0, NULL);
+ xen_add_preferred_consoles();
#ifdef CONFIG_PCI
/* PCI BIOS service won't work from a PV guest. */
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index 0d5e34b9e6f9..bcae606bbc5c 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/acpi.h>
+#include <linux/export.h>
#include <xen/hvc-console.h>
@@ -18,10 +19,11 @@
/*
* PVH variables.
*
- * The variable xen_pvh needs to live in the data segment since it is used
+ * The variable xen_pvh needs to live in a data segment since it is used
* after startup_{32|64} is invoked, which will clear the .bss segment.
*/
-bool xen_pvh __section(".data") = 0;
+bool __ro_after_init xen_pvh;
+EXPORT_SYMBOL_GPL(xen_pvh);
void __init xen_pvh_init(struct boot_params *boot_params)
{
@@ -36,6 +38,10 @@ void __init xen_pvh_init(struct boot_params *boot_params)
pfn = __pa(hypercall_page);
wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+ if (xen_initial_domain())
+ x86_init.oem.arch_setup = xen_add_preferred_consoles;
+ x86_init.oem.banner = xen_banner;
+
xen_efi_init(boot_params);
}
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index dfa091d79c2e..06c3c2fb4b06 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -19,65 +19,11 @@
* callback mask. We do this in a very simple manner, by making a call
* down into Xen. The pending flag will be checked by Xen on return.
*/
-void xen_force_evtchn_callback(void)
+noinstr void xen_force_evtchn_callback(void)
{
(void)HYPERVISOR_xen_version(0, NULL);
}
-asmlinkage __visible unsigned long xen_save_fl(void)
-{
- struct vcpu_info *vcpu;
- unsigned long flags;
-
- vcpu = this_cpu_read(xen_vcpu);
-
- /* flag has opposite sense of mask */
- flags = !vcpu->evtchn_upcall_mask;
-
- /* convert to IF type flag
- -0 -> 0x00000000
- -1 -> 0xffffffff
- */
- return (-flags) & X86_EFLAGS_IF;
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
-
-asmlinkage __visible void xen_irq_disable(void)
-{
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
- preempt_disable();
- this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
- preempt_enable_no_resched();
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
-
-asmlinkage __visible void xen_irq_enable(void)
-{
- struct vcpu_info *vcpu;
-
- /*
- * We may be preempted as soon as vcpu->evtchn_upcall_mask is
- * cleared, so disable preemption to ensure we check for
- * events on the VCPU we are still running on.
- */
- preempt_disable();
-
- vcpu = this_cpu_read(xen_vcpu);
- vcpu->evtchn_upcall_mask = 0;
-
- /* Doesn't matter if we get preempted here, because any
- pending event will get dealt with anyway. */
-
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- xen_force_evtchn_callback();
-
- preempt_enable();
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
-
static void xen_safe_halt(void)
{
/* Blocking includes an implicit local_irq_enable(). */
@@ -94,17 +40,20 @@ static void xen_halt(void)
xen_safe_halt();
}
-static const struct pv_irq_ops xen_irq_ops __initconst = {
- .save_fl = PV_CALLEE_SAVE(xen_save_fl),
- .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
- .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
+static const typeof(pv_ops) xen_irq_ops __initconst = {
+ .irq = {
+ /* Initial interrupt flag handling only called while interrupts off. */
+ .save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0),
+ .irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop),
+ .irq_enable = __PV_IS_CALLEE_SAVE(paravirt_BUG),
- .safe_halt = xen_safe_halt,
- .halt = xen_halt,
+ .safe_halt = xen_safe_halt,
+ .halt = xen_halt,
+ },
};
void __init xen_init_irq_ops(void)
{
- pv_ops.irq = xen_irq_ops;
+ pv_ops.irq = xen_irq_ops.irq;
x86_init.irqs.intr_init = xen_init_IRQ;
}
diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c
index 57409373750f..509bdee3ab90 100644
--- a/arch/x86/xen/mmu_hvm.c
+++ b/arch/x86/xen/mmu_hvm.c
@@ -9,39 +9,28 @@
#ifdef CONFIG_PROC_VMCORE
/*
- * This function is used in two contexts:
- * - the kdump kernel has to check whether a pfn of the crashed kernel
- * was a ballooned page. vmcore is using this function to decide
- * whether to access a pfn of the crashed kernel.
- * - the kexec kernel has to check whether a pfn was ballooned by the
- * previous kernel. If the pfn is ballooned, handle it properly.
- * Returns 0 if the pfn is not backed by a RAM page, the caller may
+ * The kdump kernel has to check whether a pfn of the crashed kernel
+ * was a ballooned page. vmcore is using this function to decide
+ * whether to access a pfn of the crashed kernel.
+ * Returns "false" if the pfn is not backed by a RAM page, the caller may
* handle the pfn special in this case.
*/
-static int xen_oldmem_pfn_is_ram(unsigned long pfn)
+static bool xen_vmcore_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn)
{
struct xen_hvm_get_mem_type a = {
.domid = DOMID_SELF,
.pfn = pfn,
};
- int ram;
- if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
- return -ENXIO;
-
- switch (a.mem_type) {
- case HVMMEM_mmio_dm:
- ram = 0;
- break;
- case HVMMEM_ram_rw:
- case HVMMEM_ram_ro:
- default:
- ram = 1;
- break;
+ if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) {
+ pr_warn_once("Unexpected HVMOP_get_mem_type failure\n");
+ return true;
}
-
- return ram;
+ return a.mem_type != HVMMEM_mmio_dm;
}
+static struct vmcore_cb xen_vmcore_cb = {
+ .pfn_is_ram = xen_vmcore_pfn_is_ram,
+};
#endif
static void xen_hvm_exit_mmap(struct mm_struct *mm)
@@ -75,6 +64,6 @@ void __init xen_hvm_init_mmu_ops(void)
if (is_pagetable_dying_supported())
pv_ops.mmu.exit_mmap = xen_hvm_exit_mmap;
#ifdef CONFIG_PROC_VMCORE
- WARN_ON(register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram));
+ register_vmcore_cb(&xen_vmcore_cb);
#endif
}
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 8d751939c6f3..00354866921b 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -41,7 +41,6 @@
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/sched/mm.h>
-#include <linux/highmem.h>
#include <linux/debugfs.h>
#include <linux/bug.h>
#include <linux/vmalloc.h>
@@ -86,8 +85,10 @@
#include "mmu.h"
#include "debugfs.h"
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
/* l3 pud for userspace vsyscall mapping */
static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif
/*
* Protects atomic reservation decrease/increase against concurrent increases.
@@ -241,9 +242,11 @@ static void xen_set_pmd(pmd_t *ptr, pmd_t val)
* Associate a virtual page frame with a given physical page frame
* and protection flags for that frame.
*/
-void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+void __init set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{
- set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
+ if (HYPERVISOR_update_va_mapping(vaddr, mfn_pte(mfn, flags),
+ UVMF_INVLPG))
+ BUG();
}
static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
@@ -789,7 +792,9 @@ static void __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
static void __init xen_after_bootmem(void)
{
static_branch_enable(&xen_struct_pages_ready);
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
}
@@ -1025,7 +1030,7 @@ static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
make_lowmem_page_readwrite(vaddr);
- memblock_free(paddr, size);
+ memblock_phys_free(paddr, size);
}
static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
@@ -1151,7 +1156,7 @@ static void __init xen_pagetable_p2m_free(void)
xen_cleanhighmap(addr, addr + size);
size = PAGE_ALIGN(xen_start_info->nr_pages *
sizeof(unsigned long));
- memblock_free(__pa(addr), size);
+ memblock_free((void *)addr, size);
} else {
xen_cleanmfnmap(addr);
}
@@ -1192,6 +1197,13 @@ static void __init xen_pagetable_p2m_setup(void)
static void __init xen_pagetable_init(void)
{
+ /*
+ * The majority of further PTE writes is to pagetables already
+ * announced as such to Xen. Hence it is more efficient to use
+ * hypercalls for these updates.
+ */
+ pv_ops.mmu.set_pte = __xen_set_pte;
+
paging_init();
xen_post_allocator_init();
@@ -1204,7 +1216,8 @@ static void __init xen_pagetable_init(void)
xen_remap_memory();
xen_setup_mfn_list_list();
}
-static void xen_write_cr2(unsigned long cr2)
+
+static noinstr void xen_write_cr2(unsigned long cr2)
{
this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
}
@@ -1420,10 +1433,18 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
*
* Many of these PTE updates are done on unpinned and writable pages
* and doing a hypercall for these is unnecessary and expensive. At
- * this point it is not possible to tell if a page is pinned or not,
- * so always write the PTE directly and rely on Xen trapping and
+ * this point it is rarely possible to tell if a page is pinned, so
+ * mostly write the PTE directly and rely on Xen trapping and
* emulating any updates as necessary.
*/
+static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+ if (unlikely(is_early_ioremap_ptep(ptep)))
+ __xen_set_pte(ptep, pte);
+ else
+ native_set_pte(ptep, pte);
+}
+
__visible pte_t xen_make_pte_init(pteval_t pte)
{
unsigned long pfn;
@@ -1445,11 +1466,6 @@ __visible pte_t xen_make_pte_init(pteval_t pte)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
-static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
-{
- __xen_set_pte(ptep, pte);
-}
-
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
@@ -1749,7 +1765,6 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
@@ -1766,6 +1781,13 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Unpin Xen-provided one */
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+ /* Pin user vsyscall L3 */
+ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
+ PFN_DOWN(__pa_symbol(level3_user_vsyscall)));
+#endif
+
/*
* At this stage there can be no user pgd, and no page structure to
* attach it to, so make sure we just set kernel pgd.
@@ -1955,7 +1977,7 @@ void __init xen_relocate_p2m(void)
pfn_end = p2m_pfn_end;
}
- memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
+ memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
while (pfn < pfn_end) {
if (pfn == p2m_pfn) {
pfn = p2m_pfn_end;
@@ -1998,6 +2020,7 @@ static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
{
pte_t pte;
+ unsigned long vaddr;
phys >>= PAGE_SHIFT;
@@ -2038,15 +2061,15 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
break;
}
- __native_set_fixmap(idx, pte);
+ vaddr = __fix_to_virt(idx);
+ if (HYPERVISOR_update_va_mapping(vaddr, pte, UVMF_INVLPG))
+ BUG();
#ifdef CONFIG_X86_VSYSCALL_EMULATION
/* Replicate changes to map the vsyscall page into the user
pagetable vsyscall mapping. */
- if (idx == VSYSCALL_PAGE) {
- unsigned long vaddr = __fix_to_virt(idx);
+ if (idx == VSYSCALL_PAGE)
set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
- }
#endif
}
@@ -2078,67 +2101,69 @@ static void xen_leave_lazy_mmu(void)
preempt_enable();
}
-static const struct pv_mmu_ops xen_mmu_ops __initconst = {
- .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2),
- .write_cr2 = xen_write_cr2,
+static const typeof(pv_ops) xen_mmu_ops __initconst = {
+ .mmu = {
+ .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2),
+ .write_cr2 = xen_write_cr2,
- .read_cr3 = xen_read_cr3,
- .write_cr3 = xen_write_cr3_init,
+ .read_cr3 = xen_read_cr3,
+ .write_cr3 = xen_write_cr3_init,
- .flush_tlb_user = xen_flush_tlb,
- .flush_tlb_kernel = xen_flush_tlb,
- .flush_tlb_one_user = xen_flush_tlb_one_user,
- .flush_tlb_multi = xen_flush_tlb_multi,
- .tlb_remove_table = tlb_remove_table,
+ .flush_tlb_user = xen_flush_tlb,
+ .flush_tlb_kernel = xen_flush_tlb,
+ .flush_tlb_one_user = xen_flush_tlb_one_user,
+ .flush_tlb_multi = xen_flush_tlb_multi,
+ .tlb_remove_table = tlb_remove_table,
- .pgd_alloc = xen_pgd_alloc,
- .pgd_free = xen_pgd_free,
+ .pgd_alloc = xen_pgd_alloc,
+ .pgd_free = xen_pgd_free,
- .alloc_pte = xen_alloc_pte_init,
- .release_pte = xen_release_pte_init,
- .alloc_pmd = xen_alloc_pmd_init,
- .release_pmd = xen_release_pmd_init,
+ .alloc_pte = xen_alloc_pte_init,
+ .release_pte = xen_release_pte_init,
+ .alloc_pmd = xen_alloc_pmd_init,
+ .release_pmd = xen_release_pmd_init,
- .set_pte = xen_set_pte_init,
- .set_pmd = xen_set_pmd_hyper,
+ .set_pte = xen_set_pte_init,
+ .set_pmd = xen_set_pmd_hyper,
- .ptep_modify_prot_start = xen_ptep_modify_prot_start,
- .ptep_modify_prot_commit = xen_ptep_modify_prot_commit,
+ .ptep_modify_prot_start = xen_ptep_modify_prot_start,
+ .ptep_modify_prot_commit = xen_ptep_modify_prot_commit,
- .pte_val = PV_CALLEE_SAVE(xen_pte_val),
- .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
+ .pte_val = PV_CALLEE_SAVE(xen_pte_val),
+ .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
- .make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
- .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
+ .make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
+ .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
- .set_pud = xen_set_pud_hyper,
+ .set_pud = xen_set_pud_hyper,
- .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
- .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
+ .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
+ .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
- .pud_val = PV_CALLEE_SAVE(xen_pud_val),
- .make_pud = PV_CALLEE_SAVE(xen_make_pud),
- .set_p4d = xen_set_p4d_hyper,
+ .pud_val = PV_CALLEE_SAVE(xen_pud_val),
+ .make_pud = PV_CALLEE_SAVE(xen_make_pud),
+ .set_p4d = xen_set_p4d_hyper,
- .alloc_pud = xen_alloc_pmd_init,
- .release_pud = xen_release_pmd_init,
+ .alloc_pud = xen_alloc_pmd_init,
+ .release_pud = xen_release_pmd_init,
#if CONFIG_PGTABLE_LEVELS >= 5
- .p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
- .make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
+ .p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
+ .make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
#endif
- .activate_mm = xen_activate_mm,
- .dup_mmap = xen_dup_mmap,
- .exit_mmap = xen_exit_mmap,
+ .activate_mm = xen_activate_mm,
+ .dup_mmap = xen_dup_mmap,
+ .exit_mmap = xen_exit_mmap,
- .lazy_mode = {
- .enter = paravirt_enter_lazy_mmu,
- .leave = xen_leave_lazy_mmu,
- .flush = paravirt_flush_lazy_mmu,
- },
+ .lazy_mode = {
+ .enter = paravirt_enter_lazy_mmu,
+ .leave = xen_leave_lazy_mmu,
+ .flush = paravirt_flush_lazy_mmu,
+ },
- .set_fixmap = xen_set_fixmap,
+ .set_fixmap = xen_set_fixmap,
+ },
};
void __init xen_init_mmu_ops(void)
@@ -2146,7 +2171,7 @@ void __init xen_init_mmu_ops(void)
x86_init.paging.pagetable_init = xen_pagetable_init;
x86_init.hyper.init_after_bootmem = xen_after_bootmem;
- pv_ops.mmu = xen_mmu_ops;
+ pv_ops.mmu = xen_mmu_ops.mmu;
memset(dummy_mapping, 0xff, PAGE_SIZE);
}
@@ -2398,7 +2423,7 @@ static int remap_area_pfn_pte_fn(pte_t *ptep, unsigned long addr, void *data)
int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr,
xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot,
- unsigned int domid, bool no_translate, struct page **pages)
+ unsigned int domid, bool no_translate)
{
int err = 0;
struct remap_data rmd;
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 5e6e236977c7..58db86f7b384 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -197,7 +197,7 @@ static void * __ref alloc_p2m_page(void)
static void __ref free_p2m_page(void *p)
{
if (unlikely(!slab_is_available())) {
- memblock_free((unsigned long)p, PAGE_SIZE);
+ memblock_free(p, PAGE_SIZE);
return;
}
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index e13b0b49fcdf..89dd6b1708b0 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -413,34 +413,29 @@ int pmu_apic_update(uint32_t val)
}
/* perf callbacks */
-static int xen_is_in_guest(void)
+static unsigned int xen_guest_state(void)
{
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ unsigned int state = 0;
if (!xenpmu_data) {
pr_warn_once("%s: pmudata not initialized\n", __func__);
- return 0;
+ return state;
}
if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
- return 0;
+ return state;
- return 1;
-}
-
-static int xen_is_user_mode(void)
-{
- const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ state |= PERF_GUEST_ACTIVE;
- if (!xenpmu_data) {
- pr_warn_once("%s: pmudata not initialized\n", __func__);
- return 0;
+ if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) {
+ if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER)
+ state |= PERF_GUEST_USER;
+ } else if (xenpmu_data->pmu.r.regs.cpl & 3) {
+ state |= PERF_GUEST_USER;
}
- if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
- return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
- else
- return !!(xenpmu_data->pmu.r.regs.cpl & 3);
+ return state;
}
static unsigned long xen_get_guest_ip(void)
@@ -456,9 +451,8 @@ static unsigned long xen_get_guest_ip(void)
}
static struct perf_guest_info_callbacks xen_guest_cbs = {
- .is_in_guest = xen_is_in_guest,
- .is_user_mode = xen_is_user_mode,
- .get_guest_ip = xen_get_guest_ip,
+ .state = xen_guest_state,
+ .get_ip = xen_get_guest_ip,
};
/* Convert registers from Xen's format to Linux' */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 8bfc10330107..af216feb63d9 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -153,7 +153,7 @@ static void __init xen_del_extra_mem(unsigned long start_pfn,
break;
}
}
- memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
+ memblock_phys_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
}
/*
@@ -306,10 +306,6 @@ static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
BUG();
}
- /* Update kernel mapping, but not for highmem. */
- if (pfn >= PFN_UP(__pa(high_memory - 1)))
- return;
-
if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
mfn_pte(mfn, PAGE_KERNEL), 0)) {
WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
@@ -429,13 +425,13 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
}
/*
- * If the PFNs are currently mapped, the VA mapping also needs
- * to be updated to be 1:1.
+ * If the PFNs are currently mapped, their VA mappings need to be
+ * zapped.
*/
for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
(void)HYPERVISOR_update_va_mapping(
(unsigned long)__va(pfn << PAGE_SHIFT),
- mfn_pte(pfn, PAGE_KERNEL_IO), 0);
+ native_make_pte(0), 0);
return remap_pfn;
}
@@ -719,7 +715,7 @@ static void __init xen_reserve_xen_mfnlist(void)
return;
xen_relocate_p2m();
- memblock_free(start, size);
+ memblock_phys_free(start, size);
}
/**
@@ -885,7 +881,7 @@ char * __init xen_memory_setup(void)
xen_phys_memcpy(new_area, start, size);
pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
start, start + size, new_area, new_area + size);
- memblock_free(start, size);
+ memblock_phys_free(start, size);
boot_params.hdr.ramdisk_image = new_area;
boot_params.ext_ramdisk_image = new_area >> 32;
}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c1b2f764b29a..c3e1f9a7d43a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -121,34 +121,10 @@ int xen_smp_intr_init(unsigned int cpu)
void __init xen_smp_cpus_done(unsigned int max_cpus)
{
- int cpu, rc, count = 0;
-
if (xen_hvm_domain())
native_smp_cpus_done(max_cpus);
else
calculate_max_logical_packages();
-
- if (xen_have_vcpu_info_placement)
- return;
-
- for_each_online_cpu(cpu) {
- if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS)
- continue;
-
- rc = remove_cpu(cpu);
-
- if (rc == 0) {
- /*
- * Reset vcpu_info so this cpu cannot be onlined again.
- */
- xen_vcpu_info_reset(cpu);
- count++;
- } else {
- pr_warn("%s: failed to bring CPU %d down, error %d\n",
- __func__, cpu, rc);
- }
- }
- WARN(count, "%s: brought %d CPUs offline\n", __func__, count);
}
void xen_smp_send_reschedule(int cpu)
@@ -268,20 +244,16 @@ void xen_send_IPI_allbutself(int vector)
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
- irq_enter();
generic_smp_call_function_interrupt();
inc_irq_stat(irq_call_count);
- irq_exit();
return IRQ_HANDLED;
}
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
- irq_enter();
generic_smp_call_function_single_interrupt();
inc_irq_stat(irq_call_count);
- irq_exit();
return IRQ_HANDLED;
}
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 7ed56c6075b0..6a8f3b53ab83 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -225,7 +225,6 @@ static void __init xen_pv_smp_prepare_boot_cpu(void)
static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
- unsigned int i;
if (skip_ioapic_setup) {
char *m = (max_cpus == 0) ?
@@ -238,16 +237,9 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
}
xen_init_lock_cpu(0);
- smp_store_boot_cpu_info();
- cpu_data(0).x86_max_cores = 1;
+ smp_prepare_cpus_common();
- for_each_possible_cpu(i) {
- zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
- }
- set_cpu_sibling_map(0);
+ cpu_data(0).x86_max_cores = 1;
speculative_store_bypass_ht_init();
@@ -458,10 +450,8 @@ static void xen_pv_stop_other_cpus(int wait)
static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
{
- irq_enter();
irq_work_run();
inc_irq_stat(apic_irq_work_irqs);
- irq_exit();
return IRQ_HANDLED;
}
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
index e336f223f7f4..31b1e3477cb6 100644
--- a/arch/x86/xen/vga.c
+++ b/arch/x86/xen/vga.c
@@ -63,13 +63,17 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
}
if (size >= offsetof(struct dom0_vga_console_info,
- u.vesa_lfb.gbl_caps)
- + sizeof(info->u.vesa_lfb.gbl_caps))
- screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
- if (size >= offsetof(struct dom0_vga_console_info,
u.vesa_lfb.mode_attrs)
+ sizeof(info->u.vesa_lfb.mode_attrs))
screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
+
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.ext_lfb_base)
+ + sizeof(info->u.vesa_lfb.ext_lfb_base)
+ && info->u.vesa_lfb.ext_lfb_base) {
+ screen_info->ext_lfb_base = info->u.vesa_lfb.ext_lfb_base;
+ screen_info->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
+ }
break;
}
}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 1e626444712b..e730e6200e64 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -20,6 +20,46 @@
#include <linux/init.h>
#include <linux/linkage.h>
+#include <../entry/calling.h>
+
+.pushsection .noinstr.text, "ax"
+/*
+ * Disabling events is simply a matter of making the event mask
+ * non-zero.
+ */
+SYM_FUNC_START(xen_irq_disable_direct)
+ movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ RET
+SYM_FUNC_END(xen_irq_disable_direct)
+
+/*
+ * Force an event check by making a hypercall, but preserve regs
+ * before making the call.
+ */
+SYM_FUNC_START(check_events)
+ FRAME_BEGIN
+ push %rax
+ push %rcx
+ push %rdx
+ push %rsi
+ push %rdi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+ call xen_force_evtchn_callback
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rdi
+ pop %rsi
+ pop %rdx
+ pop %rcx
+ pop %rax
+ FRAME_END
+ RET
+SYM_FUNC_END(check_events)
/*
* Enable events. This clears the event mask and tests the pending
@@ -44,19 +84,9 @@ SYM_FUNC_START(xen_irq_enable_direct)
call check_events
1:
FRAME_END
- ret
+ RET
SYM_FUNC_END(xen_irq_enable_direct)
-
-/*
- * Disabling events is simply a matter of making the event mask
- * non-zero.
- */
-SYM_FUNC_START(xen_irq_disable_direct)
- movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
- ret
-SYM_FUNC_END(xen_irq_disable_direct)
-
/*
* (xen_)save_fl is used to get the current interrupt enable status.
* Callers expect the status to be in X86_EFLAGS_IF, and other bits
@@ -70,52 +100,24 @@ SYM_FUNC_START(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
setz %ah
addb %ah, %ah
- ret
+ RET
SYM_FUNC_END(xen_save_fl_direct)
-/*
- * Force an event check by making a hypercall, but preserve regs
- * before making the call.
- */
-SYM_FUNC_START(check_events)
- FRAME_BEGIN
- push %rax
- push %rcx
- push %rdx
- push %rsi
- push %rdi
- push %r8
- push %r9
- push %r10
- push %r11
- call xen_force_evtchn_callback
- pop %r11
- pop %r10
- pop %r9
- pop %r8
- pop %rdi
- pop %rsi
- pop %rdx
- pop %rcx
- pop %rax
- FRAME_END
- ret
-SYM_FUNC_END(check_events)
-
SYM_FUNC_START(xen_read_cr2)
FRAME_BEGIN
_ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX
_ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX
FRAME_END
- ret
+ RET
SYM_FUNC_END(xen_read_cr2);
SYM_FUNC_START(xen_read_cr2_direct)
FRAME_BEGIN
_ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX
FRAME_END
- ret
+ RET
SYM_FUNC_END(xen_read_cr2_direct);
+.popsection
.macro xen_pv_trap name
SYM_CODE_START(xen_\name)
@@ -192,6 +194,25 @@ SYM_CODE_START(xen_iret)
SYM_CODE_END(xen_iret)
/*
+ * XEN pv doesn't use trampoline stack, PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is
+ * also the kernel stack. Reusing swapgs_restore_regs_and_return_to_usermode()
+ * in XEN pv would cause %rsp to move up to the top of the kernel stack and
+ * leave the IRET frame below %rsp, which is dangerous to be corrupted if #NMI
+ * interrupts. And swapgs_restore_regs_and_return_to_usermode() pushing the IRET
+ * frame at the same address is useless.
+ */
+SYM_CODE_START(xenpv_restore_regs_and_return_to_usermode)
+ UNWIND_HINT_REGS
+ POP_REGS
+
+ /* stackleak_erase() can work safely on the kernel stack. */
+ STACKLEAK_ERASE_NOCLOBBER
+
+ addq $8, %rsp /* skip regs->orig_ax */
+ jmp xen_iret
+SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode)
+
+/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
* - kernel gs
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index cb6538ae2fe0..11d286529fe5 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -20,6 +20,23 @@
#include <xen/interface/xen-mca.h>
#include <asm/xen/interface.h>
+.pushsection .noinstr.text, "ax"
+ .balign PAGE_SIZE
+SYM_CODE_START(hypercall_page)
+ .rept (PAGE_SIZE / 32)
+ UNWIND_HINT_FUNC
+ .skip 31, 0x90
+ RET
+ .endr
+
+#define HYPERCALL(n) \
+ .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
+ .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
+#include <asm/xen-hypercalls.h>
+#undef HYPERCALL
+SYM_CODE_END(hypercall_page)
+.popsection
+
#ifdef CONFIG_XEN_PV
__INIT
SYM_CODE_START(startup_xen)
@@ -28,13 +45,13 @@ SYM_CODE_START(startup_xen)
/* Clear .bss */
xor %eax,%eax
- mov $__bss_start, %_ASM_DI
- mov $__bss_stop, %_ASM_CX
- sub %_ASM_DI, %_ASM_CX
- shr $__ASM_SEL(2, 3), %_ASM_CX
- rep __ASM_SIZE(stos)
+ mov $__bss_start, %rdi
+ mov $__bss_stop, %rcx
+ sub %rdi, %rcx
+ shr $3, %rcx
+ rep stosq
- mov %_ASM_SI, xen_start_info
+ mov %rsi, xen_start_info
mov initial_stack(%rip), %rsp
/* Set up %gs.
@@ -64,23 +81,6 @@ SYM_CODE_END(asm_cpu_bringup_and_idle)
#endif
#endif
-.pushsection .text
- .balign PAGE_SIZE
-SYM_CODE_START(hypercall_page)
- .rept (PAGE_SIZE / 32)
- UNWIND_HINT_FUNC
- .skip 31, 0x90
- ret
- .endr
-
-#define HYPERCALL(n) \
- .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
- .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
-#include <asm/xen-hypercalls.h>
-#undef HYPERCALL
-SYM_CODE_END(hypercall_page)
-.popsection
-
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 8d7ec49a35fb..fd0fec6e92f4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -51,6 +51,7 @@ void __init xen_remap_memory(void);
phys_addr_t __init xen_find_free_area(phys_addr_t size);
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
+void xen_banner(void);
void xen_enable_sysenter(void);
void xen_enable_syscall(void);
void xen_vcpu_restore(void);
@@ -75,9 +76,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
bool xen_vcpu_stolen(int vcpu);
-extern int xen_have_vcpu_info_placement;
-
-int xen_vcpu_setup(int cpu);
+void xen_vcpu_setup(int cpu);
void xen_vcpu_info_reset(int cpu);
void xen_setup_vcpu_info_placement(void);
@@ -109,7 +108,7 @@ static inline void xen_uninit_lock_cpu(int cpu)
struct dom0_vga_console_info;
-#ifdef CONFIG_XEN_DOM0
+#ifdef CONFIG_XEN_PV_DOM0
void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
#else
static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
@@ -118,6 +117,8 @@ static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
}
#endif
+void xen_add_preferred_consoles(void);
+
void __init xen_init_apic(void);
#ifdef CONFIG_XEN_EFI