summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig38
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/compressed/Makefile5
-rw-r--r--arch/x86/boot/compressed/kaslr.c50
-rw-r--r--arch/x86/boot/compressed/sbat.S7
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S8
-rw-r--r--arch/x86/boot/header.S41
-rw-r--r--arch/x86/coco/sev/Makefile3
-rw-r--r--arch/x86/coco/sev/core.c193
-rw-r--r--arch/x86/coco/sev/vc-handle.c9
-rw-r--r--arch/x86/coco/tdx/tdx.c50
-rw-r--r--arch/x86/configs/i386_defconfig19
-rw-r--r--arch/x86/configs/x86_64_defconfig9
-rw-r--r--arch/x86/crypto/Kconfig27
-rw-r--r--arch/x86/crypto/Makefile6
-rw-r--r--arch/x86/crypto/aegis128-aesni-glue.c40
-rw-r--r--arch/x86/crypto/aria_aesni_avx2_glue.c1
-rw-r--r--arch/x86/crypto/aria_aesni_avx_glue.c1
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c1
-rw-r--r--arch/x86/crypto/camellia_glue.c1
-rw-r--r--arch/x86/crypto/curve25519-x86_64.c1
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c1
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S700
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S304
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S554
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c324
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S423
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S750
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S425
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c322
-rw-r--r--arch/x86/crypto/sm4_aesni_avx_glue.c1
-rw-r--r--arch/x86/crypto/twofish_glue.c1
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c1
-rw-r--r--arch/x86/entry/calling.h4
-rw-r--r--arch/x86/entry/entry.S8
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/entry/vdso/Makefile3
-rw-r--r--arch/x86/events/intel/core.c10
-rw-r--r--arch/x86/events/intel/uncore.c7
-rw-r--r--arch/x86/events/intel/uncore.h2
-rw-r--r--arch/x86/events/intel/uncore_discovery.c89
-rw-r--r--arch/x86/events/intel/uncore_discovery.h7
-rw-r--r--arch/x86/events/intel/uncore_snb.c79
-rw-r--r--arch/x86/events/intel/uncore_snbep.c4
-rw-r--r--arch/x86/hyperv/hv_init.c68
-rw-r--r--arch/x86/hyperv/hv_vtl.c61
-rw-r--r--arch/x86/hyperv/irqdomain.c69
-rw-r--r--arch/x86/hyperv/ivm.c12
-rw-r--r--arch/x86/hyperv/nested.c1
-rw-r--r--arch/x86/include/asm/acpi.h4
-rw-r--r--arch/x86/include/asm/amd/fch.h13
-rw-r--r--arch/x86/include/asm/apic.h74
-rw-r--r--arch/x86/include/asm/ce4100.h6
-rw-r--r--arch/x86/include/asm/cpufeatures.h13
-rw-r--r--arch/x86/include/asm/debugreg.h19
-rw-r--r--arch/x86/include/asm/fpu/types.h49
-rw-r--r--arch/x86/include/asm/fpu/xstate.h9
-rw-r--r--arch/x86/include/asm/init.h2
-rw-r--r--arch/x86/include/asm/intel_telemetry.h37
-rw-r--r--arch/x86/include/asm/irq_remapping.h17
-rw-r--r--arch/x86/include/asm/irqflags.h4
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h8
-rw-r--r--arch/x86/include/asm/kvm_host.h129
-rw-r--r--arch/x86/include/asm/module.h8
-rw-r--r--arch/x86/include/asm/mshyperv.h29
-rw-r--r--arch/x86/include/asm/msr-index.h8
-rw-r--r--arch/x86/include/asm/msr.h4
-rw-r--r--arch/x86/include/asm/mwait.h27
-rw-r--r--arch/x86/include/asm/nospec-branch.h37
-rw-r--r--arch/x86/include/asm/pgtable.h23
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/posted_intr.h83
-rw-r--r--arch/x86/include/asm/realmode.h2
-rw-r--r--arch/x86/include/asm/resctrl.h19
-rw-r--r--arch/x86/include/asm/set_memory.h2
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/sev.h38
-rw-r--r--arch/x86/include/asm/shared/tdx.h12
-rw-r--r--arch/x86/include/asm/sighandling.h22
-rw-r--r--arch/x86/include/asm/smp.h23
-rw-r--r--arch/x86/include/asm/special_insns.h29
-rw-r--r--arch/x86/include/asm/svm.h23
-rw-r--r--arch/x86/include/asm/syscall.h43
-rw-r--r--arch/x86/include/asm/tdx.h79
-rw-r--r--arch/x86/include/asm/tdx_global_metadata.h (renamed from arch/x86/virt/vmx/tdx/tdx_global_metadata.h)19
-rw-r--r--arch/x86/include/asm/trace/fpu.h15
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/include/uapi/asm/debugreg.h21
-rw-r--r--arch/x86/include/uapi/asm/kvm.h78
-rw-r--r--arch/x86/include/uapi/asm/setup_data.h13
-rw-r--r--arch/x86/include/uapi/asm/svm.h2
-rw-r--r--arch/x86/include/uapi/asm/vmx.h5
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/madt_wakeup.c2
-rw-r--r--arch/x86/kernel/alternative.c81
-rw-r--r--arch/x86/kernel/apic/apic_noop.c8
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/vector.c6
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c60
-rw-r--r--arch/x86/kernel/cpu/bugs.c581
-rw-r--r--arch/x86/kernel/cpu/common.c45
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c28
-rw-r--r--arch/x86/kernel/cpu/mce/core.c24
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_shas.c112
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c14
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/Makefile2
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c37
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c635
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h397
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c918
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c1092
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h (renamed from arch/x86/kernel/cpu/resctrl/trace.h)26
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c4164
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.h1
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c30
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c2
-rw-r--r--arch/x86/kernel/crash.c26
-rw-r--r--arch/x86/kernel/e820.c18
-rw-r--r--arch/x86/kernel/fpu/core.c53
-rw-r--r--arch/x86/kernel/fpu/init.c1
-rw-r--r--arch/x86/kernel/fpu/xstate.c40
-rw-r--r--arch/x86/kernel/ioport.c13
-rw-r--r--arch/x86/kernel/irq.c63
-rw-r--r--arch/x86/kernel/itmt.c23
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c58
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/ksysfs.c8
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c22
-rw-r--r--arch/x86/kernel/process.c42
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c8
-rw-r--r--arch/x86/kernel/ptrace.c22
-rw-r--r--arch/x86/kernel/setup.c46
-rw-r--r--arch/x86/kernel/signal_32.c4
-rw-r--r--arch/x86/kernel/signal_64.c4
-rw-r--r--arch/x86/kernel/smp.c24
-rw-r--r--arch/x86/kernel/smpboot.c61
-rw-r--r--arch/x86/kernel/traps.c36
-rw-r--r--arch/x86/kvm/Kconfig22
-rw-r--r--arch/x86/kvm/Makefile8
-rw-r--r--arch/x86/kvm/cpuid.c71
-rw-r--r--arch/x86/kvm/cpuid.h33
-rw-r--r--arch/x86/kvm/hyperv.c15
-rw-r--r--arch/x86/kvm/hyperv.h3
-rw-r--r--arch/x86/kvm/i8254.c90
-rw-r--r--arch/x86/kvm/i8254.h17
-rw-r--r--arch/x86/kvm/i8259.c17
-rw-r--r--arch/x86/kvm/ioapic.c62
-rw-r--r--arch/x86/kvm/ioapic.h26
-rw-r--r--arch/x86/kvm/irq.c563
-rw-r--r--arch/x86/kvm/irq.h35
-rw-r--r--arch/x86/kvm/irq_comm.c442
-rw-r--r--arch/x86/kvm/lapic.c147
-rw-r--r--arch/x86/kvm/lapic.h32
-rw-r--r--arch/x86/kvm/mmu.h6
-rw-r--r--arch/x86/kvm/mmu/mmu.c128
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h8
-rw-r--r--arch/x86/kvm/mmu/page_track.c3
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h8
-rw-r--r--arch/x86/kvm/mmu/spte.c68
-rw-r--r--arch/x86/kvm/mmu/spte.h11
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c68
-rw-r--r--arch/x86/kvm/reverse_cpuid.h7
-rw-r--r--arch/x86/kvm/smm.h3
-rw-r--r--arch/x86/kvm/svm/avic.c688
-rw-r--r--arch/x86/kvm/svm/nested.c164
-rw-r--r--arch/x86/kvm/svm/sev.c388
-rw-r--r--arch/x86/kvm/svm/svm.c667
-rw-r--r--arch/x86/kvm/svm/svm.h151
-rw-r--r--arch/x86/kvm/svm/vmenter.S6
-rw-r--r--arch/x86/kvm/trace.h99
-rw-r--r--arch/x86/kvm/vmx/capabilities.h1
-rw-r--r--arch/x86/kvm/vmx/common.h180
-rw-r--r--arch/x86/kvm/vmx/main.c1086
-rw-r--r--arch/x86/kvm/vmx/nested.c87
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c60
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.h28
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c175
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h13
-rw-r--r--arch/x86/kvm/vmx/run_flags.h10
-rw-r--r--arch/x86/kvm/vmx/tdx.c3643
-rw-r--r--arch/x86/kvm/vmx/tdx.h205
-rw-r--r--arch/x86/kvm/vmx/tdx_arch.h167
-rw-r--r--arch/x86/kvm/vmx/tdx_errno.h40
-rw-r--r--arch/x86/kvm/vmx/vmenter.S3
-rw-r--r--arch/x86/kvm/vmx/vmx.c604
-rw-r--r--arch/x86/kvm/vmx/vmx.h200
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h41
-rw-r--r--arch/x86/kvm/x86.c539
-rw-r--r--arch/x86/kvm/x86.h89
-rw-r--r--arch/x86/kvm/xen.c20
-rw-r--r--arch/x86/lib/.gitignore4
-rw-r--r--arch/x86/lib/Makefile12
-rw-r--r--arch/x86/lib/cache-smp.c26
-rw-r--r--arch/x86/lib/crc-pclmul-consts.h195
-rw-r--r--arch/x86/lib/crc-pclmul-template.S582
-rw-r--r--arch/x86/lib/crc-pclmul-template.h76
-rw-r--r--arch/x86/lib/crc-t10dif.c40
-rw-r--r--arch/x86/lib/crc16-msb-pclmul.S6
-rw-r--r--arch/x86/lib/crc32-pclmul.S6
-rw-r--r--arch/x86/lib/crc32.c111
-rw-r--r--arch/x86/lib/crc32c-3way.S360
-rw-r--r--arch/x86/lib/crc64-pclmul.S7
-rw-r--r--arch/x86/lib/crc64.c50
-rw-r--r--arch/x86/lib/crypto/.gitignore2
-rw-r--r--arch/x86/lib/crypto/Kconfig34
-rw-r--r--arch/x86/lib/crypto/Makefile20
-rw-r--r--arch/x86/lib/crypto/blake2s-core.S252
-rw-r--r--arch/x86/lib/crypto/blake2s-glue.c70
-rw-r--r--arch/x86/lib/crypto/chacha-avx2-x86_64.S1021
-rw-r--r--arch/x86/lib/crypto/chacha-avx512vl-x86_64.S836
-rw-r--r--arch/x86/lib/crypto/chacha-ssse3-x86_64.S791
-rw-r--r--arch/x86/lib/crypto/chacha_glue.c196
-rw-r--r--arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl4253
-rw-r--r--arch/x86/lib/crypto/poly1305_glue.c129
-rw-r--r--arch/x86/lib/crypto/sha256-avx-asm.S499
-rw-r--r--arch/x86/lib/crypto/sha256-avx2-asm.S774
-rw-r--r--arch/x86/lib/crypto/sha256-ni-asm.S196
-rw-r--r--arch/x86/lib/crypto/sha256-ssse3-asm.S511
-rw-r--r--arch/x86/lib/crypto/sha256.c80
-rw-r--r--arch/x86/mm/dump_pagetables.c71
-rw-r--r--arch/x86/mm/extable.c5
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c20
-rw-r--r--arch/x86/mm/ioremap.c7
-rw-r--r--arch/x86/mm/pat/memtype.c194
-rw-r--r--arch/x86/mm/pat/memtype_interval.c63
-rw-r--r--arch/x86/mm/pat/set_memory.c16
-rw-r--r--arch/x86/mm/pgtable.c9
-rw-r--r--arch/x86/mm/pti.c9
-rw-r--r--arch/x86/net/bpf_jit_comp.c10
-rw-r--r--arch/x86/pci/Makefile6
-rw-r--r--arch/x86/pci/fixup.c4
-rw-r--r--arch/x86/pci/intel_mid.c (renamed from arch/x86/pci/intel_mid_pci.c)0
-rw-r--r--arch/x86/platform/ce4100/ce4100.c95
-rw-r--r--arch/x86/platform/efi/efi_64.c4
-rw-r--r--arch/x86/power/hibernate.c19
-rw-r--r--arch/x86/purgatory/Makefile2
-rw-r--r--arch/x86/purgatory/purgatory.c2
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/x86/tools/insn_decoder_test.c2
-rw-r--r--arch/x86/tools/insn_sanity.c4
-rw-r--r--arch/x86/um/asm/checksum.h3
-rw-r--r--arch/x86/um/asm/processor.h8
-rw-r--r--arch/x86/um/asm/syscall.h2
-rw-r--r--arch/x86/um/os-Linux/mcontext.c218
-rw-r--r--arch/x86/um/ptrace.c86
-rw-r--r--arch/x86/um/shared/sysdep/kernel-offsets.h2
-rw-r--r--arch/x86/um/shared/sysdep/mcontext.h9
-rw-r--r--arch/x86/um/shared/sysdep/ptrace.h12
-rw-r--r--arch/x86/um/shared/sysdep/stub-data.h23
-rw-r--r--arch/x86/um/shared/sysdep/stub.h2
-rw-r--r--arch/x86/um/shared/sysdep/stub_32.h13
-rw-r--r--arch/x86/um/shared/sysdep/stub_64.h17
-rw-r--r--arch/x86/um/shared/sysdep/syscalls.h6
-rw-r--r--arch/x86/um/shared/sysdep/syscalls_32.h14
-rw-r--r--arch/x86/um/shared/sysdep/syscalls_64.h28
-rw-r--r--arch/x86/um/tls_32.c28
-rw-r--r--arch/x86/virt/vmx/tdx/seamcall.S3
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.c428
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.h48
-rw-r--r--arch/x86/virt/vmx/tdx/tdx_global_metadata.c50
269 files changed, 12141 insertions, 26451 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 121f9f03bd5c..08e511657f05 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -38,6 +38,7 @@ config X86_64
select ARCH_HAS_ELFCORE_COMPAT
select ZONE_DMA32
select EXECMEM if DYNAMIC_FTRACE
+ select ACPI_MRRM if ACPI
config FORCE_DYNAMIC_FTRACE
def_bool y
@@ -74,13 +75,11 @@ config X86
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE)
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
+ select ARCH_HAS_CPU_ATTACK_VECTORS if CPU_MITIGATIONS
select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
select ARCH_HAS_CPU_FINALIZE_INIT
select ARCH_HAS_CPU_PASID if IOMMU_SVA
- select ARCH_HAS_CRC32
- select ARCH_HAS_CRC64 if X86_64
- select ARCH_HAS_CRC_T10DIF
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
@@ -88,7 +87,7 @@ config X86
select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN
select ARCH_HAS_EARLY_DEBUG if KGDB
select ARCH_HAS_ELF_RANDOMIZE
- select ARCH_HAS_EXECMEM_ROX if X86_64
+ select ARCH_HAS_EXECMEM_ROX if X86_64 && STRICT_MODULE_RWX
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
@@ -146,7 +145,7 @@ config X86
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANTS_NO_INSTR
select ARCH_WANT_GENERAL_HUGETLB
- select ARCH_WANT_HUGE_PMD_SHARE
+ select ARCH_WANT_HUGE_PMD_SHARE if X86_64
select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if X86_64
select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64
@@ -203,13 +202,13 @@ config X86
select HAVE_ARCH_KFENCE
select HAVE_ARCH_KMSAN if X86_64
select HAVE_ARCH_KGDB
+ select HAVE_ARCH_KSTACK_ERASE
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
select HAVE_ARCH_PREL32_RELOCATIONS
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
- select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
@@ -243,7 +242,6 @@ config X86
select HAVE_GUP_FAST
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
select HAVE_FTRACE_GRAPH_FUNC if HAVE_FUNCTION_GRAPH_TRACER
- select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_FREGS if HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_GRAPH_TRACER if X86_32 || (X86_64 && DYNAMIC_FTRACE)
select HAVE_FUNCTION_TRACER
@@ -507,8 +505,9 @@ config X86_MPPARSE
config X86_CPU_RESCTRL
bool "x86 CPU resource control support"
depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
- select KERNFS
- select PROC_CPU_RESCTRL if PROC_FS
+ depends on MISC_FILESYSTEMS
+ select ARCH_HAS_CPU_RESCTRL
+ select RESCTRL_FS
select RESCTRL_FS_PSEUDO_LOCK
help
Enable x86 CPU resource control support.
@@ -526,12 +525,6 @@ config X86_CPU_RESCTRL
Say N if unsure.
-config RESCTRL_FS_PSEUDO_LOCK
- bool
- help
- Software mechanism to pin data in a cache portion using
- micro-architecture specific knowledge.
-
config X86_FRED
bool "Flexible Return and Event Delivery"
depends on X86_64
@@ -1862,8 +1855,7 @@ endchoice
config X86_SGX
bool "Software Guard eXtensions (SGX)"
depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC
- depends on CRYPTO=y
- depends on CRYPTO_SHA256=y
+ select CRYPTO_LIB_SHA256
select MMU_NOTIFIER
select NUMA_KEEP_MEMINFO if NUMA
select XARRAY_MULTI
@@ -2010,6 +2002,9 @@ config ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG
config ARCH_SUPPORTS_KEXEC_JUMP
def_bool y
+config ARCH_SUPPORTS_KEXEC_HANDOVER
+ def_bool X86_64
+
config ARCH_SUPPORTS_CRASH_DUMP
def_bool X86_64 || (X86_32 && HIGHMEM)
@@ -2697,6 +2692,15 @@ config MITIGATION_ITS
disabled, mitigation cannot be enabled via cmdline.
See <file:Documentation/admin-guide/hw-vuln/indirect-target-selection.rst>
+config MITIGATION_TSA
+ bool "Mitigate Transient Scheduler Attacks"
+ depends on CPU_SUP_AMD
+ default y
+ help
+ Enable mitigation for Transient Scheduler Attacks. TSA is a hardware
+ security vulnerability on AMD CPUs which can lead to forwarding of
+ invalid info to subsequent instructions and thus can affect their
+ timing and thereby cause a leakage.
endif
config ARCH_HAS_ADD_PAGES
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 640fcac3af74..3f9fb3698d66 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -71,7 +71,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
-sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|_e\?sbat\|z_.*\)$$/\#define ZO_\2 0x\1/p'
quiet_cmd_zoffset = ZOFFSET $@
cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f4f7b22d8113..3a38fdcdb9bd 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -106,6 +106,11 @@ vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o
vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o
vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
vmlinux-libs-$(CONFIG_X86_64) += $(objtree)/arch/x86/boot/startup/lib.a
+vmlinux-objs-$(CONFIG_EFI_SBAT) += $(obj)/sbat.o
+
+ifdef CONFIG_EFI_SBAT
+$(obj)/sbat.o: $(CONFIG_EFI_SBAT_FILE)
+endif
$(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE
$(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index f03d59ea6e40..3b0948ad449f 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -760,6 +760,49 @@ static void process_e820_entries(unsigned long minimum,
}
}
+/*
+ * If KHO is active, only process its scratch areas to ensure we are not
+ * stepping onto preserved memory.
+ */
+static bool process_kho_entries(unsigned long minimum, unsigned long image_size)
+{
+ struct kho_scratch *kho_scratch;
+ struct setup_data *ptr;
+ struct kho_data *kho;
+ int i, nr_areas = 0;
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ return false;
+
+ ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data;
+ while (ptr) {
+ if (ptr->type == SETUP_KEXEC_KHO) {
+ kho = (struct kho_data *)(unsigned long)ptr->data;
+ kho_scratch = (void *)(unsigned long)kho->scratch_addr;
+ nr_areas = kho->scratch_size / sizeof(*kho_scratch);
+ break;
+ }
+
+ ptr = (struct setup_data *)(unsigned long)ptr->next;
+ }
+
+ if (!nr_areas)
+ return false;
+
+ for (i = 0; i < nr_areas; i++) {
+ struct kho_scratch *area = &kho_scratch[i];
+ struct mem_vector region = {
+ .start = area->addr,
+ .size = area->size,
+ };
+
+ if (process_mem_region(&region, minimum, image_size))
+ break;
+ }
+
+ return true;
+}
+
static unsigned long find_random_phys_addr(unsigned long minimum,
unsigned long image_size)
{
@@ -775,7 +818,12 @@ static unsigned long find_random_phys_addr(unsigned long minimum,
return 0;
}
- if (!process_efi_entries(minimum, image_size))
+ /*
+ * During kexec handover only process KHO scratch areas that are known
+ * not to contain any data that must be preserved.
+ */
+ if (!process_kho_entries(minimum, image_size) &&
+ !process_efi_entries(minimum, image_size))
process_e820_entries(minimum, image_size);
phys_addr = slots_fetch_random();
diff --git a/arch/x86/boot/compressed/sbat.S b/arch/x86/boot/compressed/sbat.S
new file mode 100644
index 000000000000..838f70a997dd
--- /dev/null
+++ b/arch/x86/boot/compressed/sbat.S
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Embed SBAT data in the kernel.
+ */
+ .pushsection ".sbat", "a", @progbits
+ .incbin CONFIG_EFI_SBAT_FILE
+ .popsection
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 3b2bc61c9408..587ce3e7c504 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -43,6 +43,14 @@ SECTIONS
*(.rodata.*)
_erodata = . ;
}
+#ifdef CONFIG_EFI_SBAT
+ .sbat : ALIGN(0x1000) {
+ _sbat = . ;
+ *(.sbat)
+ _esbat = ALIGN(0x1000);
+ . = _esbat;
+ }
+#endif
.data : ALIGN(0x1000) {
_data = . ;
*(.data)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index e30649e44d8f..9bea5a1e2c52 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -43,7 +43,7 @@ SYSSEG = 0x1000 /* historical load address >> 4 */
.section ".bstext", "ax"
#ifdef CONFIG_EFI_STUB
# "MZ", MS-DOS header
- .word MZ_MAGIC
+ .word IMAGE_DOS_SIGNATURE
.org 0x38
#
# Offset to the PE header.
@@ -51,16 +51,16 @@ SYSSEG = 0x1000 /* historical load address >> 4 */
.long LINUX_PE_MAGIC
.long pe_header
pe_header:
- .long PE_MAGIC
+ .long IMAGE_NT_SIGNATURE
coff_header:
#ifdef CONFIG_X86_32
.set image_file_add_flags, IMAGE_FILE_32BIT_MACHINE
- .set pe_opt_magic, PE_OPT_MAGIC_PE32
+ .set pe_opt_magic, IMAGE_NT_OPTIONAL_HDR32_MAGIC
.word IMAGE_FILE_MACHINE_I386
#else
.set image_file_add_flags, 0
- .set pe_opt_magic, PE_OPT_MAGIC_PE32PLUS
+ .set pe_opt_magic, IMAGE_NT_OPTIONAL_HDR64_MAGIC
.word IMAGE_FILE_MACHINE_AMD64
#endif
.word section_count # nr_sections
@@ -111,7 +111,7 @@ extra_header_fields:
.long salign # SizeOfHeaders
.long 0 # CheckSum
.word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application)
- .word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics
+ .word IMAGE_DLLCHARACTERISTICS_NX_COMPAT # DllCharacteristics
#ifdef CONFIG_X86_32
.long 0 # SizeOfStackReserve
.long 0 # SizeOfStackCommit
@@ -179,15 +179,11 @@ pecompat_fstart:
#else
.set pecompat_fstart, setup_size
#endif
- .ascii ".text"
- .byte 0
- .byte 0
- .byte 0
- .long ZO__data
- .long setup_size
- .long ZO__data # Size of initialized data
- # on disk
- .long setup_size
+ .ascii ".text\0\0\0"
+ .long textsize # VirtualSize
+ .long setup_size # VirtualAddress
+ .long textsize # SizeOfRawData
+ .long setup_size # PointerToRawData
.long 0 # PointerToRelocations
.long 0 # PointerToLineNumbers
.word 0 # NumberOfRelocations
@@ -196,6 +192,23 @@ pecompat_fstart:
IMAGE_SCN_MEM_READ | \
IMAGE_SCN_MEM_EXECUTE # Characteristics
+#ifdef CONFIG_EFI_SBAT
+ .ascii ".sbat\0\0\0"
+ .long ZO__esbat - ZO__sbat # VirtualSize
+ .long setup_size + ZO__sbat # VirtualAddress
+ .long ZO__esbat - ZO__sbat # SizeOfRawData
+ .long setup_size + ZO__sbat # PointerToRawData
+
+ .long 0, 0, 0
+ .long IMAGE_SCN_CNT_INITIALIZED_DATA | \
+ IMAGE_SCN_MEM_READ | \
+ IMAGE_SCN_MEM_DISCARDABLE # Characteristics
+
+ .set textsize, ZO__sbat
+#else
+ .set textsize, ZO__data
+#endif
+
.ascii ".data\0\0\0"
.long ZO__end - ZO__data # VirtualSize
.long setup_size + ZO__data # VirtualAddress
diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile
index db3255b979bd..342d79f0ab6a 100644
--- a/arch/x86/coco/sev/Makefile
+++ b/arch/x86/coco/sev/Makefile
@@ -5,5 +5,6 @@ obj-y += core.o sev-nmi.o vc-handle.o
# Clang 14 and older may fail to respect __no_sanitize_undefined when inlining
UBSAN_SANITIZE_sev-nmi.o := n
-# GCC may fail to respect __no_sanitize_address when inlining
+# GCC may fail to respect __no_sanitize_address or __no_kcsan when inlining
KASAN_SANITIZE_sev-nmi.o := n
+KCSAN_SANITIZE_sev-nmi.o := n
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index b2569257acd3..fc59ce78c477 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -88,7 +88,7 @@ static const char * const sev_status_feat_names[] = {
*/
static u64 snp_tsc_scale __ro_after_init;
static u64 snp_tsc_offset __ro_after_init;
-static u64 snp_tsc_freq_khz __ro_after_init;
+static unsigned long snp_tsc_freq_khz __ro_after_init;
DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
@@ -869,12 +869,12 @@ static void *snp_alloc_vmsa_page(int cpu)
return page_address(p + 1);
}
-static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
+static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip, unsigned int cpu)
{
struct sev_es_save_area *cur_vmsa, *vmsa;
struct svsm_ca *caa;
u8 sipi_vector;
- int cpu, ret;
+ int ret;
u64 cr4;
/*
@@ -895,15 +895,6 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
/* Override start_ip with known protected guest start IP */
start_ip = real_mode_header->sev_es_trampoline_start;
-
- /* Find the logical CPU for the APIC ID */
- for_each_present_cpu(cpu) {
- if (arch_match_cpu_phys_id(cpu, apic_id))
- break;
- }
- if (cpu >= nr_cpu_ids)
- return -EINVAL;
-
cur_vmsa = per_cpu(sev_vmsa, cpu);
/*
@@ -1054,11 +1045,13 @@ int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
* This is needed by the OVMF UEFI firmware which will use whatever it finds in
* the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
* runtime GHCBs used by the kernel are also mapped in the EFI page-table.
+ *
+ * When running under SVSM the CA page is needed too, so map it as well.
*/
-int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
+int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd)
{
+ unsigned long address, pflags, pflags_enc;
struct sev_es_runtime_data *data;
- unsigned long address, pflags;
int cpu;
u64 pfn;
@@ -1066,6 +1059,7 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
return 0;
pflags = _PAGE_NX | _PAGE_RW;
+ pflags_enc = cc_mkenc(pflags);
for_each_possible_cpu(cpu) {
data = per_cpu(runtime_data, cpu);
@@ -1075,6 +1069,16 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags))
return 1;
+
+ if (snp_vmpl) {
+ address = per_cpu(svsm_caa_pa, cpu);
+ if (!address)
+ return 1;
+
+ pfn = address >> PAGE_SHIFT;
+ if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags_enc))
+ return 1;
+ }
}
return 0;
@@ -1398,16 +1402,16 @@ int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call,
}
EXPORT_SYMBOL_GPL(snp_issue_svsm_attest_req);
-static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input,
- struct snp_guest_request_ioctl *rio)
+static int snp_issue_guest_request(struct snp_guest_req *req)
{
+ struct snp_req_data *input = &req->input;
struct ghcb_state state;
struct es_em_ctxt ctxt;
unsigned long flags;
struct ghcb *ghcb;
int ret;
- rio->exitinfo2 = SEV_RET_NO_FW_CALL;
+ req->exitinfo2 = SEV_RET_NO_FW_CALL;
/*
* __sev_get_ghcb() needs to run with IRQs disabled because it is using
@@ -1432,8 +1436,8 @@ static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_dat
if (ret)
goto e_put;
- rio->exitinfo2 = ghcb->save.sw_exit_info_2;
- switch (rio->exitinfo2) {
+ req->exitinfo2 = ghcb->save.sw_exit_info_2;
+ switch (req->exitinfo2) {
case 0:
break;
@@ -1462,11 +1466,74 @@ e_restore_irq:
return ret;
}
+/**
+ * snp_svsm_vtpm_probe() - Probe if SVSM provides a vTPM device
+ *
+ * Check that there is SVSM and that it supports at least TPM_SEND_COMMAND
+ * which is the only request used so far.
+ *
+ * Return: true if the platform provides a vTPM SVSM device, false otherwise.
+ */
+static bool snp_svsm_vtpm_probe(void)
+{
+ struct svsm_call call = {};
+
+ /* The vTPM device is available only if a SVSM is present */
+ if (!snp_vmpl)
+ return false;
+
+ call.caa = svsm_get_caa();
+ call.rax = SVSM_VTPM_CALL(SVSM_VTPM_QUERY);
+
+ if (svsm_perform_call_protocol(&call))
+ return false;
+
+ /* Check platform commands contains TPM_SEND_COMMAND - platform command 8 */
+ return call.rcx_out & BIT_ULL(8);
+}
+
+/**
+ * snp_svsm_vtpm_send_command() - Execute a vTPM operation on SVSM
+ * @buffer: A buffer used to both send the command and receive the response.
+ *
+ * Execute a SVSM_VTPM_CMD call as defined by
+ * "Secure VM Service Module for SEV-SNP Guests" Publication # 58019 Revision: 1.00
+ *
+ * All command request/response buffers have a common structure as specified by
+ * the following table:
+ * Byte Size     In/Out    Description
+ * Offset    (Bytes)
+ * 0x000     4          In        Platform command
+ *                         Out       Platform command response size
+ *
+ * Each command can build upon this common request/response structure to create
+ * a structure specific to the command. See include/linux/tpm_svsm.h for more
+ * details.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int snp_svsm_vtpm_send_command(u8 *buffer)
+{
+ struct svsm_call call = {};
+
+ call.caa = svsm_get_caa();
+ call.rax = SVSM_VTPM_CALL(SVSM_VTPM_CMD);
+ call.rcx = __pa(buffer);
+
+ return svsm_perform_call_protocol(&call);
+}
+EXPORT_SYMBOL_GPL(snp_svsm_vtpm_send_command);
+
static struct platform_device sev_guest_device = {
.name = "sev-guest",
.id = -1,
};
+static struct platform_device tpm_svsm_device = {
+ .name = "tpm-svsm",
+ .id = -1,
+};
+
static int __init snp_init_platform_device(void)
{
if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
@@ -1475,7 +1542,11 @@ static int __init snp_init_platform_device(void)
if (platform_device_register(&sev_guest_device))
return -ENODEV;
- pr_info("SNP guest platform device initialized.\n");
+ if (snp_svsm_vtpm_probe() &&
+ platform_device_register(&tpm_svsm_device))
+ return -ENODEV;
+
+ pr_info("SNP guest platform devices initialized.\n");
return 0;
}
device_initcall(snp_init_platform_device);
@@ -1861,8 +1932,7 @@ static int enc_payload(struct snp_msg_desc *mdesc, u64 seqno, struct snp_guest_r
return 0;
}
-static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
- struct snp_guest_request_ioctl *rio)
+static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req)
{
unsigned long req_start = jiffies;
unsigned int override_npages = 0;
@@ -1876,7 +1946,7 @@ retry_request:
* sequence number must be incremented or the VMPCK must be deleted to
* prevent reuse of the IV.
*/
- rc = snp_issue_guest_request(req, &req->input, rio);
+ rc = snp_issue_guest_request(req);
switch (rc) {
case -ENOSPC:
/*
@@ -1929,7 +1999,7 @@ retry_request:
snp_inc_msg_seqno(mdesc);
if (override_err) {
- rio->exitinfo2 = override_err;
+ req->exitinfo2 = override_err;
/*
* If an extended guest request was issued and the supplied certificate
@@ -1947,12 +2017,20 @@ retry_request:
return rc;
}
-int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
- struct snp_guest_request_ioctl *rio)
+int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req)
{
u64 seqno;
int rc;
+ /*
+ * enc_payload() calls aesgcm_encrypt(), which can potentially offload to HW.
+ * The offload's DMA SG list of data to encrypt has to be in linear mapping.
+ */
+ if (!virt_addr_valid(req->req_buf) || !virt_addr_valid(req->resp_buf)) {
+ pr_warn("AES-GSM buffers must be in linear mapping");
+ return -EINVAL;
+ }
+
guard(mutex)(&snp_cmd_mutex);
/* Check if the VMPCK is not empty */
@@ -1985,14 +2063,14 @@ int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req
req->input.resp_gpa = __pa(mdesc->response);
req->input.data_gpa = req->certs_data ? __pa(req->certs_data) : 0;
- rc = __handle_guest_request(mdesc, req, rio);
+ rc = __handle_guest_request(mdesc, req);
if (rc) {
if (rc == -EIO &&
- rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
+ req->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
return rc;
pr_alert("Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n",
- rc, rio->exitinfo2);
+ rc, req->exitinfo2);
snp_disable_vmpck(mdesc);
return rc;
@@ -2011,11 +2089,10 @@ EXPORT_SYMBOL_GPL(snp_send_guest_request);
static int __init snp_get_tsc_info(void)
{
- struct snp_guest_request_ioctl *rio;
struct snp_tsc_info_resp *tsc_resp;
struct snp_tsc_info_req *tsc_req;
struct snp_msg_desc *mdesc;
- struct snp_guest_req *req;
+ struct snp_guest_req req = {};
int rc = -ENOMEM;
tsc_req = kzalloc(sizeof(*tsc_req), GFP_KERNEL);
@@ -2031,32 +2108,24 @@ static int __init snp_get_tsc_info(void)
if (!tsc_resp)
goto e_free_tsc_req;
- req = kzalloc(sizeof(*req), GFP_KERNEL);
- if (!req)
- goto e_free_tsc_resp;
-
- rio = kzalloc(sizeof(*rio), GFP_KERNEL);
- if (!rio)
- goto e_free_req;
-
mdesc = snp_msg_alloc();
if (IS_ERR_OR_NULL(mdesc))
- goto e_free_rio;
+ goto e_free_tsc_resp;
rc = snp_msg_init(mdesc, snp_vmpl);
if (rc)
goto e_free_mdesc;
- req->msg_version = MSG_HDR_VER;
- req->msg_type = SNP_MSG_TSC_INFO_REQ;
- req->vmpck_id = snp_vmpl;
- req->req_buf = tsc_req;
- req->req_sz = sizeof(*tsc_req);
- req->resp_buf = (void *)tsc_resp;
- req->resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN;
- req->exit_code = SVM_VMGEXIT_GUEST_REQUEST;
+ req.msg_version = MSG_HDR_VER;
+ req.msg_type = SNP_MSG_TSC_INFO_REQ;
+ req.vmpck_id = snp_vmpl;
+ req.req_buf = tsc_req;
+ req.req_sz = sizeof(*tsc_req);
+ req.resp_buf = (void *)tsc_resp;
+ req.resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN;
+ req.exit_code = SVM_VMGEXIT_GUEST_REQUEST;
- rc = snp_send_guest_request(mdesc, req, rio);
+ rc = snp_send_guest_request(mdesc, &req);
if (rc)
goto e_request;
@@ -2077,11 +2146,7 @@ e_request:
memzero_explicit(tsc_resp, sizeof(*tsc_resp) + AUTHTAG_LEN);
e_free_mdesc:
snp_msg_free(mdesc);
-e_free_rio:
- kfree(rio);
-e_free_req:
- kfree(req);
- e_free_tsc_resp:
+e_free_tsc_resp:
kfree(tsc_resp);
e_free_tsc_req:
kfree(tsc_req);
@@ -2109,15 +2174,31 @@ static unsigned long securetsc_get_tsc_khz(void)
void __init snp_secure_tsc_init(void)
{
- unsigned long long tsc_freq_mhz;
+ struct snp_secrets_page *secrets;
+ unsigned long tsc_freq_mhz;
+ void *mem;
if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
return;
+ mem = early_memremap_encrypted(sev_secrets_pa, PAGE_SIZE);
+ if (!mem) {
+ pr_err("Unable to get TSC_FACTOR: failed to map the SNP secrets page.\n");
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECURE_TSC);
+ }
+
+ secrets = (__force struct snp_secrets_page *)mem;
+
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
rdmsrq(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz);
- snp_tsc_freq_khz = (unsigned long)(tsc_freq_mhz * 1000);
+
+ /* Extract the GUEST TSC MHZ from BIT[17:0], rest is reserved space */
+ tsc_freq_mhz &= GENMASK_ULL(17, 0);
+
+ snp_tsc_freq_khz = SNP_SCALE_TSC_FREQ(tsc_freq_mhz * 1000, secrets->tsc_factor);
x86_platform.calibrate_cpu = securetsc_get_tsc_khz;
x86_platform.calibrate_tsc = securetsc_get_tsc_khz;
+
+ early_memunmap(mem, PAGE_SIZE);
}
diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c
index 0989d98da130..faf1fce89ed4 100644
--- a/arch/x86/coco/sev/vc-handle.c
+++ b/arch/x86/coco/sev/vc-handle.c
@@ -17,6 +17,7 @@
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/psp-sev.h>
+#include <linux/efi.h>
#include <uapi/linux/sev-guest.h>
#include <asm/init.h>
@@ -178,9 +179,15 @@ static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
return ES_OK;
}
+/*
+ * User instruction decoding is also required for the EFI runtime. Even though
+ * the EFI runtime is running in kernel mode, it uses special EFI virtual
+ * address mappings that require the use of efi_mm to properly address and
+ * decode.
+ */
static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
{
- if (user_mode(ctxt->regs))
+ if (user_mode(ctxt->regs) || mm_is_efi(current->active_mm))
return __vc_decode_user_insn(ctxt);
else
return __vc_decode_kern_insn(ctxt);
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index edab6d6049be..7b2833705d47 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -36,6 +36,7 @@
/* TDX Module call error codes */
#define TDCALL_RETURN_CODE(a) ((a) >> 32)
#define TDCALL_INVALID_OPERAND 0xc0000100
+#define TDCALL_OPERAND_BUSY 0x80000200
#define TDREPORT_SUBTYPE_0 0
@@ -109,12 +110,13 @@ static inline u64 tdg_vm_wr(u64 field, u64 value, u64 mask)
* REPORTDATA to be included into TDREPORT.
* @tdreport: Address of the output buffer to store TDREPORT.
*
- * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module
- * v1.0 specification for more information on TDG.MR.REPORT TDCALL.
+ * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module v1.0
+ * specification for more information on TDG.MR.REPORT TDCALL.
+ *
* It is used in the TDX guest driver module to get the TDREPORT0.
*
- * Return 0 on success, -EINVAL for invalid operands, or -EIO on
- * other TDCALL failures.
+ * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation,
+ * or -EIO on other TDCALL failures.
*/
int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
{
@@ -128,7 +130,9 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
ret = __tdcall(TDG_MR_REPORT, &args);
if (ret) {
if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
- return -EINVAL;
+ return -ENXIO;
+ else if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY)
+ return -EBUSY;
return -EIO;
}
@@ -137,6 +141,42 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
/**
+ * tdx_mcall_extend_rtmr() - Wrapper to extend RTMR registers using
+ * TDG.MR.RTMR.EXTEND TDCALL.
+ * @index: Index of RTMR register to be extended.
+ * @data: Address of the input buffer with RTMR register extend data.
+ *
+ * Refer to section titled "TDG.MR.RTMR.EXTEND leaf" in the TDX Module v1.0
+ * specification for more information on TDG.MR.RTMR.EXTEND TDCALL.
+ *
+ * It is used in the TDX guest driver module to allow user to extend the RTMR
+ * registers.
+ *
+ * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation,
+ * or -EIO on other TDCALL failures.
+ */
+int tdx_mcall_extend_rtmr(u8 index, u8 *data)
+{
+ struct tdx_module_args args = {
+ .rcx = virt_to_phys(data),
+ .rdx = index,
+ };
+ u64 ret;
+
+ ret = __tdcall(TDG_MR_RTMR_EXTEND, &args);
+ if (ret) {
+ if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
+ return -ENXIO;
+ if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY)
+ return -EBUSY;
+ return -EIO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tdx_mcall_extend_rtmr);
+
+/**
* tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote
* hypercall.
* @buf: Address of the directly mapped shared kernel buffer which
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 7cd2f395f301..79fa38ca954d 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -27,10 +27,12 @@ CONFIG_CGROUP_DEBUG=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_KALLSYMS_ALL=y
CONFIG_PROFILING=y
+CONFIG_KEXEC=y
+# Do not remove this as it results in non-bootable kernels
+# CONFIG_64BIT is not set
CONFIG_SMP=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
-CONFIG_NR_CPUS=8
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
@@ -39,9 +41,6 @@ CONFIG_X86_CHECK_BIOS_CORRUPTION=y
CONFIG_EFI=y
CONFIG_EFI_STUB=y
CONFIG_HZ_1000=y
-CONFIG_KEXEC=y
-CONFIG_CRASH_DUMP=y
-# CONFIG_MITIGATION_RETHUNK is not set
CONFIG_HIBERNATION=y
CONFIG_PM_DEBUG=y
CONFIG_PM_TRACE_RTC=y
@@ -52,7 +51,6 @@ CONFIG_CPU_FREQ_GOV_ONDEMAND=y
CONFIG_X86_ACPI_CPUFREQ=y
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
-CONFIG_COMPAT_32BIT_TIME=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
@@ -63,9 +61,7 @@ CONFIG_BINFMT_MISC=y
# CONFIG_COMPAT_BRK is not set
CONFIG_NET=y
CONFIG_PACKET=y
-CONFIG_UNIX=y
CONFIG_XFRM_USER=y
-CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_MULTIPLE_TABLES=y
@@ -134,7 +130,6 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_DEBUG_DEVRES=y
CONFIG_CONNECTOR=y
-CONFIG_EFI_CAPSULE_LOADER=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_VIRTIO_BLK=y
CONFIG_BLK_DEV_SD=y
@@ -210,7 +205,6 @@ CONFIG_SND_HDA_INTEL=y
CONFIG_SND_HDA_HWDEP=y
CONFIG_HIDRAW=y
CONFIG_HID_GYRATION=y
-CONFIG_LOGITECH_FF=y
CONFIG_HID_NTRIG=y
CONFIG_HID_PANTHERLORD=y
CONFIG_PANTHERLORD_FF=y
@@ -241,7 +235,6 @@ CONFIG_EXT4_FS_POSIX_ACL=y
CONFIG_EXT4_FS_SECURITY=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
-# CONFIG_PRINT_QUOTA_WARNING is not set
CONFIG_QFMT_V2=y
CONFIG_AUTOFS_FS=y
CONFIG_ISO9660_FS=y
@@ -266,19 +259,13 @@ CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_KERNEL=y
-CONFIG_FRAME_WARN=1024
CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_WX=y
CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_BOOT_PARAMS=y
-CONFIG_UNWINDER_FRAME_POINTER=y
CONFIG_DEBUG_ENTRY=y
-# CONFIG_64BIT is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 61e25f6209ed..7d7310cdf8b0 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -27,6 +27,7 @@ CONFIG_CGROUP_DEBUG=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_KALLSYMS_ALL=y
CONFIG_PROFILING=y
+CONFIG_KEXEC=y
CONFIG_SMP=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
@@ -40,8 +41,6 @@ CONFIG_EFI=y
CONFIG_EFI_STUB=y
CONFIG_EFI_MIXED=y
CONFIG_HZ_1000=y
-CONFIG_KEXEC=y
-CONFIG_CRASH_DUMP=y
CONFIG_HIBERNATION=y
CONFIG_PM_DEBUG=y
CONFIG_PM_TRACE_RTC=y
@@ -63,9 +62,7 @@ CONFIG_BINFMT_MISC=y
# CONFIG_COMPAT_BRK is not set
CONFIG_NET=y
CONFIG_PACKET=y
-CONFIG_UNIX=y
CONFIG_XFRM_USER=y
-CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_MULTIPLE_TABLES=y
@@ -205,7 +202,6 @@ CONFIG_SND_HDA_INTEL=y
CONFIG_SND_HDA_HWDEP=y
CONFIG_HIDRAW=y
CONFIG_HID_GYRATION=y
-CONFIG_LOGITECH_FF=y
CONFIG_HID_NTRIG=y
CONFIG_HID_PANTHERLORD=y
CONFIG_PANTHERLORD_FF=y
@@ -239,7 +235,6 @@ CONFIG_EXT4_FS_POSIX_ACL=y
CONFIG_EXT4_FS_SECURITY=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
-# CONFIG_PRINT_QUOTA_WARNING is not set
CONFIG_QFMT_V2=y
CONFIG_AUTOFS_FS=y
CONFIG_ISO9660_FS=y
@@ -264,13 +259,11 @@ CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_KERNEL=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_WX=y
CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 56cfdc79e2c6..94016c60561e 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -376,33 +376,6 @@ config CRYPTO_POLYVAL_CLMUL_NI
Architecture: x86_64 using:
- CLMUL-NI (carry-less multiplication new instructions)
-config CRYPTO_SHA1_SSSE3
- tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)"
- depends on 64BIT
- select CRYPTO_SHA1
- select CRYPTO_HASH
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX (Advanced Vector Extensions)
- - AVX2 (Advanced Vector Extensions 2)
- - SHA-NI (SHA Extensions New Instructions)
-
-config CRYPTO_SHA512_SSSE3
- tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)"
- depends on 64BIT
- select CRYPTO_SHA512
- select CRYPTO_HASH
- help
- SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX (Advanced Vector Extensions)
- - AVX2 (Advanced Vector Extensions 2)
-
config CRYPTO_SM3_AVX_X86_64
tristate "Hash functions: SM3 (AVX)"
depends on 64BIT
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index aa289a9e0153..d402963d6b57 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -51,12 +51,6 @@ ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy)
aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o
endif
-obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
-sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o
-
-obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
-sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
-
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index f1b6d40154e3..f1adfba1a76e 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -104,10 +104,12 @@ static void crypto_aegis128_aesni_process_ad(
}
}
-static __always_inline void
+static __always_inline int
crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
struct skcipher_walk *walk, bool enc)
{
+ int err = 0;
+
while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
if (enc)
aegis128_aesni_enc(state, walk->src.virt.addr,
@@ -119,7 +121,10 @@ crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
walk->dst.virt.addr,
round_down(walk->nbytes,
AEGIS128_BLOCK_SIZE));
- skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
+ kernel_fpu_end();
+ err = skcipher_walk_done(walk,
+ walk->nbytes % AEGIS128_BLOCK_SIZE);
+ kernel_fpu_begin();
}
if (walk->nbytes) {
@@ -131,8 +136,11 @@ crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
aegis128_aesni_dec_tail(state, walk->src.virt.addr,
walk->dst.virt.addr,
walk->nbytes);
- skcipher_walk_done(walk, 0);
+ kernel_fpu_end();
+ err = skcipher_walk_done(walk, 0);
+ kernel_fpu_begin();
}
+ return err;
}
static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead)
@@ -165,7 +173,7 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
return 0;
}
-static __always_inline void
+static __always_inline int
crypto_aegis128_aesni_crypt(struct aead_request *req,
struct aegis_block *tag_xor,
unsigned int cryptlen, bool enc)
@@ -174,20 +182,24 @@ crypto_aegis128_aesni_crypt(struct aead_request *req,
struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
struct skcipher_walk walk;
struct aegis_state state;
+ int err;
if (enc)
- skcipher_walk_aead_encrypt(&walk, req, true);
+ err = skcipher_walk_aead_encrypt(&walk, req, false);
else
- skcipher_walk_aead_decrypt(&walk, req, true);
+ err = skcipher_walk_aead_decrypt(&walk, req, false);
+ if (err)
+ return err;
kernel_fpu_begin();
aegis128_aesni_init(&state, &ctx->key, req->iv);
crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
- crypto_aegis128_aesni_process_crypt(&state, &walk, enc);
- aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
-
+ err = crypto_aegis128_aesni_process_crypt(&state, &walk, enc);
+ if (err == 0)
+ aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
kernel_fpu_end();
+ return err;
}
static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
@@ -196,8 +208,11 @@ static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
struct aegis_block tag = {};
unsigned int authsize = crypto_aead_authsize(tfm);
unsigned int cryptlen = req->cryptlen;
+ int err;
- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true);
+ err = crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true);
+ if (err)
+ return err;
scatterwalk_map_and_copy(tag.bytes, req->dst,
req->assoclen + cryptlen, authsize, 1);
@@ -212,11 +227,14 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
struct aegis_block tag;
unsigned int authsize = crypto_aead_authsize(tfm);
unsigned int cryptlen = req->cryptlen - authsize;
+ int err;
scatterwalk_map_and_copy(tag.bytes, req->src,
req->assoclen + cryptlen, authsize, 0);
- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false);
+ err = crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false);
+ if (err)
+ return err;
return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
}
diff --git a/arch/x86/crypto/aria_aesni_avx2_glue.c b/arch/x86/crypto/aria_aesni_avx2_glue.c
index b4bddcd58457..007b250f774c 100644
--- a/arch/x86/crypto/aria_aesni_avx2_glue.c
+++ b/arch/x86/crypto/aria_aesni_avx2_glue.c
@@ -9,6 +9,7 @@
#include <crypto/aria.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <linux/export.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c
index ab9b38d05332..4c88ef4eba82 100644
--- a/arch/x86/crypto/aria_aesni_avx_glue.c
+++ b/arch/x86/crypto/aria_aesni_avx_glue.c
@@ -9,6 +9,7 @@
#include <crypto/aria.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <linux/export.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index a7d162388142..5c321f255eb7 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -8,6 +8,7 @@
#include <crypto/algapi.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <linux/export.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 3bd37d664121..cbede120e5f2 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -10,6 +10,7 @@
#include <linux/unaligned.h>
#include <linux/crypto.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index dcfc0de333de..d587f05c3c8c 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -7,6 +7,7 @@
#include <crypto/curve25519.h>
#include <crypto/internal/kpp.h>
+#include <linux/export.h>
#include <linux/types.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index e640abc1cb8a..9c8b3a335d5c 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/err.h>
+#include <linux/export.h>
#include <crypto/algapi.h>
#include <crypto/serpent.h>
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
deleted file mode 100644
index 4b49bdc95265..000000000000
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ /dev/null
@@ -1,700 +0,0 @@
-/*
- * Implement fast SHA-1 with AVX2 instructions. (x86_64)
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Ilya Albrekht <ilya.albrekht@intel.com>
- * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
- * Ronen Zohar <ronen.zohar@intel.com>
- * Chandramouli Narayanan <mouli@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/*
- * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
- *
- *This implementation is based on the previous SSSE3 release:
- *Visit http://software.intel.com/en-us/articles/
- *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
- *
- *Updates 20-byte SHA-1 record at start of 'state', from 'input', for
- *even number of 'blocks' consecutive 64-byte blocks.
- *
- *extern "C" void sha1_transform_avx2(
- * struct sha1_state *state, const u8* input, int blocks );
- */
-
-#include <linux/linkage.h>
-
-#define CTX %rdi /* arg1 */
-#define BUF %rsi /* arg2 */
-#define CNT %rdx /* arg3 */
-
-#define REG_A %ecx
-#define REG_B %esi
-#define REG_C %edi
-#define REG_D %eax
-#define REG_E %edx
-#define REG_TB %ebx
-#define REG_TA %r12d
-#define REG_RA %rcx
-#define REG_RB %rsi
-#define REG_RC %rdi
-#define REG_RD %rax
-#define REG_RE %rdx
-#define REG_RTA %r12
-#define REG_RTB %rbx
-#define REG_T1 %r11d
-#define xmm_mov vmovups
-#define avx2_zeroupper vzeroupper
-#define RND_F1 1
-#define RND_F2 2
-#define RND_F3 3
-
-.macro REGALLOC
- .set A, REG_A
- .set B, REG_B
- .set C, REG_C
- .set D, REG_D
- .set E, REG_E
- .set TB, REG_TB
- .set TA, REG_TA
-
- .set RA, REG_RA
- .set RB, REG_RB
- .set RC, REG_RC
- .set RD, REG_RD
- .set RE, REG_RE
-
- .set RTA, REG_RTA
- .set RTB, REG_RTB
-
- .set T1, REG_T1
-.endm
-
-#define HASH_PTR %r9
-#define BLOCKS_CTR %r8
-#define BUFFER_PTR %r10
-#define BUFFER_PTR2 %r13
-
-#define PRECALC_BUF %r14
-#define WK_BUF %r15
-
-#define W_TMP %xmm0
-#define WY_TMP %ymm0
-#define WY_TMP2 %ymm9
-
-# AVX2 variables
-#define WY0 %ymm3
-#define WY4 %ymm5
-#define WY08 %ymm7
-#define WY12 %ymm8
-#define WY16 %ymm12
-#define WY20 %ymm13
-#define WY24 %ymm14
-#define WY28 %ymm15
-
-#define YMM_SHUFB_BSWAP %ymm10
-
-/*
- * Keep 2 iterations precalculated at a time:
- * - 80 DWORDs per iteration * 2
- */
-#define W_SIZE (80*2*2 +16)
-
-#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
-#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
-
-
-.macro UPDATE_HASH hash, val
- add \hash, \val
- mov \val, \hash
-.endm
-
-.macro PRECALC_RESET_WY
- .set WY_00, WY0
- .set WY_04, WY4
- .set WY_08, WY08
- .set WY_12, WY12
- .set WY_16, WY16
- .set WY_20, WY20
- .set WY_24, WY24
- .set WY_28, WY28
- .set WY_32, WY_00
-.endm
-
-.macro PRECALC_ROTATE_WY
- /* Rotate macros */
- .set WY_32, WY_28
- .set WY_28, WY_24
- .set WY_24, WY_20
- .set WY_20, WY_16
- .set WY_16, WY_12
- .set WY_12, WY_08
- .set WY_08, WY_04
- .set WY_04, WY_00
- .set WY_00, WY_32
-
- /* Define register aliases */
- .set WY, WY_00
- .set WY_minus_04, WY_04
- .set WY_minus_08, WY_08
- .set WY_minus_12, WY_12
- .set WY_minus_16, WY_16
- .set WY_minus_20, WY_20
- .set WY_minus_24, WY_24
- .set WY_minus_28, WY_28
- .set WY_minus_32, WY
-.endm
-
-.macro PRECALC_00_15
- .if (i == 0) # Initialize and rotate registers
- PRECALC_RESET_WY
- PRECALC_ROTATE_WY
- .endif
-
- /* message scheduling pre-compute for rounds 0-15 */
- .if ((i & 7) == 0)
- /*
- * blended AVX2 and ALU instruction scheduling
- * 1 vector iteration per 8 rounds
- */
- vmovdqu (i * 2)(BUFFER_PTR), W_TMP
- .elseif ((i & 7) == 1)
- vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
- WY_TMP, WY_TMP
- .elseif ((i & 7) == 2)
- vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
- .elseif ((i & 7) == 4)
- vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
- .elseif ((i & 7) == 7)
- vmovdqu WY_TMP, PRECALC_WK(i&~7)
-
- PRECALC_ROTATE_WY
- .endif
-.endm
-
-.macro PRECALC_16_31
- /*
- * message scheduling pre-compute for rounds 16-31
- * calculating last 32 w[i] values in 8 XMM registers
- * pre-calculate K+w[i] values and store to mem
- * for later load by ALU add instruction
- *
- * "brute force" vectorization for rounds 16-31 only
- * due to w[i]->w[i-3] dependency
- */
- .if ((i & 7) == 0)
- /*
- * blended AVX2 and ALU instruction scheduling
- * 1 vector iteration per 8 rounds
- */
- /* w[i-14] */
- vpalignr $8, WY_minus_16, WY_minus_12, WY
- vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
- .elseif ((i & 7) == 1)
- vpxor WY_minus_08, WY, WY
- vpxor WY_minus_16, WY_TMP, WY_TMP
- .elseif ((i & 7) == 2)
- vpxor WY_TMP, WY, WY
- vpslldq $12, WY, WY_TMP2
- .elseif ((i & 7) == 3)
- vpslld $1, WY, WY_TMP
- vpsrld $31, WY, WY
- .elseif ((i & 7) == 4)
- vpor WY, WY_TMP, WY_TMP
- vpslld $2, WY_TMP2, WY
- .elseif ((i & 7) == 5)
- vpsrld $30, WY_TMP2, WY_TMP2
- vpxor WY, WY_TMP, WY_TMP
- .elseif ((i & 7) == 7)
- vpxor WY_TMP2, WY_TMP, WY
- vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
- vmovdqu WY_TMP, PRECALC_WK(i&~7)
-
- PRECALC_ROTATE_WY
- .endif
-.endm
-
-.macro PRECALC_32_79
- /*
- * in SHA-1 specification:
- * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
- * instead we do equal:
- * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
- * allows more efficient vectorization
- * since w[i]=>w[i-3] dependency is broken
- */
-
- .if ((i & 7) == 0)
- /*
- * blended AVX2 and ALU instruction scheduling
- * 1 vector iteration per 8 rounds
- */
- vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
- .elseif ((i & 7) == 1)
- /* W is W_minus_32 before xor */
- vpxor WY_minus_28, WY, WY
- .elseif ((i & 7) == 2)
- vpxor WY_minus_16, WY_TMP, WY_TMP
- .elseif ((i & 7) == 3)
- vpxor WY_TMP, WY, WY
- .elseif ((i & 7) == 4)
- vpslld $2, WY, WY_TMP
- .elseif ((i & 7) == 5)
- vpsrld $30, WY, WY
- vpor WY, WY_TMP, WY
- .elseif ((i & 7) == 7)
- vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
- vmovdqu WY_TMP, PRECALC_WK(i&~7)
-
- PRECALC_ROTATE_WY
- .endif
-.endm
-
-.macro PRECALC r, s
- .set i, \r
-
- .if (i < 40)
- .set K_XMM, 32*0
- .elseif (i < 80)
- .set K_XMM, 32*1
- .elseif (i < 120)
- .set K_XMM, 32*2
- .else
- .set K_XMM, 32*3
- .endif
-
- .if (i<32)
- PRECALC_00_15 \s
- .elseif (i<64)
- PRECALC_16_31 \s
- .elseif (i < 160)
- PRECALC_32_79 \s
- .endif
-.endm
-
-.macro ROTATE_STATE
- .set T_REG, E
- .set E, D
- .set D, C
- .set C, B
- .set B, TB
- .set TB, A
- .set A, T_REG
-
- .set T_REG, RE
- .set RE, RD
- .set RD, RC
- .set RC, RB
- .set RB, RTB
- .set RTB, RA
- .set RA, T_REG
-.endm
-
-/* Macro relies on saved ROUND_Fx */
-
-.macro RND_FUN f, r
- .if (\f == RND_F1)
- ROUND_F1 \r
- .elseif (\f == RND_F2)
- ROUND_F2 \r
- .elseif (\f == RND_F3)
- ROUND_F3 \r
- .endif
-.endm
-
-.macro RR r
- .set round_id, (\r % 80)
-
- .if (round_id == 0) /* Precalculate F for first round */
- .set ROUND_FUNC, RND_F1
- mov B, TB
-
- rorx $(32-30), B, B /* b>>>2 */
- andn D, TB, T1
- and C, TB
- xor T1, TB
- .endif
-
- RND_FUN ROUND_FUNC, \r
- ROTATE_STATE
-
- .if (round_id == 18)
- .set ROUND_FUNC, RND_F2
- .elseif (round_id == 38)
- .set ROUND_FUNC, RND_F3
- .elseif (round_id == 58)
- .set ROUND_FUNC, RND_F2
- .endif
-
- .set round_id, ( (\r+1) % 80)
-
- RND_FUN ROUND_FUNC, (\r+1)
- ROTATE_STATE
-.endm
-
-.macro ROUND_F1 r
- add WK(\r), E
-
- andn C, A, T1 /* ~b&d */
- lea (RE,RTB), E /* Add F from the previous round */
-
- rorx $(32-5), A, TA /* T2 = A >>> 5 */
- rorx $(32-30),A, TB /* b>>>2 for next round */
-
- PRECALC (\r) /* msg scheduling for next 2 blocks */
-
- /*
- * Calculate F for the next round
- * (b & c) ^ andn[b, d]
- */
- and B, A /* b&c */
- xor T1, A /* F1 = (b&c) ^ (~b&d) */
-
- lea (RE,RTA), E /* E += A >>> 5 */
-.endm
-
-.macro ROUND_F2 r
- add WK(\r), E
- lea (RE,RTB), E /* Add F from the previous round */
-
- /* Calculate F for the next round */
- rorx $(32-5), A, TA /* T2 = A >>> 5 */
- .if ((round_id) < 79)
- rorx $(32-30), A, TB /* b>>>2 for next round */
- .endif
- PRECALC (\r) /* msg scheduling for next 2 blocks */
-
- .if ((round_id) < 79)
- xor B, A
- .endif
-
- add TA, E /* E += A >>> 5 */
-
- .if ((round_id) < 79)
- xor C, A
- .endif
-.endm
-
-.macro ROUND_F3 r
- add WK(\r), E
- PRECALC (\r) /* msg scheduling for next 2 blocks */
-
- lea (RE,RTB), E /* Add F from the previous round */
-
- mov B, T1
- or A, T1
-
- rorx $(32-5), A, TA /* T2 = A >>> 5 */
- rorx $(32-30), A, TB /* b>>>2 for next round */
-
- /* Calculate F for the next round
- * (b and c) or (d and (b or c))
- */
- and C, T1
- and B, A
- or T1, A
-
- add TA, E /* E += A >>> 5 */
-
-.endm
-
-/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
- * %1 + %2 >= %3 ? %4 : 0
- */
-.macro ADD_IF_GE a, b, c, d
- mov \a, RTA
- add $\d, RTA
- cmp $\c, \b
- cmovge RTA, \a
-.endm
-
-/*
- * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
- */
-.macro SHA1_PIPELINED_MAIN_BODY
-
- REGALLOC
-
- mov (HASH_PTR), A
- mov 4(HASH_PTR), B
- mov 8(HASH_PTR), C
- mov 12(HASH_PTR), D
- mov 16(HASH_PTR), E
-
- mov %rsp, PRECALC_BUF
- lea (2*4*80+32)(%rsp), WK_BUF
-
- # Precalc WK for first 2 blocks
- ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
- .set i, 0
- .rept 160
- PRECALC i
- .set i, i + 1
- .endr
-
- /* Go to next block if needed */
- ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
- ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
- xchg WK_BUF, PRECALC_BUF
-
- .align 32
-.L_loop:
- /*
- * code loops through more than one block
- * we use K_BASE value as a signal of a last block,
- * it is set below by: cmovae BUFFER_PTR, K_BASE
- */
- test BLOCKS_CTR, BLOCKS_CTR
- jnz .L_begin
- .align 32
- jmp .L_end
- .align 32
-.L_begin:
-
- /*
- * Do first block
- * rounds: 0,2,4,6,8
- */
- .set j, 0
- .rept 5
- RR j
- .set j, j+2
- .endr
-
- /*
- * rounds:
- * 10,12,14,16,18
- * 20,22,24,26,28
- * 30,32,34,36,38
- * 40,42,44,46,48
- * 50,52,54,56,58
- */
- .rept 25
- RR j
- .set j, j+2
- .endr
-
- /* Update Counter */
- sub $1, BLOCKS_CTR
- /* Move to the next block only if needed*/
- ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
- /*
- * rounds
- * 60,62,64,66,68
- * 70,72,74,76,78
- */
- .rept 10
- RR j
- .set j, j+2
- .endr
-
- UPDATE_HASH (HASH_PTR), A
- UPDATE_HASH 4(HASH_PTR), TB
- UPDATE_HASH 8(HASH_PTR), C
- UPDATE_HASH 12(HASH_PTR), D
- UPDATE_HASH 16(HASH_PTR), E
-
- test BLOCKS_CTR, BLOCKS_CTR
- jz .L_loop
-
- mov TB, B
-
- /* Process second block */
- /*
- * rounds
- * 0+80, 2+80, 4+80, 6+80, 8+80
- * 10+80,12+80,14+80,16+80,18+80
- */
-
- .set j, 0
- .rept 10
- RR j+80
- .set j, j+2
- .endr
-
- /*
- * rounds
- * 20+80,22+80,24+80,26+80,28+80
- * 30+80,32+80,34+80,36+80,38+80
- */
- .rept 10
- RR j+80
- .set j, j+2
- .endr
-
- /*
- * rounds
- * 40+80,42+80,44+80,46+80,48+80
- * 50+80,52+80,54+80,56+80,58+80
- */
- .rept 10
- RR j+80
- .set j, j+2
- .endr
-
- /* update counter */
- sub $1, BLOCKS_CTR
- /* Move to the next block only if needed*/
- ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
-
- /*
- * rounds
- * 60+80,62+80,64+80,66+80,68+80
- * 70+80,72+80,74+80,76+80,78+80
- */
- .rept 10
- RR j+80
- .set j, j+2
- .endr
-
- UPDATE_HASH (HASH_PTR), A
- UPDATE_HASH 4(HASH_PTR), TB
- UPDATE_HASH 8(HASH_PTR), C
- UPDATE_HASH 12(HASH_PTR), D
- UPDATE_HASH 16(HASH_PTR), E
-
- /* Reset state for AVX2 reg permutation */
- mov A, TA
- mov TB, A
- mov C, TB
- mov E, C
- mov D, B
- mov TA, D
-
- REGALLOC
-
- xchg WK_BUF, PRECALC_BUF
-
- jmp .L_loop
-
- .align 32
-.L_end:
-
-.endm
-/*
- * macro implements SHA-1 function's body for several 64-byte blocks
- * param: function's name
- */
-.macro SHA1_VECTOR_ASM name
- SYM_FUNC_START(\name)
-
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- RESERVE_STACK = (W_SIZE*4 + 8+24)
-
- /* Align stack */
- push %rbp
- mov %rsp, %rbp
- and $~(0x20-1), %rsp
- sub $RESERVE_STACK, %rsp
-
- avx2_zeroupper
-
- /* Setup initial values */
- mov CTX, HASH_PTR
- mov BUF, BUFFER_PTR
-
- mov BUF, BUFFER_PTR2
- mov CNT, BLOCKS_CTR
-
- xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
-
- SHA1_PIPELINED_MAIN_BODY
-
- avx2_zeroupper
-
- mov %rbp, %rsp
- pop %rbp
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
- RET
-
- SYM_FUNC_END(\name)
-.endm
-
-.section .rodata
-
-#define K1 0x5a827999
-#define K2 0x6ed9eba1
-#define K3 0x8f1bbcdc
-#define K4 0xca62c1d6
-
-.align 128
-K_XMM_AR:
- .long K1, K1, K1, K1
- .long K1, K1, K1, K1
- .long K2, K2, K2, K2
- .long K2, K2, K2, K2
- .long K3, K3, K3, K3
- .long K3, K3, K3, K3
- .long K4, K4, K4, K4
- .long K4, K4, K4, K4
-
-BSWAP_SHUFB_CTL:
- .long 0x00010203
- .long 0x04050607
- .long 0x08090a0b
- .long 0x0c0d0e0f
- .long 0x00010203
- .long 0x04050607
- .long 0x08090a0b
- .long 0x0c0d0e0f
-.text
-
-SHA1_VECTOR_ASM sha1_transform_avx2
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
deleted file mode 100644
index cade913d4882..000000000000
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Intel SHA Extensions optimized implementation of a SHA-1 update function
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Sean Gulley <sean.m.gulley@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-#define DIGEST_PTR %rdi /* 1st arg */
-#define DATA_PTR %rsi /* 2nd arg */
-#define NUM_BLKS %rdx /* 3rd arg */
-
-/* gcc conversion */
-#define FRAME_SIZE 32 /* space for 2x16 bytes */
-
-#define ABCD %xmm0
-#define E0 %xmm1 /* Need two E's b/c they ping pong */
-#define E1 %xmm2
-#define MSG0 %xmm3
-#define MSG1 %xmm4
-#define MSG2 %xmm5
-#define MSG3 %xmm6
-#define SHUF_MASK %xmm7
-
-
-/*
- * Intel SHA Extensions optimized implementation of a SHA-1 update function
- *
- * The function takes a pointer to the current hash values, a pointer to the
- * input data, and a number of 64 byte blocks to process. Once all blocks have
- * been processed, the digest pointer is updated with the resulting hash value.
- * The function only processes complete blocks, there is no functionality to
- * store partial blocks. All message padding and hash value initialization must
- * be done outside the update function.
- *
- * The indented lines in the loop are instructions related to rounds processing.
- * The non-indented lines are instructions related to the message schedule.
- *
- * void sha1_ni_transform(uint32_t *digest, const void *data,
- uint32_t numBlocks)
- * digest : pointer to digest
- * data: pointer to input data
- * numBlocks: Number of blocks to process
- */
-.text
-SYM_TYPED_FUNC_START(sha1_ni_transform)
- push %rbp
- mov %rsp, %rbp
- sub $FRAME_SIZE, %rsp
- and $~0xF, %rsp
-
- shl $6, NUM_BLKS /* convert to bytes */
- jz .Ldone_hash
- add DATA_PTR, NUM_BLKS /* pointer to end of data */
-
- /* load initial hash values */
- pinsrd $3, 1*16(DIGEST_PTR), E0
- movdqu 0*16(DIGEST_PTR), ABCD
- pand UPPER_WORD_MASK(%rip), E0
- pshufd $0x1B, ABCD, ABCD
-
- movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
-
-.Lloop0:
- /* Save hash values for addition after rounds */
- movdqa E0, (0*16)(%rsp)
- movdqa ABCD, (1*16)(%rsp)
-
- /* Rounds 0-3 */
- movdqu 0*16(DATA_PTR), MSG0
- pshufb SHUF_MASK, MSG0
- paddd MSG0, E0
- movdqa ABCD, E1
- sha1rnds4 $0, E0, ABCD
-
- /* Rounds 4-7 */
- movdqu 1*16(DATA_PTR), MSG1
- pshufb SHUF_MASK, MSG1
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1rnds4 $0, E1, ABCD
- sha1msg1 MSG1, MSG0
-
- /* Rounds 8-11 */
- movdqu 2*16(DATA_PTR), MSG2
- pshufb SHUF_MASK, MSG2
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1rnds4 $0, E0, ABCD
- sha1msg1 MSG2, MSG1
- pxor MSG2, MSG0
-
- /* Rounds 12-15 */
- movdqu 3*16(DATA_PTR), MSG3
- pshufb SHUF_MASK, MSG3
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1msg2 MSG3, MSG0
- sha1rnds4 $0, E1, ABCD
- sha1msg1 MSG3, MSG2
- pxor MSG3, MSG1
-
- /* Rounds 16-19 */
- sha1nexte MSG0, E0
- movdqa ABCD, E1
- sha1msg2 MSG0, MSG1
- sha1rnds4 $0, E0, ABCD
- sha1msg1 MSG0, MSG3
- pxor MSG0, MSG2
-
- /* Rounds 20-23 */
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1msg2 MSG1, MSG2
- sha1rnds4 $1, E1, ABCD
- sha1msg1 MSG1, MSG0
- pxor MSG1, MSG3
-
- /* Rounds 24-27 */
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1msg2 MSG2, MSG3
- sha1rnds4 $1, E0, ABCD
- sha1msg1 MSG2, MSG1
- pxor MSG2, MSG0
-
- /* Rounds 28-31 */
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1msg2 MSG3, MSG0
- sha1rnds4 $1, E1, ABCD
- sha1msg1 MSG3, MSG2
- pxor MSG3, MSG1
-
- /* Rounds 32-35 */
- sha1nexte MSG0, E0
- movdqa ABCD, E1
- sha1msg2 MSG0, MSG1
- sha1rnds4 $1, E0, ABCD
- sha1msg1 MSG0, MSG3
- pxor MSG0, MSG2
-
- /* Rounds 36-39 */
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1msg2 MSG1, MSG2
- sha1rnds4 $1, E1, ABCD
- sha1msg1 MSG1, MSG0
- pxor MSG1, MSG3
-
- /* Rounds 40-43 */
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1msg2 MSG2, MSG3
- sha1rnds4 $2, E0, ABCD
- sha1msg1 MSG2, MSG1
- pxor MSG2, MSG0
-
- /* Rounds 44-47 */
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1msg2 MSG3, MSG0
- sha1rnds4 $2, E1, ABCD
- sha1msg1 MSG3, MSG2
- pxor MSG3, MSG1
-
- /* Rounds 48-51 */
- sha1nexte MSG0, E0
- movdqa ABCD, E1
- sha1msg2 MSG0, MSG1
- sha1rnds4 $2, E0, ABCD
- sha1msg1 MSG0, MSG3
- pxor MSG0, MSG2
-
- /* Rounds 52-55 */
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1msg2 MSG1, MSG2
- sha1rnds4 $2, E1, ABCD
- sha1msg1 MSG1, MSG0
- pxor MSG1, MSG3
-
- /* Rounds 56-59 */
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1msg2 MSG2, MSG3
- sha1rnds4 $2, E0, ABCD
- sha1msg1 MSG2, MSG1
- pxor MSG2, MSG0
-
- /* Rounds 60-63 */
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1msg2 MSG3, MSG0
- sha1rnds4 $3, E1, ABCD
- sha1msg1 MSG3, MSG2
- pxor MSG3, MSG1
-
- /* Rounds 64-67 */
- sha1nexte MSG0, E0
- movdqa ABCD, E1
- sha1msg2 MSG0, MSG1
- sha1rnds4 $3, E0, ABCD
- sha1msg1 MSG0, MSG3
- pxor MSG0, MSG2
-
- /* Rounds 68-71 */
- sha1nexte MSG1, E1
- movdqa ABCD, E0
- sha1msg2 MSG1, MSG2
- sha1rnds4 $3, E1, ABCD
- pxor MSG1, MSG3
-
- /* Rounds 72-75 */
- sha1nexte MSG2, E0
- movdqa ABCD, E1
- sha1msg2 MSG2, MSG3
- sha1rnds4 $3, E0, ABCD
-
- /* Rounds 76-79 */
- sha1nexte MSG3, E1
- movdqa ABCD, E0
- sha1rnds4 $3, E1, ABCD
-
- /* Add current hash values with previously saved */
- sha1nexte (0*16)(%rsp), E0
- paddd (1*16)(%rsp), ABCD
-
- /* Increment data pointer and loop if more to process */
- add $64, DATA_PTR
- cmp NUM_BLKS, DATA_PTR
- jne .Lloop0
-
- /* Write hash values back in the correct order */
- pshufd $0x1B, ABCD, ABCD
- movdqu ABCD, 0*16(DIGEST_PTR)
- pextrd $3, E0, 1*16(DIGEST_PTR)
-
-.Ldone_hash:
- mov %rbp, %rsp
- pop %rbp
-
- RET
-SYM_FUNC_END(sha1_ni_transform)
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x000102030405060708090a0b0c0d0e0f
-
-.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
-.align 16
-UPPER_WORD_MASK:
- .octa 0xFFFFFFFF000000000000000000000000
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
deleted file mode 100644
index f54988c80eb4..000000000000
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ /dev/null
@@ -1,554 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
- * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
- * processors. CPUs supporting Intel(R) AVX extensions will get an additional
- * boost.
- *
- * This work was inspired by the vectorized implementation of Dean Gaudet.
- * Additional information on it can be found at:
- * http://www.arctic.org/~dean/crypto/sha1.html
- *
- * It was improved upon with more efficient vectorization of the message
- * scheduling. This implementation has also been optimized for all current and
- * several future generations of Intel CPUs.
- *
- * See this article for more information about the implementation details:
- * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
- *
- * Copyright (C) 2010, Intel Corp.
- * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
- * Ronen Zohar <ronen.zohar@intel.com>
- *
- * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
- * Author: Mathias Krause <minipli@googlemail.com>
- */
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-#define CTX %rdi // arg1
-#define BUF %rsi // arg2
-#define CNT %rdx // arg3
-
-#define REG_A %ecx
-#define REG_B %esi
-#define REG_C %edi
-#define REG_D %r12d
-#define REG_E %edx
-
-#define REG_T1 %eax
-#define REG_T2 %ebx
-
-#define K_BASE %r8
-#define HASH_PTR %r9
-#define BUFFER_PTR %r10
-#define BUFFER_END %r11
-
-#define W_TMP1 %xmm0
-#define W_TMP2 %xmm9
-
-#define W0 %xmm1
-#define W4 %xmm2
-#define W8 %xmm3
-#define W12 %xmm4
-#define W16 %xmm5
-#define W20 %xmm6
-#define W24 %xmm7
-#define W28 %xmm8
-
-#define XMM_SHUFB_BSWAP %xmm10
-
-/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
-#define WK(t) (((t) & 15) * 4)(%rsp)
-#define W_PRECALC_AHEAD 16
-
-/*
- * This macro implements the SHA-1 function's body for single 64-byte block
- * param: function's name
- */
-.macro SHA1_VECTOR_ASM name
- SYM_TYPED_FUNC_START(\name)
-
- push %rbx
- push %r12
- push %rbp
- mov %rsp, %rbp
-
- sub $64, %rsp # allocate workspace
- and $~15, %rsp # align stack
-
- mov CTX, HASH_PTR
- mov BUF, BUFFER_PTR
-
- shl $6, CNT # multiply by 64
- add BUF, CNT
- mov CNT, BUFFER_END
-
- lea K_XMM_AR(%rip), K_BASE
- xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
-
- SHA1_PIPELINED_MAIN_BODY
-
- # cleanup workspace
- mov $8, %ecx
- mov %rsp, %rdi
- xor %eax, %eax
- rep stosq
-
- mov %rbp, %rsp # deallocate workspace
- pop %rbp
- pop %r12
- pop %rbx
- RET
-
- SYM_FUNC_END(\name)
-.endm
-
-/*
- * This macro implements 80 rounds of SHA-1 for one 64-byte block
- */
-.macro SHA1_PIPELINED_MAIN_BODY
- INIT_REGALLOC
-
- mov (HASH_PTR), A
- mov 4(HASH_PTR), B
- mov 8(HASH_PTR), C
- mov 12(HASH_PTR), D
- mov 16(HASH_PTR), E
-
- .set i, 0
- .rept W_PRECALC_AHEAD
- W_PRECALC i
- .set i, (i+1)
- .endr
-
-.align 4
-1:
- RR F1,A,B,C,D,E,0
- RR F1,D,E,A,B,C,2
- RR F1,B,C,D,E,A,4
- RR F1,E,A,B,C,D,6
- RR F1,C,D,E,A,B,8
-
- RR F1,A,B,C,D,E,10
- RR F1,D,E,A,B,C,12
- RR F1,B,C,D,E,A,14
- RR F1,E,A,B,C,D,16
- RR F1,C,D,E,A,B,18
-
- RR F2,A,B,C,D,E,20
- RR F2,D,E,A,B,C,22
- RR F2,B,C,D,E,A,24
- RR F2,E,A,B,C,D,26
- RR F2,C,D,E,A,B,28
-
- RR F2,A,B,C,D,E,30
- RR F2,D,E,A,B,C,32
- RR F2,B,C,D,E,A,34
- RR F2,E,A,B,C,D,36
- RR F2,C,D,E,A,B,38
-
- RR F3,A,B,C,D,E,40
- RR F3,D,E,A,B,C,42
- RR F3,B,C,D,E,A,44
- RR F3,E,A,B,C,D,46
- RR F3,C,D,E,A,B,48
-
- RR F3,A,B,C,D,E,50
- RR F3,D,E,A,B,C,52
- RR F3,B,C,D,E,A,54
- RR F3,E,A,B,C,D,56
- RR F3,C,D,E,A,B,58
-
- add $64, BUFFER_PTR # move to the next 64-byte block
- cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
- cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
-
- RR F4,A,B,C,D,E,60
- RR F4,D,E,A,B,C,62
- RR F4,B,C,D,E,A,64
- RR F4,E,A,B,C,D,66
- RR F4,C,D,E,A,B,68
-
- RR F4,A,B,C,D,E,70
- RR F4,D,E,A,B,C,72
- RR F4,B,C,D,E,A,74
- RR F4,E,A,B,C,D,76
- RR F4,C,D,E,A,B,78
-
- UPDATE_HASH (HASH_PTR), A
- UPDATE_HASH 4(HASH_PTR), B
- UPDATE_HASH 8(HASH_PTR), C
- UPDATE_HASH 12(HASH_PTR), D
- UPDATE_HASH 16(HASH_PTR), E
-
- RESTORE_RENAMED_REGS
- cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
- jne 1b
-.endm
-
-.macro INIT_REGALLOC
- .set A, REG_A
- .set B, REG_B
- .set C, REG_C
- .set D, REG_D
- .set E, REG_E
- .set T1, REG_T1
- .set T2, REG_T2
-.endm
-
-.macro RESTORE_RENAMED_REGS
- # order is important (REG_C is where it should be)
- mov B, REG_B
- mov D, REG_D
- mov A, REG_A
- mov E, REG_E
-.endm
-
-.macro SWAP_REG_NAMES a, b
- .set _T, \a
- .set \a, \b
- .set \b, _T
-.endm
-
-.macro F1 b, c, d
- mov \c, T1
- SWAP_REG_NAMES \c, T1
- xor \d, T1
- and \b, T1
- xor \d, T1
-.endm
-
-.macro F2 b, c, d
- mov \d, T1
- SWAP_REG_NAMES \d, T1
- xor \c, T1
- xor \b, T1
-.endm
-
-.macro F3 b, c ,d
- mov \c, T1
- SWAP_REG_NAMES \c, T1
- mov \b, T2
- or \b, T1
- and \c, T2
- and \d, T1
- or T2, T1
-.endm
-
-.macro F4 b, c, d
- F2 \b, \c, \d
-.endm
-
-.macro UPDATE_HASH hash, val
- add \hash, \val
- mov \val, \hash
-.endm
-
-/*
- * RR does two rounds of SHA-1 back to back with W[] pre-calc
- * t1 = F(b, c, d); e += w(i)
- * e += t1; b <<= 30; d += w(i+1);
- * t1 = F(a, b, c);
- * d += t1; a <<= 5;
- * e += a;
- * t1 = e; a >>= 7;
- * t1 <<= 5;
- * d += t1;
- */
-.macro RR F, a, b, c, d, e, round
- add WK(\round), \e
- \F \b, \c, \d # t1 = F(b, c, d);
- W_PRECALC (\round + W_PRECALC_AHEAD)
- rol $30, \b
- add T1, \e
- add WK(\round + 1), \d
-
- \F \a, \b, \c
- W_PRECALC (\round + W_PRECALC_AHEAD + 1)
- rol $5, \a
- add \a, \e
- add T1, \d
- ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
-
- mov \e, T1
- SWAP_REG_NAMES \e, T1
-
- rol $5, T1
- add T1, \d
-
- # write: \a, \b
- # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
-.endm
-
-.macro W_PRECALC r
- .set i, \r
-
- .if (i < 20)
- .set K_XMM, 0
- .elseif (i < 40)
- .set K_XMM, 16
- .elseif (i < 60)
- .set K_XMM, 32
- .elseif (i < 80)
- .set K_XMM, 48
- .endif
-
- .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
- .set i, ((\r) % 80) # pre-compute for the next iteration
- .if (i == 0)
- W_PRECALC_RESET
- .endif
- W_PRECALC_00_15
- .elseif (i<32)
- W_PRECALC_16_31
- .elseif (i < 80) // rounds 32-79
- W_PRECALC_32_79
- .endif
-.endm
-
-.macro W_PRECALC_RESET
- .set W, W0
- .set W_minus_04, W4
- .set W_minus_08, W8
- .set W_minus_12, W12
- .set W_minus_16, W16
- .set W_minus_20, W20
- .set W_minus_24, W24
- .set W_minus_28, W28
- .set W_minus_32, W
-.endm
-
-.macro W_PRECALC_ROTATE
- .set W_minus_32, W_minus_28
- .set W_minus_28, W_minus_24
- .set W_minus_24, W_minus_20
- .set W_minus_20, W_minus_16
- .set W_minus_16, W_minus_12
- .set W_minus_12, W_minus_08
- .set W_minus_08, W_minus_04
- .set W_minus_04, W
- .set W, W_minus_32
-.endm
-
-.macro W_PRECALC_SSSE3
-
-.macro W_PRECALC_00_15
- W_PRECALC_00_15_SSSE3
-.endm
-.macro W_PRECALC_16_31
- W_PRECALC_16_31_SSSE3
-.endm
-.macro W_PRECALC_32_79
- W_PRECALC_32_79_SSSE3
-.endm
-
-/* message scheduling pre-compute for rounds 0-15 */
-.macro W_PRECALC_00_15_SSSE3
- .if ((i & 3) == 0)
- movdqu (i*4)(BUFFER_PTR), W_TMP1
- .elseif ((i & 3) == 1)
- pshufb XMM_SHUFB_BSWAP, W_TMP1
- movdqa W_TMP1, W
- .elseif ((i & 3) == 2)
- paddd (K_BASE), W_TMP1
- .elseif ((i & 3) == 3)
- movdqa W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-/* message scheduling pre-compute for rounds 16-31
- *
- * - calculating last 32 w[i] values in 8 XMM registers
- * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
- * instruction
- *
- * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
- * dependency, but improves for 32-79
- */
-.macro W_PRECALC_16_31_SSSE3
- # blended scheduling of vector and scalar instruction streams, one 4-wide
- # vector iteration / 4 scalar rounds
- .if ((i & 3) == 0)
- movdqa W_minus_12, W
- palignr $8, W_minus_16, W # w[i-14]
- movdqa W_minus_04, W_TMP1
- psrldq $4, W_TMP1 # w[i-3]
- pxor W_minus_08, W
- .elseif ((i & 3) == 1)
- pxor W_minus_16, W_TMP1
- pxor W_TMP1, W
- movdqa W, W_TMP2
- movdqa W, W_TMP1
- pslldq $12, W_TMP2
- .elseif ((i & 3) == 2)
- psrld $31, W
- pslld $1, W_TMP1
- por W, W_TMP1
- movdqa W_TMP2, W
- psrld $30, W_TMP2
- pslld $2, W
- .elseif ((i & 3) == 3)
- pxor W, W_TMP1
- pxor W_TMP2, W_TMP1
- movdqa W_TMP1, W
- paddd K_XMM(K_BASE), W_TMP1
- movdqa W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-/* message scheduling pre-compute for rounds 32-79
- *
- * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
- * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
- * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
- */
-.macro W_PRECALC_32_79_SSSE3
- .if ((i & 3) == 0)
- movdqa W_minus_04, W_TMP1
- pxor W_minus_28, W # W is W_minus_32 before xor
- palignr $8, W_minus_08, W_TMP1
- .elseif ((i & 3) == 1)
- pxor W_minus_16, W
- pxor W_TMP1, W
- movdqa W, W_TMP1
- .elseif ((i & 3) == 2)
- psrld $30, W
- pslld $2, W_TMP1
- por W, W_TMP1
- .elseif ((i & 3) == 3)
- movdqa W_TMP1, W
- paddd K_XMM(K_BASE), W_TMP1
- movdqa W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-.endm // W_PRECALC_SSSE3
-
-
-#define K1 0x5a827999
-#define K2 0x6ed9eba1
-#define K3 0x8f1bbcdc
-#define K4 0xca62c1d6
-
-.section .rodata
-.align 16
-
-K_XMM_AR:
- .long K1, K1, K1, K1
- .long K2, K2, K2, K2
- .long K3, K3, K3, K3
- .long K4, K4, K4, K4
-
-BSWAP_SHUFB_CTL:
- .long 0x00010203
- .long 0x04050607
- .long 0x08090a0b
- .long 0x0c0d0e0f
-
-
-.section .text
-
-W_PRECALC_SSSE3
-.macro xmm_mov a, b
- movdqu \a,\b
-.endm
-
-/*
- * SSSE3 optimized implementation:
- *
- * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
- * const u8 *data, int blocks);
- *
- * Note that struct sha1_state is assumed to begin with u32 state[5].
- */
-SHA1_VECTOR_ASM sha1_transform_ssse3
-
-.macro W_PRECALC_AVX
-
-.purgem W_PRECALC_00_15
-.macro W_PRECALC_00_15
- W_PRECALC_00_15_AVX
-.endm
-.purgem W_PRECALC_16_31
-.macro W_PRECALC_16_31
- W_PRECALC_16_31_AVX
-.endm
-.purgem W_PRECALC_32_79
-.macro W_PRECALC_32_79
- W_PRECALC_32_79_AVX
-.endm
-
-.macro W_PRECALC_00_15_AVX
- .if ((i & 3) == 0)
- vmovdqu (i*4)(BUFFER_PTR), W_TMP1
- .elseif ((i & 3) == 1)
- vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
- .elseif ((i & 3) == 2)
- vpaddd (K_BASE), W, W_TMP1
- .elseif ((i & 3) == 3)
- vmovdqa W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-.macro W_PRECALC_16_31_AVX
- .if ((i & 3) == 0)
- vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
- vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
- vpxor W_minus_08, W, W
- vpxor W_minus_16, W_TMP1, W_TMP1
- .elseif ((i & 3) == 1)
- vpxor W_TMP1, W, W
- vpslldq $12, W, W_TMP2
- vpslld $1, W, W_TMP1
- .elseif ((i & 3) == 2)
- vpsrld $31, W, W
- vpor W, W_TMP1, W_TMP1
- vpslld $2, W_TMP2, W
- vpsrld $30, W_TMP2, W_TMP2
- .elseif ((i & 3) == 3)
- vpxor W, W_TMP1, W_TMP1
- vpxor W_TMP2, W_TMP1, W
- vpaddd K_XMM(K_BASE), W, W_TMP1
- vmovdqu W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-.macro W_PRECALC_32_79_AVX
- .if ((i & 3) == 0)
- vpalignr $8, W_minus_08, W_minus_04, W_TMP1
- vpxor W_minus_28, W, W # W is W_minus_32 before xor
- .elseif ((i & 3) == 1)
- vpxor W_minus_16, W_TMP1, W_TMP1
- vpxor W_TMP1, W, W
- .elseif ((i & 3) == 2)
- vpslld $2, W, W_TMP1
- vpsrld $30, W, W
- vpor W, W_TMP1, W
- .elseif ((i & 3) == 3)
- vpaddd K_XMM(K_BASE), W, W_TMP1
- vmovdqu W_TMP1, WK(i&~3)
- W_PRECALC_ROTATE
- .endif
-.endm
-
-.endm // W_PRECALC_AVX
-
-W_PRECALC_AVX
-.purgem xmm_mov
-.macro xmm_mov a, b
- vmovdqu \a,\b
-.endm
-
-
-/* AVX optimized implementation:
- * extern "C" void sha1_transform_avx(struct sha1_state *state,
- * const u8 *data, int blocks);
- */
-SHA1_VECTOR_ASM sha1_transform_avx
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
deleted file mode 100644
index 0a912bfc86c5..000000000000
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ /dev/null
@@ -1,324 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- *
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementations
- * using SSSE3, AVX, AVX2, and SHA-NI instructions.
- *
- * This file is based on sha1_generic.c
- *
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- * Copyright (c) Mathias Krause <minipli@googlemail.com>
- * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-static const struct x86_cpu_id module_cpu_ids[] = {
- X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
-
-static inline int sha1_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, sha1_block_fn *sha1_xform)
-{
- int remain;
-
- /*
- * Make sure struct sha1_state begins directly with the SHA1
- * 160-bit internal state, as this is what the asm functions expect.
- */
- BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
-
- kernel_fpu_begin();
- remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform);
- kernel_fpu_end();
-
- return remain;
-}
-
-static inline int sha1_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out,
- sha1_block_fn *sha1_xform)
-{
- kernel_fpu_begin();
- sha1_base_do_finup(desc, data, len, sha1_xform);
- kernel_fpu_end();
-
- return sha1_base_finish(desc, out);
-}
-
-asmlinkage void sha1_transform_ssse3(struct sha1_state *state,
- const u8 *data, int blocks);
-
-static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_update(desc, data, len, sha1_transform_ssse3);
-}
-
-static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha1_finup(desc, data, len, out, sha1_transform_ssse3);
-}
-
-static struct shash_alg sha1_ssse3_alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_ssse3_update,
- .finup = sha1_ssse3_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-ssse3",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int register_sha1_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- return crypto_register_shash(&sha1_ssse3_alg);
- return 0;
-}
-
-static void unregister_sha1_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shash(&sha1_ssse3_alg);
-}
-
-asmlinkage void sha1_transform_avx(struct sha1_state *state,
- const u8 *data, int blocks);
-
-static int sha1_avx_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_update(desc, data, len, sha1_transform_avx);
-}
-
-static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha1_finup(desc, data, len, out, sha1_transform_avx);
-}
-
-static struct shash_alg sha1_avx_alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_avx_update,
- .finup = sha1_avx_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-avx",
- .cra_priority = 160,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static bool avx_usable(void)
-{
- if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
- if (boot_cpu_has(X86_FEATURE_AVX))
- pr_info("AVX detected but unusable.\n");
- return false;
- }
-
- return true;
-}
-
-static int register_sha1_avx(void)
-{
- if (avx_usable())
- return crypto_register_shash(&sha1_avx_alg);
- return 0;
-}
-
-static void unregister_sha1_avx(void)
-{
- if (avx_usable())
- crypto_unregister_shash(&sha1_avx_alg);
-}
-
-#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
-
-asmlinkage void sha1_transform_avx2(struct sha1_state *state,
- const u8 *data, int blocks);
-
-static bool avx2_usable(void)
-{
- if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
- && boot_cpu_has(X86_FEATURE_BMI1)
- && boot_cpu_has(X86_FEATURE_BMI2))
- return true;
-
- return false;
-}
-
-static inline void sha1_apply_transform_avx2(struct sha1_state *state,
- const u8 *data, int blocks)
-{
- /* Select the optimal transform based on data block size */
- if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE)
- sha1_transform_avx2(state, data, blocks);
- else
- sha1_transform_avx(state, data, blocks);
-}
-
-static int sha1_avx2_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_update(desc, data, len, sha1_apply_transform_avx2);
-}
-
-static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2);
-}
-
-static struct shash_alg sha1_avx2_alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_avx2_update,
- .finup = sha1_avx2_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-avx2",
- .cra_priority = 170,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int register_sha1_avx2(void)
-{
- if (avx2_usable())
- return crypto_register_shash(&sha1_avx2_alg);
- return 0;
-}
-
-static void unregister_sha1_avx2(void)
-{
- if (avx2_usable())
- crypto_unregister_shash(&sha1_avx2_alg);
-}
-
-asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data,
- int rounds);
-
-static int sha1_ni_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha1_update(desc, data, len, sha1_ni_transform);
-}
-
-static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha1_finup(desc, data, len, out, sha1_ni_transform);
-}
-
-static struct shash_alg sha1_ni_alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_ni_update,
- .finup = sha1_ni_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-ni",
- .cra_priority = 250,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int register_sha1_ni(void)
-{
- if (boot_cpu_has(X86_FEATURE_SHA_NI))
- return crypto_register_shash(&sha1_ni_alg);
- return 0;
-}
-
-static void unregister_sha1_ni(void)
-{
- if (boot_cpu_has(X86_FEATURE_SHA_NI))
- crypto_unregister_shash(&sha1_ni_alg);
-}
-
-static int __init sha1_ssse3_mod_init(void)
-{
- if (!x86_match_cpu(module_cpu_ids))
- return -ENODEV;
-
- if (register_sha1_ssse3())
- goto fail;
-
- if (register_sha1_avx()) {
- unregister_sha1_ssse3();
- goto fail;
- }
-
- if (register_sha1_avx2()) {
- unregister_sha1_avx();
- unregister_sha1_ssse3();
- goto fail;
- }
-
- if (register_sha1_ni()) {
- unregister_sha1_avx2();
- unregister_sha1_avx();
- unregister_sha1_ssse3();
- goto fail;
- }
-
- return 0;
-fail:
- return -ENODEV;
-}
-
-static void __exit sha1_ssse3_mod_fini(void)
-{
- unregister_sha1_ni();
- unregister_sha1_avx2();
- unregister_sha1_avx();
- unregister_sha1_ssse3();
-}
-
-module_init(sha1_ssse3_mod_init);
-module_exit(sha1_ssse3_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-
-MODULE_ALIAS_CRYPTO("sha1");
-MODULE_ALIAS_CRYPTO("sha1-ssse3");
-MODULE_ALIAS_CRYPTO("sha1-avx");
-MODULE_ALIAS_CRYPTO("sha1-avx2");
-MODULE_ALIAS_CRYPTO("sha1-ni");
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
deleted file mode 100644
index 5bfce4b045fd..000000000000
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ /dev/null
@@ -1,423 +0,0 @@
-########################################################################
-# Implement fast SHA-512 with AVX instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# David Cote <david.m.cote@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-512 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-.text
-
-# Virtual Registers
-# ARG1
-digest = %rdi
-# ARG2
-msg = %rsi
-# ARG3
-msglen = %rdx
-T1 = %rcx
-T2 = %r8
-a_64 = %r9
-b_64 = %r10
-c_64 = %r11
-d_64 = %r12
-e_64 = %r13
-f_64 = %r14
-g_64 = %r15
-h_64 = %rbx
-tmp0 = %rax
-
-# Local variables (stack frame)
-
-# Message Schedule
-W_SIZE = 80*8
-# W[t] + K[t] | W[t+1] + K[t+1]
-WK_SIZE = 2*8
-
-frame_W = 0
-frame_WK = frame_W + W_SIZE
-frame_size = frame_WK + WK_SIZE
-
-# Useful QWORD "arrays" for simpler memory references
-# MSG, DIGEST, K_t, W_t are arrays
-# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
-
-# Input message (arg1)
-#define MSG(i) 8*i(msg)
-
-# Output Digest (arg2)
-#define DIGEST(i) 8*i(digest)
-
-# SHA Constants (static mem)
-#define K_t(i) 8*i+K512(%rip)
-
-# Message Schedule (stack frame)
-#define W_t(i) 8*i+frame_W(%rsp)
-
-# W[t]+K[t] (stack frame)
-#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
-
-.macro RotateState
- # Rotate symbols a..h right
- TMP = h_64
- h_64 = g_64
- g_64 = f_64
- f_64 = e_64
- e_64 = d_64
- d_64 = c_64
- c_64 = b_64
- b_64 = a_64
- a_64 = TMP
-.endm
-
-.macro RORQ p1 p2
- # shld is faster than ror on Sandybridge
- shld $(64-\p2), \p1, \p1
-.endm
-
-.macro SHA512_Round rnd
- # Compute Round %%t
- mov f_64, T1 # T1 = f
- mov e_64, tmp0 # tmp = e
- xor g_64, T1 # T1 = f ^ g
- RORQ tmp0, 23 # 41 # tmp = e ror 23
- and e_64, T1 # T1 = (f ^ g) & e
- xor e_64, tmp0 # tmp = (e ror 23) ^ e
- xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
- idx = \rnd
- add WK_2(idx), T1 # W[t] + K[t] from message scheduler
- RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4
- xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
- mov a_64, T2 # T2 = a
- add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
- RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
- add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
- mov a_64, tmp0 # tmp = a
- xor c_64, T2 # T2 = a ^ c
- and c_64, tmp0 # tmp = a & c
- and b_64, T2 # T2 = (a ^ c) & b
- xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
- mov a_64, tmp0 # tmp = a
- RORQ tmp0, 5 # 39 # tmp = a ror 5
- xor a_64, tmp0 # tmp = (a ror 5) ^ a
- add T1, d_64 # e(next_state) = d + T1
- RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6
- xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
- lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
- RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
- add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
- RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_avx rnd
- # Compute rounds t-2 and t-1
- # Compute message schedule QWORDS t and t+1
-
- # Two rounds are computed based on the values for K[t-2]+W[t-2] and
- # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
- # scheduler.
- # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
- # They are then added to their respective SHA512 constants at
- # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
- # For brievity, the comments following vectored instructions only refer to
- # the first of a pair of QWORDS.
- # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
- # The computation of the message schedule and the rounds are tightly
- # stitched to take advantage of instruction-level parallelism.
-
- idx = \rnd - 2
- vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2]
- idx = \rnd - 15
- vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
- mov f_64, T1
- vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61
- mov e_64, tmp0
- vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1
- xor g_64, T1
- RORQ tmp0, 23 # 41
- vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19
- and e_64, T1
- xor e_64, tmp0
- vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19
- xor g_64, T1
- idx = \rnd
- add WK_2(idx), T1#
- vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8
- RORQ tmp0, 4 # 18
- vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6
- xor e_64, tmp0
- mov a_64, T2
- add h_64, T1
- vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8
- RORQ tmp0, 14 # 14
- add tmp0, T1
- vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7
- mov a_64, tmp0
- xor c_64, T2
- vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3
- and c_64, tmp0
- and b_64, T2
- vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3
- xor tmp0, T2
- mov a_64, tmp0
- vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63
- RORQ tmp0, 5 # 39
- vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63
- xor a_64, tmp0
- add T1, d_64
- RORQ tmp0, 6 # 34
- xor a_64, tmp0
- vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
- # W[t-15]>>7 ^ W[t-15]<<63
- lea (T1, T2), h_64
- RORQ tmp0, 28 # 28
- vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25
- add tmp0, h_64
- RotateState
- vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
- # W[t-2]<<25
- mov f_64, T1
- vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2])
- mov e_64, tmp0
- xor g_64, T1
- idx = \rnd - 16
- vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16]
- idx = \rnd - 7
- vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
- RORQ tmp0, 23 # 41
- and e_64, T1
- xor e_64, tmp0
- xor g_64, T1
- vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56
- idx = \rnd + 1
- add WK_2(idx), T1
- vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15])
- RORQ tmp0, 4 # 18
- vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
- xor e_64, tmp0
- vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
- # s0(W[t-15]) + W[t-16]
- mov a_64, T2
- add h_64, T1
- RORQ tmp0, 14 # 14
- add tmp0, T1
- idx = \rnd
- vmovdqa %xmm0, W_t(idx) # Store W[t]
- vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t]
- vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
- mov a_64, tmp0
- xor c_64, T2
- and c_64, tmp0
- and b_64, T2
- xor tmp0, T2
- mov a_64, tmp0
- RORQ tmp0, 5 # 39
- xor a_64, tmp0
- add T1, d_64
- RORQ tmp0, 6 # 34
- xor a_64, tmp0
- lea (T1, T2), h_64
- RORQ tmp0, 28 # 28
- add tmp0, h_64
- RotateState
-.endm
-
-########################################################################
-# void sha512_transform_avx(sha512_state *state, const u8 *data, int blocks)
-# Purpose: Updates the SHA512 digest stored at "state" with the message
-# stored in "data".
-# The size of the message pointed to by "data" must be an integer multiple
-# of SHA512 message blocks.
-# "blocks" is the message length in SHA512 blocks
-########################################################################
-SYM_TYPED_FUNC_START(sha512_transform_avx)
- test msglen, msglen
- je .Lnowork
-
- # Save GPRs
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- # Allocate Stack Space
- push %rbp
- mov %rsp, %rbp
- sub $frame_size, %rsp
- and $~(0x20 - 1), %rsp
-
-.Lupdateblock:
-
- # Load state variables
- mov DIGEST(0), a_64
- mov DIGEST(1), b_64
- mov DIGEST(2), c_64
- mov DIGEST(3), d_64
- mov DIGEST(4), e_64
- mov DIGEST(5), f_64
- mov DIGEST(6), g_64
- mov DIGEST(7), h_64
-
- t = 0
- .rept 80/2 + 1
- # (80 rounds) / (2 rounds/iteration) + (1 iteration)
- # +1 iteration because the scheduler leads hashing by 1 iteration
- .if t < 2
- # BSWAP 2 QWORDS
- vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1
- vmovdqu MSG(t), %xmm0
- vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
- vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
- vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
- vmovdqa %xmm0, WK_2(t) # Store into WK for rounds
- .elseif t < 16
- # BSWAP 2 QWORDS# Compute 2 Rounds
- vmovdqu MSG(t), %xmm0
- vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
- SHA512_Round t-2 # Round t-2
- vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
- vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
- SHA512_Round t-1 # Round t-1
- vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK
- .elseif t < 79
- # Schedule 2 QWORDS# Compute 2 Rounds
- SHA512_2Sched_2Round_avx t
- .else
- # Compute 2 Rounds
- SHA512_Round t-2
- SHA512_Round t-1
- .endif
- t = t+2
- .endr
-
- # Update digest
- add a_64, DIGEST(0)
- add b_64, DIGEST(1)
- add c_64, DIGEST(2)
- add d_64, DIGEST(3)
- add e_64, DIGEST(4)
- add f_64, DIGEST(5)
- add g_64, DIGEST(6)
- add h_64, DIGEST(7)
-
- # Advance to next message block
- add $16*8, msg
- dec msglen
- jnz .Lupdateblock
-
- # Restore Stack Pointer
- mov %rbp, %rsp
- pop %rbp
-
- # Restore GPRs
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
-.Lnowork:
- RET
-SYM_FUNC_END(sha512_transform_avx)
-
-########################################################################
-### Binary Data
-
-.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
-.align 16
-# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
-XMM_QWORD_BSWAP:
- .octa 0x08090a0b0c0d0e0f0001020304050607
-
-# Mergeable 640-byte rodata section. This allows linker to merge the table
-# with other, exactly the same 640-byte fragment of another rodata section
-# (if such section exists).
-.section .rodata.cst640.K512, "aM", @progbits, 640
-.align 64
-# K[t] used in SHA512 hashing
-K512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
deleted file mode 100644
index 24973f42c43f..000000000000
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ /dev/null
@@ -1,750 +0,0 @@
-########################################################################
-# Implement fast SHA-512 with AVX2 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# David Cote <david.m.cote@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-512 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 1 blocks at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-.text
-
-# Virtual Registers
-Y_0 = %ymm4
-Y_1 = %ymm5
-Y_2 = %ymm6
-Y_3 = %ymm7
-
-YTMP0 = %ymm0
-YTMP1 = %ymm1
-YTMP2 = %ymm2
-YTMP3 = %ymm3
-YTMP4 = %ymm8
-XFER = YTMP0
-
-BYTE_FLIP_MASK = %ymm9
-
-# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
-CTX1 = %rdi
-CTX2 = %r12
-# 2nd arg
-INP = %rsi
-# 3rd arg
-NUM_BLKS = %rdx
-
-c = %rcx
-d = %r8
-e = %rdx
-y3 = %rsi
-
-TBL = %rdi # clobbers CTX1
-
-a = %rax
-b = %rbx
-
-f = %r9
-g = %r10
-h = %r11
-old_h = %r11
-
-T1 = %r12 # clobbers CTX2
-y0 = %r13
-y1 = %r14
-y2 = %r15
-
-# Local variables (stack frame)
-XFER_SIZE = 4*8
-SRND_SIZE = 1*8
-INP_SIZE = 1*8
-INPEND_SIZE = 1*8
-CTX_SIZE = 1*8
-
-frame_XFER = 0
-frame_SRND = frame_XFER + XFER_SIZE
-frame_INP = frame_SRND + SRND_SIZE
-frame_INPEND = frame_INP + INP_SIZE
-frame_CTX = frame_INPEND + INPEND_SIZE
-frame_size = frame_CTX + CTX_SIZE
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-
-# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
-# Load ymm with mem and byte swap each dword
-.macro COPY_YMM_AND_BSWAP p1 p2 p3
- VMOVDQ \p2, \p1
- vpshufb \p3, \p1, \p1
-.endm
-# rotate_Ys
-# Rotate values of symbols Y0...Y3
-.macro rotate_Ys
- Y_ = Y_0
- Y_0 = Y_1
- Y_1 = Y_2
- Y_2 = Y_3
- Y_3 = Y_
-.endm
-
-# RotateState
-.macro RotateState
- # Rotate symbols a..h right
- old_h = h
- TMP_ = h
- h = g
- g = f
- f = e
- e = d
- d = c
- c = b
- b = a
- a = TMP_
-.endm
-
-# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
-# YDST = {YSRC1, YSRC2} >> RVAL*8
-.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
- vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
- vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
-################################### RND N + 0 #########################################
-
- # Extract w[t-7]
- MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
- # Calculate w[t-16] + w[t-7]
- vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
- # Extract w[t-15]
- MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
-
- # Calculate sigma0
-
- # Calculate w[t-15] ror 1
- vpsrlq $1, YTMP1, YTMP2
- vpsllq $(64-1), YTMP1, YTMP3
- vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
- # Calculate w[t-15] shr 7
- vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- add frame_XFER(%rsp),h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
- mov f, y2 # y2 = f # CH
- rorx $34, a, T1 # T1 = a >> 34 # S0B
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- xor g, y2 # y2 = f^g # CH
- rorx $14, e, y1 # y1 = (e >> 14) # S1
-
- and e, y2 # y2 = (f^g)&e # CH
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- add h, d # d = k + w + h + d # --
-
- and b, y3 # y3 = (a|c)&b # MAJA
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
-
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
-
- add y0, y2 # y2 = S1 + CH # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-################################### RND N + 1 #########################################
-
- # Calculate w[t-15] ror 8
- vpsrlq $8, YTMP1, YTMP2
- vpsllq $(64-8), YTMP1, YTMP1
- vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
- # XOR the three components
- vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
- vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
-
-
- # Add three components, w[t-16], w[t-7] and sigma0
- vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
- # Move to appropriate lanes for calculating w[16] and w[17]
- vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
- # Move to appropriate lanes for calculating w[18] and w[19]
- vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
-
- # Calculate w[16] and w[17] in both 128 bit lanes
-
- # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
- vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
- vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
-
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- mov f, y2 # y2 = f # CH
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
-
- and b, y3 # y3 = (a|c)&b # MAJA
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
-
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-
-################################### RND N + 2 #########################################
-
- vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
- vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
- vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
- vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
- # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
-
- # Add sigma1 to the other compunents to get w[16] and w[17]
- vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
-
- # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
- vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
-
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- or c, y3 # y3 = a|c # MAJA
- mov f, y2 # y2 = f # CH
- xor g, y2 # y2 = f^g # CH
-
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- and e, y2 # y2 = (f^g)&e # CH
-
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-################################### RND N + 3 #########################################
-
- vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
- vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
- vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
- vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
- vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
- vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
- # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
-
- # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
- # to newly calculated sigma1 to get w[18] and w[19]
- vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
-
- # Form w[19, w[18], w17], w[16]
- vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
-
- mov a, y3 # y3 = a # MAJA
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- mov f, y2 # y2 = f # CH
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- add y0, y2 # y2 = S1 + CH # --
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- rorx $28, a, T1 # T1 = (a >> 28) # S0
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
-
- add y1, h # h = k + w + h + S0 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
- rotate_Ys
-.endm
-
-.macro DO_4ROUNDS
-
-################################### RND N + 0 #########################################
-
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- RotateState
-
-################################### RND N + 1 #########################################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- RotateState
-
-################################### RND N + 2 #########################################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- RotateState
-
-################################### RND N + 3 #########################################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $41, e, y0 # y0 = e >> 41 # S1A
- rorx $18, e, y1 # y1 = e >> 18 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
- rorx $14, e, y1 # y1 = (e >> 14) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
- rorx $34, a, T1 # T1 = a >> 34 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $39, a, y1 # y1 = a >> 39 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
- rorx $28, a, T1 # T1 = (a >> 28) # S0
- add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3, h # h = t1 + S0 + MAJ # --
-
- RotateState
-
-.endm
-
-########################################################################
-# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
-# Purpose: Updates the SHA512 digest stored at "state" with the message
-# stored in "data".
-# The size of the message pointed to by "data" must be an integer multiple
-# of SHA512 message blocks.
-# "blocks" is the message length in SHA512 blocks
-########################################################################
-SYM_TYPED_FUNC_START(sha512_transform_rorx)
- # Save GPRs
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- # Allocate Stack Space
- push %rbp
- mov %rsp, %rbp
- sub $frame_size, %rsp
- and $~(0x20 - 1), %rsp
-
- shl $7, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS # pointer to end of data
- mov NUM_BLKS, frame_INPEND(%rsp)
-
- ## load initial digest
- mov 8*0(CTX1), a
- mov 8*1(CTX1), b
- mov 8*2(CTX1), c
- mov 8*3(CTX1), d
- mov 8*4(CTX1), e
- mov 8*5(CTX1), f
- mov 8*6(CTX1), g
- mov 8*7(CTX1), h
-
- # save %rdi (CTX) before it gets clobbered
- mov %rdi, frame_CTX(%rsp)
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
-
-.Lloop0:
- lea K512(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
-
- mov INP, frame_INP(%rsp)
-
- ## schedule 64 input dwords, by doing 12 rounds of 4 each
- movq $4, frame_SRND(%rsp)
-
-.align 16
-.Lloop1:
- vpaddq (TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddq 1*32(TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddq 2*32(TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddq 3*32(TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- add $(4*32), TBL
- FOUR_ROUNDS_AND_SCHED
-
- subq $1, frame_SRND(%rsp)
- jne .Lloop1
-
- movq $2, frame_SRND(%rsp)
-.Lloop2:
- vpaddq (TBL), Y_0, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- DO_4ROUNDS
- vpaddq 1*32(TBL), Y_1, XFER
- vmovdqa XFER, frame_XFER(%rsp)
- add $(2*32), TBL
- DO_4ROUNDS
-
- vmovdqa Y_2, Y_0
- vmovdqa Y_3, Y_1
-
- subq $1, frame_SRND(%rsp)
- jne .Lloop2
-
- mov frame_CTX(%rsp), CTX2
- addm 8*0(CTX2), a
- addm 8*1(CTX2), b
- addm 8*2(CTX2), c
- addm 8*3(CTX2), d
- addm 8*4(CTX2), e
- addm 8*5(CTX2), f
- addm 8*6(CTX2), g
- addm 8*7(CTX2), h
-
- mov frame_INP(%rsp), INP
- add $128, INP
- cmp frame_INPEND(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- # Restore Stack Pointer
- mov %rbp, %rsp
- pop %rbp
-
- # Restore GPRs
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
- vzeroupper
- RET
-SYM_FUNC_END(sha512_transform_rorx)
-
-########################################################################
-### Binary Data
-
-
-# Mergeable 640-byte rodata section. This allows linker to merge the table
-# with other, exactly the same 640-byte fragment of another rodata section
-# (if such section exists).
-.section .rodata.cst640.K512, "aM", @progbits, 640
-.align 64
-# K[t] used in SHA512 hashing
-K512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
-
-.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
-.align 32
-# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x08090a0b0c0d0e0f0001020304050607
- .octa 0x18191a1b1c1d1e1f1011121314151617
-
-.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
-.align 32
-MASK_YMM_LO:
- .octa 0x00000000000000000000000000000000
- .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
deleted file mode 100644
index 30a2c4777f9d..000000000000
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ /dev/null
@@ -1,425 +0,0 @@
-########################################################################
-# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# David Cote <david.m.cote@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-512 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-.text
-
-# Virtual Registers
-# ARG1
-digest = %rdi
-# ARG2
-msg = %rsi
-# ARG3
-msglen = %rdx
-T1 = %rcx
-T2 = %r8
-a_64 = %r9
-b_64 = %r10
-c_64 = %r11
-d_64 = %r12
-e_64 = %r13
-f_64 = %r14
-g_64 = %r15
-h_64 = %rbx
-tmp0 = %rax
-
-# Local variables (stack frame)
-
-W_SIZE = 80*8
-WK_SIZE = 2*8
-
-frame_W = 0
-frame_WK = frame_W + W_SIZE
-frame_size = frame_WK + WK_SIZE
-
-# Useful QWORD "arrays" for simpler memory references
-# MSG, DIGEST, K_t, W_t are arrays
-# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
-
-# Input message (arg1)
-#define MSG(i) 8*i(msg)
-
-# Output Digest (arg2)
-#define DIGEST(i) 8*i(digest)
-
-# SHA Constants (static mem)
-#define K_t(i) 8*i+K512(%rip)
-
-# Message Schedule (stack frame)
-#define W_t(i) 8*i+frame_W(%rsp)
-
-# W[t]+K[t] (stack frame)
-#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
-
-.macro RotateState
- # Rotate symbols a..h right
- TMP = h_64
- h_64 = g_64
- g_64 = f_64
- f_64 = e_64
- e_64 = d_64
- d_64 = c_64
- c_64 = b_64
- b_64 = a_64
- a_64 = TMP
-.endm
-
-.macro SHA512_Round rnd
-
- # Compute Round %%t
- mov f_64, T1 # T1 = f
- mov e_64, tmp0 # tmp = e
- xor g_64, T1 # T1 = f ^ g
- ror $23, tmp0 # 41 # tmp = e ror 23
- and e_64, T1 # T1 = (f ^ g) & e
- xor e_64, tmp0 # tmp = (e ror 23) ^ e
- xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
- idx = \rnd
- add WK_2(idx), T1 # W[t] + K[t] from message scheduler
- ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4
- xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
- mov a_64, T2 # T2 = a
- add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
- ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
- add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
- mov a_64, tmp0 # tmp = a
- xor c_64, T2 # T2 = a ^ c
- and c_64, tmp0 # tmp = a & c
- and b_64, T2 # T2 = (a ^ c) & b
- xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
- mov a_64, tmp0 # tmp = a
- ror $5, tmp0 # 39 # tmp = a ror 5
- xor a_64, tmp0 # tmp = (a ror 5) ^ a
- add T1, d_64 # e(next_state) = d + T1
- ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6
- xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
- lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
- ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
- add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
- RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_sse rnd
-
- # Compute rounds t-2 and t-1
- # Compute message schedule QWORDS t and t+1
-
- # Two rounds are computed based on the values for K[t-2]+W[t-2] and
- # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
- # scheduler.
- # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
- # They are then added to their respective SHA512 constants at
- # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
- # For brievity, the comments following vectored instructions only refer to
- # the first of a pair of QWORDS.
- # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
- # The computation of the message schedule and the rounds are tightly
- # stitched to take advantage of instruction-level parallelism.
- # For clarity, integer instructions (for the rounds calculation) are indented
- # by one tab. Vectored instructions (for the message scheduler) are indented
- # by two tabs.
-
- mov f_64, T1
- idx = \rnd -2
- movdqa W_t(idx), %xmm2 # XMM2 = W[t-2]
- xor g_64, T1
- and e_64, T1
- movdqa %xmm2, %xmm0 # XMM0 = W[t-2]
- xor g_64, T1
- idx = \rnd
- add WK_2(idx), T1
- idx = \rnd - 15
- movdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
- mov e_64, tmp0
- ror $23, tmp0 # 41
- movdqa %xmm5, %xmm3 # XMM3 = W[t-15]
- xor e_64, tmp0
- ror $4, tmp0 # 18
- psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42
- xor e_64, tmp0
- ror $14, tmp0 # 14
- psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1
- add tmp0, T1
- add h_64, T1
- pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2]
- mov a_64, T2
- xor c_64, T2
- pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15]
- and b_64, T2
- mov a_64, tmp0
- psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
- and c_64, tmp0
- xor tmp0, T2
- psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
- mov a_64, tmp0
- ror $5, tmp0 # 39
- pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
- xor a_64, tmp0
- ror $6, tmp0 # 34
- pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
- xor a_64, tmp0
- ror $28, tmp0 # 28
- psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
- add tmp0, T2
- add T1, d_64
- psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
- lea (T1, T2), h_64
- RotateState
- movdqa %xmm2, %xmm1 # XMM1 = W[t-2]
- mov f_64, T1
- xor g_64, T1
- movdqa %xmm5, %xmm4 # XMM4 = W[t-15]
- and e_64, T1
- xor g_64, T1
- psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42
- idx = \rnd + 1
- add WK_2(idx), T1
- mov e_64, tmp0
- psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7
- ror $23, tmp0 # 41
- xor e_64, tmp0
- pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2]
- ror $4, tmp0 # 18
- xor e_64, tmp0
- pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15]
- ror $14, tmp0 # 14
- add tmp0, T1
- psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
- add h_64, T1
- mov a_64, T2
- psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
- xor c_64, T2
- and b_64, T2
- pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2])
- mov a_64, tmp0
- and c_64, tmp0
- idx = \rnd - 7
- movdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
- xor tmp0, T2
- pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15])
- mov a_64, tmp0
- paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15])
- ror $5, tmp0 # 39
- idx =\rnd-16
- paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
- xor a_64, tmp0
- paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
- ror $6, tmp0 # 34
- movdqa %xmm0, W_t(\rnd) # Store scheduled qwords
- xor a_64, tmp0
- paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t]
- ror $28, tmp0 # 28
- idx = \rnd
- movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
- add tmp0, T2
- add T1, d_64
- lea (T1, T2), h_64
- RotateState
-.endm
-
-########################################################################
-## void sha512_transform_ssse3(struct sha512_state *state, const u8 *data,
-## int blocks);
-# (struct sha512_state is assumed to begin with u64 state[8])
-# Purpose: Updates the SHA512 digest stored at "state" with the message
-# stored in "data".
-# The size of the message pointed to by "data" must be an integer multiple
-# of SHA512 message blocks.
-# "blocks" is the message length in SHA512 blocks.
-########################################################################
-SYM_TYPED_FUNC_START(sha512_transform_ssse3)
-
- test msglen, msglen
- je .Lnowork
-
- # Save GPRs
- push %rbx
- push %r12
- push %r13
- push %r14
- push %r15
-
- # Allocate Stack Space
- push %rbp
- mov %rsp, %rbp
- sub $frame_size, %rsp
- and $~(0x20 - 1), %rsp
-
-.Lupdateblock:
-
-# Load state variables
- mov DIGEST(0), a_64
- mov DIGEST(1), b_64
- mov DIGEST(2), c_64
- mov DIGEST(3), d_64
- mov DIGEST(4), e_64
- mov DIGEST(5), f_64
- mov DIGEST(6), g_64
- mov DIGEST(7), h_64
-
- t = 0
- .rept 80/2 + 1
- # (80 rounds) / (2 rounds/iteration) + (1 iteration)
- # +1 iteration because the scheduler leads hashing by 1 iteration
- .if t < 2
- # BSWAP 2 QWORDS
- movdqa XMM_QWORD_BSWAP(%rip), %xmm1
- movdqu MSG(t), %xmm0
- pshufb %xmm1, %xmm0 # BSWAP
- movdqa %xmm0, W_t(t) # Store Scheduled Pair
- paddq K_t(t), %xmm0 # Compute W[t]+K[t]
- movdqa %xmm0, WK_2(t) # Store into WK for rounds
- .elseif t < 16
- # BSWAP 2 QWORDS# Compute 2 Rounds
- movdqu MSG(t), %xmm0
- pshufb %xmm1, %xmm0 # BSWAP
- SHA512_Round t-2 # Round t-2
- movdqa %xmm0, W_t(t) # Store Scheduled Pair
- paddq K_t(t), %xmm0 # Compute W[t]+K[t]
- SHA512_Round t-1 # Round t-1
- movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK
- .elseif t < 79
- # Schedule 2 QWORDS# Compute 2 Rounds
- SHA512_2Sched_2Round_sse t
- .else
- # Compute 2 Rounds
- SHA512_Round t-2
- SHA512_Round t-1
- .endif
- t = t+2
- .endr
-
- # Update digest
- add a_64, DIGEST(0)
- add b_64, DIGEST(1)
- add c_64, DIGEST(2)
- add d_64, DIGEST(3)
- add e_64, DIGEST(4)
- add f_64, DIGEST(5)
- add g_64, DIGEST(6)
- add h_64, DIGEST(7)
-
- # Advance to next message block
- add $16*8, msg
- dec msglen
- jnz .Lupdateblock
-
- # Restore Stack Pointer
- mov %rbp, %rsp
- pop %rbp
-
- # Restore GPRs
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbx
-
-.Lnowork:
- RET
-SYM_FUNC_END(sha512_transform_ssse3)
-
-########################################################################
-### Binary Data
-
-.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
-.align 16
-# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
-XMM_QWORD_BSWAP:
- .octa 0x08090a0b0c0d0e0f0001020304050607
-
-# Mergeable 640-byte rodata section. This allows linker to merge the table
-# with other, exactly the same 640-byte fragment of another rodata section
-# (if such section exists).
-.section .rodata.cst640.K512, "aM", @progbits, 640
-.align 64
-# K[t] used in SHA512 hashing
-K512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
deleted file mode 100644
index 067684c54395..000000000000
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * Cryptographic API.
- *
- * Glue code for the SHA512 Secure Hash Algorithm assembler
- * implementation using supplemental SSE3 / AVX / AVX2 instructions.
- *
- * This file is based on sha512_generic.c
- *
- * Copyright (C) 2013 Intel Corporation
- * Author: Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-
-asmlinkage void sha512_transform_ssse3(struct sha512_state *state,
- const u8 *data, int blocks);
-
-static int sha512_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, sha512_block_fn *sha512_xform)
-{
- int remain;
-
- /*
- * Make sure struct sha512_state begins directly with the SHA512
- * 512-bit internal state, as this is what the asm functions expect.
- */
- BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
-
- kernel_fpu_begin();
- remain = sha512_base_do_update_blocks(desc, data, len, sha512_xform);
- kernel_fpu_end();
-
- return remain;
-}
-
-static int sha512_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out, sha512_block_fn *sha512_xform)
-{
- kernel_fpu_begin();
- sha512_base_do_finup(desc, data, len, sha512_xform);
- kernel_fpu_end();
-
- return sha512_base_finish(desc, out);
-}
-
-static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_update(desc, data, len, sha512_transform_ssse3);
-}
-
-static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha512_finup(desc, data, len, out, sha512_transform_ssse3);
-}
-
-static struct shash_alg sha512_ssse3_algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_ssse3_update,
- .finup = sha512_ssse3_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-ssse3",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_ssse3_update,
- .finup = sha512_ssse3_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-ssse3",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int register_sha512_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- return crypto_register_shashes(sha512_ssse3_algs,
- ARRAY_SIZE(sha512_ssse3_algs));
- return 0;
-}
-
-static void unregister_sha512_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shashes(sha512_ssse3_algs,
- ARRAY_SIZE(sha512_ssse3_algs));
-}
-
-asmlinkage void sha512_transform_avx(struct sha512_state *state,
- const u8 *data, int blocks);
-static bool avx_usable(void)
-{
- if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
- if (boot_cpu_has(X86_FEATURE_AVX))
- pr_info("AVX detected but unusable.\n");
- return false;
- }
-
- return true;
-}
-
-static int sha512_avx_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_update(desc, data, len, sha512_transform_avx);
-}
-
-static int sha512_avx_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha512_finup(desc, data, len, out, sha512_transform_avx);
-}
-
-static struct shash_alg sha512_avx_algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_avx_update,
- .finup = sha512_avx_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-avx",
- .cra_priority = 160,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_avx_update,
- .finup = sha512_avx_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-avx",
- .cra_priority = 160,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int register_sha512_avx(void)
-{
- if (avx_usable())
- return crypto_register_shashes(sha512_avx_algs,
- ARRAY_SIZE(sha512_avx_algs));
- return 0;
-}
-
-static void unregister_sha512_avx(void)
-{
- if (avx_usable())
- crypto_unregister_shashes(sha512_avx_algs,
- ARRAY_SIZE(sha512_avx_algs));
-}
-
-asmlinkage void sha512_transform_rorx(struct sha512_state *state,
- const u8 *data, int blocks);
-
-static int sha512_avx2_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_update(desc, data, len, sha512_transform_rorx);
-}
-
-static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha512_finup(desc, data, len, out, sha512_transform_rorx);
-}
-
-static struct shash_alg sha512_avx2_algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_avx2_update,
- .finup = sha512_avx2_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-avx2",
- .cra_priority = 170,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_avx2_update,
- .finup = sha512_avx2_finup,
- .descsize = SHA512_STATE_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-avx2",
- .cra_priority = 170,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static bool avx2_usable(void)
-{
- if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_BMI2))
- return true;
-
- return false;
-}
-
-static int register_sha512_avx2(void)
-{
- if (avx2_usable())
- return crypto_register_shashes(sha512_avx2_algs,
- ARRAY_SIZE(sha512_avx2_algs));
- return 0;
-}
-static const struct x86_cpu_id module_cpu_ids[] = {
- X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
-
-static void unregister_sha512_avx2(void)
-{
- if (avx2_usable())
- crypto_unregister_shashes(sha512_avx2_algs,
- ARRAY_SIZE(sha512_avx2_algs));
-}
-
-static int __init sha512_ssse3_mod_init(void)
-{
- if (!x86_match_cpu(module_cpu_ids))
- return -ENODEV;
-
- if (register_sha512_ssse3())
- goto fail;
-
- if (register_sha512_avx()) {
- unregister_sha512_ssse3();
- goto fail;
- }
-
- if (register_sha512_avx2()) {
- unregister_sha512_avx();
- unregister_sha512_ssse3();
- goto fail;
- }
-
- return 0;
-fail:
- return -ENODEV;
-}
-
-static void __exit sha512_ssse3_mod_fini(void)
-{
- unregister_sha512_avx2();
- unregister_sha512_avx();
- unregister_sha512_ssse3();
-}
-
-module_init(sha512_ssse3_mod_init);
-module_exit(sha512_ssse3_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-
-MODULE_ALIAS_CRYPTO("sha512");
-MODULE_ALIAS_CRYPTO("sha512-ssse3");
-MODULE_ALIAS_CRYPTO("sha512-avx");
-MODULE_ALIAS_CRYPTO("sha512-avx2");
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha384-ssse3");
-MODULE_ALIAS_CRYPTO("sha384-avx");
-MODULE_ALIAS_CRYPTO("sha384-avx2");
diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c
index 72867fc49ce8..88caf418a06f 100644
--- a/arch/x86/crypto/sm4_aesni_avx_glue.c
+++ b/arch/x86/crypto/sm4_aesni_avx_glue.c
@@ -11,6 +11,7 @@
#include <asm/fpu/api.h>
#include <linux/module.h>
#include <linux/crypto.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <crypto/internal/skcipher.h>
#include <crypto/sm4.h>
diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
index 4c67184dc573..8e9906d36902 100644
--- a/arch/x86/crypto/twofish_glue.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -40,6 +40,7 @@
#include <crypto/algapi.h>
#include <crypto/twofish.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 1a1ecfa7f72a..8ad77725bf60 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -9,6 +9,7 @@
#include <crypto/algapi.h>
#include <crypto/twofish.h>
#include <linux/crypto.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index d83236b96f22..94519688b007 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -369,7 +369,7 @@ For 32-bit we have the following conventions - kernel is built with
.endm
.macro STACKLEAK_ERASE_NOCLOBBER
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
PUSH_AND_CLEAR_REGS
call stackleak_erase
POP_REGS
@@ -388,7 +388,7 @@ For 32-bit we have the following conventions - kernel is built with
#endif /* !CONFIG_X86_64 */
.macro STACKLEAK_ERASE
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
call stackleak_erase
#endif
.endm
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index 175958b02f2b..8e9a0cc20a4a 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -36,20 +36,20 @@ EXPORT_SYMBOL_GPL(write_ibpb);
/*
* Define the VERW operand that is disguised as entry code so that
- * it can be referenced with KPTI enabled. This ensure VERW can be
+ * it can be referenced with KPTI enabled. This ensures VERW can be
* used late in exit-to-user path after page tables are switched.
*/
.pushsection .entry.text, "ax"
.align L1_CACHE_BYTES, 0xcc
-SYM_CODE_START_NOALIGN(mds_verw_sel)
+SYM_CODE_START_NOALIGN(x86_verw_sel)
UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
.word __KERNEL_DS
.align L1_CACHE_BYTES, 0xcc
-SYM_CODE_END(mds_verw_sel);
+SYM_CODE_END(x86_verw_sel);
/* For KVM */
-EXPORT_SYMBOL_GPL(mds_verw_sel);
+EXPORT_SYMBOL_GPL(x86_verw_sel);
.popsection
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ac007ea00979..4877e16da69a 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -473,3 +473,5 @@
465 i386 listxattrat sys_listxattrat
466 i386 removexattrat sys_removexattrat
467 i386 open_tree_attr sys_open_tree_attr
+468 i386 file_getattr sys_file_getattr
+469 i386 file_setattr sys_file_setattr
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index cfb5ca41e30d..92cf0fe2291e 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -391,6 +391,8 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 54d3e9774d62..f247f5f5cb44 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -62,7 +62,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),)
endif
endif
-$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
$(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO
#
@@ -123,6 +123,7 @@ KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(KSTACK_ERASE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 466283326630..c2fb729c270e 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2826,7 +2826,7 @@ static void intel_pmu_read_event(struct perf_event *event)
* If the PEBS counters snapshotting is enabled,
* the topdown event is available in PEBS records.
*/
- if (is_topdown_event(event) && !is_pebs_counter_event_group(event))
+ if (is_topdown_count(event) && !is_pebs_counter_event_group(event))
static_call(intel_pmu_update_topdown_event)(event, NULL);
else
intel_pmu_drain_pebs_buffer();
@@ -2900,6 +2900,7 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int msr_b, msr_c;
+ int msr_offset;
if (!mask && !cpuc->acr_cfg_b[idx])
return;
@@ -2907,19 +2908,20 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload)
if (idx < INTEL_PMC_IDX_FIXED) {
msr_b = MSR_IA32_PMC_V6_GP0_CFG_B;
msr_c = MSR_IA32_PMC_V6_GP0_CFG_C;
+ msr_offset = x86_pmu.addr_offset(idx, false);
} else {
msr_b = MSR_IA32_PMC_V6_FX0_CFG_B;
msr_c = MSR_IA32_PMC_V6_FX0_CFG_C;
- idx -= INTEL_PMC_IDX_FIXED;
+ msr_offset = x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false);
}
if (cpuc->acr_cfg_b[idx] != mask) {
- wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask);
+ wrmsrl(msr_b + msr_offset, mask);
cpuc->acr_cfg_b[idx] = mask;
}
/* Only need to update the reload value when there is a valid config value. */
if (mask && cpuc->acr_cfg_c[idx] != reload) {
- wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload);
+ wrmsrl(msr_c + msr_offset, reload);
cpuc->acr_cfg_c[idx] = reload;
}
}
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index e0815a12db90..a762f7f5b161 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1807,6 +1807,12 @@ static const struct intel_uncore_init_fun lnl_uncore_init __initconst = {
.mmio_init = lnl_uncore_mmio_init,
};
+static const struct intel_uncore_init_fun ptl_uncore_init __initconst = {
+ .cpu_init = ptl_uncore_cpu_init,
+ .mmio_init = ptl_uncore_mmio_init,
+ .use_discovery = true,
+};
+
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
@@ -1888,6 +1894,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_VFM(INTEL_ARROWLAKE_U, &mtl_uncore_init),
X86_MATCH_VFM(INTEL_ARROWLAKE_H, &mtl_uncore_init),
X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_uncore_init),
+ X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &ptl_uncore_init),
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init),
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init),
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 3dcb88c0ecfa..d8815fff7588 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -612,10 +612,12 @@ void tgl_uncore_cpu_init(void);
void adl_uncore_cpu_init(void);
void lnl_uncore_cpu_init(void);
void mtl_uncore_cpu_init(void);
+void ptl_uncore_cpu_init(void);
void tgl_uncore_mmio_init(void);
void tgl_l_uncore_mmio_init(void);
void adl_uncore_mmio_init(void);
void lnl_uncore_mmio_init(void);
+void ptl_uncore_mmio_init(void);
int snb_pci2phy_map_init(int devid);
/* uncore_snbep.c */
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 18a3022f26a0..7d57ce706feb 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -274,32 +274,15 @@ uncore_ignore_unit(struct uncore_unit_discovery *unit, int *ignore)
return false;
}
-static int parse_discovery_table(struct pci_dev *dev, int die,
- u32 bar_offset, bool *parsed,
- int *ignore)
+static int __parse_discovery_table(resource_size_t addr, int die,
+ bool *parsed, int *ignore)
{
struct uncore_global_discovery global;
struct uncore_unit_discovery unit;
void __iomem *io_addr;
- resource_size_t addr;
unsigned long size;
- u32 val;
int i;
- pci_read_config_dword(dev, bar_offset, &val);
-
- if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64)
- return -EINVAL;
-
- addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK);
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
- if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) {
- u32 val2;
-
- pci_read_config_dword(dev, bar_offset + 4, &val2);
- addr |= ((resource_size_t)val2) << 32;
- }
-#endif
size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE;
io_addr = ioremap(addr, size);
if (!io_addr)
@@ -342,7 +325,32 @@ static int parse_discovery_table(struct pci_dev *dev, int die,
return 0;
}
-bool intel_uncore_has_discovery_tables(int *ignore)
+static int parse_discovery_table(struct pci_dev *dev, int die,
+ u32 bar_offset, bool *parsed,
+ int *ignore)
+{
+ resource_size_t addr;
+ u32 val;
+
+ pci_read_config_dword(dev, bar_offset, &val);
+
+ if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64)
+ return -EINVAL;
+
+ addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK);
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+ u32 val2;
+
+ pci_read_config_dword(dev, bar_offset + 4, &val2);
+ addr |= ((resource_size_t)val2) << 32;
+ }
+#endif
+
+ return __parse_discovery_table(addr, die, parsed, ignore);
+}
+
+static bool intel_uncore_has_discovery_tables_pci(int *ignore)
{
u32 device, val, entry_id, bar_offset;
int die, dvsec = 0, ret = true;
@@ -391,6 +399,45 @@ err:
return ret;
}
+static bool intel_uncore_has_discovery_tables_msr(int *ignore)
+{
+ unsigned long *die_mask;
+ bool parsed = false;
+ int cpu, die;
+ u64 base;
+
+ die_mask = kcalloc(BITS_TO_LONGS(uncore_max_dies()),
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!die_mask)
+ return false;
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ die = topology_logical_die_id(cpu);
+ if (__test_and_set_bit(die, die_mask))
+ continue;
+
+ if (rdmsrq_safe_on_cpu(cpu, UNCORE_DISCOVERY_MSR, &base))
+ continue;
+
+ if (!base)
+ continue;
+
+ __parse_discovery_table(base, die, &parsed, ignore);
+ }
+
+ cpus_read_unlock();
+
+ kfree(die_mask);
+ return parsed;
+}
+
+bool intel_uncore_has_discovery_tables(int *ignore)
+{
+ return intel_uncore_has_discovery_tables_msr(ignore) ||
+ intel_uncore_has_discovery_tables_pci(ignore);
+}
+
void intel_uncore_clear_discovery_tables(void)
{
struct intel_uncore_discovery_type *type, *next;
@@ -604,7 +651,7 @@ void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box)
}
addr = unit->addr;
- box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE);
+ box->io_addr = ioremap(addr, type->mmio_map_size);
if (!box->io_addr) {
pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n",
type->type_id, unit->id, (unsigned long long)addr);
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 0e94aa7db8e7..dff75c98e22f 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -1,5 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0-only */
+/* Store the full address of the global discovery table */
+#define UNCORE_DISCOVERY_MSR 0x201e
+
/* Generic device ID of a discovery table device */
#define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7
/* Capability ID for a discovery table device */
@@ -168,3 +171,7 @@ bool intel_generic_uncore_assign_hw_event(struct perf_event *event,
struct intel_uncore_box *box);
void uncore_find_add_unit(struct intel_uncore_discovery_unit *node,
struct rb_root *root, u16 *num_units);
+struct intel_uncore_type **
+uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
+ struct intel_uncore_type **extra, int max_num_types,
+ struct intel_uncore_type **uncores);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index a1a96833e30e..807e582b8f17 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -1855,3 +1855,82 @@ void lnl_uncore_mmio_init(void)
}
/* end of Lunar Lake MMIO uncore support */
+
+/* Panther Lake uncore support */
+
+#define UNCORE_PTL_MAX_NUM_UNCORE_TYPES 42
+#define UNCORE_PTL_TYPE_IMC 6
+#define UNCORE_PTL_TYPE_SNCU 34
+#define UNCORE_PTL_TYPE_HBO 41
+
+#define PTL_UNCORE_GLOBAL_CTL_OFFSET 0x380
+
+static struct intel_uncore_type ptl_uncore_imc = {
+ .name = "imc",
+ .mmio_map_size = 0xf00,
+};
+
+static void ptl_uncore_sncu_init_box(struct intel_uncore_box *box)
+{
+ intel_generic_uncore_mmio_init_box(box);
+
+ /* Clear the global freeze bit */
+ if (box->io_addr)
+ writel(0, box->io_addr + PTL_UNCORE_GLOBAL_CTL_OFFSET);
+}
+
+static struct intel_uncore_ops ptl_uncore_sncu_ops = {
+ .init_box = ptl_uncore_sncu_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = intel_generic_uncore_mmio_disable_box,
+ .enable_box = intel_generic_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = intel_generic_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+static struct intel_uncore_type ptl_uncore_sncu = {
+ .name = "sncu",
+ .ops = &ptl_uncore_sncu_ops,
+ .mmio_map_size = 0xf00,
+};
+
+static struct intel_uncore_type ptl_uncore_hbo = {
+ .name = "hbo",
+ .mmio_map_size = 0xf00,
+};
+
+static struct intel_uncore_type *ptl_uncores[UNCORE_PTL_MAX_NUM_UNCORE_TYPES] = {
+ [UNCORE_PTL_TYPE_IMC] = &ptl_uncore_imc,
+ [UNCORE_PTL_TYPE_SNCU] = &ptl_uncore_sncu,
+ [UNCORE_PTL_TYPE_HBO] = &ptl_uncore_hbo,
+};
+
+#define UNCORE_PTL_MMIO_EXTRA_UNCORES 1
+
+static struct intel_uncore_type *ptl_mmio_extra_uncores[UNCORE_PTL_MMIO_EXTRA_UNCORES] = {
+ &adl_uncore_imc_free_running,
+};
+
+void ptl_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO,
+ UNCORE_PTL_MMIO_EXTRA_UNCORES,
+ ptl_mmio_extra_uncores,
+ UNCORE_PTL_MAX_NUM_UNCORE_TYPES,
+ ptl_uncores);
+}
+
+static struct intel_uncore_type *ptl_msr_uncores[] = {
+ &mtl_uncore_cbox,
+ NULL
+};
+
+void ptl_uncore_cpu_init(void)
+{
+ mtl_uncore_cbox.num_boxes = 6;
+ mtl_uncore_cbox.ops = &lnl_uncore_msr_ops;
+ uncore_msr_uncores = ptl_msr_uncores;
+}
+
+/* end of Panther Lake uncore support */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 2824dc9950be..e1f370b8d065 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -6409,9 +6409,11 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
to_type->get_topology = from_type->get_topology;
if (from_type->cleanup_mapping)
to_type->cleanup_mapping = from_type->cleanup_mapping;
+ if (from_type->mmio_map_size)
+ to_type->mmio_map_size = from_type->mmio_map_size;
}
-static struct intel_uncore_type **
+struct intel_uncore_type **
uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
struct intel_uncore_type **extra, int max_num_types,
struct intel_uncore_type **uncores)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 5d27194a2efa..afdbda2dd7b7 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -34,6 +34,7 @@
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
+#include <linux/export.h>
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
@@ -391,40 +392,6 @@ static void __init hv_stimer_setup_percpu_clockev(void)
old_setup_percpu_clockev();
}
-#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE)
-static u8 __init get_vtl(void)
-{
- u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS;
- struct hv_input_get_vp_registers *input;
- struct hv_output_get_vp_registers *output;
- unsigned long flags;
- u64 ret;
-
- local_irq_save(flags);
- input = *this_cpu_ptr(hyperv_pcpu_input_arg);
- output = *this_cpu_ptr(hyperv_pcpu_output_arg);
-
- memset(input, 0, struct_size(input, names, 1));
- input->partition_id = HV_PARTITION_ID_SELF;
- input->vp_index = HV_VP_INDEX_SELF;
- input->input_vtl.as_uint8 = 0;
- input->names[0] = HV_REGISTER_VSM_VP_STATUS;
-
- ret = hv_do_hypercall(control, input, output);
- if (hv_result_success(ret)) {
- ret = output->values[0].reg8 & HV_X64_VTL_MASK;
- } else {
- pr_err("Failed to get VTL(error: %lld) exiting...\n", ret);
- BUG();
- }
-
- local_irq_restore(flags);
- return ret;
-}
-#else
-static inline u8 get_vtl(void) { return 0; }
-#endif
-
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
@@ -707,3 +674,36 @@ bool hv_is_hyperv_initialized(void)
return hypercall_msr.enable;
}
EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
+
+int hv_apicid_to_vp_index(u32 apic_id)
+{
+ u64 control;
+ u64 status;
+ unsigned long irq_flags;
+ struct hv_get_vp_from_apic_id_in *input;
+ u32 *output, ret;
+
+ local_irq_save(irq_flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->apic_ids[0] = apic_id;
+
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_INDEX_FROM_APIC_ID;
+ status = hv_do_hypercall(control, input, output);
+ ret = output[0];
+
+ local_irq_restore(irq_flags);
+
+ if (!hv_result_success(status)) {
+ pr_err("failed to get vp index from apic id %d, status %#llx\n",
+ apic_id, status);
+ return -EINVAL;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_apicid_to_vp_index);
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 4580936dcb03..042e8712d8de 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -56,7 +56,12 @@ static void __noreturn hv_vtl_restart(char __maybe_unused *cmd)
void __init hv_vtl_init_platform(void)
{
- pr_info("Linux runs in Hyper-V Virtual Trust Level\n");
+ /*
+ * This function is a no-op if the VTL mode is not enabled.
+ * If it is, this function runs if and only the kernel boots in
+ * VTL2 which the x86 hv initialization path makes sure of.
+ */
+ pr_info("Linux runs in Hyper-V Virtual Trust Level %d\n", ms_hyperv.vtl);
x86_platform.realmode_reserve = x86_init_noop;
x86_platform.realmode_init = x86_init_noop;
@@ -207,63 +212,23 @@ free_lock:
return ret;
}
-static int hv_vtl_apicid_to_vp_id(u32 apic_id)
-{
- u64 control;
- u64 status;
- unsigned long irq_flags;
- struct hv_get_vp_from_apic_id_in *input;
- u32 *output, ret;
-
- local_irq_save(irq_flags);
-
- input = *this_cpu_ptr(hyperv_pcpu_input_arg);
- memset(input, 0, sizeof(*input));
- input->partition_id = HV_PARTITION_ID_SELF;
- input->apic_ids[0] = apic_id;
-
- output = *this_cpu_ptr(hyperv_pcpu_output_arg);
-
- control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_ID_FROM_APIC_ID;
- status = hv_do_hypercall(control, input, output);
- ret = output[0];
-
- local_irq_restore(irq_flags);
-
- if (!hv_result_success(status)) {
- pr_err("failed to get vp id from apic id %d, status %#llx\n",
- apic_id, status);
- return -EINVAL;
- }
-
- return ret;
-}
-
-static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
+static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip, unsigned int cpu)
{
- int vp_id, cpu;
-
- /* Find the logical CPU for the APIC ID */
- for_each_present_cpu(cpu) {
- if (arch_match_cpu_phys_id(cpu, apicid))
- break;
- }
- if (cpu >= nr_cpu_ids)
- return -EINVAL;
+ int vp_index;
pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid);
- vp_id = hv_vtl_apicid_to_vp_id(apicid);
+ vp_index = hv_apicid_to_vp_index(apicid);
- if (vp_id < 0) {
+ if (vp_index < 0) {
pr_err("Couldn't find CPU with APIC ID %d\n", apicid);
return -EINVAL;
}
- if (vp_id > ms_hyperv.max_vp_index) {
- pr_err("Invalid CPU id %d for APIC ID %d\n", vp_id, apicid);
+ if (vp_index > ms_hyperv.max_vp_index) {
+ pr_err("Invalid CPU id %d for APIC ID %d\n", vp_index, apicid);
return -EINVAL;
}
- return hv_vtl_bringup_vcpu(vp_id, cpu, start_eip);
+ return hv_vtl_bringup_vcpu(vp_index, cpu, start_eip);
}
int __init hv_vtl_early_init(void)
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 31f0d29cbc5e..090f5ac9f492 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -10,6 +10,7 @@
#include <linux/pci.h>
#include <linux/irq.h>
+#include <linux/export.h>
#include <asm/mshyperv.h>
static int hv_map_interrupt(union hv_device_id device_id, bool level,
@@ -46,7 +47,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
if (nr_bank < 0) {
local_irq_restore(flags);
pr_err("%s: unable to generate VP set\n", __func__);
- return EINVAL;
+ return -EINVAL;
}
intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
@@ -66,7 +67,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
if (!hv_result_success(status))
hv_status_err(status, "\n");
- return hv_result(status);
+ return hv_result_to_errno(status);
}
static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
@@ -88,7 +89,10 @@ static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
local_irq_restore(flags);
- return hv_result(status);
+ if (!hv_result_success(status))
+ hv_status_err(status, "\n");
+
+ return hv_result_to_errno(status);
}
#ifdef CONFIG_PCI_MSI
@@ -169,13 +173,34 @@ static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev)
return dev_id;
}
-static int hv_map_msi_interrupt(struct pci_dev *dev, int cpu, int vector,
- struct hv_interrupt_entry *entry)
+/**
+ * hv_map_msi_interrupt() - "Map" the MSI IRQ in the hypervisor.
+ * @data: Describes the IRQ
+ * @out_entry: Hypervisor (MSI) interrupt entry (can be NULL)
+ *
+ * Map the IRQ in the hypervisor by issuing a MAP_DEVICE_INTERRUPT hypercall.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int hv_map_msi_interrupt(struct irq_data *data,
+ struct hv_interrupt_entry *out_entry)
{
- union hv_device_id device_id = hv_build_pci_dev_id(dev);
+ struct irq_cfg *cfg = irqd_cfg(data);
+ struct hv_interrupt_entry dummy;
+ union hv_device_id device_id;
+ struct msi_desc *msidesc;
+ struct pci_dev *dev;
+ int cpu;
- return hv_map_interrupt(device_id, false, cpu, vector, entry);
+ msidesc = irq_data_get_msi_desc(data);
+ dev = msi_desc_to_pci_dev(msidesc);
+ device_id = hv_build_pci_dev_id(dev);
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
+
+ return hv_map_interrupt(device_id, false, cpu, cfg->vector,
+ out_entry ? out_entry : &dummy);
}
+EXPORT_SYMBOL_GPL(hv_map_msi_interrupt);
static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg)
{
@@ -188,13 +213,11 @@ static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi
static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry);
static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{
+ struct hv_interrupt_entry *stored_entry;
+ struct irq_cfg *cfg = irqd_cfg(data);
struct msi_desc *msidesc;
struct pci_dev *dev;
- struct hv_interrupt_entry out_entry, *stored_entry;
- struct irq_cfg *cfg = irqd_cfg(data);
- const cpumask_t *affinity;
- int cpu;
- u64 status;
+ int ret;
msidesc = irq_data_get_msi_desc(data);
dev = msi_desc_to_pci_dev(msidesc);
@@ -204,9 +227,6 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return;
}
- affinity = irq_data_get_effective_affinity_mask(data);
- cpu = cpumask_first_and(affinity, cpu_online_mask);
-
if (data->chip_data) {
/*
* This interrupt is already mapped. Let's unmap first.
@@ -219,14 +239,12 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
stored_entry = data->chip_data;
data->chip_data = NULL;
- status = hv_unmap_msi_interrupt(dev, stored_entry);
+ ret = hv_unmap_msi_interrupt(dev, stored_entry);
kfree(stored_entry);
- if (status != HV_STATUS_SUCCESS) {
- hv_status_debug(status, "failed to unmap\n");
+ if (ret)
return;
- }
}
stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC);
@@ -235,15 +253,14 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return;
}
- status = hv_map_msi_interrupt(dev, cpu, cfg->vector, &out_entry);
- if (status != HV_STATUS_SUCCESS) {
+ ret = hv_map_msi_interrupt(data, stored_entry);
+ if (ret) {
kfree(stored_entry);
return;
}
- *stored_entry = out_entry;
data->chip_data = stored_entry;
- entry_to_msi_msg(&out_entry, msg);
+ entry_to_msi_msg(data->chip_data, msg);
return;
}
@@ -257,7 +274,6 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
{
struct hv_interrupt_entry old_entry;
struct msi_msg msg;
- u64 status;
if (!irqd->chip_data) {
pr_debug("%s: no chip data\n!", __func__);
@@ -270,10 +286,7 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
kfree(irqd->chip_data);
irqd->chip_data = NULL;
- status = hv_unmap_msi_interrupt(dev, &old_entry);
-
- if (status != HV_STATUS_SUCCESS)
- hv_status_err(status, "\n");
+ (void)hv_unmap_msi_interrupt(dev, &old_entry);
}
static void hv_msi_free_irq(struct irq_domain *domain,
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 09a165a3c41e..ade6c665c97e 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -9,6 +9,8 @@
#include <linux/bitfield.h>
#include <linux/types.h>
#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
#include <asm/svm.h>
#include <asm/sev.h>
#include <asm/io.h>
@@ -289,7 +291,7 @@ static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
free_page((unsigned long)vmsa);
}
-int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
+int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, unsigned int cpu)
{
struct sev_es_save_area *vmsa = (struct sev_es_save_area *)
__get_free_page(GFP_KERNEL | __GFP_ZERO);
@@ -298,10 +300,16 @@ int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
u64 ret, retry = 5;
struct hv_enable_vp_vtl *start_vp_input;
unsigned long flags;
+ int vp_index;
if (!vmsa)
return -ENOMEM;
+ /* Find the Hyper-V VP index which might be not the same as APIC ID */
+ vp_index = hv_apicid_to_vp_index(apic_id);
+ if (vp_index < 0 || vp_index > ms_hyperv.max_vp_index)
+ return -EINVAL;
+
native_store_gdt(&gdtr);
vmsa->gdtr.base = gdtr.address;
@@ -349,7 +357,7 @@ int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
start_vp_input = (struct hv_enable_vp_vtl *)ap_start_input_arg;
memset(start_vp_input, 0, sizeof(*start_vp_input));
start_vp_input->partition_id = -1;
- start_vp_input->vp_index = cpu;
+ start_vp_input->vp_index = vp_index;
start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl;
*(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1;
diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c
index 1083dc8646f9..8ccbb7c4fc27 100644
--- a/arch/x86/hyperv/nested.c
+++ b/arch/x86/hyperv/nested.c
@@ -11,6 +11,7 @@
#include <linux/types.h>
+#include <linux/export.h>
#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
#include <asm/tlbflush.h>
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 5ab1a4598d00..a03aa6f999d1 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -158,13 +158,13 @@ static inline bool acpi_has_cpu_in_madt(void)
}
#define ACPI_HAVE_ARCH_SET_ROOT_POINTER
-static inline void acpi_arch_set_root_pointer(u64 addr)
+static __always_inline void acpi_arch_set_root_pointer(u64 addr)
{
x86_init.acpi.set_root_pointer(addr);
}
#define ACPI_HAVE_ARCH_GET_ROOT_POINTER
-static inline u64 acpi_arch_get_root_pointer(void)
+static __always_inline u64 acpi_arch_get_root_pointer(void)
{
return x86_init.acpi.get_root_pointer();
}
diff --git a/arch/x86/include/asm/amd/fch.h b/arch/x86/include/asm/amd/fch.h
deleted file mode 100644
index 2cf5153edbc2..000000000000
--- a/arch/x86/include/asm/amd/fch.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_AMD_FCH_H_
-#define _ASM_X86_AMD_FCH_H_
-
-#define FCH_PM_BASE 0xFED80300
-
-/* Register offsets from PM base: */
-#define FCH_PM_DECODEEN 0x00
-#define FCH_PM_DECODEEN_SMBUS0SEL GENMASK(20, 19)
-#define FCH_PM_SCRATCH 0x80
-#define FCH_PM_S5_RESET_STATUS 0xC0
-
-#endif /* _ASM_X86_AMD_FCH_H_ */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 68e10e30fe9b..07ba4935e873 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -313,9 +313,9 @@ struct apic {
u32 (*get_apic_id)(u32 id);
/* wakeup_secondary_cpu */
- int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip);
+ int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu);
/* wakeup secondary CPU using 64-bit wakeup point */
- int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip);
+ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu);
char *name;
};
@@ -333,8 +333,8 @@ struct apic_override {
void (*send_IPI_self)(int vector);
u64 (*icr_read)(void);
void (*icr_write)(u32 low, u32 high);
- int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip);
- int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip);
+ int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu);
+ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu);
};
/*
@@ -488,11 +488,14 @@ static inline void apic_setup_apic_calls(void) { }
extern void apic_ack_irq(struct irq_data *data);
+#define APIC_VECTOR_TO_BIT_NUMBER(v) ((unsigned int)(v) % 32)
+#define APIC_VECTOR_TO_REG_OFFSET(v) ((unsigned int)(v) / 32 * 0x10)
+
static inline bool lapic_vector_set_in_irr(unsigned int vector)
{
- u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ u32 irr = apic_read(APIC_IRR + APIC_VECTOR_TO_REG_OFFSET(vector));
- return !!(irr & (1U << (vector % 32)));
+ return !!(irr & (1U << APIC_VECTOR_TO_BIT_NUMBER(vector)));
}
static inline bool is_vector_pending(unsigned int vector)
@@ -500,6 +503,65 @@ static inline bool is_vector_pending(unsigned int vector)
return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector);
}
+#define MAX_APIC_VECTOR 256
+#define APIC_VECTORS_PER_REG 32
+
+/*
+ * Vector states are maintained by APIC in 32-bit registers that are
+ * 16 bytes aligned. The status of each vector is kept in a single
+ * bit.
+ */
+static inline int apic_find_highest_vector(void *bitmap)
+{
+ int vec;
+ u32 *reg;
+
+ for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; vec >= 0; vec -= APIC_VECTORS_PER_REG) {
+ reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec);
+ if (*reg)
+ return __fls(*reg) + vec;
+ }
+
+ return -1;
+}
+
+static inline u32 apic_get_reg(void *regs, int reg)
+{
+ return *((u32 *) (regs + reg));
+}
+
+static inline void apic_set_reg(void *regs, int reg, u32 val)
+{
+ *((u32 *) (regs + reg)) = val;
+}
+
+static __always_inline u64 apic_get_reg64(void *regs, int reg)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ return *((u64 *) (regs + reg));
+}
+
+static __always_inline void apic_set_reg64(void *regs, int reg, u64 val)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ *((u64 *) (regs + reg)) = val;
+}
+
+static inline void apic_clear_vector(int vec, void *bitmap)
+{
+ clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec));
+}
+
+static inline void apic_set_vector(int vec, void *bitmap)
+{
+ set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec));
+}
+
+static inline int apic_test_vector(int vec, void *bitmap)
+{
+ return test_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec));
+}
+
/*
* Warm reset vector position:
*/
diff --git a/arch/x86/include/asm/ce4100.h b/arch/x86/include/asm/ce4100.h
index 2930f560d7f3..e1f965bb1e31 100644
--- a/arch/x86/include/asm/ce4100.h
+++ b/arch/x86/include/asm/ce4100.h
@@ -4,4 +4,10 @@
int ce4100_pci_init(void);
+#ifdef CONFIG_SERIAL_8250
+void __init sdv_serial_fixup(void);
+#else
+static inline void sdv_serial_fixup(void) {};
+#endif
+
#endif
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 5b50e0e35129..602957dd2609 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -336,6 +336,7 @@
#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */
+#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/
#define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */
#define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */
@@ -378,6 +379,7 @@
#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */
#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */
+#define X86_FEATURE_BUS_LOCK_THRESHOLD (15*32+29) /* Bus lock threshold */
#define X86_FEATURE_IDLE_HLT (15*32+30) /* IDLE HLT intercept */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
@@ -446,6 +448,7 @@
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */
#define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */
#define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */
+#define X86_FEATURE_ALLOWED_SEV_FEATURES (19*32+27) /* Allowed SEV Features */
#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */
#define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */
@@ -453,10 +456,15 @@
#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */
#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */
#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */
+#define X86_FEATURE_VERW_CLEAR (20*32+ 5) /* The memory form of VERW mitigates TSA */
#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */
+
#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */
#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */
+#define X86_FEATURE_GP_ON_USER_CPUID (20*32+17) /* User CPUID faulting */
+
+#define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */
#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */
#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */
@@ -483,6 +491,9 @@
#define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */
#define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */
#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
+#define X86_FEATURE_TSA_SQ_NO (21*32+11) /* AMD CPU not vulnerable to TSA-SQ */
+#define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */
+#define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */
/*
* BUG word(s)
@@ -538,5 +549,5 @@
#define X86_BUG_OLD_MICROCODE X86_BUG( 1*32+ 6) /* "old_microcode" CPU has old microcode, it is surely vulnerable to something */
#define X86_BUG_ITS X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */
#define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */
-
+#define X86_BUG_TSA X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 363110e6b2e3..a2c1f2d24b64 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -9,6 +9,14 @@
#include <asm/cpufeature.h>
#include <asm/msr.h>
+/*
+ * Define bits that are always set to 1 in DR7, only bit 10 is
+ * architecturally reserved to '1'.
+ *
+ * This is also the init/reset value for DR7.
+ */
+#define DR7_FIXED_1 0x00000400
+
DECLARE_PER_CPU(unsigned long, cpu_dr7);
#ifndef CONFIG_PARAVIRT_XXL
@@ -100,8 +108,8 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value)
static inline void hw_breakpoint_disable(void)
{
- /* Zero the control register for HW Breakpoint */
- set_debugreg(0UL, 7);
+ /* Reset the control register for HW Breakpoint */
+ set_debugreg(DR7_FIXED_1, 7);
/* Zero-out the individual HW breakpoint address registers */
set_debugreg(0UL, 0);
@@ -125,9 +133,12 @@ static __always_inline unsigned long local_db_save(void)
return 0;
get_debugreg(dr7, 7);
- dr7 &= ~0x400; /* architecturally set bit */
+
+ /* Architecturally set bit */
+ dr7 &= ~DR7_FIXED_1;
if (dr7)
- set_debugreg(0, 7);
+ set_debugreg(DR7_FIXED_1, 7);
+
/*
* Ensure the compiler doesn't lower the above statements into
* the critical section; disabling breakpoints late would not
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 1c94121acd3d..93e99d2583d6 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -118,7 +118,7 @@ enum xfeature {
XFEATURE_PKRU,
XFEATURE_PASID,
XFEATURE_CET_USER,
- XFEATURE_CET_KERNEL_UNUSED,
+ XFEATURE_CET_KERNEL,
XFEATURE_RSRVD_COMP_13,
XFEATURE_RSRVD_COMP_14,
XFEATURE_LBR,
@@ -142,7 +142,7 @@ enum xfeature {
#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
#define XFEATURE_MASK_PASID (1 << XFEATURE_PASID)
#define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER)
-#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL_UNUSED)
+#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL)
#define XFEATURE_MASK_LBR (1 << XFEATURE_LBR)
#define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG)
#define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA)
@@ -269,6 +269,16 @@ struct cet_user_state {
};
/*
+ * State component 12 is Control-flow Enforcement supervisor states.
+ * This state includes SSP pointers for privilege levels 0 through 2.
+ */
+struct cet_supervisor_state {
+ u64 pl0_ssp;
+ u64 pl1_ssp;
+ u64 pl2_ssp;
+} __packed;
+
+/*
* State component 15: Architectural LBR configuration state.
* The size of Arch LBR state depends on the number of LBRs (lbr_depth).
*/
@@ -552,6 +562,31 @@ struct fpu_guest {
};
/*
+ * FPU state configuration data for fpu_guest.
+ * Initialized at boot time. Read only after init.
+ */
+struct vcpu_fpu_config {
+ /*
+ * @size:
+ *
+ * The default size of the register state buffer in guest FPUs.
+ * Includes all supported features except independent managed
+ * features and features which have to be requested by user space
+ * before usage.
+ */
+ unsigned int size;
+
+ /*
+ * @features:
+ *
+ * The default supported features bitmap in guest FPUs. Does not
+ * include independent managed features and features which have to
+ * be requested by user space before usage.
+ */
+ u64 features;
+};
+
+/*
* FPU state configuration data. Initialized at boot time. Read only after init.
*/
struct fpu_state_config {
@@ -567,8 +602,9 @@ struct fpu_state_config {
* @default_size:
*
* The default size of the register state buffer. Includes all
- * supported features except independent managed features and
- * features which have to be requested by user space before usage.
+ * supported features except independent managed features,
+ * guest-only features and features which have to be requested by
+ * user space before usage.
*/
unsigned int default_size;
@@ -584,8 +620,8 @@ struct fpu_state_config {
* @default_features:
*
* The default supported features bitmap. Does not include
- * independent managed features and features which have to
- * be requested by user space before usage.
+ * independent managed features, guest-only features and features
+ * which have to be requested by user space before usage.
*/
u64 default_features;
/*
@@ -606,5 +642,6 @@ struct fpu_state_config {
/* FPU state configuration information */
extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg;
+extern struct vcpu_fpu_config guest_default_cfg;
#endif /* _ASM_X86_FPU_TYPES_H */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index b308a76afbb7..7a7dc9d56027 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -46,9 +46,13 @@
/* Features which are dynamically enabled for a process on request */
#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
+/* Supervisor features which are enabled only in guest FPUs */
+#define XFEATURE_MASK_GUEST_SUPERVISOR XFEATURE_MASK_CET_KERNEL
+
/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
- XFEATURE_MASK_CET_USER)
+ XFEATURE_MASK_CET_USER | \
+ XFEATURE_MASK_GUEST_SUPERVISOR)
/*
* A supervisor state component may not always contain valuable information,
@@ -75,8 +79,7 @@
* Unsupported supervisor features. When a supervisor feature in this mask is
* supported in the future, move it to the supported supervisor feature mask.
*/
-#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \
- XFEATURE_MASK_CET_KERNEL)
+#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)
/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 8b1b1abcef15..5a68e9db6518 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,7 +5,7 @@
#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000
#define __head __section(".head.text") __no_sanitize_undefined __no_stack_protector
#else
-#define __head __section(".head.text") __no_sanitize_undefined
+#define __head __section(".head.text") __no_sanitize_undefined __no_kstack_erase
#endif
struct x86_mapping_info {
diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h
index 43b7657febca..944637a4e6de 100644
--- a/arch/x86/include/asm/intel_telemetry.h
+++ b/arch/x86/include/asm/intel_telemetry.h
@@ -59,18 +59,6 @@ struct telemetry_plt_config {
};
struct telemetry_core_ops {
- int (*get_sampling_period)(u8 *pss_min_period, u8 *pss_max_period,
- u8 *ioss_min_period, u8 *ioss_max_period);
-
- int (*get_eventconfig)(struct telemetry_evtconfig *pss_evtconfig,
- struct telemetry_evtconfig *ioss_evtconfig,
- int pss_len, int ioss_len);
-
- int (*update_events)(struct telemetry_evtconfig pss_evtconfig,
- struct telemetry_evtconfig ioss_evtconfig);
-
- int (*set_sampling_period)(u8 pss_period, u8 ioss_period);
-
int (*get_trace_verbosity)(enum telemetry_unit telem_unit,
u32 *verbosity);
@@ -84,11 +72,6 @@ struct telemetry_core_ops {
int (*read_eventlog)(enum telemetry_unit telem_unit,
struct telemetry_evtlog *evtlog,
int len, int log_all_evts);
-
- int (*add_events)(u8 num_pss_evts, u8 num_ioss_evts,
- u32 *pss_evtmap, u32 *ioss_evtmap);
-
- int (*reset_events)(void);
};
int telemetry_set_pltdata(const struct telemetry_core_ops *ops,
@@ -101,35 +84,15 @@ struct telemetry_plt_config *telemetry_get_pltdata(void);
int telemetry_get_evtname(enum telemetry_unit telem_unit,
const char **name, int len);
-int telemetry_update_events(struct telemetry_evtconfig pss_evtconfig,
- struct telemetry_evtconfig ioss_evtconfig);
-
-int telemetry_add_events(u8 num_pss_evts, u8 num_ioss_evts,
- u32 *pss_evtmap, u32 *ioss_evtmap);
-
-int telemetry_reset_events(void);
-
-int telemetry_get_eventconfig(struct telemetry_evtconfig *pss_config,
- struct telemetry_evtconfig *ioss_config,
- int pss_len, int ioss_len);
-
int telemetry_read_events(enum telemetry_unit telem_unit,
struct telemetry_evtlog *evtlog, int len);
-int telemetry_raw_read_events(enum telemetry_unit telem_unit,
- struct telemetry_evtlog *evtlog, int len);
-
int telemetry_read_eventlog(enum telemetry_unit telem_unit,
struct telemetry_evtlog *evtlog, int len);
int telemetry_raw_read_eventlog(enum telemetry_unit telem_unit,
struct telemetry_evtlog *evtlog, int len);
-int telemetry_get_sampling_period(u8 *pss_min_period, u8 *pss_max_period,
- u8 *ioss_min_period, u8 *ioss_max_period);
-
-int telemetry_set_sampling_period(u8 pss_period, u8 ioss_period);
-
int telemetry_set_trace_verbosity(enum telemetry_unit telem_unit,
u32 verbosity);
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 5036f13ab69f..5a0d42464d44 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -26,7 +26,22 @@ enum {
IRQ_REMAP_X2APIC_MODE,
};
-struct vcpu_data {
+/*
+ * This is mainly used to communicate information back-and-forth
+ * between SVM and IOMMU for setting up and tearing down posted
+ * interrupt
+ */
+struct amd_iommu_pi_data {
+ u64 vapic_addr; /* Physical address of the vCPU's vAPIC. */
+ u32 ga_tag;
+ u32 vector; /* Guest vector of the interrupt */
+ int cpu;
+ bool ga_log_intr;
+ bool is_guest_mode;
+ void *ir_data;
+};
+
+struct intel_iommu_pi_data {
u64 pi_desc_addr; /* Physical address of PI Descriptor */
u32 vector; /* Guest vector of the interrupt */
};
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 9a9b21b78905..b30e5474c18e 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -44,13 +44,13 @@ static __always_inline void native_irq_enable(void)
static __always_inline void native_safe_halt(void)
{
- mds_idle_clear_cpu_buffers();
+ x86_idle_clear_cpu_buffers();
asm volatile("sti; hlt": : :"memory");
}
static __always_inline void native_halt(void)
{
- mds_idle_clear_cpu_buffers();
+ x86_idle_clear_cpu_buffers();
asm volatile("hlt": : :"memory");
}
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 823c0434bbad..18a5c3119e1a 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -21,6 +21,7 @@ KVM_X86_OP(has_emulated_msr)
KVM_X86_OP(vcpu_after_set_cpuid)
KVM_X86_OP(vm_init)
KVM_X86_OP_OPTIONAL(vm_destroy)
+KVM_X86_OP_OPTIONAL(vm_pre_destroy)
KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
KVM_X86_OP(vcpu_create)
KVM_X86_OP(vcpu_free)
@@ -48,7 +49,6 @@ KVM_X86_OP(set_idt)
KVM_X86_OP(get_gdt)
KVM_X86_OP(set_gdt)
KVM_X86_OP(sync_dirty_debug_regs)
-KVM_X86_OP(set_dr6)
KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
@@ -111,10 +111,11 @@ KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
KVM_X86_OP_OPTIONAL(vcpu_blocking)
KVM_X86_OP_OPTIONAL(vcpu_unblocking)
KVM_X86_OP_OPTIONAL(pi_update_irte)
-KVM_X86_OP_OPTIONAL(pi_start_assignment)
+KVM_X86_OP_OPTIONAL(pi_start_bypass)
KVM_X86_OP_OPTIONAL(apicv_pre_state_restore)
KVM_X86_OP_OPTIONAL(apicv_post_state_restore)
KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
+KVM_X86_OP_OPTIONAL(protected_apic_has_interrupt)
KVM_X86_OP_OPTIONAL(set_hv_timer)
KVM_X86_OP_OPTIONAL(cancel_hv_timer)
KVM_X86_OP(setup_mce)
@@ -126,6 +127,7 @@ KVM_X86_OP(enable_smi_window)
#endif
KVM_X86_OP_OPTIONAL(dev_get_attr)
KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
+KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
@@ -136,7 +138,7 @@ KVM_X86_OP(check_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
KVM_X86_OP_OPTIONAL(migrate_timers)
-KVM_X86_OP(msr_filter_changed)
+KVM_X86_OP(recalc_msr_intercepts)
KVM_X86_OP(complete_emulated_msr)
KVM_X86_OP(vcpu_deliver_sipi_vector)
KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9c971f846108..f19a76d3ca0e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -31,6 +31,7 @@
#include <asm/apic.h>
#include <asm/pvclock-abi.h>
+#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/mtrr.h>
#include <asm/msr-index.h>
@@ -126,7 +127,8 @@
KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_HV_TLB_FLUSH \
KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34)
+#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE \
+ KVM_ARCH_REQ_FLAGS(34, KVM_REQUEST_WAIT)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -248,7 +250,6 @@ enum x86_intercept_stage;
#define DR7_BP_EN_MASK 0x000000ff
#define DR7_GE (1 << 9)
#define DR7_GD (1 << 13)
-#define DR7_FIXED_1 0x00000400
#define DR7_VOLATILE 0xffff2bff
#define KVM_GUESTDBG_VALID_MASK \
@@ -296,6 +297,7 @@ enum x86_intercept_stage;
*/
#define KVM_APIC_PV_EOI_PENDING 1
+struct kvm_kernel_irqfd;
struct kvm_kernel_irq_routing_entry;
/*
@@ -412,7 +414,6 @@ struct kvm_rmap_head {
};
struct kvm_pio_request {
- unsigned long linear_rip;
unsigned long count;
int in;
int port;
@@ -609,8 +610,15 @@ struct kvm_pmu {
struct kvm_pmu_ops;
enum {
- KVM_DEBUGREG_BP_ENABLED = 1,
- KVM_DEBUGREG_WONT_EXIT = 2,
+ KVM_DEBUGREG_BP_ENABLED = BIT(0),
+ KVM_DEBUGREG_WONT_EXIT = BIT(1),
+ /*
+ * Guest debug registers (DR0-3, DR6 and DR7) are saved/restored by
+ * hardware on exit from or enter to guest. KVM needn't switch them.
+ * DR0-3, DR6 and DR7 are set to their architectural INIT value on VM
+ * exit, host values need to be restored.
+ */
+ KVM_DEBUGREG_AUTO_SWITCH = BIT(2),
};
struct kvm_mtrr {
@@ -693,8 +701,13 @@ struct kvm_vcpu_hv {
struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS];
- /* Preallocated buffer for handling hypercalls passing sparse vCPU set */
+ /*
+ * Preallocated buffers for handling hypercalls that pass sparse vCPU
+ * sets (for high vCPU counts, they're too large to comfortably fit on
+ * the stack).
+ */
u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS];
+ DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
struct hv_vp_assist_page vp_assist_page;
@@ -757,6 +770,7 @@ enum kvm_only_cpuid_leafs {
CPUID_8000_0022_EAX,
CPUID_7_2_EDX,
CPUID_24_0_EBX,
+ CPUID_8000_0021_ECX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -911,6 +925,7 @@ struct kvm_vcpu_arch {
bool emulate_regs_need_sync_to_vcpu;
bool emulate_regs_need_sync_from_vcpu;
int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
+ unsigned long cui_linear_rip;
gpa_t time;
s8 pvclock_tsc_shift;
@@ -1028,6 +1043,7 @@ struct kvm_vcpu_arch {
int pending_ioapic_eoi;
int pending_external_vector;
+ int highest_stale_pending_ioapic_eoi;
/* be preempted when it's in kernel-mode(cpl=0) */
bool preempted_in_kernel;
@@ -1305,6 +1321,12 @@ enum kvm_apicv_inhibit {
*/
APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
+ /*
+ * AVIC is disabled because the vCPU's APIC ID is beyond the max
+ * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable.
+ */
+ APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG,
+
NR_APICV_INHIBIT_REASONS,
};
@@ -1323,7 +1345,8 @@ enum kvm_apicv_inhibit {
__APICV_INHIBIT_REASON(IRQWIN), \
__APICV_INHIBIT_REASON(PIT_REINJ), \
__APICV_INHIBIT_REASON(SEV), \
- __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED)
+ __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \
+ __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG)
struct kvm_arch {
unsigned long n_used_mmu_pages;
@@ -1335,7 +1358,7 @@ struct kvm_arch {
bool has_private_mem;
bool has_protected_state;
bool pre_fault_allowed;
- struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+ struct hlist_head *mmu_page_hash;
struct list_head active_mmu_pages;
/*
* A list of kvm_mmu_page structs that, if zapped, could possibly be
@@ -1364,11 +1387,13 @@ struct kvm_arch {
#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
atomic_t noncoherent_dma_count;
-#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
- atomic_t assigned_device_count;
+ unsigned long nr_possible_bypass_irqs;
+
+#ifdef CONFIG_KVM_IOAPIC
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
+#endif
atomic_t vapics_in_nmi_mode;
struct mutex apic_map_lock;
struct kvm_apic_map __rcu *apic_map;
@@ -1383,12 +1408,8 @@ struct kvm_arch {
gpa_t wall_clock;
- bool mwait_in_guest;
- bool hlt_in_guest;
- bool pause_in_guest;
- bool cstate_in_guest;
+ u64 disabled_exits;
- unsigned long irq_sources_bitmap;
s64 kvmclock_offset;
/*
@@ -1417,9 +1438,6 @@ struct kvm_arch {
struct delayed_work kvmclock_update_work;
struct delayed_work kvmclock_sync_work;
- /* reads protected by irq_srcu, writes by irq_lock */
- struct hlist_head mask_notifier_list;
-
#ifdef CONFIG_KVM_HYPERV
struct kvm_hv hyperv;
#endif
@@ -1442,6 +1460,7 @@ struct kvm_arch {
bool x2apic_format;
bool x2apic_broadcast_quirk_disabled;
+ bool has_mapped_host_mmio;
bool guest_can_read_msr_platform_info;
bool exception_payload_enabled;
@@ -1571,6 +1590,13 @@ struct kvm_arch {
struct kvm_mmu_memory_cache split_desc_cache;
gfn_t gfn_direct_bits;
+
+ /*
+ * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A Zero
+ * value indicates CPU dirty logging is unsupported or disabled in
+ * current VM.
+ */
+ int cpu_dirty_log_size;
};
struct kvm_vm_stat {
@@ -1658,6 +1684,12 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
}
+enum kvm_x86_run_flags {
+ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
+ KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
+ KVM_RUN_LOAD_DEBUGCTL = BIT(2),
+};
+
struct kvm_x86_ops {
const char *name;
@@ -1674,6 +1706,7 @@ struct kvm_x86_ops {
unsigned int vm_size;
int (*vm_init)(struct kvm *kvm);
void (*vm_destroy)(struct kvm *kvm);
+ void (*vm_pre_destroy)(struct kvm *kvm);
/* Create, but do not attach this VCPU */
int (*vcpu_precreate)(struct kvm *kvm);
@@ -1685,6 +1718,12 @@ struct kvm_x86_ops {
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
+ /*
+ * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
+ * match the host's value even while the guest is active.
+ */
+ const u64 HOST_OWNED_DEBUGCTL;
+
void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -1707,7 +1746,6 @@ struct kvm_x86_ops {
void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
- void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
@@ -1738,7 +1776,7 @@ struct kvm_x86_ops {
int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
- bool force_immediate_exit);
+ u64 run_flags);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
@@ -1823,11 +1861,6 @@ struct kvm_x86_ops {
struct x86_exception *exception);
void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
- /*
- * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero
- * value indicates CPU dirty logging is unsupported or disabled.
- */
- int cpu_dirty_log_size;
void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
const struct kvm_x86_nested_ops *nested_ops;
@@ -1835,12 +1868,14 @@ struct kvm_x86_ops {
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
- int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
- void (*pi_start_assignment)(struct kvm *kvm);
+ int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
+ void (*pi_start_bypass)(struct kvm *kvm);
void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
+ bool (*protected_apic_has_interrupt)(struct kvm_vcpu *vcpu);
int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
bool *expired);
@@ -1857,6 +1892,7 @@ struct kvm_x86_ops {
int (*dev_get_attr)(u32 group, u64 attr, u64 *val);
int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
+ int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
@@ -1872,7 +1908,7 @@ struct kvm_x86_ops {
int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu);
void (*migrate_timers)(struct kvm_vcpu *vcpu);
- void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
+ void (*recalc_msr_intercepts)(struct kvm_vcpu *vcpu);
int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
@@ -1930,6 +1966,8 @@ struct kvm_arch_async_pf {
extern u32 __read_mostly kvm_nr_uret_msrs;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
+extern bool __read_mostly enable_ipiv;
+extern bool __read_mostly enable_device_posted_irqs;
extern struct kvm_x86_ops kvm_x86_ops;
#define kvm_x86_call(func) static_call(kvm_x86_##func)
@@ -1947,7 +1985,7 @@ void kvm_x86_vendor_exit(void);
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
- return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ return kvzalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT);
}
#define __KVM_HAVE_ARCH_VM_FREE
@@ -1992,7 +2030,7 @@ void kvm_mmu_vendor_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
-void kvm_mmu_init_vm(struct kvm *kvm);
+int kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
@@ -2023,19 +2061,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
-struct kvm_irq_mask_notifier {
- void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
- int irq;
- struct hlist_node link;
-};
-
-void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn);
-void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn);
-void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
- bool mask);
-
extern bool tdp_enabled;
u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
@@ -2194,9 +2219,6 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
return !!(*irq_state);
}
-int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
-void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
-
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
@@ -2333,6 +2355,7 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
int kvm_add_user_return_msr(u32 msr);
int kvm_find_user_return_msr(u32 msr);
int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
+void kvm_user_return_msr_update_cache(unsigned int index, u64 val);
static inline bool kvm_is_supported_user_return_msr(u32 msr)
{
@@ -2372,9 +2395,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
-void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct kvm_lapic_irq *irq);
-
static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
{
/* We can only post Fixed and LowPrio IRQs */
@@ -2416,7 +2436,12 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
KVM_X86_QUIRK_SLOT_ZAP_ALL | \
- KVM_X86_QUIRK_STUFF_FEATURE_MSRS)
+ KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \
+ KVM_X86_QUIRK_IGNORE_GUEST_PAT)
+
+#define KVM_X86_CONDITIONAL_QUIRKS \
+ (KVM_X86_QUIRK_CD_NW_CLEARED | \
+ KVM_X86_QUIRK_IGNORE_GUEST_PAT)
/*
* KVM previously used a u32 field in kvm_run to indicate the hypercall was
@@ -2427,7 +2452,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
static inline bool kvm_arch_has_irq_bypass(void)
{
- return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
+ return enable_device_posted_irqs;
}
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index e988bac0a4a1..3c2de4ce3b10 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -5,12 +5,20 @@
#include <asm-generic/module.h>
#include <asm/orc_types.h>
+struct its_array {
+#ifdef CONFIG_MITIGATION_ITS
+ void **pages;
+ int num;
+#endif
+};
+
struct mod_arch_specific {
#ifdef CONFIG_UNWINDER_ORC
unsigned int num_orcs;
int *orc_unwind_ip;
struct orc_entry *orc_unwind;
#endif
+ struct its_array its_pages;
};
#endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 778444310cfb..abc4659f5809 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -112,12 +112,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
return hv_status;
}
-/* Hypercall to the L0 hypervisor */
-static inline u64 hv_do_nested_hypercall(u64 control, void *input, void *output)
-{
- return hv_do_hypercall(control | HV_HYPERCALL_NESTED, input, output);
-}
-
/* Fast hypercall with 8 bytes of input and no output */
static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1)
{
@@ -165,13 +159,6 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
return _hv_do_fast_hypercall8(control, input1);
}
-static inline u64 hv_do_fast_nested_hypercall8(u16 code, u64 input1)
-{
- u64 control = (u64)code | HV_HYPERCALL_FAST_BIT | HV_HYPERCALL_NESTED;
-
- return _hv_do_fast_hypercall8(control, input1);
-}
-
/* Fast hypercall with 16 bytes of input */
static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2)
{
@@ -223,13 +210,6 @@ static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
return _hv_do_fast_hypercall16(control, input1, input2);
}
-static inline u64 hv_do_fast_nested_hypercall16(u16 code, u64 input1, u64 input2)
-{
- u64 control = (u64)code | HV_HYPERCALL_FAST_BIT | HV_HYPERCALL_NESTED;
-
- return _hv_do_fast_hypercall16(control, input1, input2);
-}
-
extern struct hv_vp_assist_page **hv_vp_assist_page;
static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
@@ -262,6 +242,8 @@ static inline void hv_apic_init(void) {}
struct irq_domain *hv_create_pci_msi_domain(void);
+int hv_map_msi_interrupt(struct irq_data *data,
+ struct hv_interrupt_entry *out_entry);
int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
struct hv_interrupt_entry *entry);
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
@@ -269,11 +251,12 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
#ifdef CONFIG_AMD_MEM_ENCRYPT
bool hv_ghcb_negotiate_protocol(void);
void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason);
-int hv_snp_boot_ap(u32 cpu, unsigned long start_ip);
+int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, unsigned int cpu);
#else
static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
-static inline int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) { return 0; }
+static inline int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip,
+ unsigned int cpu) { return 0; }
#endif
#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST)
@@ -307,6 +290,7 @@ static __always_inline u64 hv_raw_get_msr(unsigned int reg)
{
return native_rdmsrq(reg);
}
+int hv_apicid_to_vp_index(u32 apic_id);
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
@@ -328,6 +312,7 @@ static inline void hv_set_msr(unsigned int reg, u64 value) { }
static inline u64 hv_get_msr(unsigned int reg) { return 0; }
static inline void hv_set_non_nested_msr(unsigned int reg, u64 value) { }
static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; }
+static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; }
#endif /* CONFIG_HYPERV */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b7dded3c8113..b65c3ba5fa14 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -419,6 +419,7 @@
#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12)
#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
+#define DEBUGCTLMSR_RTM_DEBUG BIT(15)
#define MSR_PEBS_FRONTEND 0x000003f7
@@ -628,6 +629,7 @@
#define MSR_AMD64_OSVW_STATUS 0xc0010141
#define MSR_AMD_PPIN_CTL 0xc00102f0
#define MSR_AMD_PPIN 0xc00102f1
+#define MSR_AMD64_CPUID_FN_7 0xc0011002
#define MSR_AMD64_CPUID_FN_1 0xc0011004
#define MSR_AMD64_LS_CFG 0xc0011020
#define MSR_AMD64_DC_CFG 0xc0011022
@@ -732,6 +734,11 @@
#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301
#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302
+/* AMD Hardware Feedback Support MSRs */
+#define MSR_AMD_WORKLOAD_CLASS_CONFIG 0xc0000500
+#define MSR_AMD_WORKLOAD_CLASS_ID 0xc0000501
+#define MSR_AMD_WORKLOAD_HRST 0xc0000502
+
/* AMD Last Branch Record MSRs */
#define MSR_AMD64_LBR_SELECT 0xc000010e
@@ -830,6 +837,7 @@
#define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
#define MSR_K7_HWCR_IRPERF_EN_BIT 30
#define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT)
+#define MSR_K7_HWCR_CPUID_USER_DIS_BIT 35
#define MSR_K7_FID_VID_CTL 0xc0010041
#define MSR_K7_FID_VID_STATUS 0xc0010042
#define MSR_K7_HWCR_CPB_DIS_BIT 25
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 4096b8af4ba7..9c2ea29e12a9 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -228,7 +228,7 @@ static __always_inline u64 rdpmc(int counter)
#endif /* !CONFIG_PARAVIRT_XXL */
/* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */
-#define WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6)
+#define ASM_WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6)
/* Non-serializing WRMSR, when available. Falls back to a serializing WRMSR. */
static __always_inline void wrmsrns(u32 msr, u64 val)
@@ -237,7 +237,7 @@ static __always_inline void wrmsrns(u32 msr, u64 val)
* WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant
* DS prefix to avoid a trailing NOP.
*/
- asm volatile("1: " ALTERNATIVE("ds wrmsr", WRMSRNS, X86_FEATURE_WRMSRNS)
+ asm volatile("1: " ALTERNATIVE("ds wrmsr", ASM_WRMSRNS, X86_FEATURE_WRMSRNS)
"2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
: : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)));
}
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index dd2b129b0418..6ca6516c7492 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -43,8 +43,6 @@ static __always_inline void __monitorx(const void *eax, u32 ecx, u32 edx)
static __always_inline void __mwait(u32 eax, u32 ecx)
{
- mds_idle_clear_cpu_buffers();
-
/*
* Use the instruction mnemonic with implicit operands, as the LLVM
* assembler fails to assemble the mnemonic with explicit operands:
@@ -80,7 +78,7 @@ static __always_inline void __mwait(u32 eax, u32 ecx)
*/
static __always_inline void __mwaitx(u32 eax, u32 ebx, u32 ecx)
{
- /* No MDS buffer clear as this is AMD/HYGON only */
+ /* No need for TSA buffer clearing on AMD */
/* "mwaitx %eax, %ebx, %ecx" */
asm volatile(".byte 0x0f, 0x01, 0xfb"
@@ -98,7 +96,6 @@ static __always_inline void __mwaitx(u32 eax, u32 ebx, u32 ecx)
*/
static __always_inline void __sti_mwait(u32 eax, u32 ecx)
{
- mds_idle_clear_cpu_buffers();
asm volatile("sti; mwait" :: "a" (eax), "c" (ecx));
}
@@ -115,21 +112,29 @@ static __always_inline void __sti_mwait(u32 eax, u32 ecx)
*/
static __always_inline void mwait_idle_with_hints(u32 eax, u32 ecx)
{
+ if (need_resched())
+ return;
+
+ x86_idle_clear_cpu_buffers();
+
if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) {
const void *addr = &current_thread_info()->flags;
alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr));
__monitor(addr, 0, 0);
- if (!need_resched()) {
- if (ecx & 1) {
- __mwait(eax, ecx);
- } else {
- __sti_mwait(eax, ecx);
- raw_local_irq_disable();
- }
+ if (need_resched())
+ goto out;
+
+ if (ecx & 1) {
+ __mwait(eax, ecx);
+ } else {
+ __sti_mwait(eax, ecx);
+ raw_local_irq_disable();
}
}
+
+out:
current_clr_polling();
}
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 20d754b98f3f..10f261678749 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -302,25 +302,31 @@
.endm
/*
- * Macro to execute VERW instruction that mitigate transient data sampling
- * attacks such as MDS. On affected systems a microcode update overloaded VERW
- * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF.
- *
+ * Macro to execute VERW insns that mitigate transient data sampling
+ * attacks such as MDS or TSA. On affected systems a microcode update
+ * overloaded VERW insns to also clear the CPU buffers. VERW clobbers
+ * CFLAGS.ZF.
* Note: Only the memory operand variant of VERW clears the CPU buffers.
*/
-.macro CLEAR_CPU_BUFFERS
+.macro __CLEAR_CPU_BUFFERS feature
#ifdef CONFIG_X86_64
- ALTERNATIVE "", "verw mds_verw_sel(%rip)", X86_FEATURE_CLEAR_CPU_BUF
+ ALTERNATIVE "", "verw x86_verw_sel(%rip)", \feature
#else
/*
* In 32bit mode, the memory operand must be a %cs reference. The data
* segments may not be usable (vm86 mode), and the stack segment may not
* be flat (ESPFIX32).
*/
- ALTERNATIVE "", "verw %cs:mds_verw_sel", X86_FEATURE_CLEAR_CPU_BUF
+ ALTERNATIVE "", "verw %cs:x86_verw_sel", \feature
#endif
.endm
+#define CLEAR_CPU_BUFFERS \
+ __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF
+
+#define VM_CLEAR_CPU_BUFFERS \
+ __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF_VM
+
#ifdef CONFIG_X86_64
.macro CLEAR_BRANCH_HISTORY
ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP
@@ -567,24 +573,24 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
DECLARE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
-DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
+DECLARE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
DECLARE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
-extern u16 mds_verw_sel;
+extern u16 x86_verw_sel;
#include <asm/segment.h>
/**
- * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
+ * x86_clear_cpu_buffers - Buffer clearing support for different x86 CPU vulns
*
* This uses the otherwise unused and obsolete VERW instruction in
* combination with microcode which triggers a CPU buffer flush when the
* instruction is executed.
*/
-static __always_inline void mds_clear_cpu_buffers(void)
+static __always_inline void x86_clear_cpu_buffers(void)
{
static const u16 ds = __KERNEL_DS;
@@ -601,14 +607,15 @@ static __always_inline void mds_clear_cpu_buffers(void)
}
/**
- * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
+ * x86_idle_clear_cpu_buffers - Buffer clearing support in idle for the MDS
+ * and TSA vulnerabilities.
*
* Clear CPU buffers if the corresponding static key is enabled
*/
-static __always_inline void mds_idle_clear_cpu_buffers(void)
+static __always_inline void x86_idle_clear_cpu_buffers(void)
{
- if (static_branch_likely(&mds_idle_clear))
- mds_clear_cpu_buffers();
+ if (static_branch_likely(&cpu_buf_idle_clear))
+ x86_clear_cpu_buffers();
}
#endif /* __ASSEMBLER__ */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5ddba366d3b4..97954c936c54 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -777,6 +777,9 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot)
static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ /* This bit combination is used to mark shadow stacks */
+ WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) ==
+ _PAGE_DIRTY);
pfn ^= protnone_mask(pgprot_val(pgprot));
pfn &= PTE_PFN_MASK;
return __pte(pfn | check_pgprot(pgprot));
@@ -1073,22 +1076,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
*/
#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- *
- * (Currently stuck as a macro because of indirect forward reference
- * to linux/mm.h:page_to_nid())
- */
-#define mk_pte(page, pgprot) \
-({ \
- pgprot_t __pgprot = pgprot; \
- \
- WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \
- _PAGE_DIRTY); \
- pfn_pte(page_to_pfn(page), __pgprot); \
-})
-
static inline int pmd_bad(pmd_t pmd)
{
return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
@@ -1353,8 +1340,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
-#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
-
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
@@ -1576,7 +1561,7 @@ static inline pte_t pte_swp_mkexclusive(pte_t pte)
return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE);
}
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
{
return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE;
}
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b74ec5c3643b..a5731fb1e9dd 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -214,9 +214,6 @@ enum page_cache_mode {
#define PAGE_READONLY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0)
#define PAGE_READONLY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0)
-#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G)
-#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G)
-
/*
* Page tables needs to have Write=1 in order for any lower PTEs to be
* writable. This includes shadow stack memory (Write=0, Dirty=1)
diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h
index de788b400fba..a5f761fbf45b 100644
--- a/arch/x86/include/asm/posted_intr.h
+++ b/arch/x86/include/asm/posted_intr.h
@@ -1,19 +1,24 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X86_POSTED_INTR_H
#define _X86_POSTED_INTR_H
+
+#include <asm/cmpxchg.h>
+#include <asm/rwonce.h>
#include <asm/irq_vectors.h>
+#include <linux/bitmap.h>
+
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
#define PID_TABLE_ENTRY_VALID 1
+#define NR_PIR_VECTORS 256
+#define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG)
+
/* Posted-Interrupt Descriptor */
struct pi_desc {
- union {
- u32 pir[8]; /* Posted interrupt requested */
- u64 pir64[4];
- };
+ unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */
union {
struct {
u16 notifications; /* Suppress and outstanding bits */
@@ -26,6 +31,65 @@ struct pi_desc {
u32 rsvd[6];
} __aligned(64);
+/*
+ * De-multiplexing posted interrupts is on the performance path, the code
+ * below is written to optimize the cache performance based on the following
+ * considerations:
+ * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
+ * accessed by both CPU and IOMMU.
+ * 2.During software processing of posted interrupts, the CPU needs to do
+ * natural width read and xchg for checking and clearing posted interrupt
+ * request (PIR), a 256 bit field within the PID.
+ * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
+ * line when posting interrupts and setting control bits.
+ * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
+ * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
+ * cache line. The cache line states after each operation are as follows,
+ * assuming a 64-bit kernel:
+ * CPU IOMMU PID Cache line state
+ * ---------------------------------------------------------------
+ *...read64 exclusive
+ *...lock xchg64 modified
+ *... post/atomic swap invalid
+ *...-------------------------------------------------------------
+ *
+ * To reduce L1 data cache miss, it is important to avoid contention with
+ * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
+ * when processing posted interrupts in software, e.g. to dispatch interrupt
+ * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR
+ * in KVM.
+ *
+ * In addition, the code is trying to keep the cache line state consistent
+ * as much as possible. e.g. when making a copy and clearing the PIR
+ * (assuming non-zero PIR bits are present in the entire PIR), it does:
+ * read, read, read, read, xchg, xchg, xchg, xchg
+ * instead of:
+ * read, xchg, read, xchg, read, xchg, read, xchg
+ */
+static __always_inline bool pi_harvest_pir(unsigned long *pir,
+ unsigned long *pir_vals)
+{
+ unsigned long pending = 0;
+ int i;
+
+ for (i = 0; i < NR_PIR_WORDS; i++) {
+ pir_vals[i] = READ_ONCE(pir[i]);
+ pending |= pir_vals[i];
+ }
+
+ if (!pending)
+ return false;
+
+ for (i = 0; i < NR_PIR_WORDS; i++) {
+ if (!pir_vals[i])
+ continue;
+
+ pir_vals[i] = arch_xchg(&pir[i], 0);
+ }
+
+ return true;
+}
+
static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
{
return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
@@ -43,12 +107,12 @@ static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
- return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+ return test_and_set_bit(vector, pi_desc->pir);
}
static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
{
- return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+ return bitmap_empty(pi_desc->pir, NR_VECTORS);
}
static inline void pi_set_sn(struct pi_desc *pi_desc)
@@ -81,6 +145,11 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc)
return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
}
+static inline bool pi_test_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
/* Non-atomic helpers */
static inline void __pi_set_sn(struct pi_desc *pi_desc)
{
@@ -105,7 +174,7 @@ static inline bool pi_pending_this_cpu(unsigned int vector)
if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR))
return false;
- return test_bit(vector, (unsigned long *)pid->pir);
+ return test_bit(vector, pid->pir);
}
extern void intel_posted_msi_init(void);
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index f607081a022a..e406a1e92c63 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -78,7 +78,7 @@ extern unsigned char secondary_startup_64[];
extern unsigned char secondary_startup_64_no_verify[];
#endif
-static inline size_t real_mode_size_needed(void)
+static __always_inline size_t real_mode_size_needed(void)
{
if (real_mode_header)
return 0; /* already allocated. */
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index bd6afe805cf6..feb93b50e990 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -177,7 +177,7 @@ static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored,
return READ_ONCE(tsk->rmid) == rmid;
}
-static inline void resctrl_sched_in(struct task_struct *tsk)
+static inline void resctrl_arch_sched_in(struct task_struct *tsk)
{
if (static_branch_likely(&rdt_enable_key))
__resctrl_sched_in(tsk);
@@ -196,25 +196,22 @@ static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid)
/* x86 can always read an rmid, nothing needs allocating */
struct rdt_resource;
-static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid)
+static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r,
+ enum resctrl_event_id evtid)
{
might_sleep();
return NULL;
-};
+}
-static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid,
- void *ctx) { };
+static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r,
+ enum resctrl_event_id evtid,
+ void *ctx) { }
-u64 resctrl_arch_get_prefetch_disable_bits(void);
-int resctrl_arch_pseudo_lock_fn(void *_plr);
-int resctrl_arch_measure_cycles_lat_fn(void *_plr);
-int resctrl_arch_measure_l2_residency(void *_plr);
-int resctrl_arch_measure_l3_residency(void *_plr);
void resctrl_cpu_detect(struct cpuinfo_x86 *c);
#else
-static inline void resctrl_sched_in(struct task_struct *tsk) {}
+static inline void resctrl_arch_sched_in(struct task_struct *tsk) {}
static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {}
#endif /* CONFIG_X86_CPU_RESCTRL */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 8d9f1c9aaa4c..61f56cdaccb5 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -4,6 +4,7 @@
#include <asm/page.h>
#include <asm-generic/set_memory.h>
+#include <asm/pgtable.h>
#define set_memory_rox set_memory_rox
int set_memory_rox(unsigned long addr, int numpages);
@@ -37,6 +38,7 @@ int set_memory_rox(unsigned long addr, int numpages);
* The caller is required to take care of these.
*/
+int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot);
int _set_memory_uc(unsigned long addr, int numpages);
int _set_memory_wc(unsigned long addr, int numpages);
int _set_memory_wt(unsigned long addr, int numpages);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 6324f4c6c545..692af46603a1 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -68,6 +68,8 @@ extern void x86_ce4100_early_setup(void);
static inline void x86_ce4100_early_setup(void) { }
#endif
+#include <linux/kexec_handover.h>
+
#ifndef _SETUP
#include <asm/espfix.h>
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 6158893786d6..89075ff19afa 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -223,6 +223,18 @@ struct snp_tsc_info_resp {
u8 rsvd2[100];
} __packed;
+/*
+ * Obtain the mean TSC frequency by decreasing the nominal TSC frequency with
+ * TSC_FACTOR as documented in the SNP Firmware ABI specification:
+ *
+ * GUEST_TSC_FREQ * (1 - (TSC_FACTOR * 0.00001))
+ *
+ * which is equivalent to:
+ *
+ * GUEST_TSC_FREQ -= (GUEST_TSC_FREQ * TSC_FACTOR) / 100000;
+ */
+#define SNP_SCALE_TSC_FREQ(freq, factor) ((freq) - (freq) * (factor) / 100000)
+
struct snp_guest_req {
void *req_buf;
size_t req_sz;
@@ -231,6 +243,7 @@ struct snp_guest_req {
size_t resp_sz;
u64 exit_code;
+ u64 exitinfo2;
unsigned int vmpck_id;
u8 msg_version;
u8 msg_type;
@@ -282,8 +295,11 @@ struct snp_secrets_page {
u8 svsm_guest_vmpl;
u8 rsvd3[3];
+ /* The percentage decrease from nominal to mean TSC frequency. */
+ u32 tsc_factor;
+
/* Remainder of page */
- u8 rsvd4[3744];
+ u8 rsvd4[3740];
} __packed;
struct snp_msg_desc {
@@ -415,6 +431,10 @@ struct svsm_call {
#define SVSM_ATTEST_SERVICES 0
#define SVSM_ATTEST_SINGLE_SERVICE 1
+#define SVSM_VTPM_CALL(x) ((2ULL << 32) | (x))
+#define SVSM_VTPM_QUERY 0
+#define SVSM_VTPM_CMD 1
+
#ifdef CONFIG_AMD_MEM_ENCRYPT
extern u8 snp_vmpl;
@@ -441,7 +461,7 @@ static __always_inline void sev_es_nmi_complete(void)
cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
__sev_es_nmi_complete();
}
-extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
+extern int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd);
extern void sev_enable(struct boot_params *bp);
/*
@@ -482,8 +502,6 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
return rc;
}
-struct snp_guest_request_ioctl;
-
void setup_ghcb(void);
void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
unsigned long npages);
@@ -509,8 +527,9 @@ void snp_kexec_begin(void);
int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id);
struct snp_msg_desc *snp_msg_alloc(void);
void snp_msg_free(struct snp_msg_desc *mdesc);
-int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
- struct snp_guest_request_ioctl *rio);
+int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req);
+
+int snp_svsm_vtpm_send_command(u8 *buffer);
void __init snp_secure_tsc_prepare(void);
void __init snp_secure_tsc_init(void);
@@ -550,7 +569,7 @@ static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
static inline void sev_es_nmi_complete(void) { }
-static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; }
+static inline int sev_es_efi_map_ghcbs_cas(pgd_t *pgd) { return 0; }
static inline void sev_enable(struct boot_params *bp) { }
static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; }
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; }
@@ -581,8 +600,9 @@ static inline void snp_kexec_begin(void) { }
static inline int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id) { return -1; }
static inline struct snp_msg_desc *snp_msg_alloc(void) { return NULL; }
static inline void snp_msg_free(struct snp_msg_desc *mdesc) { }
-static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req,
- struct snp_guest_request_ioctl *rio) { return -ENODEV; }
+static inline int snp_send_guest_request(struct snp_msg_desc *mdesc,
+ struct snp_guest_req *req) { return -ENODEV; }
+static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; }
static inline void __init snp_secure_tsc_prepare(void) { }
static inline void __init snp_secure_tsc_init(void) { }
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index a28ff6b14145..8bc074c8d7c6 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -13,6 +13,7 @@
/* TDX module Call Leaf IDs */
#define TDG_VP_VMCALL 0
#define TDG_VP_INFO 1
+#define TDG_MR_RTMR_EXTEND 2
#define TDG_VP_VEINFO_GET 3
#define TDG_MR_REPORT 4
#define TDG_MEM_PAGE_ACCEPT 6
@@ -67,11 +68,20 @@
#define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT)
/* TDX hypercall Leaf IDs */
+#define TDVMCALL_GET_TD_VM_CALL_INFO 0x10000
#define TDVMCALL_MAP_GPA 0x10001
#define TDVMCALL_GET_QUOTE 0x10002
#define TDVMCALL_REPORT_FATAL_ERROR 0x10003
+#define TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT 0x10004ULL
-#define TDVMCALL_STATUS_RETRY 1
+/*
+ * TDG.VP.VMCALL Status Codes (returned in R10)
+ */
+#define TDVMCALL_STATUS_SUCCESS 0x0000000000000000ULL
+#define TDVMCALL_STATUS_RETRY 0x0000000000000001ULL
+#define TDVMCALL_STATUS_INVALID_OPERAND 0x8000000000000000ULL
+#define TDVMCALL_STATUS_ALIGN_ERROR 0x8000000000000002ULL
+#define TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED 0x8000000000000003ULL
/*
* Bitmasks of exposed registers (with VMM).
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index e770c4fc47f4..8727c7e21dd1 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -24,4 +24,26 @@ int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
+/*
+ * To prevent immediate repeat of single step trap on return from SIGTRAP
+ * handler if the trap flag (TF) is set without an external debugger attached,
+ * clear the software event flag in the augmented SS, ensuring no single-step
+ * trap is pending upon ERETU completion.
+ *
+ * Note, this function should be called in sigreturn() before the original
+ * state is restored to make sure the TF is read from the entry frame.
+ */
+static __always_inline void prevent_single_step_upon_eretu(struct pt_regs *regs)
+{
+ /*
+ * If the trap flag (TF) is set, i.e., the sigreturn() SYSCALL instruction
+ * is being single-stepped, do not clear the software event flag in the
+ * augmented SS, thus a debugger won't skip over the following instruction.
+ */
+#ifdef CONFIG_X86_FRED
+ if (!(regs->flags & X86_EFLAGS_TF))
+ regs->fred_ss.swevent = 0;
+#endif
+}
+
#endif /* _ASM_X86_SIGHANDLING_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 0c1c68039d6f..22bfebe6776d 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -112,7 +112,10 @@ void __noreturn hlt_play_dead(void);
void native_play_dead(void);
void play_dead_common(void);
void wbinvd_on_cpu(int cpu);
-int wbinvd_on_all_cpus(void);
+void wbinvd_on_all_cpus(void);
+void wbinvd_on_cpus_mask(struct cpumask *cpus);
+void wbnoinvd_on_all_cpus(void);
+void wbnoinvd_on_cpus_mask(struct cpumask *cpus);
void smp_kick_mwait_play_dead(void);
void __noreturn mwait_play_dead(unsigned int eax_hint);
@@ -148,10 +151,24 @@ static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
#else /* !CONFIG_SMP */
#define wbinvd_on_cpu(cpu) wbinvd()
-static inline int wbinvd_on_all_cpus(void)
+static inline void wbinvd_on_all_cpus(void)
{
wbinvd();
- return 0;
+}
+
+static inline void wbinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ wbinvd();
+}
+
+static inline void wbnoinvd_on_all_cpus(void)
+{
+ wbnoinvd();
+}
+
+static inline void wbnoinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ wbnoinvd();
}
static inline struct cpumask *cpu_llc_shared_mask(int cpu)
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index ecda17efa042..fde2bd7af19e 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -104,9 +104,36 @@ static inline void wrpkru(u32 pkru)
}
#endif
+/*
+ * Write back all modified lines in all levels of cache associated with this
+ * logical processor to main memory, and then invalidate all caches. Depending
+ * on the micro-architecture, WBINVD (and WBNOINVD below) may or may not affect
+ * lower level caches associated with another logical processor that shares any
+ * level of this processor's cache hierarchy.
+ */
static __always_inline void wbinvd(void)
{
- asm volatile("wbinvd": : :"memory");
+ asm volatile("wbinvd" : : : "memory");
+}
+
+/* Instruction encoding provided for binutils backwards compatibility. */
+#define ASM_WBNOINVD _ASM_BYTES(0xf3,0x0f,0x09)
+
+/*
+ * Write back all modified lines in all levels of cache associated with this
+ * logical processor to main memory, but do NOT explicitly invalidate caches,
+ * i.e. leave all/most cache lines in the hierarchy in non-modified state.
+ */
+static __always_inline void wbnoinvd(void)
+{
+ /*
+ * Explicitly encode WBINVD if X86_FEATURE_WBNOINVD is unavailable even
+ * though WBNOINVD is backwards compatible (it's simply WBINVD with an
+ * ignored REP prefix), to guarantee that WBNOINVD isn't used if it
+ * needs to be avoided for any reason. For all supported usage in the
+ * kernel, WBINVD is functionally a superset of WBNOINVD.
+ */
+ alternative("wbinvd", ASM_WBNOINVD, X86_FEATURE_WBNOINVD);
}
static inline unsigned long __read_cr4(void)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 9b7fa99ae951..ffc27f676243 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -116,6 +116,7 @@ enum {
INTERCEPT_INVPCID,
INTERCEPT_MCOMMIT,
INTERCEPT_TLBSYNC,
+ INTERCEPT_BUSLOCK,
INTERCEPT_IDLE_HLT = 166,
};
@@ -159,7 +160,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u64 avic_physical_id; /* Offset 0xf8 */
u8 reserved_7[8];
u64 vmsa_pa; /* Used for an SEV-ES guest */
- u8 reserved_8[720];
+ u8 reserved_8[16];
+ u16 bus_lock_counter; /* Offset 0x120 */
+ u8 reserved_9[22];
+ u64 allowed_sev_features; /* Offset 0x138 */
+ u64 guest_sev_features; /* Offset 0x140 */
+ u8 reserved_10[664];
/*
* Offset 0x3e0, 32 bytes reserved
* for use by hypervisor/software.
@@ -246,16 +252,21 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+/*
+ * GA_LOG_INTR is a synthetic flag that's never propagated to hardware-visible
+ * tables. GA_LOG_INTR is set if the vCPU needs device posted IRQs to generate
+ * GA log interrupts to wake the vCPU (because it's blocking or about to block).
+ */
+#define AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR BIT_ULL(61)
+
#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
-#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK GENMASK_ULL(51, 12)
#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFFULL)
#define AVIC_DOORBELL_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
-#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-
#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
@@ -284,13 +295,13 @@ enum avic_ipi_failure_cause {
static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID);
static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID);
-#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-
#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0)
#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3)
#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)
#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
+#define VMCB_ALLOWED_SEV_FEATURES_VALID BIT_ULL(63)
+
struct vmcb_seg {
u16 selector;
u16 attrib;
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 7c488ff0c764..c10dbb74cd00 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -38,6 +38,13 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
return regs->orig_ax;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ regs->orig_ax = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -90,6 +97,18 @@ static inline void syscall_get_arguments(struct task_struct *task,
args[5] = regs->bp;
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+ regs->bx = args[0];
+ regs->cx = args[1];
+ regs->dx = args[2];
+ regs->si = args[3];
+ regs->di = args[4];
+ regs->bp = args[5];
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
return AUDIT_ARCH_I386;
@@ -121,6 +140,30 @@ static inline void syscall_get_arguments(struct task_struct *task,
}
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+# ifdef CONFIG_IA32_EMULATION
+ if (task->thread_info.status & TS_COMPAT) {
+ regs->bx = *args++;
+ regs->cx = *args++;
+ regs->dx = *args++;
+ regs->si = *args++;
+ regs->di = *args++;
+ regs->bp = *args;
+ } else
+# endif
+ {
+ regs->di = *args++;
+ regs->si = *args++;
+ regs->dx = *args++;
+ regs->r10 = *args++;
+ regs->r8 = *args++;
+ regs->r9 = *args;
+ }
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
/* x32 tasks should be considered AUDIT_ARCH_X86_64. */
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 4a1922ec80cf..7ddef3a69866 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -5,6 +5,7 @@
#include <linux/init.h>
#include <linux/bits.h>
+#include <linux/mmzone.h>
#include <asm/errno.h>
#include <asm/ptrace.h>
@@ -18,6 +19,7 @@
* TDX module.
*/
#define TDX_ERROR _BITUL(63)
+#define TDX_NON_RECOVERABLE _BITUL(62)
#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40))
#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000))
@@ -33,6 +35,8 @@
#ifndef __ASSEMBLER__
#include <uapi/asm/mce.h>
+#include <asm/tdx_global_metadata.h>
+#include <linux/pgtable.h>
/*
* Used by the #VE exception handler to gather the #VE exception
@@ -64,6 +68,8 @@ bool tdx_early_handle_ve(struct pt_regs *regs);
int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport);
+int tdx_mcall_extend_rtmr(u8 index, u8 *data);
+
u64 tdx_hcall_get_quote(u8 *buf, size_t size);
void __init tdx_dump_attributes(u64 td_attr);
@@ -100,7 +106,7 @@ void tdx_init(void);
typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args);
-static inline u64 sc_retry(sc_func_t func, u64 fn,
+static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
struct tdx_module_args *args)
{
int retry = RDRAND_RETRY_LOOPS;
@@ -119,11 +125,82 @@ static inline u64 sc_retry(sc_func_t func, u64 fn,
int tdx_cpu_enable(void);
int tdx_enable(void);
const char *tdx_dump_mce_info(struct mce *m);
+const struct tdx_sys_info *tdx_get_sysinfo(void);
+
+int tdx_guest_keyid_alloc(void);
+u32 tdx_get_nr_guest_keyids(void);
+void tdx_guest_keyid_free(unsigned int keyid);
+
+struct tdx_td {
+ /* TD root structure: */
+ struct page *tdr_page;
+
+ int tdcs_nr_pages;
+ /* TD control structure: */
+ struct page **tdcs_pages;
+
+ /* Size of `tdcx_pages` in struct tdx_vp */
+ int tdcx_nr_pages;
+};
+
+struct tdx_vp {
+ /* TDVP root page */
+ struct page *tdvpr_page;
+
+ /* TD vCPU control structure: */
+ struct page **tdcx_pages;
+};
+
+static inline u64 mk_keyed_paddr(u16 hkid, struct page *page)
+{
+ u64 ret;
+
+ ret = page_to_phys(page);
+ /* KeyID bits are just above the physical address bits: */
+ ret |= (u64)hkid << boot_cpu_data.x86_phys_bits;
+
+ return ret;
+}
+
+static inline int pg_level_to_tdx_sept_level(enum pg_level level)
+{
+ WARN_ON_ONCE(level == PG_LEVEL_NONE);
+ return level - 1;
+}
+
+u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args);
+u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page);
+u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page);
+u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mng_key_config(struct tdx_td *td);
+u64 tdh_mng_create(struct tdx_td *td, u16 hkid);
+u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp);
+u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data);
+u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mr_finalize(struct tdx_td *td);
+u64 tdh_vp_flush(struct tdx_vp *vp);
+u64 tdh_mng_vpflushdone(struct tdx_td *td);
+u64 tdh_mng_key_freeid(struct tdx_td *td);
+u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err);
+u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid);
+u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data);
+u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask);
+u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size);
+u64 tdh_mem_track(struct tdx_td *tdr);
+u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_phymem_cache_wb(bool resume);
+u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td);
+u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page);
#else
static inline void tdx_init(void) { }
static inline int tdx_cpu_enable(void) { return -ENODEV; }
static inline int tdx_enable(void) { return -ENODEV; }
+static inline u32 tdx_get_nr_guest_keyids(void) { return 0; }
static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
+static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
#endif /* CONFIG_INTEL_TDX_HOST */
#endif /* !__ASSEMBLER__ */
diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h
index 6dd3c9695f59..060a2ad744bf 100644
--- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h
+++ b/arch/x86/include/asm/tdx_global_metadata.h
@@ -17,9 +17,28 @@ struct tdx_sys_info_tdmr {
u16 pamt_1g_entry_size;
};
+struct tdx_sys_info_td_ctrl {
+ u16 tdr_base_size;
+ u16 tdcs_base_size;
+ u16 tdvps_base_size;
+};
+
+struct tdx_sys_info_td_conf {
+ u64 attributes_fixed0;
+ u64 attributes_fixed1;
+ u64 xfam_fixed0;
+ u64 xfam_fixed1;
+ u16 num_cpuid_config;
+ u16 max_vcpus_per_td;
+ u64 cpuid_config_leaves[128];
+ u64 cpuid_config_values[128][2];
+};
+
struct tdx_sys_info {
struct tdx_sys_info_features features;
struct tdx_sys_info_tdmr tdmr;
+ struct tdx_sys_info_td_ctrl td_ctrl;
+ struct tdx_sys_info_td_conf td_conf;
};
#endif
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 0454d5e60e5d..721b408d9a67 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -44,16 +44,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_after_save,
TP_ARGS(fpu)
);
-DEFINE_EVENT(x86_fpu, x86_fpu_before_restore,
- TP_PROTO(struct fpu *fpu),
- TP_ARGS(fpu)
-);
-
-DEFINE_EVENT(x86_fpu, x86_fpu_after_restore,
- TP_PROTO(struct fpu *fpu),
- TP_ARGS(fpu)
-);
-
DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated,
TP_PROTO(struct fpu *fpu),
TP_ARGS(fpu)
@@ -64,11 +54,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated,
TP_ARGS(fpu)
);
-DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
- TP_PROTO(struct fpu *fpu),
- TP_ARGS(fpu)
-);
-
DEFINE_EVENT(x86_fpu, x86_fpu_dropped,
TP_PROTO(struct fpu *fpu),
TP_ARGS(fpu)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 8707361b24da..cca7d6641287 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -256,6 +256,7 @@ enum vmcs_field {
TSC_MULTIPLIER_HIGH = 0x00002033,
TERTIARY_VM_EXEC_CONTROL = 0x00002034,
TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035,
+ SHARED_EPT_POINTER = 0x0000203C,
PID_POINTER_TABLE = 0x00002042,
PID_POINTER_TABLE_HIGH = 0x00002043,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
@@ -586,6 +587,7 @@ enum vm_entry_failure_code {
#define EPT_VIOLATION_PROT_READ BIT(3)
#define EPT_VIOLATION_PROT_WRITE BIT(4)
#define EPT_VIOLATION_PROT_EXEC BIT(5)
+#define EPT_VIOLATION_EXEC_FOR_RING3_LIN BIT(6)
#define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \
EPT_VIOLATION_PROT_WRITE | \
EPT_VIOLATION_PROT_EXEC)
diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h
index 0007ba077c0c..41da492dfb01 100644
--- a/arch/x86/include/uapi/asm/debugreg.h
+++ b/arch/x86/include/uapi/asm/debugreg.h
@@ -15,7 +15,26 @@
which debugging register was responsible for the trap. The other bits
are either reserved or not of interest to us. */
-/* Define reserved bits in DR6 which are always set to 1 */
+/*
+ * Define bits in DR6 which are set to 1 by default.
+ *
+ * This is also the DR6 architectural value following Power-up, Reset or INIT.
+ *
+ * Note, with the introduction of Bus Lock Detection (BLD) and Restricted
+ * Transactional Memory (RTM), the DR6 register has been modified:
+ *
+ * 1) BLD flag (bit 11) is no longer reserved to 1 if the CPU supports
+ * Bus Lock Detection. The assertion of a bus lock could clear it.
+ *
+ * 2) RTM flag (bit 16) is no longer reserved to 1 if the CPU supports
+ * restricted transactional memory. #DB occurred inside an RTM region
+ * could clear it.
+ *
+ * Apparently, DR6.BLD and DR6.RTM are active low bits.
+ *
+ * As a result, DR6_RESERVED is an incorrect name now, but it is kept for
+ * compatibility.
+ */
#define DR6_RESERVED (0xFFFF0FF0)
#define DR_TRAP0 (0x1) /* db0 */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 460306b35a4b..0f15d683817d 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -441,6 +441,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
#define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8)
+#define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
@@ -844,6 +845,7 @@ struct kvm_sev_snp_launch_start {
};
/* Kept in sync with firmware values for simplicity. */
+#define KVM_SEV_PAGE_TYPE_INVALID 0x0
#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1
#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3
#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4
@@ -930,4 +932,80 @@ struct kvm_hyperv_eventfd {
#define KVM_X86_SNP_VM 4
#define KVM_X86_TDX_VM 5
+/* Trust Domain eXtension sub-ioctl() commands. */
+enum kvm_tdx_cmd_id {
+ KVM_TDX_CAPABILITIES = 0,
+ KVM_TDX_INIT_VM,
+ KVM_TDX_INIT_VCPU,
+ KVM_TDX_INIT_MEM_REGION,
+ KVM_TDX_FINALIZE_VM,
+ KVM_TDX_GET_CPUID,
+
+ KVM_TDX_CMD_NR_MAX,
+};
+
+struct kvm_tdx_cmd {
+ /* enum kvm_tdx_cmd_id */
+ __u32 id;
+ /* flags for sub-commend. If sub-command doesn't use this, set zero. */
+ __u32 flags;
+ /*
+ * data for each sub-command. An immediate or a pointer to the actual
+ * data in process virtual address. If sub-command doesn't use it,
+ * set zero.
+ */
+ __u64 data;
+ /*
+ * Auxiliary error code. The sub-command may return TDX SEAMCALL
+ * status code in addition to -Exxx.
+ */
+ __u64 hw_error;
+};
+
+struct kvm_tdx_capabilities {
+ __u64 supported_attrs;
+ __u64 supported_xfam;
+
+ __u64 kernel_tdvmcallinfo_1_r11;
+ __u64 user_tdvmcallinfo_1_r11;
+ __u64 kernel_tdvmcallinfo_1_r12;
+ __u64 user_tdvmcallinfo_1_r12;
+
+ __u64 reserved[250];
+
+ /* Configurable CPUID bits for userspace */
+ struct kvm_cpuid2 cpuid;
+};
+
+struct kvm_tdx_init_vm {
+ __u64 attributes;
+ __u64 xfam;
+ __u64 mrconfigid[6]; /* sha384 digest */
+ __u64 mrowner[6]; /* sha384 digest */
+ __u64 mrownerconfig[6]; /* sha384 digest */
+
+ /* The total space for TD_PARAMS before the CPUIDs is 256 bytes */
+ __u64 reserved[12];
+
+ /*
+ * Call KVM_TDX_INIT_VM before vcpu creation, thus before
+ * KVM_SET_CPUID2.
+ * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the
+ * TDX module directly virtualizes those CPUIDs without VMM. The user
+ * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with
+ * those values. If it doesn't, KVM may have wrong idea of vCPUIDs of
+ * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX
+ * module doesn't virtualize.
+ */
+ struct kvm_cpuid2 cpuid;
+};
+
+#define KVM_TDX_MEASURE_MEMORY_REGION _BITULL(0)
+
+struct kvm_tdx_init_mem_region {
+ __u64 source_addr;
+ __u64 gpa;
+ __u64 nr_pages;
+};
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h
index 50c45ead4e7c..2671c4e1b3a0 100644
--- a/arch/x86/include/uapi/asm/setup_data.h
+++ b/arch/x86/include/uapi/asm/setup_data.h
@@ -13,7 +13,8 @@
#define SETUP_CC_BLOB 7
#define SETUP_IMA 8
#define SETUP_RNG_SEED 9
-#define SETUP_ENUM_MAX SETUP_RNG_SEED
+#define SETUP_KEXEC_KHO 10
+#define SETUP_ENUM_MAX SETUP_KEXEC_KHO
#define SETUP_INDIRECT (1<<31)
#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT)
@@ -78,6 +79,16 @@ struct ima_setup_data {
__u64 size;
} __attribute__((packed));
+/*
+ * Locations of kexec handover metadata
+ */
+struct kho_data {
+ __u64 fdt_addr;
+ __u64 fdt_size;
+ __u64 scratch_addr;
+ __u64 scratch_size;
+} __attribute__((packed));
+
#endif /* __ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_SETUP_DATA_H */
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index ec1321248dac..9c640a521a67 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -95,6 +95,7 @@
#define SVM_EXIT_CR14_WRITE_TRAP 0x09e
#define SVM_EXIT_CR15_WRITE_TRAP 0x09f
#define SVM_EXIT_INVPCID 0x0a2
+#define SVM_EXIT_BUS_LOCK 0x0a5
#define SVM_EXIT_IDLE_HLT 0x0a6
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
@@ -225,6 +226,7 @@
{ SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \
{ SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \
{ SVM_EXIT_INVPCID, "invpcid" }, \
+ { SVM_EXIT_BUS_LOCK, "buslock" }, \
{ SVM_EXIT_IDLE_HLT, "idle-halt" }, \
{ SVM_EXIT_NPF, "npf" }, \
{ SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index a5faf6d88f1b..f0f4a4cf84a7 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -34,6 +34,7 @@
#define EXIT_REASON_TRIPLE_FAULT 2
#define EXIT_REASON_INIT_SIGNAL 3
#define EXIT_REASON_SIPI_SIGNAL 4
+#define EXIT_REASON_OTHER_SMI 6
#define EXIT_REASON_INTERRUPT_WINDOW 7
#define EXIT_REASON_NMI_WINDOW 8
@@ -92,6 +93,7 @@
#define EXIT_REASON_TPAUSE 68
#define EXIT_REASON_BUS_LOCK 74
#define EXIT_REASON_NOTIFY 75
+#define EXIT_REASON_TDCALL 77
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -155,7 +157,8 @@
{ EXIT_REASON_UMWAIT, "UMWAIT" }, \
{ EXIT_REASON_TPAUSE, "TPAUSE" }, \
{ EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \
- { EXIT_REASON_NOTIFY, "NOTIFY" }
+ { EXIT_REASON_NOTIFY, "NOTIFY" }, \
+ { EXIT_REASON_TDCALL, "TDCALL" }
#define VMX_EXIT_REASON_FLAGS \
{ VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" }
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 99a783fd4691..0d2a6d953be9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -3,7 +3,7 @@
# Makefile for the linux kernel.
#
-extra-y += vmlinux.lds
+always-$(KBUILD_BUILTIN) += vmlinux.lds
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
index f36f28405dcc..6d7603511f52 100644
--- a/arch/x86/kernel/acpi/madt_wakeup.c
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -126,7 +126,7 @@ static int __init acpi_mp_setup_reset(u64 reset_vector)
return 0;
}
-static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
+static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip, unsigned int cpu)
{
if (!acpi_mp_wake_mailbox_paddr) {
pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ecfe7b497cad..ea1d984166cd 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -116,6 +116,24 @@ static struct module *its_mod;
#endif
static void *its_page;
static unsigned int its_offset;
+struct its_array its_pages;
+
+static void *__its_alloc(struct its_array *pages)
+{
+ void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+ if (!page)
+ return NULL;
+
+ void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *),
+ GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+
+ pages->pages = tmp;
+ pages->pages[pages->num++] = page;
+
+ return no_free_ptr(page);
+}
/* Initialize a thunk with the "jmp *reg; int3" instructions. */
static void *its_init_thunk(void *thunk, int reg)
@@ -151,6 +169,21 @@ static void *its_init_thunk(void *thunk, int reg)
return thunk + offset;
}
+static void its_pages_protect(struct its_array *pages)
+{
+ for (int i = 0; i < pages->num; i++) {
+ void *page = pages->pages[i];
+ execmem_restore_rox(page, PAGE_SIZE);
+ }
+}
+
+static void its_fini_core(void)
+{
+ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
+ its_pages_protect(&its_pages);
+ kfree(its_pages.pages);
+}
+
#ifdef CONFIG_MODULES
void its_init_mod(struct module *mod)
{
@@ -173,10 +206,8 @@ void its_fini_mod(struct module *mod)
its_page = NULL;
mutex_unlock(&text_mutex);
- for (int i = 0; i < mod->its_num_pages; i++) {
- void *page = mod->its_page_array[i];
- execmem_restore_rox(page, PAGE_SIZE);
- }
+ if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ its_pages_protect(&mod->arch.its_pages);
}
void its_free_mod(struct module *mod)
@@ -184,37 +215,33 @@ void its_free_mod(struct module *mod)
if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
return;
- for (int i = 0; i < mod->its_num_pages; i++) {
- void *page = mod->its_page_array[i];
+ for (int i = 0; i < mod->arch.its_pages.num; i++) {
+ void *page = mod->arch.its_pages.pages[i];
execmem_free(page);
}
- kfree(mod->its_page_array);
+ kfree(mod->arch.its_pages.pages);
}
#endif /* CONFIG_MODULES */
static void *its_alloc(void)
{
- void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
-
- if (!page)
- return NULL;
+ struct its_array *pages = &its_pages;
+ void *page;
#ifdef CONFIG_MODULES
- if (its_mod) {
- void *tmp = krealloc(its_mod->its_page_array,
- (its_mod->its_num_pages+1) * sizeof(void *),
- GFP_KERNEL);
- if (!tmp)
- return NULL;
+ if (its_mod)
+ pages = &its_mod->arch.its_pages;
+#endif
- its_mod->its_page_array = tmp;
- its_mod->its_page_array[its_mod->its_num_pages++] = page;
+ page = __its_alloc(pages);
+ if (!page)
+ return NULL;
- execmem_make_temp_rw(page, PAGE_SIZE);
- }
-#endif /* CONFIG_MODULES */
+ execmem_make_temp_rw(page, PAGE_SIZE);
+ if (pages == &its_pages)
+ set_memory_x((unsigned long)page, 1);
- return no_free_ptr(page);
+ return page;
}
static void *its_allocate_thunk(int reg)
@@ -268,7 +295,9 @@ u8 *its_static_thunk(int reg)
return thunk;
}
-#endif
+#else
+static inline void its_fini_core(void) {}
+#endif /* CONFIG_MITIGATION_ITS */
/*
* Nomenclature for variable names to simplify and clarify this code and ease
@@ -2338,6 +2367,8 @@ void __init alternative_instructions(void)
apply_retpolines(__retpoline_sites, __retpoline_sites_end);
apply_returns(__return_sites, __return_sites_end);
+ its_fini_core();
+
/*
* Adjust all CALL instructions to point to func()-10, including
* those in .altinstr_replacement.
@@ -3107,6 +3138,6 @@ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, c
*/
void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate)
{
- __smp_text_poke_batch_add(addr, opcode, len, emulate);
+ smp_text_poke_batch_add(addr, opcode, len, emulate);
smp_text_poke_batch_finish();
}
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index b5bb7a2e8340..58abb941c45b 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -27,7 +27,13 @@ static void noop_send_IPI_allbutself(int vector) { }
static void noop_send_IPI_all(int vector) { }
static void noop_send_IPI_self(int vector) { }
static void noop_apic_icr_write(u32 low, u32 id) { }
-static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { return -1; }
+
+static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip,
+ unsigned int cpu)
+{
+ return -1;
+}
+
static u64 noop_apic_icr_read(void) { return 0; }
static u32 noop_get_apic_id(u32 apicid) { return 0; }
static void noop_apic_eoi(void) { }
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index e272bc7fdc8e..5c5be2d58242 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -57,7 +57,7 @@ static void numachip2_apic_icr_write(int apicid, unsigned int val)
numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val);
}
-static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip)
+static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu)
{
numachip_apic_icr_write(phys_apicid, APIC_DM_INIT);
numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP |
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ba5a4ccda37a..5ba2feb2c04c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2225,7 +2225,7 @@ static int mp_irqdomain_create(int ioapic)
/* Handle device tree enumerated APICs proper */
if (cfg->dev) {
- fn = of_node_to_fwnode(cfg->dev);
+ fn = of_fwnode_handle(cfg->dev);
} else {
fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic));
if (!fn)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index fee42a73d64a..a947b46a8b64 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -183,6 +183,7 @@ setnew:
apicd->cpu = newcpu;
BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));
per_cpu(vector_irq, newcpu)[newvec] = desc;
+ apic_update_irq_cfg(irqd, newvec, newcpu);
}
static void vector_assign_managed_shutdown(struct irq_data *irqd)
@@ -261,7 +262,6 @@ assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest)
if (vector < 0)
return vector;
apic_update_vector(irqd, vector, cpu);
- apic_update_irq_cfg(irqd, vector, cpu);
return 0;
}
@@ -338,7 +338,7 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest)
if (vector < 0)
return vector;
apic_update_vector(irqd, vector, cpu);
- apic_update_irq_cfg(irqd, vector, cpu);
+
return 0;
}
@@ -864,7 +864,7 @@ void lapic_offline(void)
__vector_cleanup(cl, false);
irq_matrix_offline(vector_matrix);
- WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0);
+ WARN_ON_ONCE(timer_delete_sync_try(&cl->timer) < 0);
WARN_ON_ONCE(!hlist_empty(&cl->head));
unlock_vector_lock();
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 7fef504ca508..15209f220e1f 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -667,7 +667,7 @@ static __init void build_uv_gr_table(void)
}
}
-static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip)
+static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu)
{
unsigned long val;
int pnode;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 93da466dfe2c..a5ece6ebe8a7 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -9,7 +9,7 @@
#include <linux/sched/clock.h>
#include <linux/random.h>
#include <linux/topology.h>
-#include <asm/amd/fch.h>
+#include <linux/platform_data/x86/amd-fch.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cacheinfo.h>
@@ -31,7 +31,7 @@
#include "cpu.h"
-u16 invlpgb_count_max __ro_after_init;
+u16 invlpgb_count_max __ro_after_init = 1;
static inline int rdmsrq_amd_safe(unsigned msr, u64 *p)
{
@@ -377,6 +377,47 @@ static void bsp_determine_snp(struct cpuinfo_x86 *c)
#endif
}
+#define ZEN_MODEL_STEP_UCODE(fam, model, step, ucode) \
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, fam, model), \
+ step, step, ucode)
+
+static const struct x86_cpu_id amd_tsa_microcode[] = {
+ ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x1, 0x0a0011d7),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x2, 0x0a00123b),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x08, 0x2, 0x0a00820d),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x1, 0x0a10114c),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x2, 0x0a10124c),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x18, 0x1, 0x0a108109),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x0, 0x0a20102e),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x2, 0x0a201211),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x44, 0x1, 0x0a404108),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x50, 0x0, 0x0a500012),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x61, 0x2, 0x0a60120a),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x74, 0x1, 0x0a704108),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x75, 0x2, 0x0a705208),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x78, 0x0, 0x0a708008),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x7c, 0x0, 0x0a70c008),
+ ZEN_MODEL_STEP_UCODE(0x19, 0xa0, 0x2, 0x0aa00216),
+ {},
+};
+
+static void tsa_init(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (cpu_has(c, X86_FEATURE_ZEN3) ||
+ cpu_has(c, X86_FEATURE_ZEN4)) {
+ if (x86_match_min_microcode_rev(amd_tsa_microcode))
+ setup_force_cpu_cap(X86_FEATURE_VERW_CLEAR);
+ else
+ pr_debug("%s: current revision: 0x%x\n", __func__, c->microcode);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_TSA_SQ_NO);
+ setup_force_cpu_cap(X86_FEATURE_TSA_L1_NO);
+ }
+}
+
static void bsp_init_amd(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
@@ -489,6 +530,11 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
}
bsp_determine_snp(c);
+ tsa_init(c);
+
+ if (cpu_has(c, X86_FEATURE_GP_ON_USER_CPUID))
+ setup_force_cpu_cap(X86_FEATURE_CPUID_FAULT);
+
return;
warn:
@@ -930,6 +976,16 @@ static void init_amd_zen2(struct cpuinfo_x86 *c)
init_spectral_chicken(c);
fix_erratum_1386(c);
zen2_zenbleed_check(c);
+
+ /* Disable RDSEED on AMD Cyan Skillfish because of an error. */
+ if (c->x86_model == 0x47 && c->x86_stepping == 0x0) {
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18);
+ pr_emerg("RDSEED is not reliable on this platform; disabling.\n");
+ }
+
+ /* Correct misconfigured CPUID on some clients. */
+ clear_cpu_cap(c, X86_FEATURE_INVLPGB);
}
static void init_amd_zen3(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 7f94e6a5497d..b74bf937cd9f 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -94,6 +94,8 @@ static void __init bhi_apply_mitigation(void);
static void __init its_select_mitigation(void);
static void __init its_update_mitigation(void);
static void __init its_apply_mitigation(void);
+static void __init tsa_select_mitigation(void);
+static void __init tsa_apply_mitigation(void);
/* The base value of the SPEC_CTRL MSR without task-specific bits set */
u64 x86_spec_ctrl_base;
@@ -113,10 +115,9 @@ void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
static void __init set_return_thunk(void *thunk)
{
- if (x86_return_thunk != __x86_return_thunk)
- pr_warn("x86/bugs: return thunk changed\n");
-
x86_return_thunk = thunk;
+
+ pr_info("active return thunk: %ps\n", thunk);
}
/* Update SPEC_CTRL MSR and its cached copy unconditionally */
@@ -169,9 +170,9 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
EXPORT_SYMBOL_GPL(switch_vcpu_ibpb);
-/* Control MDS CPU buffer clear before idling (halt, mwait) */
-DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
-EXPORT_SYMBOL_GPL(mds_idle_clear);
+/* Control CPU buffer clear before idling (halt, mwait) */
+DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
+EXPORT_SYMBOL_GPL(cpu_buf_idle_clear);
/*
* Controls whether l1d flush based mitigations are enabled,
@@ -188,6 +189,39 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
EXPORT_SYMBOL_GPL(cpu_buf_vm_clear);
+#undef pr_fmt
+#define pr_fmt(fmt) "mitigations: " fmt
+
+static void __init cpu_print_attack_vectors(void)
+{
+ pr_info("Enabled attack vectors: ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
+ pr_cont("user_kernel, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER))
+ pr_cont("user_user, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST))
+ pr_cont("guest_host, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST))
+ pr_cont("guest_guest, ");
+
+ pr_cont("SMT mitigations: ");
+
+ switch (smt_mitigations) {
+ case SMT_MITIGATIONS_OFF:
+ pr_cont("off\n");
+ break;
+ case SMT_MITIGATIONS_AUTO:
+ pr_cont("auto\n");
+ break;
+ case SMT_MITIGATIONS_ON:
+ pr_cont("on\n");
+ }
+}
+
void __init cpu_select_mitigations(void)
{
/*
@@ -208,6 +242,8 @@ void __init cpu_select_mitigations(void)
x86_arch_cap_msr = x86_read_arch_cap_msr();
+ cpu_print_attack_vectors();
+
/* Select the proper CPU mitigations before patching alternatives: */
spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
@@ -225,6 +261,7 @@ void __init cpu_select_mitigations(void)
gds_select_mitigation();
its_select_mitigation();
bhi_select_mitigation();
+ tsa_select_mitigation();
/*
* After mitigations are selected, some may need to update their
@@ -272,6 +309,7 @@ void __init cpu_select_mitigations(void)
gds_apply_mitigation();
its_apply_mitigation();
bhi_apply_mitigation();
+ tsa_apply_mitigation();
}
/*
@@ -329,6 +367,62 @@ static void x86_amd_ssb_disable(void)
#undef pr_fmt
#define pr_fmt(fmt) "MDS: " fmt
+/*
+ * Returns true if vulnerability should be mitigated based on the
+ * selected attack vector controls.
+ *
+ * See Documentation/admin-guide/hw-vuln/attack_vector_controls.rst
+ */
+static bool __init should_mitigate_vuln(unsigned int bug)
+{
+ switch (bug) {
+ /*
+ * The only runtime-selected spectre_v1 mitigations in the kernel are
+ * related to SWAPGS protection on kernel entry. Therefore, protection
+ * is only required for the user->kernel attack vector.
+ */
+ case X86_BUG_SPECTRE_V1:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL);
+
+ case X86_BUG_SPECTRE_V2:
+ case X86_BUG_RETBLEED:
+ case X86_BUG_SRSO:
+ case X86_BUG_L1TF:
+ case X86_BUG_ITS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST);
+
+ case X86_BUG_SPECTRE_V2_USER:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST);
+
+ /*
+ * All the vulnerabilities below allow potentially leaking data
+ * across address spaces. Therefore, mitigation is required for
+ * any of these 4 attack vectors.
+ */
+ case X86_BUG_MDS:
+ case X86_BUG_TAA:
+ case X86_BUG_MMIO_STALE_DATA:
+ case X86_BUG_RFDS:
+ case X86_BUG_SRBDS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST);
+
+ case X86_BUG_GDS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST) ||
+ (smt_mitigations != SMT_MITIGATIONS_OFF);
+ default:
+ WARN(1, "Unknown bug %x\n", bug);
+ return false;
+ }
+}
+
/* Default mitigation for MDS-affected CPUs */
static enum mds_mitigations mds_mitigation __ro_after_init =
IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF;
@@ -382,13 +476,17 @@ static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init;
static void __init mds_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_MDS)) {
mds_mitigation = MDS_MITIGATION_OFF;
return;
}
- if (mds_mitigation == MDS_MITIGATION_AUTO)
- mds_mitigation = MDS_MITIGATION_FULL;
+ if (mds_mitigation == MDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_MDS))
+ mds_mitigation = MDS_MITIGATION_FULL;
+ else
+ mds_mitigation = MDS_MITIGATION_OFF;
+ }
if (mds_mitigation == MDS_MITIGATION_OFF)
return;
@@ -398,7 +496,7 @@ static void __init mds_select_mitigation(void)
static void __init mds_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
return;
/* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */
@@ -419,7 +517,7 @@ static void __init mds_apply_mitigation(void)
mds_mitigation == MDS_MITIGATION_VMWERV) {
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
- (mds_nosmt || cpu_mitigations_auto_nosmt()))
+ (mds_nosmt || smt_mitigations == SMT_MITIGATIONS_ON))
cpu_smt_disable(false);
}
}
@@ -475,12 +573,13 @@ static void __init taa_select_mitigation(void)
return;
}
- if (cpu_mitigations_off())
- taa_mitigation = TAA_MITIGATION_OFF;
-
/* Microcode will be checked in taa_update_mitigation(). */
- if (taa_mitigation == TAA_MITIGATION_AUTO)
- taa_mitigation = TAA_MITIGATION_VERW;
+ if (taa_mitigation == TAA_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_TAA))
+ taa_mitigation = TAA_MITIGATION_VERW;
+ else
+ taa_mitigation = TAA_MITIGATION_OFF;
+ }
if (taa_mitigation != TAA_MITIGATION_OFF)
verw_clear_cpu_buf_mitigation_selected = true;
@@ -488,7 +587,7 @@ static void __init taa_select_mitigation(void)
static void __init taa_update_mitigation(void)
{
- if (!taa_vulnerable() || cpu_mitigations_off())
+ if (!taa_vulnerable())
return;
if (verw_clear_cpu_buf_mitigation_selected)
@@ -529,7 +628,7 @@ static void __init taa_apply_mitigation(void)
*/
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
- if (taa_nosmt || cpu_mitigations_auto_nosmt())
+ if (taa_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)
cpu_smt_disable(false);
}
}
@@ -575,8 +674,12 @@ static void __init mmio_select_mitigation(void)
}
/* Microcode will be checked in mmio_update_mitigation(). */
- if (mmio_mitigation == MMIO_MITIGATION_AUTO)
- mmio_mitigation = MMIO_MITIGATION_VERW;
+ if (mmio_mitigation == MMIO_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_MMIO_STALE_DATA))
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ else
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ }
if (mmio_mitigation == MMIO_MITIGATION_OFF)
return;
@@ -591,7 +694,7 @@ static void __init mmio_select_mitigation(void)
static void __init mmio_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
return;
if (verw_clear_cpu_buf_mitigation_selected)
@@ -637,9 +740,9 @@ static void __init mmio_apply_mitigation(void)
* is required irrespective of SMT state.
*/
if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
- static_branch_enable(&mds_idle_clear);
+ static_branch_enable(&cpu_buf_idle_clear);
- if (mmio_nosmt || cpu_mitigations_auto_nosmt())
+ if (mmio_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)
cpu_smt_disable(false);
}
@@ -680,13 +783,17 @@ static inline bool __init verw_clears_cpu_reg_file(void)
static void __init rfds_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_RFDS)) {
rfds_mitigation = RFDS_MITIGATION_OFF;
return;
}
- if (rfds_mitigation == RFDS_MITIGATION_AUTO)
- rfds_mitigation = RFDS_MITIGATION_VERW;
+ if (rfds_mitigation == RFDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_RFDS))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+ else
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ }
if (rfds_mitigation == RFDS_MITIGATION_OFF)
return;
@@ -697,7 +804,7 @@ static void __init rfds_select_mitigation(void)
static void __init rfds_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
return;
if (verw_clear_cpu_buf_mitigation_selected)
@@ -798,13 +905,19 @@ void update_srbds_msr(void)
static void __init srbds_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SRBDS) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS)) {
srbds_mitigation = SRBDS_MITIGATION_OFF;
return;
}
- if (srbds_mitigation == SRBDS_MITIGATION_AUTO)
- srbds_mitigation = SRBDS_MITIGATION_FULL;
+ if (srbds_mitigation == SRBDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_SRBDS))
+ srbds_mitigation = SRBDS_MITIGATION_FULL;
+ else {
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+ return;
+ }
+ }
/*
* Check to see if this is one of the MDS_NO systems supporting TSX that
@@ -952,12 +1065,15 @@ static void __init gds_select_mitigation(void)
return;
}
- if (cpu_mitigations_off())
- gds_mitigation = GDS_MITIGATION_OFF;
/* Will verify below that mitigation _can_ be disabled */
-
- if (gds_mitigation == GDS_MITIGATION_AUTO)
- gds_mitigation = GDS_MITIGATION_FULL;
+ if (gds_mitigation == GDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_GDS))
+ gds_mitigation = GDS_MITIGATION_FULL;
+ else {
+ gds_mitigation = GDS_MITIGATION_OFF;
+ return;
+ }
+ }
/* No microcode */
if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) {
@@ -1063,13 +1179,16 @@ static bool smap_works_speculatively(void)
static void __init spectre_v1_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V1))
spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
}
static void __init spectre_v1_apply_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
return;
if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) {
@@ -1120,6 +1239,20 @@ early_param("nospectre_v1", nospectre_v1_cmdline);
enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE;
+/* Depends on spectre_v2 mitigation selected already */
+static inline bool cdt_possible(enum spectre_v2_mitigation mode)
+{
+ if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) ||
+ !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE))
+ return false;
+
+ if (mode == SPECTRE_V2_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE)
+ return true;
+
+ return false;
+}
+
#undef pr_fmt
#define pr_fmt(fmt) "RETBleed: " fmt
@@ -1158,6 +1291,21 @@ static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
static int __ro_after_init retbleed_nosmt = false;
+enum srso_mitigation {
+ SRSO_MITIGATION_NONE,
+ SRSO_MITIGATION_AUTO,
+ SRSO_MITIGATION_UCODE_NEEDED,
+ SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
+ SRSO_MITIGATION_MICROCODE,
+ SRSO_MITIGATION_NOSMT,
+ SRSO_MITIGATION_SAFE_RET,
+ SRSO_MITIGATION_IBPB,
+ SRSO_MITIGATION_IBPB_ON_VMEXIT,
+ SRSO_MITIGATION_BP_SPEC_REDUCE,
+};
+
+static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO;
+
static int __init retbleed_parse_cmdline(char *str)
{
if (!str)
@@ -1200,7 +1348,7 @@ early_param("retbleed", retbleed_parse_cmdline);
static void __init retbleed_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED)) {
retbleed_mitigation = RETBLEED_MITIGATION_NONE;
return;
}
@@ -1237,6 +1385,11 @@ static void __init retbleed_select_mitigation(void)
if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO)
return;
+ if (!should_mitigate_vuln(X86_BUG_RETBLEED)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ return;
+ }
+
/* Intel mitigation selected in retbleed_update_mitigation() */
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
@@ -1247,35 +1400,36 @@ static void __init retbleed_select_mitigation(void)
retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
else
retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ /* Final mitigation depends on spectre-v2 selection */
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED))
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ else if (boot_cpu_has(X86_FEATURE_IBRS))
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ else
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
}
}
static void __init retbleed_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED))
return;
- if (retbleed_mitigation == RETBLEED_MITIGATION_NONE)
- goto out;
+ /* ITS can also enable stuffing */
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF)
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
- /*
- * retbleed=stuff is only allowed on Intel. If stuffing can't be used
- * then a different mitigation will be selected below.
- *
- * its=stuff will also attempt to enable stuffing.
- */
- if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF ||
- its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) {
- if (spectre_v2_enabled != SPECTRE_V2_RETPOLINE) {
- pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n");
- retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
- } else {
- if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
- pr_info("Retbleed mitigation updated to stuffing\n");
+ /* If SRSO is using IBPB, that works for retbleed too */
+ if (srso_mitigation == SRSO_MITIGATION_IBPB)
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
- retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
- }
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF &&
+ !cdt_possible(spectre_v2_enabled)) {
+ pr_err("WARNING: retbleed=stuff depends on retpoline\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
}
+
/*
* Let IBRS trump all on Intel without affecting the effects of the
* retbleed= cmdline option except for call depth based stuffing
@@ -1294,15 +1448,11 @@ static void __init retbleed_update_mitigation(void)
if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
pr_err(RETBLEED_INTEL_MSG);
}
- /* If nothing has set the mitigation yet, default to NONE. */
- if (retbleed_mitigation == RETBLEED_MITIGATION_AUTO)
- retbleed_mitigation = RETBLEED_MITIGATION_NONE;
}
-out:
+
pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
}
-
static void __init retbleed_apply_mitigation(void)
{
bool mitigate_smt = false;
@@ -1358,7 +1508,7 @@ static void __init retbleed_apply_mitigation(void)
}
if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) &&
- (retbleed_nosmt || cpu_mitigations_auto_nosmt()))
+ (retbleed_nosmt || smt_mitigations == SMT_MITIGATIONS_ON))
cpu_smt_disable(false);
}
@@ -1403,13 +1553,17 @@ early_param("indirect_target_selection", its_parse_cmdline);
static void __init its_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_ITS)) {
its_mitigation = ITS_MITIGATION_OFF;
return;
}
- if (its_mitigation == ITS_MITIGATION_AUTO)
- its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ if (its_mitigation == ITS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_ITS))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ else
+ its_mitigation = ITS_MITIGATION_OFF;
+ }
if (its_mitigation == ITS_MITIGATION_OFF)
return;
@@ -1440,15 +1594,17 @@ static void __init its_select_mitigation(void)
static void __init its_update_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_ITS))
return;
switch (spectre_v2_enabled) {
case SPECTRE_V2_NONE:
- pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
+ if (its_mitigation != ITS_MITIGATION_OFF)
+ pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
its_mitigation = ITS_MITIGATION_OFF;
break;
case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
/* Retpoline+CDT mitigates ITS */
if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF)
its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
@@ -1462,13 +1618,8 @@ static void __init its_update_mitigation(void)
break;
}
- /*
- * retbleed_update_mitigation() will try to do stuffing if its=stuff.
- * If it can't, such as if spectre_v2!=retpoline, then fall back to
- * aligned thunks.
- */
if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF &&
- retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
+ !cdt_possible(spectre_v2_enabled))
its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
pr_info("%s\n", its_strings[its_mitigation]);
@@ -1476,15 +1627,127 @@ static void __init its_update_mitigation(void)
static void __init its_apply_mitigation(void)
{
- /* its=stuff forces retbleed stuffing and is enabled there. */
- if (its_mitigation != ITS_MITIGATION_ALIGNED_THUNKS)
+ switch (its_mitigation) {
+ case ITS_MITIGATION_OFF:
+ case ITS_MITIGATION_AUTO:
+ case ITS_MITIGATION_VMEXIT_ONLY:
+ break;
+ case ITS_MITIGATION_ALIGNED_THUNKS:
+ if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
+ setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
+
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ set_return_thunk(its_return_thunk);
+ break;
+ case ITS_MITIGATION_RETPOLINE_STUFF:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+ set_return_thunk(call_depth_return_thunk);
+ break;
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Transient Scheduler Attacks: " fmt
+
+enum tsa_mitigations {
+ TSA_MITIGATION_NONE,
+ TSA_MITIGATION_AUTO,
+ TSA_MITIGATION_UCODE_NEEDED,
+ TSA_MITIGATION_USER_KERNEL,
+ TSA_MITIGATION_VM,
+ TSA_MITIGATION_FULL,
+};
+
+static const char * const tsa_strings[] = {
+ [TSA_MITIGATION_NONE] = "Vulnerable",
+ [TSA_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [TSA_MITIGATION_USER_KERNEL] = "Mitigation: Clear CPU buffers: user/kernel boundary",
+ [TSA_MITIGATION_VM] = "Mitigation: Clear CPU buffers: VM",
+ [TSA_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
+};
+
+static enum tsa_mitigations tsa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TSA) ? TSA_MITIGATION_AUTO : TSA_MITIGATION_NONE;
+
+static int __init tsa_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ tsa_mitigation = TSA_MITIGATION_NONE;
+ else if (!strcmp(str, "on"))
+ tsa_mitigation = TSA_MITIGATION_FULL;
+ else if (!strcmp(str, "user"))
+ tsa_mitigation = TSA_MITIGATION_USER_KERNEL;
+ else if (!strcmp(str, "vm"))
+ tsa_mitigation = TSA_MITIGATION_VM;
+ else
+ pr_err("Ignoring unknown tsa=%s option.\n", str);
+
+ return 0;
+}
+early_param("tsa", tsa_parse_cmdline);
+
+static void __init tsa_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TSA)) {
+ tsa_mitigation = TSA_MITIGATION_NONE;
return;
+ }
+
+ if (tsa_mitigation == TSA_MITIGATION_AUTO) {
+ bool vm = false, uk = false;
+
+ tsa_mitigation = TSA_MITIGATION_NONE;
- if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
- setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER)) {
+ tsa_mitigation = TSA_MITIGATION_USER_KERNEL;
+ uk = true;
+ }
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) {
+ tsa_mitigation = TSA_MITIGATION_VM;
+ vm = true;
+ }
- setup_force_cpu_cap(X86_FEATURE_RETHUNK);
- set_return_thunk(its_return_thunk);
+ if (uk && vm)
+ tsa_mitigation = TSA_MITIGATION_FULL;
+ }
+
+ if (tsa_mitigation == TSA_MITIGATION_NONE)
+ return;
+
+ if (!boot_cpu_has(X86_FEATURE_VERW_CLEAR))
+ tsa_mitigation = TSA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * No need to set verw_clear_cpu_buf_mitigation_selected - it
+ * doesn't fit all cases here and it is not needed because this
+ * is the only VERW-based mitigation on AMD.
+ */
+ pr_info("%s\n", tsa_strings[tsa_mitigation]);
+}
+
+static void __init tsa_apply_mitigation(void)
+{
+ switch (tsa_mitigation) {
+ case TSA_MITIGATION_USER_KERNEL:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ break;
+ case TSA_MITIGATION_VM:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ break;
+ case TSA_MITIGATION_FULL:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ break;
+ default:
+ break;
+ }
}
#undef pr_fmt
@@ -1609,7 +1872,7 @@ static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void)
char arg[20];
int ret, i;
- if (cpu_mitigations_off() || !IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2))
+ if (!IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2))
return SPECTRE_V2_USER_CMD_NONE;
ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
@@ -1647,6 +1910,13 @@ static void __init spectre_v2_user_select_mitigation(void)
spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
break;
case SPECTRE_V2_USER_CMD_AUTO:
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2_USER))
+ break;
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ if (smt_mitigations == SMT_MITIGATIONS_OFF)
+ break;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
case SPECTRE_V2_USER_CMD_PRCTL:
spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
@@ -1798,8 +2068,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
int ret, i;
cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE;
- if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
- cpu_mitigations_off())
+ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
return SPECTRE_V2_CMD_NONE;
ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
@@ -2002,11 +2271,20 @@ early_param("spectre_bhi", spectre_bhi_parse_cmdline);
static void __init bhi_select_mitigation(void)
{
- if (!boot_cpu_has(X86_BUG_BHI) || cpu_mitigations_off())
+ if (!boot_cpu_has(X86_BUG_BHI))
bhi_mitigation = BHI_MITIGATION_OFF;
- if (bhi_mitigation == BHI_MITIGATION_AUTO)
- bhi_mitigation = BHI_MITIGATION_ON;
+ if (bhi_mitigation != BHI_MITIGATION_AUTO)
+ return;
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST)) {
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
+ bhi_mitigation = BHI_MITIGATION_ON;
+ else
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
+ } else {
+ bhi_mitigation = BHI_MITIGATION_OFF;
+ }
}
static void __init bhi_update_mitigation(void)
@@ -2062,8 +2340,11 @@ static void __init spectre_v2_select_mitigation(void)
case SPECTRE_V2_CMD_NONE:
return;
- case SPECTRE_V2_CMD_FORCE:
case SPECTRE_V2_CMD_AUTO:
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2))
+ break;
+ fallthrough;
+ case SPECTRE_V2_CMD_FORCE:
if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
spectre_v2_enabled = SPECTRE_V2_EIBRS;
break;
@@ -2117,7 +2398,7 @@ static void __init spectre_v2_update_mitigation(void)
}
}
- if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && !cpu_mitigations_off())
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]);
}
@@ -2249,10 +2530,10 @@ static void update_mds_branch_idle(void)
return;
if (sched_smt_active()) {
- static_branch_enable(&mds_idle_clear);
+ static_branch_enable(&cpu_buf_idle_clear);
} else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) {
- static_branch_disable(&mds_idle_clear);
+ static_branch_disable(&cpu_buf_idle_clear);
}
}
@@ -2316,6 +2597,25 @@ void cpu_bugs_smt_update(void)
break;
}
+ switch (tsa_mitigation) {
+ case TSA_MITIGATION_USER_KERNEL:
+ case TSA_MITIGATION_VM:
+ case TSA_MITIGATION_AUTO:
+ case TSA_MITIGATION_FULL:
+ /*
+ * TSA-SQ can potentially lead to info leakage between
+ * SMT threads.
+ */
+ if (sched_smt_active())
+ static_branch_enable(&cpu_buf_idle_clear);
+ else
+ static_branch_disable(&cpu_buf_idle_clear);
+ break;
+ case TSA_MITIGATION_NONE:
+ case TSA_MITIGATION_UCODE_NEEDED:
+ break;
+ }
+
mutex_unlock(&spec_ctrl_mutex);
}
@@ -2750,17 +3050,23 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
static void __init l1tf_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_L1TF) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
l1tf_mitigation = L1TF_MITIGATION_OFF;
return;
}
- if (l1tf_mitigation == L1TF_MITIGATION_AUTO) {
- if (cpu_mitigations_auto_nosmt())
- l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
- else
- l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+ if (l1tf_mitigation != L1TF_MITIGATION_AUTO)
+ return;
+
+ if (!should_mitigate_vuln(X86_BUG_L1TF)) {
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ return;
}
+
+ if (smt_mitigations == SMT_MITIGATIONS_ON)
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
}
static void __init l1tf_apply_mitigation(void)
@@ -2834,31 +3140,18 @@ early_param("l1tf", l1tf_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt
-enum srso_mitigation {
- SRSO_MITIGATION_NONE,
- SRSO_MITIGATION_AUTO,
- SRSO_MITIGATION_UCODE_NEEDED,
- SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
- SRSO_MITIGATION_MICROCODE,
- SRSO_MITIGATION_SAFE_RET,
- SRSO_MITIGATION_IBPB,
- SRSO_MITIGATION_IBPB_ON_VMEXIT,
- SRSO_MITIGATION_BP_SPEC_REDUCE,
-};
-
static const char * const srso_strings[] = {
[SRSO_MITIGATION_NONE] = "Vulnerable",
[SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
[SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode",
[SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET",
+ [SRSO_MITIGATION_NOSMT] = "Mitigation: SMT disabled",
[SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET",
[SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
[SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only",
[SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation"
};
-static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO;
-
static int __init srso_parse_cmdline(char *str)
{
if (!str)
@@ -2885,35 +3178,44 @@ early_param("spec_rstack_overflow", srso_parse_cmdline);
static void __init srso_select_mitigation(void)
{
- bool has_microcode;
-
- if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
+ if (!boot_cpu_has_bug(X86_BUG_SRSO)) {
srso_mitigation = SRSO_MITIGATION_NONE;
-
- if (srso_mitigation == SRSO_MITIGATION_NONE)
return;
+ }
- if (srso_mitigation == SRSO_MITIGATION_AUTO)
- srso_mitigation = SRSO_MITIGATION_SAFE_RET;
-
- has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
- if (has_microcode) {
- /*
- * Zen1/2 with SMT off aren't vulnerable after the right
- * IBPB microcode has been applied.
- */
- if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
- setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
+ if (srso_mitigation == SRSO_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_SRSO)) {
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ } else {
srso_mitigation = SRSO_MITIGATION_NONE;
return;
}
- } else {
+ }
+
+ /* Zen1/2 with SMT off aren't vulnerable to SRSO. */
+ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
+ srso_mitigation = SRSO_MITIGATION_NOSMT;
+ return;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) {
pr_warn("IBPB-extending microcode not applied!\n");
pr_warn(SRSO_NOTICE);
+
+ /*
+ * Safe-RET provides partial mitigation without microcode, but
+ * other mitigations require microcode to provide any
+ * mitigations.
+ */
+ if (srso_mitigation == SRSO_MITIGATION_SAFE_RET)
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
+ else
+ srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
}
switch (srso_mitigation) {
case SRSO_MITIGATION_SAFE_RET:
+ case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED:
if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) {
srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
goto ibpb_on_vmexit;
@@ -2923,9 +3225,6 @@ static void __init srso_select_mitigation(void)
pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
srso_mitigation = SRSO_MITIGATION_NONE;
}
-
- if (!has_microcode)
- srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
break;
ibpb_on_vmexit:
case SRSO_MITIGATION_IBPB_ON_VMEXIT:
@@ -2940,9 +3239,6 @@ ibpb_on_vmexit:
pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
srso_mitigation = SRSO_MITIGATION_NONE;
}
-
- if (!has_microcode)
- srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
break;
default:
break;
@@ -2957,8 +3253,7 @@ static void __init srso_update_mitigation(void)
srso_mitigation = SRSO_MITIGATION_IBPB;
if (boot_cpu_has_bug(X86_BUG_SRSO) &&
- !cpu_mitigations_off() &&
- !boot_cpu_has(X86_FEATURE_SRSO_NO))
+ !cpu_mitigations_off())
pr_info("%s\n", srso_strings[srso_mitigation]);
}
@@ -3254,9 +3549,6 @@ static ssize_t retbleed_show_state(char *buf)
static ssize_t srso_show_state(char *buf)
{
- if (boot_cpu_has(X86_FEATURE_SRSO_NO))
- return sysfs_emit(buf, "Mitigation: SMT disabled\n");
-
return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]);
}
@@ -3265,6 +3557,11 @@ static ssize_t gds_show_state(char *buf)
return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]);
}
+static ssize_t tsa_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]);
+}
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -3328,6 +3625,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_ITS:
return its_show_state(buf);
+ case X86_BUG_TSA:
+ return tsa_show_state(buf);
+
default:
break;
}
@@ -3414,6 +3714,11 @@ ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_att
{
return cpu_show_common(dev, attr, buf, X86_BUG_ITS);
}
+
+ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TSA);
+}
#endif
void __warn_thunk(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8feb8fd2957a..34a054181c4d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -26,6 +26,7 @@
#include <linux/pgtable.h>
#include <linux/stackprotector.h>
#include <linux/utsname.h>
+#include <linux/efi.h>
#include <asm/alternative.h>
#include <asm/cmdline.h>
@@ -1233,6 +1234,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define ITS BIT(8)
/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */
#define ITS_NATIVE_ONLY BIT(9)
+/* CPU is affected by Transient Scheduler Attacks */
+#define TSA BIT(10)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS),
@@ -1280,7 +1283,7 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_AMD(0x16, RETBLEED),
VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
- VULNBL_AMD(0x19, SRSO),
+ VULNBL_AMD(0x19, SRSO | TSA),
VULNBL_AMD(0x1a, SRSO),
{}
};
@@ -1530,6 +1533,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY);
}
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (!cpu_has(c, X86_FEATURE_TSA_SQ_NO) ||
+ !cpu_has(c, X86_FEATURE_TSA_L1_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, TSA) ||
+ /* Enable bug on Zen guests to allow for live migration. */
+ (cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_ZEN)))
+ setup_force_cpu_bug(X86_BUG_TSA);
+ }
+ }
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
@@ -2243,20 +2256,16 @@ EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
#endif
#endif
-/*
- * Clear all 6 debug registers:
- */
-static void clear_all_debug_regs(void)
+static void initialize_debug_regs(void)
{
- int i;
-
- for (i = 0; i < 8; i++) {
- /* Ignore db4, db5 */
- if ((i == 4) || (i == 5))
- continue;
-
- set_debugreg(0, i);
- }
+ /* Control register first -- to make sure everything is disabled. */
+ set_debugreg(DR7_FIXED_1, 7);
+ set_debugreg(DR6_RESERVED, 6);
+ /* dr5 and dr4 don't exist */
+ set_debugreg(0, 3);
+ set_debugreg(0, 2);
+ set_debugreg(0, 1);
+ set_debugreg(0, 0);
}
#ifdef CONFIG_KGDB
@@ -2417,7 +2426,7 @@ void cpu_init(void)
load_mm_ldt(&init_mm);
- clear_all_debug_regs();
+ initialize_debug_regs();
dbg_restore_debug_regs();
doublefault_init_cpu_tss();
@@ -2530,6 +2539,12 @@ void __init arch_cpu_finalize_init(void)
fpu__init_cpu();
/*
+ * This needs to follow the FPU initializtion, since EFI depends on it.
+ */
+ if (efi_enabled(EFI_RUNTIME_SERVICES))
+ efi_enter_virtual_mode();
+
+ /*
* Ensure that access to the per CPU representation has the initial
* boot CPU configuration.
*/
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 9d852c3b2cb5..5c4eb28c3ac9 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -350,7 +350,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
struct thresh_restart {
struct threshold_block *b;
- int reset;
int set_lvt_off;
int lvt_off;
u16 old_limit;
@@ -432,13 +431,13 @@ static void threshold_restart_bank(void *_tr)
rdmsr(tr->b->address, lo, hi);
- if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
- tr->reset = 1; /* limit cannot be lower than err count */
-
- if (tr->reset) { /* reset err count and overflow bit */
- hi =
- (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
- (THRESHOLD_MAX - tr->b->threshold_limit);
+ /*
+ * Reset error count and overflow bit.
+ * This is done during init or after handling an interrupt.
+ */
+ if (hi & MASK_OVERFLOW_HI || tr->set_lvt_off) {
+ hi &= ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI);
+ hi |= THRESHOLD_MAX - tr->b->threshold_limit;
} else if (tr->old_limit) { /* change limit w/o reset */
int new_count = (hi & THRESHOLD_MAX) +
(tr->old_limit - tr->b->threshold_limit);
@@ -1113,13 +1112,20 @@ static const char *get_name(unsigned int cpu, unsigned int bank, struct threshol
}
bank_type = smca_get_bank_type(cpu, bank);
- if (bank_type >= N_SMCA_BANK_TYPES)
- return NULL;
if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) {
if (b->block < ARRAY_SIZE(smca_umc_block_names))
return smca_umc_block_names[b->block];
- return NULL;
+ }
+
+ if (b && b->block) {
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_block_%u", b->block);
+ return buf_mcatype;
+ }
+
+ if (bank_type >= N_SMCA_BANK_TYPES) {
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_bank_%u", bank);
+ return buf_mcatype;
}
if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index e9b3c5d4a52e..4da4eab56c81 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1740,6 +1740,11 @@ static void mc_poll_banks_default(void)
void (*mc_poll_banks)(void) = mc_poll_banks_default;
+static bool should_enable_timer(unsigned long iv)
+{
+ return !mca_cfg.ignore_ce && iv;
+}
+
static void mce_timer_fn(struct timer_list *t)
{
struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
@@ -1763,7 +1768,7 @@ static void mce_timer_fn(struct timer_list *t)
if (mce_get_storm_mode()) {
__start_timer(t, HZ);
- } else {
+ } else if (should_enable_timer(iv)) {
__this_cpu_write(mce_next_interval, iv);
__start_timer(t, iv);
}
@@ -2156,11 +2161,10 @@ static void mce_start_timer(struct timer_list *t)
{
unsigned long iv = check_interval * HZ;
- if (mca_cfg.ignore_ce || !iv)
- return;
-
- this_cpu_write(mce_next_interval, iv);
- __start_timer(t, iv);
+ if (should_enable_timer(iv)) {
+ this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
}
static void __mcheck_cpu_setup_timer(void)
@@ -2801,15 +2805,9 @@ static int mce_cpu_dead(unsigned int cpu)
static int mce_cpu_online(unsigned int cpu)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- int ret;
mce_device_create(cpu);
-
- ret = mce_threshold_create_device(cpu);
- if (ret) {
- mce_device_remove(cpu);
- return ret;
- }
+ mce_threshold_create_device(cpu);
mce_reenable_cpu();
mce_start_timer(t);
return 0;
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index efcf21e9552e..9b149b9c4109 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -478,6 +478,7 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c)
void mce_intel_feature_clear(struct cpuinfo_x86 *c)
{
intel_clear_lmce();
+ cmci_clear();
}
bool intel_filter_mce(struct mce *m)
diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c
index 2a1655b1fdd8..1fd349cfc802 100644
--- a/arch/x86/kernel/cpu/microcode/amd_shas.c
+++ b/arch/x86/kernel/cpu/microcode/amd_shas.c
@@ -231,6 +231,13 @@ static const struct patch_digest phashes[] = {
0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21,
}
},
+ { 0xa0011d7, {
+ 0x35,0x07,0xcd,0x40,0x94,0xbc,0x81,0x6b,
+ 0xfc,0x61,0x56,0x1a,0xe2,0xdb,0x96,0x12,
+ 0x1c,0x1c,0x31,0xb1,0x02,0x6f,0xe5,0xd2,
+ 0xfe,0x1b,0x04,0x03,0x2c,0x8f,0x4c,0x36,
+ }
+ },
{ 0xa001223, {
0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8,
0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4,
@@ -294,6 +301,13 @@ static const struct patch_digest phashes[] = {
0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59,
}
},
+ { 0xa00123b, {
+ 0xef,0xa1,0x1e,0x71,0xf1,0xc3,0x2c,0xe2,
+ 0xc3,0xef,0x69,0x41,0x7a,0x54,0xca,0xc3,
+ 0x8f,0x62,0x84,0xee,0xc2,0x39,0xd9,0x28,
+ 0x95,0xa7,0x12,0x49,0x1e,0x30,0x71,0x72,
+ }
+ },
{ 0xa00820c, {
0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3,
0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63,
@@ -301,6 +315,13 @@ static const struct patch_digest phashes[] = {
0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2,
}
},
+ { 0xa00820d, {
+ 0xf9,0x2a,0xc0,0xf4,0x9e,0xa4,0x87,0xa4,
+ 0x7d,0x87,0x00,0xfd,0xab,0xda,0x19,0xca,
+ 0x26,0x51,0x32,0xc1,0x57,0x91,0xdf,0xc1,
+ 0x05,0xeb,0x01,0x7c,0x5a,0x95,0x21,0xb7,
+ }
+ },
{ 0xa10113e, {
0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10,
0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0,
@@ -322,6 +343,13 @@ static const struct patch_digest phashes[] = {
0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4,
}
},
+ { 0xa10114c, {
+ 0x9e,0xb6,0xa2,0xd9,0x87,0x38,0xc5,0x64,
+ 0xd8,0x88,0xfa,0x78,0x98,0xf9,0x6f,0x74,
+ 0x39,0x90,0x1b,0xa5,0xcf,0x5e,0xb4,0x2a,
+ 0x02,0xff,0xd4,0x8c,0x71,0x8b,0xe2,0xc0,
+ }
+ },
{ 0xa10123e, {
0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18,
0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d,
@@ -343,6 +371,13 @@ static const struct patch_digest phashes[] = {
0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75,
}
},
+ { 0xa10124c, {
+ 0x29,0xea,0xf1,0x2c,0xb2,0xe4,0xef,0x90,
+ 0xa4,0xcd,0x1d,0x86,0x97,0x17,0x61,0x46,
+ 0xfc,0x22,0xcb,0x57,0x75,0x19,0xc8,0xcc,
+ 0x0c,0xf5,0xbc,0xac,0x81,0x9d,0x9a,0xd2,
+ }
+ },
{ 0xa108108, {
0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9,
0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6,
@@ -350,6 +385,13 @@ static const struct patch_digest phashes[] = {
0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16,
}
},
+ { 0xa108109, {
+ 0x85,0xb4,0xbd,0x7c,0x49,0xa7,0xbd,0xfa,
+ 0x49,0x36,0x80,0x81,0xc5,0xb7,0x39,0x1b,
+ 0x9a,0xaa,0x50,0xde,0x9b,0xe9,0x32,0x35,
+ 0x42,0x7e,0x51,0x4f,0x52,0x2c,0x28,0x59,
+ }
+ },
{ 0xa20102d, {
0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11,
0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89,
@@ -357,6 +399,13 @@ static const struct patch_digest phashes[] = {
0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4,
}
},
+ { 0xa20102e, {
+ 0xbe,0x1f,0x32,0x04,0x0d,0x3c,0x9c,0xdd,
+ 0xe1,0xa4,0xbf,0x76,0x3a,0xec,0xc2,0xf6,
+ 0x11,0x00,0xa7,0xaf,0x0f,0xe5,0x02,0xc5,
+ 0x54,0x3a,0x1f,0x8c,0x16,0xb5,0xff,0xbe,
+ }
+ },
{ 0xa201210, {
0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe,
0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9,
@@ -364,6 +413,13 @@ static const struct patch_digest phashes[] = {
0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41,
}
},
+ { 0xa201211, {
+ 0x69,0xa1,0x17,0xec,0xd0,0xf6,0x6c,0x95,
+ 0xe2,0x1e,0xc5,0x59,0x1a,0x52,0x0a,0x27,
+ 0xc4,0xed,0xd5,0x59,0x1f,0xbf,0x00,0xff,
+ 0x08,0x88,0xb5,0xe1,0x12,0xb6,0xcc,0x27,
+ }
+ },
{ 0xa404107, {
0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45,
0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0,
@@ -371,6 +427,13 @@ static const struct patch_digest phashes[] = {
0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99,
}
},
+ { 0xa404108, {
+ 0x69,0x67,0x43,0x06,0xf8,0x0c,0x62,0xdc,
+ 0xa4,0x21,0x30,0x4f,0x0f,0x21,0x2c,0xcb,
+ 0xcc,0x37,0xf1,0x1c,0xc3,0xf8,0x2f,0x19,
+ 0xdf,0x53,0x53,0x46,0xb1,0x15,0xea,0x00,
+ }
+ },
{ 0xa500011, {
0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4,
0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1,
@@ -378,6 +441,13 @@ static const struct patch_digest phashes[] = {
0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74,
}
},
+ { 0xa500012, {
+ 0xeb,0x74,0x0d,0x47,0xa1,0x8e,0x09,0xe4,
+ 0x93,0x4c,0xad,0x03,0x32,0x4c,0x38,0x16,
+ 0x10,0x39,0xdd,0x06,0xaa,0xce,0xd6,0x0f,
+ 0x62,0x83,0x9d,0x8e,0x64,0x55,0xbe,0x63,
+ }
+ },
{ 0xa601209, {
0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32,
0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30,
@@ -385,6 +455,13 @@ static const struct patch_digest phashes[] = {
0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d,
}
},
+ { 0xa60120a, {
+ 0x0c,0x8b,0x3d,0xfd,0x52,0x52,0x85,0x7d,
+ 0x20,0x3a,0xe1,0x7e,0xa4,0x21,0x3b,0x7b,
+ 0x17,0x86,0xae,0xac,0x13,0xb8,0x63,0x9d,
+ 0x06,0x01,0xd0,0xa0,0x51,0x9a,0x91,0x2c,
+ }
+ },
{ 0xa704107, {
0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6,
0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93,
@@ -392,6 +469,13 @@ static const struct patch_digest phashes[] = {
0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39,
}
},
+ { 0xa704108, {
+ 0xd7,0x55,0x15,0x2b,0xfe,0xc4,0xbc,0x93,
+ 0xec,0x91,0xa0,0xae,0x45,0xb7,0xc3,0x98,
+ 0x4e,0xff,0x61,0x77,0x88,0xc2,0x70,0x49,
+ 0xe0,0x3a,0x1d,0x84,0x38,0x52,0xbf,0x5a,
+ }
+ },
{ 0xa705206, {
0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4,
0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7,
@@ -399,6 +483,13 @@ static const struct patch_digest phashes[] = {
0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc,
}
},
+ { 0xa705208, {
+ 0x30,0x1d,0x55,0x24,0xbc,0x6b,0x5a,0x19,
+ 0x0c,0x7d,0x1d,0x74,0xaa,0xd1,0xeb,0xd2,
+ 0x16,0x62,0xf7,0x5b,0xe1,0x1f,0x18,0x11,
+ 0x5c,0xf0,0x94,0x90,0x26,0xec,0x69,0xff,
+ }
+ },
{ 0xa708007, {
0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3,
0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2,
@@ -406,6 +497,13 @@ static const struct patch_digest phashes[] = {
0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93,
}
},
+ { 0xa708008, {
+ 0x08,0x6e,0xf0,0x22,0x4b,0x8e,0xc4,0x46,
+ 0x58,0x34,0xe6,0x47,0xa2,0x28,0xfd,0xab,
+ 0x22,0x3d,0xdd,0xd8,0x52,0x9e,0x1d,0x16,
+ 0xfa,0x01,0x68,0x14,0x79,0x3e,0xe8,0x6b,
+ }
+ },
{ 0xa70c005, {
0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b,
0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f,
@@ -413,6 +511,13 @@ static const struct patch_digest phashes[] = {
0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13,
}
},
+ { 0xa70c008, {
+ 0x0f,0xdb,0x37,0xa1,0x10,0xaf,0xd4,0x21,
+ 0x94,0x0d,0xa4,0xa2,0xe9,0x86,0x6c,0x0e,
+ 0x85,0x7c,0x36,0x30,0xa3,0x3a,0x78,0x66,
+ 0x18,0x10,0x60,0x0d,0x78,0x3d,0x44,0xd0,
+ }
+ },
{ 0xaa00116, {
0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63,
0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5,
@@ -441,4 +546,11 @@ static const struct patch_digest phashes[] = {
0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef,
}
},
+ { 0xaa00216, {
+ 0x79,0xfb,0x5b,0x9f,0xb6,0xe6,0xa8,0xf5,
+ 0x4e,0x7c,0x4f,0x8e,0x1d,0xad,0xd0,0x08,
+ 0xc2,0x43,0x7c,0x8b,0xe6,0xdb,0xd0,0xd2,
+ 0xe8,0x39,0x26,0xc1,0xe5,0x5a,0x48,0xf1,
+ }
+ },
};
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index fe50eb5b7c4a..b92e09a87c69 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -17,8 +17,8 @@
#define pr_fmt(fmt) "microcode: " fmt
-#include <linux/platform_device.h>
#include <linux/stop_machine.h>
+#include <linux/device/faux.h>
#include <linux/syscore_ops.h>
#include <linux/miscdevice.h>
#include <linux/capability.h>
@@ -249,7 +249,7 @@ static void reload_early_microcode(unsigned int cpu)
}
/* fake device for request_firmware */
-static struct platform_device *microcode_pdev;
+static struct faux_device *microcode_fdev;
#ifdef CONFIG_MICROCODE_LATE_LOADING
/*
@@ -690,7 +690,7 @@ static int load_late_locked(void)
if (!setup_cpus())
return -EBUSY;
- switch (microcode_ops->request_microcode_fw(0, &microcode_pdev->dev)) {
+ switch (microcode_ops->request_microcode_fw(0, &microcode_fdev->dev)) {
case UCODE_NEW:
return load_late_stop_cpus(false);
case UCODE_NEW_SAFE:
@@ -841,9 +841,9 @@ static int __init microcode_init(void)
if (early_data.new_rev)
pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev);
- microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0);
- if (IS_ERR(microcode_pdev))
- return PTR_ERR(microcode_pdev);
+ microcode_fdev = faux_device_create("microcode", NULL, NULL);
+ if (!microcode_fdev)
+ return -ENODEV;
dev_root = bus_get_dev_root(&cpu_subsys);
if (dev_root) {
@@ -862,7 +862,7 @@ static int __init microcode_init(void)
return 0;
out_pdev:
- platform_device_unregister(microcode_pdev);
+ faux_device_destroy(microcode_fdev);
return error;
}
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index e2c6b471d230..8c18327eb10b 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -593,7 +593,7 @@ static void get_fixed_ranges(mtrr_type *frs)
void mtrr_save_fixed_ranges(void *info)
{
- if (boot_cpu_has(X86_FEATURE_MTRR))
+ if (mtrr_state.have_fixed)
get_fixed_ranges(mtrr_state.fixed_ranges);
}
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
index 0c13b0befd8a..d8a04b195da2 100644
--- a/arch/x86/kernel/cpu/resctrl/Makefile
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -2,4 +2,6 @@
obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o
obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o
+
+# To allow define_trace.h's recursive include:
CFLAGS_pseudo_lock.o = -I$(src)
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index d987b11c168c..187d527ef73b 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -61,7 +61,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
[RDT_RESOURCE_L3] =
{
.r_resctrl = {
- .rid = RDT_RESOURCE_L3,
.name = "L3",
.ctrl_scope = RESCTRL_L3_CACHE,
.mon_scope = RESCTRL_L3_CACHE,
@@ -75,7 +74,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
[RDT_RESOURCE_L2] =
{
.r_resctrl = {
- .rid = RDT_RESOURCE_L2,
.name = "L2",
.ctrl_scope = RESCTRL_L2_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2),
@@ -87,7 +85,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
[RDT_RESOURCE_MBA] =
{
.r_resctrl = {
- .rid = RDT_RESOURCE_MBA,
.name = "MB",
.ctrl_scope = RESCTRL_L3_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA),
@@ -97,7 +94,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
[RDT_RESOURCE_SMBA] =
{
.r_resctrl = {
- .rid = RDT_RESOURCE_SMBA,
.name = "SMBA",
.ctrl_scope = RESCTRL_L3_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA),
@@ -165,21 +161,6 @@ static inline void cache_alloc_hsw_probe(void)
rdt_alloc_capable = true;
}
-bool is_mba_sc(struct rdt_resource *r)
-{
- if (!r)
- r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
-
- /*
- * The software controller support is only applicable to MBA resource.
- * Make sure to check for resource type.
- */
- if (r->rid != RDT_RESOURCE_MBA)
- return false;
-
- return r->membw.mba_sc;
-}
-
/*
* rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
* exposed to user interface and the h/w understandable delay values.
@@ -517,6 +498,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
struct rdt_hw_mon_domain *hw_dom;
struct rdt_domain_hdr *hdr;
struct rdt_mon_domain *d;
+ struct cacheinfo *ci;
int err;
lockdep_assert_held(&domain_list_lock);
@@ -544,12 +526,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
d = &hw_dom->d_resctrl;
d->hdr.id = id;
d->hdr.type = RESCTRL_MON_DOMAIN;
- d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
- if (!d->ci) {
+ ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+ if (!ci) {
pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
mon_domain_free(hw_dom);
return;
}
+ d->ci_id = ci->id;
cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
arch_mon_domain_online(r, d);
@@ -738,7 +721,7 @@ struct rdt_options {
bool force_off, force_on;
};
-static struct rdt_options rdt_options[] __initdata = {
+static struct rdt_options rdt_options[] __ro_after_init = {
RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC),
RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
@@ -778,7 +761,7 @@ static int __init set_rdt_options(char *str)
}
__setup("rdt", set_rdt_options);
-bool __init rdt_cpu_has(int flag)
+bool rdt_cpu_has(int flag)
{
bool ret = boot_cpu_has(flag);
struct rdt_options *o;
@@ -798,7 +781,7 @@ bool __init rdt_cpu_has(int flag)
return ret;
}
-__init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
{
if (!rdt_cpu_has(X86_FEATURE_BMEC))
return false;
@@ -1012,7 +995,11 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c)
static int __init resctrl_arch_late_init(void)
{
struct rdt_resource *r;
- int state, ret;
+ int state, ret, i;
+
+ /* for_each_rdt_resource() requires all rid to be initialised. */
+ for (i = 0; i < RDT_NUM_RESOURCES; i++)
+ rdt_resources_all[i].r_resctrl.rid = i;
/*
* Initialize functions(or definitions) that are different
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 0a0ac5f6112e..1189c0df4ad7 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -16,277 +16,9 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cpu.h>
-#include <linux/kernfs.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/tick.h>
#include "internal.h"
-struct rdt_parse_data {
- struct rdtgroup *rdtgrp;
- char *buf;
-};
-
-typedef int (ctrlval_parser_t)(struct rdt_parse_data *data,
- struct resctrl_schema *s,
- struct rdt_ctrl_domain *d);
-
-/*
- * Check whether MBA bandwidth percentage value is correct. The value is
- * checked against the minimum and max bandwidth values specified by the
- * hardware. The allocated bandwidth percentage is rounded to the next
- * control step available on the hardware.
- */
-static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r)
-{
- int ret;
- u32 bw;
-
- /*
- * Only linear delay values is supported for current Intel SKUs.
- */
- if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
- rdt_last_cmd_puts("No support for non-linear MB domains\n");
- return false;
- }
-
- ret = kstrtou32(buf, 10, &bw);
- if (ret) {
- rdt_last_cmd_printf("Invalid MB value %s\n", buf);
- return false;
- }
-
- /* Nothing else to do if software controller is enabled. */
- if (is_mba_sc(r)) {
- *data = bw;
- return true;
- }
-
- if (bw < r->membw.min_bw || bw > r->membw.max_bw) {
- rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n",
- bw, r->membw.min_bw, r->membw.max_bw);
- return false;
- }
-
- *data = roundup(bw, (unsigned long)r->membw.bw_gran);
- return true;
-}
-
-static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d)
-{
- struct resctrl_staged_config *cfg;
- u32 closid = data->rdtgrp->closid;
- struct rdt_resource *r = s->res;
- u32 bw_val;
-
- cfg = &d->staged_config[s->conf_type];
- if (cfg->have_new_ctrl) {
- rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
- return -EINVAL;
- }
-
- if (!bw_validate(data->buf, &bw_val, r))
- return -EINVAL;
-
- if (is_mba_sc(r)) {
- d->mbps_val[closid] = bw_val;
- return 0;
- }
-
- cfg->new_ctrl = bw_val;
- cfg->have_new_ctrl = true;
-
- return 0;
-}
-
-/*
- * Check whether a cache bit mask is valid.
- * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID:
- * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1
- * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1
- *
- * Haswell does not support a non-contiguous 1s value and additionally
- * requires at least two bits set.
- * AMD allows non-contiguous bitmasks.
- */
-static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
-{
- u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1;
- unsigned int cbm_len = r->cache.cbm_len;
- unsigned long first_bit, zero_bit, val;
- int ret;
-
- ret = kstrtoul(buf, 16, &val);
- if (ret) {
- rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf);
- return false;
- }
-
- if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) {
- rdt_last_cmd_puts("Mask out of range\n");
- return false;
- }
-
- first_bit = find_first_bit(&val, cbm_len);
- zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
-
- /* Are non-contiguous bitmasks allowed? */
- if (!r->cache.arch_has_sparse_bitmasks &&
- (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
- rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
- return false;
- }
-
- if ((zero_bit - first_bit) < r->cache.min_cbm_bits) {
- rdt_last_cmd_printf("Need at least %d bits in the mask\n",
- r->cache.min_cbm_bits);
- return false;
- }
-
- *data = val;
- return true;
-}
-
-/*
- * Read one cache bit mask (hex). Check that it is valid for the current
- * resource type.
- */
-static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d)
-{
- struct rdtgroup *rdtgrp = data->rdtgrp;
- struct resctrl_staged_config *cfg;
- struct rdt_resource *r = s->res;
- u32 cbm_val;
-
- cfg = &d->staged_config[s->conf_type];
- if (cfg->have_new_ctrl) {
- rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
- return -EINVAL;
- }
-
- /*
- * Cannot set up more than one pseudo-locked region in a cache
- * hierarchy.
- */
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
- rdtgroup_pseudo_locked_in_hierarchy(d)) {
- rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n");
- return -EINVAL;
- }
-
- if (!cbm_validate(data->buf, &cbm_val, r))
- return -EINVAL;
-
- if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
- rdtgrp->mode == RDT_MODE_SHAREABLE) &&
- rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
- rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n");
- return -EINVAL;
- }
-
- /*
- * The CBM may not overlap with the CBM of another closid if
- * either is exclusive.
- */
- if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) {
- rdt_last_cmd_puts("Overlaps with exclusive group\n");
- return -EINVAL;
- }
-
- if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) {
- if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- rdt_last_cmd_puts("Overlaps with other group\n");
- return -EINVAL;
- }
- }
-
- cfg->new_ctrl = cbm_val;
- cfg->have_new_ctrl = true;
-
- return 0;
-}
-
-/*
- * For each domain in this resource we expect to find a series of:
- * id=mask
- * separated by ";". The "id" is in decimal, and must match one of
- * the "id"s for this resource.
- */
-static int parse_line(char *line, struct resctrl_schema *s,
- struct rdtgroup *rdtgrp)
-{
- enum resctrl_conf_type t = s->conf_type;
- ctrlval_parser_t *parse_ctrlval = NULL;
- struct resctrl_staged_config *cfg;
- struct rdt_resource *r = s->res;
- struct rdt_parse_data data;
- struct rdt_ctrl_domain *d;
- char *dom = NULL, *id;
- unsigned long dom_id;
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- switch (r->schema_fmt) {
- case RESCTRL_SCHEMA_BITMAP:
- parse_ctrlval = &parse_cbm;
- break;
- case RESCTRL_SCHEMA_RANGE:
- parse_ctrlval = &parse_bw;
- break;
- }
-
- if (WARN_ON_ONCE(!parse_ctrlval))
- return -EINVAL;
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
- (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
- rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
- return -EINVAL;
- }
-
-next:
- if (!line || line[0] == '\0')
- return 0;
- dom = strsep(&line, ";");
- id = strsep(&dom, "=");
- if (!dom || kstrtoul(id, 10, &dom_id)) {
- rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
- return -EINVAL;
- }
- dom = strim(dom);
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- if (d->hdr.id == dom_id) {
- data.buf = dom;
- data.rdtgrp = rdtgrp;
- if (parse_ctrlval(&data, s, d))
- return -EINVAL;
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- cfg = &d->staged_config[t];
- /*
- * In pseudo-locking setup mode and just
- * parsed a valid CBM that should be
- * pseudo-locked. Only one locked region per
- * resource group and domain so just do
- * the required initialization for single
- * region and return.
- */
- rdtgrp->plr->s = s;
- rdtgrp->plr->d = d;
- rdtgrp->plr->cbm = cfg->new_ctrl;
- d->plr = rdtgrp->plr;
- return 0;
- }
- goto next;
- }
- }
- return -EINVAL;
-}
-
int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type t, u32 cfg_val)
{
@@ -351,100 +83,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
return 0;
}
-static int rdtgroup_parse_resource(char *resname, char *tok,
- struct rdtgroup *rdtgrp)
-{
- struct resctrl_schema *s;
-
- list_for_each_entry(s, &resctrl_schema_all, list) {
- if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid)
- return parse_line(tok, s, rdtgrp);
- }
- rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
- return -EINVAL;
-}
-
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct resctrl_schema *s;
- struct rdtgroup *rdtgrp;
- struct rdt_resource *r;
- char *tok, *resname;
- int ret = 0;
-
- /* Valid input requires a trailing newline */
- if (nbytes == 0 || buf[nbytes - 1] != '\n')
- return -EINVAL;
- buf[nbytes - 1] = '\0';
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
- rdt_last_cmd_clear();
-
- /*
- * No changes to pseudo-locked region allowed. It has to be removed
- * and re-created instead.
- */
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Resource group is pseudo-locked\n");
- goto out;
- }
-
- rdt_staged_configs_clear();
-
- while ((tok = strsep(&buf, "\n")) != NULL) {
- resname = strim(strsep(&tok, ":"));
- if (!tok) {
- rdt_last_cmd_puts("Missing ':'\n");
- ret = -EINVAL;
- goto out;
- }
- if (tok[0] == '\0') {
- rdt_last_cmd_printf("Missing '%s' value\n", resname);
- ret = -EINVAL;
- goto out;
- }
- ret = rdtgroup_parse_resource(resname, tok, rdtgrp);
- if (ret)
- goto out;
- }
-
- list_for_each_entry(s, &resctrl_schema_all, list) {
- r = s->res;
-
- /*
- * Writes to mba_sc resources update the software controller,
- * not the control MSR.
- */
- if (is_mba_sc(r))
- continue;
-
- ret = resctrl_arch_update_domains(r, rdtgrp->closid);
- if (ret)
- goto out;
- }
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- /*
- * If pseudo-locking fails we keep the resource group in
- * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service
- * active and updated for just the domain the pseudo-locked
- * region was requested for.
- */
- ret = rdtgroup_pseudo_lock_create(rdtgrp);
- }
-
-out:
- rdt_staged_configs_clear();
- rdtgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
-}
-
u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type type)
{
@@ -453,276 +91,3 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
return hw_dom->ctrl_val[idx];
}
-
-static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
-{
- struct rdt_resource *r = schema->res;
- struct rdt_ctrl_domain *dom;
- bool sep = false;
- u32 ctrl_val;
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- seq_printf(s, "%*s:", max_name_width, schema->name);
- list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
- if (sep)
- seq_puts(s, ";");
-
- if (is_mba_sc(r))
- ctrl_val = dom->mbps_val[closid];
- else
- ctrl_val = resctrl_arch_get_config(r, dom, closid,
- schema->conf_type);
-
- seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val);
- sep = true;
- }
- seq_puts(s, "\n");
-}
-
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct resctrl_schema *schema;
- struct rdtgroup *rdtgrp;
- int ret = 0;
- u32 closid;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (rdtgrp) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- list_for_each_entry(schema, &resctrl_schema_all, list) {
- seq_printf(s, "%s:uninitialized\n", schema->name);
- }
- } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- if (!rdtgrp->plr->d) {
- rdt_last_cmd_clear();
- rdt_last_cmd_puts("Cache domain offline\n");
- ret = -ENODEV;
- } else {
- seq_printf(s, "%s:%d=%x\n",
- rdtgrp->plr->s->res->name,
- rdtgrp->plr->d->hdr.id,
- rdtgrp->plr->cbm);
- }
- } else {
- closid = rdtgrp->closid;
- list_for_each_entry(schema, &resctrl_schema_all, list) {
- if (closid < schema->num_closid)
- show_doms(s, schema, closid);
- }
- }
- } else {
- ret = -ENOENT;
- }
- rdtgroup_kn_unlock(of->kn);
- return ret;
-}
-
-static int smp_mon_event_count(void *arg)
-{
- mon_event_count(arg);
-
- return 0;
-}
-
-ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct rdtgroup *rdtgrp;
- int ret = 0;
-
- /* Valid input requires a trailing newline */
- if (nbytes == 0 || buf[nbytes - 1] != '\n')
- return -EINVAL;
- buf[nbytes - 1] = '\0';
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
- rdt_last_cmd_clear();
-
- if (!strcmp(buf, "mbm_local_bytes")) {
- if (resctrl_arch_is_mbm_local_enabled())
- rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
- else
- ret = -EINVAL;
- } else if (!strcmp(buf, "mbm_total_bytes")) {
- if (resctrl_arch_is_mbm_total_enabled())
- rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
- else
- ret = -EINVAL;
- } else {
- ret = -EINVAL;
- }
-
- if (ret)
- rdt_last_cmd_printf("Unsupported event id '%s'\n", buf);
-
- rdtgroup_kn_unlock(of->kn);
-
- return ret ?: nbytes;
-}
-
-int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
-
- if (rdtgrp) {
- switch (rdtgrp->mba_mbps_event) {
- case QOS_L3_MBM_LOCAL_EVENT_ID:
- seq_puts(s, "mbm_local_bytes\n");
- break;
- case QOS_L3_MBM_TOTAL_EVENT_ID:
- seq_puts(s, "mbm_total_bytes\n");
- break;
- default:
- pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event);
- ret = -EINVAL;
- break;
- }
- } else {
- ret = -ENOENT;
- }
-
- rdtgroup_kn_unlock(of->kn);
-
- return ret;
-}
-
-struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id,
- struct list_head **pos)
-{
- struct rdt_domain_hdr *d;
- struct list_head *l;
-
- list_for_each(l, h) {
- d = list_entry(l, struct rdt_domain_hdr, list);
- /* When id is found, return its domain. */
- if (id == d->id)
- return d;
- /* Stop searching when finding id's position in sorted list. */
- if (id < d->id)
- break;
- }
-
- if (pos)
- *pos = l;
-
- return NULL;
-}
-
-void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
- struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
- cpumask_t *cpumask, int evtid, int first)
-{
- int cpu;
-
- /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- /*
- * Setup the parameters to pass to mon_event_count() to read the data.
- */
- rr->rgrp = rdtgrp;
- rr->evtid = evtid;
- rr->r = r;
- rr->d = d;
- rr->first = first;
- rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
- if (IS_ERR(rr->arch_mon_ctx)) {
- rr->err = -EINVAL;
- return;
- }
-
- cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU);
-
- /*
- * cpumask_any_housekeeping() prefers housekeeping CPUs, but
- * are all the CPUs nohz_full? If yes, pick a CPU to IPI.
- * MPAM's resctrl_arch_rmid_read() is unable to read the
- * counters on some platforms if its called in IRQ context.
- */
- if (tick_nohz_full_cpu(cpu))
- smp_call_function_any(cpumask, mon_event_count, rr, 1);
- else
- smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
-
- resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
-}
-
-int rdtgroup_mondata_show(struct seq_file *m, void *arg)
-{
- struct kernfs_open_file *of = m->private;
- struct rdt_domain_hdr *hdr;
- struct rmid_read rr = {0};
- struct rdt_mon_domain *d;
- u32 resid, evtid, domid;
- struct rdtgroup *rdtgrp;
- struct rdt_resource *r;
- union mon_data_bits md;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- ret = -ENOENT;
- goto out;
- }
-
- md.priv = of->kn->priv;
- resid = md.u.rid;
- domid = md.u.domid;
- evtid = md.u.evtid;
- r = resctrl_arch_get_resource(resid);
-
- if (md.u.sum) {
- /*
- * This file requires summing across all domains that share
- * the L3 cache id that was provided in the "domid" field of the
- * mon_data_bits union. Search all domains in the resource for
- * one that matches this cache id.
- */
- list_for_each_entry(d, &r->mon_domains, hdr.list) {
- if (d->ci->id == domid) {
- rr.ci = d->ci;
- mon_event_read(&rr, r, NULL, rdtgrp,
- &d->ci->shared_cpu_map, evtid, false);
- goto checkresult;
- }
- }
- ret = -ENOENT;
- goto out;
- } else {
- /*
- * This file provides data from a single domain. Search
- * the resource to find the domain with "domid".
- */
- hdr = resctrl_find_domain(&r->mon_domains, domid, NULL);
- if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) {
- ret = -ENOENT;
- goto out;
- }
- d = container_of(hdr, struct rdt_mon_domain, hdr);
- mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false);
- }
-
-checkresult:
-
- if (rr.err == -EIO)
- seq_puts(m, "Error\n");
- else if (rr.err == -EINVAL)
- seq_puts(m, "Unavailable\n");
- else
- seq_printf(m, "%llu\n", rr.val);
-
-out:
- rdtgroup_kn_unlock(of->kn);
- return ret;
-}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index eaae99602b61..5e3c41b36437 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -3,28 +3,21 @@
#define _ASM_X86_RESCTRL_INTERNAL_H
#include <linux/resctrl.h>
-#include <linux/sched.h>
-#include <linux/kernfs.h>
-#include <linux/fs_context.h>
-#include <linux/jump_label.h>
-#include <linux/tick.h>
-
-#include <asm/resctrl.h>
#define L3_QOS_CDP_ENABLE 0x01ULL
#define L2_QOS_CDP_ENABLE 0x01ULL
-#define CQM_LIMBOCHECK_INTERVAL 1000
-
#define MBM_CNTR_WIDTH_BASE 24
-#define MBM_OVERFLOW_INTERVAL 1000
-#define MAX_MBA_BW 100u
+
#define MBA_IS_LINEAR 0x4
+
#define MBM_CNTR_WIDTH_OFFSET_AMD 20
#define RMID_VAL_ERROR BIT_ULL(63)
+
#define RMID_VAL_UNAVAIL BIT_ULL(62)
+
/*
* With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for
* data to be returned. The counter width is discovered from the hardware
@@ -33,278 +26,6 @@
#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
/**
- * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
- * aren't marked nohz_full
- * @mask: The mask to pick a CPU from.
- * @exclude_cpu:The CPU to avoid picking.
- *
- * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping
- * CPUs that don't use nohz_full, these are preferred. Pass
- * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs.
- *
- * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available.
- */
-static inline unsigned int
-cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu)
-{
- unsigned int cpu, hk_cpu;
-
- if (exclude_cpu == RESCTRL_PICK_ANY_CPU)
- cpu = cpumask_any(mask);
- else
- cpu = cpumask_any_but(mask, exclude_cpu);
-
- /* Only continue if tick_nohz_full_mask has been initialized. */
- if (!tick_nohz_full_enabled())
- return cpu;
-
- /* If the CPU picked isn't marked nohz_full nothing more needs doing. */
- if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu))
- return cpu;
-
- /* Try to find a CPU that isn't nohz_full to use in preference */
- hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask);
- if (hk_cpu == exclude_cpu)
- hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask);
-
- if (hk_cpu < nr_cpu_ids)
- cpu = hk_cpu;
-
- return cpu;
-}
-
-struct rdt_fs_context {
- struct kernfs_fs_context kfc;
- bool enable_cdpl2;
- bool enable_cdpl3;
- bool enable_mba_mbps;
- bool enable_debug;
-};
-
-static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
-{
- struct kernfs_fs_context *kfc = fc->fs_private;
-
- return container_of(kfc, struct rdt_fs_context, kfc);
-}
-
-/**
- * struct mon_evt - Entry in the event list of a resource
- * @evtid: event id
- * @name: name of the event
- * @configurable: true if the event is configurable
- * @list: entry in &rdt_resource->evt_list
- */
-struct mon_evt {
- enum resctrl_event_id evtid;
- char *name;
- bool configurable;
- struct list_head list;
-};
-
-/**
- * union mon_data_bits - Monitoring details for each event file.
- * @priv: Used to store monitoring event data in @u
- * as kernfs private data.
- * @u.rid: Resource id associated with the event file.
- * @u.evtid: Event id associated with the event file.
- * @u.sum: Set when event must be summed across multiple
- * domains.
- * @u.domid: When @u.sum is zero this is the domain to which
- * the event file belongs. When @sum is one this
- * is the id of the L3 cache that all domains to be
- * summed share.
- * @u: Name of the bit fields struct.
- */
-union mon_data_bits {
- void *priv;
- struct {
- unsigned int rid : 10;
- enum resctrl_event_id evtid : 7;
- unsigned int sum : 1;
- unsigned int domid : 14;
- } u;
-};
-
-/**
- * struct rmid_read - Data passed across smp_call*() to read event count.
- * @rgrp: Resource group for which the counter is being read. If it is a parent
- * resource group then its event count is summed with the count from all
- * its child resource groups.
- * @r: Resource describing the properties of the event being read.
- * @d: Domain that the counter should be read from. If NULL then sum all
- * domains in @r sharing L3 @ci.id
- * @evtid: Which monitor event to read.
- * @first: Initialize MBM counter when true.
- * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
- * @err: Error encountered when reading counter.
- * @val: Returned value of event counter. If @rgrp is a parent resource group,
- * @val includes the sum of event counts from its child resource groups.
- * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id,
- * (summed across child resource groups if @rgrp is a parent resource group).
- * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only).
- */
-struct rmid_read {
- struct rdtgroup *rgrp;
- struct rdt_resource *r;
- struct rdt_mon_domain *d;
- enum resctrl_event_id evtid;
- bool first;
- struct cacheinfo *ci;
- int err;
- u64 val;
- void *arch_mon_ctx;
-};
-
-extern struct list_head resctrl_schema_all;
-extern bool resctrl_mounted;
-
-enum rdt_group_type {
- RDTCTRL_GROUP = 0,
- RDTMON_GROUP,
- RDT_NUM_GROUP,
-};
-
-/**
- * enum rdtgrp_mode - Mode of a RDT resource group
- * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations
- * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed
- * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
- * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
- * allowed AND the allocations are Cache Pseudo-Locked
- * @RDT_NUM_MODES: Total number of modes
- *
- * The mode of a resource group enables control over the allowed overlap
- * between allocations associated with different resource groups (classes
- * of service). User is able to modify the mode of a resource group by
- * writing to the "mode" resctrl file associated with the resource group.
- *
- * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by
- * writing the appropriate text to the "mode" file. A resource group enters
- * "pseudo-locked" mode after the schemata is written while the resource
- * group is in "pseudo-locksetup" mode.
- */
-enum rdtgrp_mode {
- RDT_MODE_SHAREABLE = 0,
- RDT_MODE_EXCLUSIVE,
- RDT_MODE_PSEUDO_LOCKSETUP,
- RDT_MODE_PSEUDO_LOCKED,
-
- /* Must be last */
- RDT_NUM_MODES,
-};
-
-/**
- * struct mongroup - store mon group's data in resctrl fs.
- * @mon_data_kn: kernfs node for the mon_data directory
- * @parent: parent rdtgrp
- * @crdtgrp_list: child rdtgroup node list
- * @rmid: rmid for this rdtgroup
- */
-struct mongroup {
- struct kernfs_node *mon_data_kn;
- struct rdtgroup *parent;
- struct list_head crdtgrp_list;
- u32 rmid;
-};
-
-/**
- * struct rdtgroup - store rdtgroup's data in resctrl file system.
- * @kn: kernfs node
- * @rdtgroup_list: linked list for all rdtgroups
- * @closid: closid for this rdtgroup
- * @cpu_mask: CPUs assigned to this rdtgroup
- * @flags: status bits
- * @waitcount: how many cpus expect to find this
- * group when they acquire rdtgroup_mutex
- * @type: indicates type of this rdtgroup - either
- * monitor only or ctrl_mon group
- * @mon: mongroup related data
- * @mode: mode of resource group
- * @mba_mbps_event: input monitoring event id when mba_sc is enabled
- * @plr: pseudo-locked region
- */
-struct rdtgroup {
- struct kernfs_node *kn;
- struct list_head rdtgroup_list;
- u32 closid;
- struct cpumask cpu_mask;
- int flags;
- atomic_t waitcount;
- enum rdt_group_type type;
- struct mongroup mon;
- enum rdtgrp_mode mode;
- enum resctrl_event_id mba_mbps_event;
- struct pseudo_lock_region *plr;
-};
-
-/* rdtgroup.flags */
-#define RDT_DELETED 1
-
-/* rftype.flags */
-#define RFTYPE_FLAGS_CPUS_LIST 1
-
-/*
- * Define the file type flags for base and info directories.
- */
-#define RFTYPE_INFO BIT(0)
-#define RFTYPE_BASE BIT(1)
-#define RFTYPE_CTRL BIT(4)
-#define RFTYPE_MON BIT(5)
-#define RFTYPE_TOP BIT(6)
-#define RFTYPE_RES_CACHE BIT(8)
-#define RFTYPE_RES_MB BIT(9)
-#define RFTYPE_DEBUG BIT(10)
-#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
-#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
-#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP)
-#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
-#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON)
-
-/* List of all resource groups */
-extern struct list_head rdt_all_groups;
-
-extern int max_name_width;
-
-/**
- * struct rftype - describe each file in the resctrl file system
- * @name: File name
- * @mode: Access mode
- * @kf_ops: File operations
- * @flags: File specific RFTYPE_FLAGS_* flags
- * @fflags: File specific RFTYPE_* flags
- * @seq_show: Show content of the file
- * @write: Write to the file
- */
-struct rftype {
- char *name;
- umode_t mode;
- const struct kernfs_ops *kf_ops;
- unsigned long flags;
- unsigned long fflags;
-
- int (*seq_show)(struct kernfs_open_file *of,
- struct seq_file *sf, void *v);
- /*
- * write() is the generic write callback which maps directly to
- * kernfs write operation and overrides all other operations.
- * Maximum write size is determined by ->max_write_len.
- */
- ssize_t (*write)(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off);
-};
-
-/**
- * struct mbm_state - status for each MBM counter in each domain
- * @prev_bw_bytes: Previous bytes value read for bandwidth calculation
- * @prev_bw: The most recent bandwidth in MBps
- */
-struct mbm_state {
- u64 prev_bw_bytes;
- u32 prev_bw;
-};
-
-/**
* struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s
* return value.
* @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
@@ -401,24 +122,7 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r
return container_of(r, struct rdt_hw_resource, r_resctrl);
}
-extern struct mutex rdtgroup_mutex;
-
-static inline const char *rdt_kn_name(const struct kernfs_node *kn)
-{
- return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex));
-}
-
extern struct rdt_hw_resource rdt_resources_all[];
-extern struct rdtgroup rdtgroup_default;
-extern struct dentry *debugfs_resctrl;
-extern enum resctrl_event_id mba_mbps_default_event;
-
-static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
-{
- return rdt_resources_all[l].cdp_enabled;
-}
-
-int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d);
@@ -455,99 +159,14 @@ union cpuid_0x10_x_edx {
unsigned int full;
};
-void rdt_last_cmd_clear(void);
-void rdt_last_cmd_puts(const char *s);
-__printf(1, 2)
-void rdt_last_cmd_printf(const char *fmt, ...);
-
void rdt_ctrl_update(void *arg);
-struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
-void rdtgroup_kn_unlock(struct kernfs_node *kn);
-int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
-int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
- umode_t mask);
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off);
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v);
-ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off);
-int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v);
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
- unsigned long cbm, int closid, bool exclusive);
-unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d,
- unsigned long cbm);
-enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
-int rdtgroup_tasks_assigned(struct rdtgroup *r);
-int closids_supported(void);
-void closid_free(int closid);
-int alloc_rmid(u32 closid);
-void free_rmid(u32 closid, u32 rmid);
-int rdt_get_mon_l3_config(struct rdt_resource *r);
-void resctrl_mon_resource_exit(void);
-bool __init rdt_cpu_has(int flag);
-void mon_event_count(void *info);
-int rdtgroup_mondata_show(struct seq_file *m, void *arg);
-void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
- struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
- cpumask_t *cpumask, int evtid, int first);
-int __init resctrl_mon_resource_init(void);
-void mbm_setup_overflow_handler(struct rdt_mon_domain *dom,
- unsigned long delay_ms,
- int exclude_cpu);
-void mbm_handle_overflow(struct work_struct *work);
-void __init intel_rdt_mbm_apply_quirk(void);
-bool is_mba_sc(struct rdt_resource *r);
-void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
- int exclude_cpu);
-void cqm_handle_limbo(struct work_struct *work);
-bool has_busy_rmid(struct rdt_mon_domain *d);
-void __check_limbo(struct rdt_mon_domain *d, bool force_free);
-void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
-void resctrl_file_fflags_init(const char *config, unsigned long fflags);
-void rdt_staged_configs_clear(void);
-bool closid_allocated(unsigned int closid);
-int resctrl_find_cleanest_closid(void);
-
-#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK
-int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
-int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm);
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d);
-int rdt_pseudo_lock_init(void);
-void rdt_pseudo_lock_release(void);
-int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
-void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
-#else
-static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
-{
- return -EOPNOTSUPP;
-}
-static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
-{
- return -EOPNOTSUPP;
-}
-
-static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
-{
- return false;
-}
+int rdt_get_mon_l3_config(struct rdt_resource *r);
-static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
-{
- return false;
-}
+bool rdt_cpu_has(int flag);
-static inline int rdt_pseudo_lock_init(void) { return 0; }
-static inline void rdt_pseudo_lock_release(void) { }
-static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
-{
- return -EOPNOTSUPP;
-}
+void __init intel_rdt_mbm_apply_quirk(void);
-static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { }
-#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 591b0b44d260..c261558276cd 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -18,63 +18,12 @@
#define pr_fmt(fmt) "resctrl: " fmt
#include <linux/cpu.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-#include <linux/slab.h>
+#include <linux/resctrl.h>
#include <asm/cpu_device_id.h>
#include <asm/msr.h>
-#include <asm/resctrl.h>
#include "internal.h"
-#include "trace.h"
-
-/**
- * struct rmid_entry - dirty tracking for all RMID.
- * @closid: The CLOSID for this entry.
- * @rmid: The RMID for this entry.
- * @busy: The number of domains with cached data using this RMID.
- * @list: Member of the rmid_free_lru list when busy == 0.
- *
- * Depending on the architecture the correct monitor is accessed using
- * both @closid and @rmid, or @rmid only.
- *
- * Take the rdtgroup_mutex when accessing.
- */
-struct rmid_entry {
- u32 closid;
- u32 rmid;
- int busy;
- struct list_head list;
-};
-
-/*
- * @rmid_free_lru - A least recently used list of free RMIDs
- * These RMIDs are guaranteed to have an occupancy less than the
- * threshold occupancy
- */
-static LIST_HEAD(rmid_free_lru);
-
-/*
- * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has.
- * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined.
- * Indexed by CLOSID. Protected by rdtgroup_mutex.
- */
-static u32 *closid_num_dirty_rmid;
-
-/*
- * @rmid_limbo_count - count of currently unused but (potentially)
- * dirty RMIDs.
- * This counts RMIDs that no one is currently using but that
- * may have a occupancy value > resctrl_rmid_realloc_threshold. User can
- * change the threshold occupancy value.
- */
-static unsigned int rmid_limbo_count;
-
-/*
- * @rmid_entry - The entry in the limbo and free lists.
- */
-static struct rmid_entry *rmid_ptrs;
/*
* Global boolean for rdt_monitor which is true if any
@@ -87,23 +36,12 @@ bool rdt_mon_capable;
*/
unsigned int rdt_mon_features;
-/*
- * This is the threshold cache occupancy in bytes at which we will consider an
- * RMID available for re-allocation.
- */
-unsigned int resctrl_rmid_realloc_threshold;
-
-/*
- * This is the maximum value for the reallocation threshold, in bytes.
- */
-unsigned int resctrl_rmid_realloc_limit;
-
#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
static int snc_nodes_per_l3_cache = 1;
/*
- * The correction factor table is documented in Documentation/arch/x86/resctrl.rst.
+ * The correction factor table is documented in Documentation/filesystems/resctrl.rst.
* If rmid > rmid threshold, MBM total and local values should be multiplied
* by the correction factor.
*
@@ -152,6 +90,7 @@ static const struct mbm_correction_factor_table {
};
static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+
static u64 mbm_cf __read_mostly;
static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
@@ -164,33 +103,6 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
}
/*
- * x86 and arm64 differ in their handling of monitoring.
- * x86's RMID are independent numbers, there is only one source of traffic
- * with an RMID value of '1'.
- * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of
- * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID
- * value is no longer unique.
- * To account for this, resctrl uses an index. On x86 this is just the RMID,
- * on arm64 it encodes the CLOSID and RMID. This gives a unique number.
- *
- * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code
- * must accept an attempt to read every index.
- */
-static inline struct rmid_entry *__rmid_entry(u32 idx)
-{
- struct rmid_entry *entry;
- u32 closid, rmid;
-
- entry = &rmid_ptrs[idx];
- resctrl_arch_rmid_idx_decode(idx, &closid, &rmid);
-
- WARN_ON_ONCE(entry->closid != closid);
- WARN_ON_ONCE(entry->rmid != rmid);
-
- return entry;
-}
-
-/*
* When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
* "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
* needed. The physical RMID is the same as the logical RMID.
@@ -261,12 +173,11 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do
return &hw_dom->arch_mbm_total[rmid];
case QOS_L3_MBM_LOCAL_EVENT_ID:
return &hw_dom->arch_mbm_local[rmid];
+ default:
+ /* Never expect to get here */
+ WARN_ON_ONCE(1);
+ return NULL;
}
-
- /* Never expect to get here */
- WARN_ON_ONCE(1);
-
- return NULL;
}
void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
@@ -347,769 +258,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
return 0;
}
-static void limbo_release_entry(struct rmid_entry *entry)
-{
- lockdep_assert_held(&rdtgroup_mutex);
-
- rmid_limbo_count--;
- list_add_tail(&entry->list, &rmid_free_lru);
-
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
- closid_num_dirty_rmid[entry->closid]--;
-}
-
-/*
- * Check the RMIDs that are marked as busy for this domain. If the
- * reported LLC occupancy is below the threshold clear the busy bit and
- * decrement the count. If the busy count gets to zero on an RMID, we
- * free the RMID
- */
-void __check_limbo(struct rdt_mon_domain *d, bool force_free)
-{
- struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
- u32 idx_limit = resctrl_arch_system_num_rmid_idx();
- struct rmid_entry *entry;
- u32 idx, cur_idx = 1;
- void *arch_mon_ctx;
- bool rmid_dirty;
- u64 val = 0;
-
- arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID);
- if (IS_ERR(arch_mon_ctx)) {
- pr_warn_ratelimited("Failed to allocate monitor context: %ld",
- PTR_ERR(arch_mon_ctx));
- return;
- }
-
- /*
- * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
- * are marked as busy for occupancy < threshold. If the occupancy
- * is less than the threshold decrement the busy counter of the
- * RMID and move it to the free list when the counter reaches 0.
- */
- for (;;) {
- idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx);
- if (idx >= idx_limit)
- break;
-
- entry = __rmid_entry(idx);
- if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid,
- QOS_L3_OCCUP_EVENT_ID, &val,
- arch_mon_ctx)) {
- rmid_dirty = true;
- } else {
- rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
-
- /*
- * x86's CLOSID and RMID are independent numbers, so the entry's
- * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the
- * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't
- * used to select the configuration. It is thus necessary to track both
- * CLOSID and RMID because there may be dependencies between them
- * on some architectures.
- */
- trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val);
- }
-
- if (force_free || !rmid_dirty) {
- clear_bit(idx, d->rmid_busy_llc);
- if (!--entry->busy)
- limbo_release_entry(entry);
- }
- cur_idx = idx + 1;
- }
-
- resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx);
-}
-
-bool has_busy_rmid(struct rdt_mon_domain *d)
-{
- u32 idx_limit = resctrl_arch_system_num_rmid_idx();
-
- return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit;
-}
-
-static struct rmid_entry *resctrl_find_free_rmid(u32 closid)
-{
- struct rmid_entry *itr;
- u32 itr_idx, cmp_idx;
-
- if (list_empty(&rmid_free_lru))
- return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC);
-
- list_for_each_entry(itr, &rmid_free_lru, list) {
- /*
- * Get the index of this free RMID, and the index it would need
- * to be if it were used with this CLOSID.
- * If the CLOSID is irrelevant on this architecture, the two
- * index values are always the same on every entry and thus the
- * very first entry will be returned.
- */
- itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid);
- cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid);
-
- if (itr_idx == cmp_idx)
- return itr;
- }
-
- return ERR_PTR(-ENOSPC);
-}
-
-/**
- * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated
- * RMID are clean, or the CLOSID that has
- * the most clean RMID.
- *
- * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID
- * may not be able to allocate clean RMID. To avoid this the allocator will
- * choose the CLOSID with the most clean RMID.
- *
- * When the CLOSID and RMID are independent numbers, the first free CLOSID will
- * be returned.
- */
-int resctrl_find_cleanest_closid(void)
-{
- u32 cleanest_closid = ~0;
- int i = 0;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
- return -EIO;
-
- for (i = 0; i < closids_supported(); i++) {
- int num_dirty;
-
- if (closid_allocated(i))
- continue;
-
- num_dirty = closid_num_dirty_rmid[i];
- if (num_dirty == 0)
- return i;
-
- if (cleanest_closid == ~0)
- cleanest_closid = i;
-
- if (num_dirty < closid_num_dirty_rmid[cleanest_closid])
- cleanest_closid = i;
- }
-
- if (cleanest_closid == ~0)
- return -ENOSPC;
-
- return cleanest_closid;
-}
-
-/*
- * For MPAM the RMID value is not unique, and has to be considered with
- * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which
- * allows all domains to be managed by a single free list.
- * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler.
- */
-int alloc_rmid(u32 closid)
-{
- struct rmid_entry *entry;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- entry = resctrl_find_free_rmid(closid);
- if (IS_ERR(entry))
- return PTR_ERR(entry);
-
- list_del(&entry->list);
- return entry->rmid;
-}
-
-static void add_rmid_to_limbo(struct rmid_entry *entry)
-{
- struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
- struct rdt_mon_domain *d;
- u32 idx;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
-
- entry->busy = 0;
- list_for_each_entry(d, &r->mon_domains, hdr.list) {
- /*
- * For the first limbo RMID in the domain,
- * setup up the limbo worker.
- */
- if (!has_busy_rmid(d))
- cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL,
- RESCTRL_PICK_ANY_CPU);
- set_bit(idx, d->rmid_busy_llc);
- entry->busy++;
- }
-
- rmid_limbo_count++;
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
- closid_num_dirty_rmid[entry->closid]++;
-}
-
-void free_rmid(u32 closid, u32 rmid)
-{
- u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
- struct rmid_entry *entry;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- /*
- * Do not allow the default rmid to be free'd. Comparing by index
- * allows architectures that ignore the closid parameter to avoid an
- * unnecessary check.
- */
- if (!resctrl_arch_mon_capable() ||
- idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
- RESCTRL_RESERVED_RMID))
- return;
-
- entry = __rmid_entry(idx);
-
- if (resctrl_arch_is_llc_occupancy_enabled())
- add_rmid_to_limbo(entry);
- else
- list_add_tail(&entry->list, &rmid_free_lru);
-}
-
-static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid,
- u32 rmid, enum resctrl_event_id evtid)
-{
- u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
-
- switch (evtid) {
- case QOS_L3_MBM_TOTAL_EVENT_ID:
- return &d->mbm_total[idx];
- case QOS_L3_MBM_LOCAL_EVENT_ID:
- return &d->mbm_local[idx];
- default:
- return NULL;
- }
-}
-
-static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
-{
- int cpu = smp_processor_id();
- struct rdt_mon_domain *d;
- struct mbm_state *m;
- int err, ret;
- u64 tval = 0;
-
- if (rr->first) {
- resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid);
- m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
- if (m)
- memset(m, 0, sizeof(struct mbm_state));
- return 0;
- }
-
- if (rr->d) {
- /* Reading a single domain, must be on a CPU in that domain. */
- if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask))
- return -EINVAL;
- rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid,
- rr->evtid, &tval, rr->arch_mon_ctx);
- if (rr->err)
- return rr->err;
-
- rr->val += tval;
-
- return 0;
- }
-
- /* Summing domains that share a cache, must be on a CPU for that cache. */
- if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
- return -EINVAL;
-
- /*
- * Legacy files must report the sum of an event across all
- * domains that share the same L3 cache instance.
- * Report success if a read from any domain succeeds, -EINVAL
- * (translated to "Unavailable" for user space) if reading from
- * all domains fail for any reason.
- */
- ret = -EINVAL;
- list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
- if (d->ci->id != rr->ci->id)
- continue;
- err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
- rr->evtid, &tval, rr->arch_mon_ctx);
- if (!err) {
- rr->val += tval;
- ret = 0;
- }
- }
-
- if (ret)
- rr->err = ret;
-
- return ret;
-}
-
-/*
- * mbm_bw_count() - Update bw count from values previously read by
- * __mon_event_count().
- * @closid: The closid used to identify the cached mbm_state.
- * @rmid: The rmid used to identify the cached mbm_state.
- * @rr: The struct rmid_read populated by __mon_event_count().
- *
- * Supporting function to calculate the memory bandwidth
- * and delta bandwidth in MBps. The chunks value previously read by
- * __mon_event_count() is compared with the chunks value from the previous
- * invocation. This must be called once per second to maintain values in MBps.
- */
-static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
-{
- u64 cur_bw, bytes, cur_bytes;
- struct mbm_state *m;
-
- m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
- if (WARN_ON_ONCE(!m))
- return;
-
- cur_bytes = rr->val;
- bytes = cur_bytes - m->prev_bw_bytes;
- m->prev_bw_bytes = cur_bytes;
-
- cur_bw = bytes / SZ_1M;
-
- m->prev_bw = cur_bw;
-}
-
-/*
- * This is scheduled by mon_event_read() to read the CQM/MBM counters
- * on a domain.
- */
-void mon_event_count(void *info)
-{
- struct rdtgroup *rdtgrp, *entry;
- struct rmid_read *rr = info;
- struct list_head *head;
- int ret;
-
- rdtgrp = rr->rgrp;
-
- ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr);
-
- /*
- * For Ctrl groups read data from child monitor groups and
- * add them together. Count events which are read successfully.
- * Discard the rmid_read's reporting errors.
- */
- head = &rdtgrp->mon.crdtgrp_list;
-
- if (rdtgrp->type == RDTCTRL_GROUP) {
- list_for_each_entry(entry, head, mon.crdtgrp_list) {
- if (__mon_event_count(entry->closid, entry->mon.rmid,
- rr) == 0)
- ret = 0;
- }
- }
-
- /*
- * __mon_event_count() calls for newly created monitor groups may
- * report -EINVAL/Unavailable if the monitor hasn't seen any traffic.
- * Discard error if any of the monitor event reads succeeded.
- */
- if (ret == 0)
- rr->err = 0;
-}
-
-static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu,
- struct rdt_resource *r)
-{
- struct rdt_ctrl_domain *d;
-
- lockdep_assert_cpus_held();
-
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- /* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
- return d;
- }
-
- return NULL;
-}
-
-/*
- * Feedback loop for MBA software controller (mba_sc)
- *
- * mba_sc is a feedback loop where we periodically read MBM counters and
- * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
- * that:
- *
- * current bandwidth(cur_bw) < user specified bandwidth(user_bw)
- *
- * This uses the MBM counters to measure the bandwidth and MBA throttle
- * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
- * fact that resctrl rdtgroups have both monitoring and control.
- *
- * The frequency of the checks is 1s and we just tag along the MBM overflow
- * timer. Having 1s interval makes the calculation of bandwidth simpler.
- *
- * Although MBA's goal is to restrict the bandwidth to a maximum, there may
- * be a need to increase the bandwidth to avoid unnecessarily restricting
- * the L2 <-> L3 traffic.
- *
- * Since MBA controls the L2 external bandwidth where as MBM measures the
- * L3 external bandwidth the following sequence could lead to such a
- * situation.
- *
- * Consider an rdtgroup which had high L3 <-> memory traffic in initial
- * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
- * after some time rdtgroup has mostly L2 <-> L3 traffic.
- *
- * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
- * throttle MSRs already have low percentage values. To avoid
- * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
- */
-static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
-{
- u32 closid, rmid, cur_msr_val, new_msr_val;
- struct mbm_state *pmbm_data, *cmbm_data;
- struct rdt_ctrl_domain *dom_mba;
- enum resctrl_event_id evt_id;
- struct rdt_resource *r_mba;
- struct list_head *head;
- struct rdtgroup *entry;
- u32 cur_bw, user_bw;
-
- r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
- evt_id = rgrp->mba_mbps_event;
-
- closid = rgrp->closid;
- rmid = rgrp->mon.rmid;
- pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id);
- if (WARN_ON_ONCE(!pmbm_data))
- return;
-
- dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba);
- if (!dom_mba) {
- pr_warn_once("Failure to get domain for MBA update\n");
- return;
- }
-
- cur_bw = pmbm_data->prev_bw;
- user_bw = dom_mba->mbps_val[closid];
-
- /* MBA resource doesn't support CDP */
- cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
-
- /*
- * For Ctrl groups read data from child monitor groups.
- */
- head = &rgrp->mon.crdtgrp_list;
- list_for_each_entry(entry, head, mon.crdtgrp_list) {
- cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id);
- if (WARN_ON_ONCE(!cmbm_data))
- return;
- cur_bw += cmbm_data->prev_bw;
- }
-
- /*
- * Scale up/down the bandwidth linearly for the ctrl group. The
- * bandwidth step is the bandwidth granularity specified by the
- * hardware.
- * Always increase throttling if current bandwidth is above the
- * target set by user.
- * But avoid thrashing up and down on every poll by checking
- * whether a decrease in throttling is likely to push the group
- * back over target. E.g. if currently throttling to 30% of bandwidth
- * on a system with 10% granularity steps, check whether moving to
- * 40% would go past the limit by multiplying current bandwidth by
- * "(30 + 10) / 30".
- */
- if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
- new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
- } else if (cur_msr_val < MAX_MBA_BW &&
- (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) {
- new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
- } else {
- return;
- }
-
- resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
-}
-
-static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
- u32 closid, u32 rmid, enum resctrl_event_id evtid)
-{
- struct rmid_read rr = {0};
-
- rr.r = r;
- rr.d = d;
- rr.evtid = evtid;
- rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
- if (IS_ERR(rr.arch_mon_ctx)) {
- pr_warn_ratelimited("Failed to allocate monitor context: %ld",
- PTR_ERR(rr.arch_mon_ctx));
- return;
- }
-
- __mon_event_count(closid, rmid, &rr);
-
- /*
- * If the software controller is enabled, compute the
- * bandwidth for this event id.
- */
- if (is_mba_sc(NULL))
- mbm_bw_count(closid, rmid, &rr);
-
- resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
-}
-
-static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
- u32 closid, u32 rmid)
-{
- /*
- * This is protected from concurrent reads from user as both
- * the user and overflow handler hold the global mutex.
- */
- if (resctrl_arch_is_mbm_total_enabled())
- mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
-
- if (resctrl_arch_is_mbm_local_enabled())
- mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
-}
-
-/*
- * Handler to scan the limbo list and move the RMIDs
- * to free list whose occupancy < threshold_occupancy.
- */
-void cqm_handle_limbo(struct work_struct *work)
-{
- unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
- struct rdt_mon_domain *d;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- d = container_of(work, struct rdt_mon_domain, cqm_limbo.work);
-
- __check_limbo(d, false);
-
- if (has_busy_rmid(d)) {
- d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
- RESCTRL_PICK_ANY_CPU);
- schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,
- delay);
- }
-
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-}
-
-/**
- * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this
- * domain.
- * @dom: The domain the limbo handler should run for.
- * @delay_ms: How far in the future the handler should run.
- * @exclude_cpu: Which CPU the handler should not run on,
- * RESCTRL_PICK_ANY_CPU to pick any CPU.
- */
-void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
- int exclude_cpu)
-{
- unsigned long delay = msecs_to_jiffies(delay_ms);
- int cpu;
-
- cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
- dom->cqm_work_cpu = cpu;
-
- if (cpu < nr_cpu_ids)
- schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
-}
-
-void mbm_handle_overflow(struct work_struct *work)
-{
- unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
- struct rdtgroup *prgrp, *crgrp;
- struct rdt_mon_domain *d;
- struct list_head *head;
- struct rdt_resource *r;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- /*
- * If the filesystem has been unmounted this work no longer needs to
- * run.
- */
- if (!resctrl_mounted || !resctrl_arch_mon_capable())
- goto out_unlock;
-
- r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
- d = container_of(work, struct rdt_mon_domain, mbm_over.work);
-
- list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
-
- head = &prgrp->mon.crdtgrp_list;
- list_for_each_entry(crgrp, head, mon.crdtgrp_list)
- mbm_update(r, d, crgrp->closid, crgrp->mon.rmid);
-
- if (is_mba_sc(NULL))
- update_mba_bw(prgrp, d);
- }
-
- /*
- * Re-check for housekeeping CPUs. This allows the overflow handler to
- * move off a nohz_full CPU quickly.
- */
- d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
- RESCTRL_PICK_ANY_CPU);
- schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
-
-out_unlock:
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-}
-
-/**
- * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this
- * domain.
- * @dom: The domain the overflow handler should run for.
- * @delay_ms: How far in the future the handler should run.
- * @exclude_cpu: Which CPU the handler should not run on,
- * RESCTRL_PICK_ANY_CPU to pick any CPU.
- */
-void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
- int exclude_cpu)
-{
- unsigned long delay = msecs_to_jiffies(delay_ms);
- int cpu;
-
- /*
- * When a domain comes online there is no guarantee the filesystem is
- * mounted. If not, there is no need to catch counter overflow.
- */
- if (!resctrl_mounted || !resctrl_arch_mon_capable())
- return;
- cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
- dom->mbm_work_cpu = cpu;
-
- if (cpu < nr_cpu_ids)
- schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
-}
-
-static int dom_data_init(struct rdt_resource *r)
-{
- u32 idx_limit = resctrl_arch_system_num_rmid_idx();
- u32 num_closid = resctrl_arch_get_num_closid(r);
- struct rmid_entry *entry = NULL;
- int err = 0, i;
- u32 idx;
-
- mutex_lock(&rdtgroup_mutex);
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
- u32 *tmp;
-
- /*
- * If the architecture hasn't provided a sanitised value here,
- * this may result in larger arrays than necessary. Resctrl will
- * use a smaller system wide value based on the resources in
- * use.
- */
- tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL);
- if (!tmp) {
- err = -ENOMEM;
- goto out_unlock;
- }
-
- closid_num_dirty_rmid = tmp;
- }
-
- rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL);
- if (!rmid_ptrs) {
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
- kfree(closid_num_dirty_rmid);
- closid_num_dirty_rmid = NULL;
- }
- err = -ENOMEM;
- goto out_unlock;
- }
-
- for (i = 0; i < idx_limit; i++) {
- entry = &rmid_ptrs[i];
- INIT_LIST_HEAD(&entry->list);
-
- resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid);
- list_add_tail(&entry->list, &rmid_free_lru);
- }
-
- /*
- * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and
- * are always allocated. These are used for the rdtgroup_default
- * control group, which will be setup later in resctrl_init().
- */
- idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
- RESCTRL_RESERVED_RMID);
- entry = __rmid_entry(idx);
- list_del(&entry->list);
-
-out_unlock:
- mutex_unlock(&rdtgroup_mutex);
-
- return err;
-}
-
-static void dom_data_exit(struct rdt_resource *r)
-{
- mutex_lock(&rdtgroup_mutex);
-
- if (!r->mon_capable)
- goto out_unlock;
-
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
- kfree(closid_num_dirty_rmid);
- closid_num_dirty_rmid = NULL;
- }
-
- kfree(rmid_ptrs);
- rmid_ptrs = NULL;
-
-out_unlock:
- mutex_unlock(&rdtgroup_mutex);
-}
-
-static struct mon_evt llc_occupancy_event = {
- .name = "llc_occupancy",
- .evtid = QOS_L3_OCCUP_EVENT_ID,
-};
-
-static struct mon_evt mbm_total_event = {
- .name = "mbm_total_bytes",
- .evtid = QOS_L3_MBM_TOTAL_EVENT_ID,
-};
-
-static struct mon_evt mbm_local_event = {
- .name = "mbm_local_bytes",
- .evtid = QOS_L3_MBM_LOCAL_EVENT_ID,
-};
-
-/*
- * Initialize the event list for the resource.
- *
- * Note that MBM events are also part of RDT_RESOURCE_L3 resource
- * because as per the SDM the total and local memory bandwidth
- * are enumerated as part of L3 monitoring.
- */
-static void l3_mon_evt_init(struct rdt_resource *r)
-{
- INIT_LIST_HEAD(&r->evt_list);
-
- if (resctrl_arch_is_llc_occupancy_enabled())
- list_add_tail(&llc_occupancy_event.list, &r->evt_list);
- if (resctrl_arch_is_mbm_total_enabled())
- list_add_tail(&mbm_total_event.list, &r->evt_list);
- if (resctrl_arch_is_mbm_local_enabled())
- list_add_tail(&mbm_local_event.list, &r->evt_list);
-}
-
/*
* The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
* which indicates that RMIDs are configured in legacy mode.
@@ -1193,51 +341,6 @@ static __init int snc_get_config(void)
return ret;
}
-/**
- * resctrl_mon_resource_init() - Initialise global monitoring structures.
- *
- * Allocate and initialise global monitor resources that do not belong to a
- * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists.
- * Called once during boot after the struct rdt_resource's have been configured
- * but before the filesystem is mounted.
- * Resctrl's cpuhp callbacks may be called before this point to bring a domain
- * online.
- *
- * Returns 0 for success, or -ENOMEM.
- */
-int __init resctrl_mon_resource_init(void)
-{
- struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
- int ret;
-
- if (!r->mon_capable)
- return 0;
-
- ret = dom_data_init(r);
- if (ret)
- return ret;
-
- l3_mon_evt_init(r);
-
- if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) {
- mbm_total_event.configurable = true;
- resctrl_file_fflags_init("mbm_total_bytes_config",
- RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
- }
- if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) {
- mbm_local_event.configurable = true;
- resctrl_file_fflags_init("mbm_local_bytes_config",
- RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
- }
-
- if (resctrl_arch_is_mbm_local_enabled())
- mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
- else if (resctrl_arch_is_mbm_total_enabled())
- mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
-
- return 0;
-}
-
int __init rdt_get_mon_l3_config(struct rdt_resource *r)
{
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
@@ -1285,13 +388,6 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
return 0;
}
-void resctrl_mon_resource_exit(void)
-{
- struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
-
- dom_data_exit(r);
-}
-
void __init intel_rdt_mbm_apply_quirk(void)
{
int cf_index;
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 1190c48a16b2..de580eca3363 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -11,19 +11,13 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/cacheflush.h>
#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/debugfs.h>
-#include <linux/kthread.h>
-#include <linux/mman.h>
#include <linux/perf_event.h>
#include <linux/pm_qos.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
+#include <linux/resctrl.h>
-#include <asm/cacheflush.h>
#include <asm/cpu_device_id.h>
-#include <asm/resctrl.h>
#include <asm/perf_event.h>
#include <asm/msr.h>
@@ -31,7 +25,8 @@
#include "internal.h"
#define CREATE_TRACE_POINTS
-#include "trace.h"
+
+#include "pseudo_lock_trace.h"
/*
* The bits needed to disable hardware prefetching varies based on the
@@ -39,29 +34,6 @@
*/
static u64 prefetch_disable_bits;
-/*
- * Major number assigned to and shared by all devices exposing
- * pseudo-locked regions.
- */
-static unsigned int pseudo_lock_major;
-static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0);
-
-static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode)
-{
- const struct rdtgroup *rdtgrp;
-
- rdtgrp = dev_get_drvdata(dev);
- if (mode)
- *mode = 0600;
- guard(mutex)(&rdtgroup_mutex);
- return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn));
-}
-
-static const struct class pseudo_lock_class = {
- .name = "pseudo_lock",
- .devnode = pseudo_lock_devnode,
-};
-
/**
* resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
* platforms
@@ -123,298 +95,6 @@ u64 resctrl_arch_get_prefetch_disable_bits(void)
}
/**
- * pseudo_lock_minor_get - Obtain available minor number
- * @minor: Pointer to where new minor number will be stored
- *
- * A bitmask is used to track available minor numbers. Here the next free
- * minor number is marked as unavailable and returned.
- *
- * Return: 0 on success, <0 on failure.
- */
-static int pseudo_lock_minor_get(unsigned int *minor)
-{
- unsigned long first_bit;
-
- first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS);
-
- if (first_bit == MINORBITS)
- return -ENOSPC;
-
- __clear_bit(first_bit, &pseudo_lock_minor_avail);
- *minor = first_bit;
-
- return 0;
-}
-
-/**
- * pseudo_lock_minor_release - Return minor number to available
- * @minor: The minor number made available
- */
-static void pseudo_lock_minor_release(unsigned int minor)
-{
- __set_bit(minor, &pseudo_lock_minor_avail);
-}
-
-/**
- * region_find_by_minor - Locate a pseudo-lock region by inode minor number
- * @minor: The minor number of the device representing pseudo-locked region
- *
- * When the character device is accessed we need to determine which
- * pseudo-locked region it belongs to. This is done by matching the minor
- * number of the device to the pseudo-locked region it belongs.
- *
- * Minor numbers are assigned at the time a pseudo-locked region is associated
- * with a cache instance.
- *
- * Return: On success return pointer to resource group owning the pseudo-locked
- * region, NULL on failure.
- */
-static struct rdtgroup *region_find_by_minor(unsigned int minor)
-{
- struct rdtgroup *rdtgrp, *rdtgrp_match = NULL;
-
- list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
- if (rdtgrp->plr && rdtgrp->plr->minor == minor) {
- rdtgrp_match = rdtgrp;
- break;
- }
- }
- return rdtgrp_match;
-}
-
-/**
- * struct pseudo_lock_pm_req - A power management QoS request list entry
- * @list: Entry within the @pm_reqs list for a pseudo-locked region
- * @req: PM QoS request
- */
-struct pseudo_lock_pm_req {
- struct list_head list;
- struct dev_pm_qos_request req;
-};
-
-static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
-{
- struct pseudo_lock_pm_req *pm_req, *next;
-
- list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) {
- dev_pm_qos_remove_request(&pm_req->req);
- list_del(&pm_req->list);
- kfree(pm_req);
- }
-}
-
-/**
- * pseudo_lock_cstates_constrain - Restrict cores from entering C6
- * @plr: Pseudo-locked region
- *
- * To prevent the cache from being affected by power management entering
- * C6 has to be avoided. This is accomplished by requesting a latency
- * requirement lower than lowest C6 exit latency of all supported
- * platforms as found in the cpuidle state tables in the intel_idle driver.
- * At this time it is possible to do so with a single latency requirement
- * for all supported platforms.
- *
- * Since Goldmont is supported, which is affected by X86_BUG_MONITOR,
- * the ACPI latencies need to be considered while keeping in mind that C2
- * may be set to map to deeper sleep states. In this case the latency
- * requirement needs to prevent entering C2 also.
- *
- * Return: 0 on success, <0 on failure
- */
-static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
-{
- struct pseudo_lock_pm_req *pm_req;
- int cpu;
- int ret;
-
- for_each_cpu(cpu, &plr->d->hdr.cpu_mask) {
- pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
- if (!pm_req) {
- rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n");
- ret = -ENOMEM;
- goto out_err;
- }
- ret = dev_pm_qos_add_request(get_cpu_device(cpu),
- &pm_req->req,
- DEV_PM_QOS_RESUME_LATENCY,
- 30);
- if (ret < 0) {
- rdt_last_cmd_printf("Failed to add latency req CPU%d\n",
- cpu);
- kfree(pm_req);
- ret = -1;
- goto out_err;
- }
- list_add(&pm_req->list, &plr->pm_reqs);
- }
-
- return 0;
-
-out_err:
- pseudo_lock_cstates_relax(plr);
- return ret;
-}
-
-/**
- * pseudo_lock_region_clear - Reset pseudo-lock region data
- * @plr: pseudo-lock region
- *
- * All content of the pseudo-locked region is reset - any memory allocated
- * freed.
- *
- * Return: void
- */
-static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
-{
- plr->size = 0;
- plr->line_size = 0;
- kfree(plr->kmem);
- plr->kmem = NULL;
- plr->s = NULL;
- if (plr->d)
- plr->d->plr = NULL;
- plr->d = NULL;
- plr->cbm = 0;
- plr->debugfs_dir = NULL;
-}
-
-/**
- * pseudo_lock_region_init - Initialize pseudo-lock region information
- * @plr: pseudo-lock region
- *
- * Called after user provided a schemata to be pseudo-locked. From the
- * schemata the &struct pseudo_lock_region is on entry already initialized
- * with the resource, domain, and capacity bitmask. Here the information
- * required for pseudo-locking is deduced from this data and &struct
- * pseudo_lock_region initialized further. This information includes:
- * - size in bytes of the region to be pseudo-locked
- * - cache line size to know the stride with which data needs to be accessed
- * to be pseudo-locked
- * - a cpu associated with the cache instance on which the pseudo-locking
- * flow can be executed
- *
- * Return: 0 on success, <0 on failure. Descriptive error will be written
- * to last_cmd_status buffer.
- */
-static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
-{
- enum resctrl_scope scope = plr->s->res->ctrl_scope;
- struct cacheinfo *ci;
- int ret;
-
- if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE))
- return -ENODEV;
-
- /* Pick the first cpu we find that is associated with the cache. */
- plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask);
-
- if (!cpu_online(plr->cpu)) {
- rdt_last_cmd_printf("CPU %u associated with cache not online\n",
- plr->cpu);
- ret = -ENODEV;
- goto out_region;
- }
-
- ci = get_cpu_cacheinfo_level(plr->cpu, scope);
- if (ci) {
- plr->line_size = ci->coherency_line_size;
- plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
- return 0;
- }
-
- ret = -1;
- rdt_last_cmd_puts("Unable to determine cache line size\n");
-out_region:
- pseudo_lock_region_clear(plr);
- return ret;
-}
-
-/**
- * pseudo_lock_init - Initialize a pseudo-lock region
- * @rdtgrp: resource group to which new pseudo-locked region will belong
- *
- * A pseudo-locked region is associated with a resource group. When this
- * association is created the pseudo-locked region is initialized. The
- * details of the pseudo-locked region are not known at this time so only
- * allocation is done and association established.
- *
- * Return: 0 on success, <0 on failure
- */
-static int pseudo_lock_init(struct rdtgroup *rdtgrp)
-{
- struct pseudo_lock_region *plr;
-
- plr = kzalloc(sizeof(*plr), GFP_KERNEL);
- if (!plr)
- return -ENOMEM;
-
- init_waitqueue_head(&plr->lock_thread_wq);
- INIT_LIST_HEAD(&plr->pm_reqs);
- rdtgrp->plr = plr;
- return 0;
-}
-
-/**
- * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked
- * @plr: pseudo-lock region
- *
- * Initialize the details required to set up the pseudo-locked region and
- * allocate the contiguous memory that will be pseudo-locked to the cache.
- *
- * Return: 0 on success, <0 on failure. Descriptive error will be written
- * to last_cmd_status buffer.
- */
-static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr)
-{
- int ret;
-
- ret = pseudo_lock_region_init(plr);
- if (ret < 0)
- return ret;
-
- /*
- * We do not yet support contiguous regions larger than
- * KMALLOC_MAX_SIZE.
- */
- if (plr->size > KMALLOC_MAX_SIZE) {
- rdt_last_cmd_puts("Requested region exceeds maximum size\n");
- ret = -E2BIG;
- goto out_region;
- }
-
- plr->kmem = kzalloc(plr->size, GFP_KERNEL);
- if (!plr->kmem) {
- rdt_last_cmd_puts("Unable to allocate memory\n");
- ret = -ENOMEM;
- goto out_region;
- }
-
- ret = 0;
- goto out;
-out_region:
- pseudo_lock_region_clear(plr);
-out:
- return ret;
-}
-
-/**
- * pseudo_lock_free - Free a pseudo-locked region
- * @rdtgrp: resource group to which pseudo-locked region belonged
- *
- * The pseudo-locked region's resources have already been released, or not
- * yet created at this point. Now it can be freed and disassociated from the
- * resource group.
- *
- * Return: void
- */
-static void pseudo_lock_free(struct rdtgroup *rdtgrp)
-{
- pseudo_lock_region_clear(rdtgrp->plr);
- kfree(rdtgrp->plr);
- rdtgrp->plr = NULL;
-}
-
-/**
* resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
* @_plr: the pseudo-lock region descriptor
*
@@ -544,340 +224,6 @@ int resctrl_arch_pseudo_lock_fn(void *_plr)
}
/**
- * rdtgroup_monitor_in_progress - Test if monitoring in progress
- * @rdtgrp: resource group being queried
- *
- * Return: 1 if monitor groups have been created for this resource
- * group, 0 otherwise.
- */
-static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp)
-{
- return !list_empty(&rdtgrp->mon.crdtgrp_list);
-}
-
-/**
- * rdtgroup_locksetup_user_restrict - Restrict user access to group
- * @rdtgrp: resource group needing access restricted
- *
- * A resource group used for cache pseudo-locking cannot have cpus or tasks
- * assigned to it. This is communicated to the user by restricting access
- * to all the files that can be used to make such changes.
- *
- * Permissions restored with rdtgroup_locksetup_user_restore()
- *
- * Return: 0 on success, <0 on failure. If a failure occurs during the
- * restriction of access an attempt will be made to restore permissions but
- * the state of the mode of these files will be uncertain when a failure
- * occurs.
- */
-static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
- if (ret)
- return ret;
-
- ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
- if (ret)
- goto err_tasks;
-
- ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
- if (ret)
- goto err_cpus;
-
- if (resctrl_arch_mon_capable()) {
- ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
- if (ret)
- goto err_cpus_list;
- }
-
- ret = 0;
- goto out;
-
-err_cpus_list:
- rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
-err_cpus:
- rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
-err_tasks:
- rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
-out:
- return ret;
-}
-
-/**
- * rdtgroup_locksetup_user_restore - Restore user access to group
- * @rdtgrp: resource group needing access restored
- *
- * Restore all file access previously removed using
- * rdtgroup_locksetup_user_restrict()
- *
- * Return: 0 on success, <0 on failure. If a failure occurs during the
- * restoration of access an attempt will be made to restrict permissions
- * again but the state of the mode of these files will be uncertain when
- * a failure occurs.
- */
-static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
- if (ret)
- return ret;
-
- ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
- if (ret)
- goto err_tasks;
-
- ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
- if (ret)
- goto err_cpus;
-
- if (resctrl_arch_mon_capable()) {
- ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
- if (ret)
- goto err_cpus_list;
- }
-
- ret = 0;
- goto out;
-
-err_cpus_list:
- rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
-err_cpus:
- rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
-err_tasks:
- rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
-out:
- return ret;
-}
-
-/**
- * rdtgroup_locksetup_enter - Resource group enters locksetup mode
- * @rdtgrp: resource group requested to enter locksetup mode
- *
- * A resource group enters locksetup mode to reflect that it would be used
- * to represent a pseudo-locked region and is in the process of being set
- * up to do so. A resource group used for a pseudo-locked region would
- * lose the closid associated with it so we cannot allow it to have any
- * tasks or cpus assigned nor permit tasks or cpus to be assigned in the
- * future. Monitoring of a pseudo-locked region is not allowed either.
- *
- * The above and more restrictions on a pseudo-locked region are checked
- * for and enforced before the resource group enters the locksetup mode.
- *
- * Returns: 0 if the resource group successfully entered locksetup mode, <0
- * on failure. On failure the last_cmd_status buffer is updated with text to
- * communicate details of failure to the user.
- */
-int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- /*
- * The default resource group can neither be removed nor lose the
- * default closid associated with it.
- */
- if (rdtgrp == &rdtgroup_default) {
- rdt_last_cmd_puts("Cannot pseudo-lock default group\n");
- return -EINVAL;
- }
-
- /*
- * Cache Pseudo-locking not supported when CDP is enabled.
- *
- * Some things to consider if you would like to enable this
- * support (using L3 CDP as example):
- * - When CDP is enabled two separate resources are exposed,
- * L3DATA and L3CODE, but they are actually on the same cache.
- * The implication for pseudo-locking is that if a
- * pseudo-locked region is created on a domain of one
- * resource (eg. L3CODE), then a pseudo-locked region cannot
- * be created on that same domain of the other resource
- * (eg. L3DATA). This is because the creation of a
- * pseudo-locked region involves a call to wbinvd that will
- * affect all cache allocations on particular domain.
- * - Considering the previous, it may be possible to only
- * expose one of the CDP resources to pseudo-locking and
- * hide the other. For example, we could consider to only
- * expose L3DATA and since the L3 cache is unified it is
- * still possible to place instructions there are execute it.
- * - If only one region is exposed to pseudo-locking we should
- * still keep in mind that availability of a portion of cache
- * for pseudo-locking should take into account both resources.
- * Similarly, if a pseudo-locked region is created in one
- * resource, the portion of cache used by it should be made
- * unavailable to all future allocations from both resources.
- */
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) ||
- resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) {
- rdt_last_cmd_puts("CDP enabled\n");
- return -EINVAL;
- }
-
- /*
- * Not knowing the bits to disable prefetching implies that this
- * platform does not support Cache Pseudo-Locking.
- */
- if (resctrl_arch_get_prefetch_disable_bits() == 0) {
- rdt_last_cmd_puts("Pseudo-locking not supported\n");
- return -EINVAL;
- }
-
- if (rdtgroup_monitor_in_progress(rdtgrp)) {
- rdt_last_cmd_puts("Monitoring in progress\n");
- return -EINVAL;
- }
-
- if (rdtgroup_tasks_assigned(rdtgrp)) {
- rdt_last_cmd_puts("Tasks assigned to resource group\n");
- return -EINVAL;
- }
-
- if (!cpumask_empty(&rdtgrp->cpu_mask)) {
- rdt_last_cmd_puts("CPUs assigned to resource group\n");
- return -EINVAL;
- }
-
- if (rdtgroup_locksetup_user_restrict(rdtgrp)) {
- rdt_last_cmd_puts("Unable to modify resctrl permissions\n");
- return -EIO;
- }
-
- ret = pseudo_lock_init(rdtgrp);
- if (ret) {
- rdt_last_cmd_puts("Unable to init pseudo-lock region\n");
- goto out_release;
- }
-
- /*
- * If this system is capable of monitoring a rmid would have been
- * allocated when the control group was created. This is not needed
- * anymore when this group would be used for pseudo-locking. This
- * is safe to call on platforms not capable of monitoring.
- */
- free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
-
- ret = 0;
- goto out;
-
-out_release:
- rdtgroup_locksetup_user_restore(rdtgrp);
-out:
- return ret;
-}
-
-/**
- * rdtgroup_locksetup_exit - resource group exist locksetup mode
- * @rdtgrp: resource group
- *
- * When a resource group exits locksetup mode the earlier restrictions are
- * lifted.
- *
- * Return: 0 on success, <0 on failure
- */
-int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- if (resctrl_arch_mon_capable()) {
- ret = alloc_rmid(rdtgrp->closid);
- if (ret < 0) {
- rdt_last_cmd_puts("Out of RMIDs\n");
- return ret;
- }
- rdtgrp->mon.rmid = ret;
- }
-
- ret = rdtgroup_locksetup_user_restore(rdtgrp);
- if (ret) {
- free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
- return ret;
- }
-
- pseudo_lock_free(rdtgrp);
- return 0;
-}
-
-/**
- * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked
- * @d: RDT domain
- * @cbm: CBM to test
- *
- * @d represents a cache instance and @cbm a capacity bitmask that is
- * considered for it. Determine if @cbm overlaps with any existing
- * pseudo-locked region on @d.
- *
- * @cbm is unsigned long, even if only 32 bits are used, to make the
- * bitmap functions work correctly.
- *
- * Return: true if @cbm overlaps with pseudo-locked region on @d, false
- * otherwise.
- */
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
-{
- unsigned int cbm_len;
- unsigned long cbm_b;
-
- if (d->plr) {
- cbm_len = d->plr->s->res->cache.cbm_len;
- cbm_b = d->plr->cbm;
- if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
- return true;
- }
- return false;
-}
-
-/**
- * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy
- * @d: RDT domain under test
- *
- * The setup of a pseudo-locked region affects all cache instances within
- * the hierarchy of the region. It is thus essential to know if any
- * pseudo-locked regions exist within a cache hierarchy to prevent any
- * attempts to create new pseudo-locked regions in the same hierarchy.
- *
- * Return: true if a pseudo-locked region exists in the hierarchy of @d or
- * if it is not possible to test due to memory allocation issue,
- * false otherwise.
- */
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
-{
- struct rdt_ctrl_domain *d_i;
- cpumask_var_t cpu_with_psl;
- struct rdt_resource *r;
- bool ret = false;
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
- return true;
-
- /*
- * First determine which cpus have pseudo-locked regions
- * associated with them.
- */
- for_each_alloc_capable_rdt_resource(r) {
- list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) {
- if (d_i->plr)
- cpumask_or(cpu_with_psl, cpu_with_psl,
- &d_i->hdr.cpu_mask);
- }
- }
-
- /*
- * Next test if new pseudo-locked region would intersect with
- * existing region.
- */
- if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl))
- ret = true;
-
- free_cpumask_var(cpu_with_psl);
- return ret;
-}
-
-/**
* resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
* pseudo-locked memory
* @_plr: pseudo-lock region to measure
@@ -1169,433 +515,3 @@ out:
wake_up_interruptible(&plr->lock_thread_wq);
return 0;
}
-
-/**
- * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
- * @rdtgrp: Resource group to which the pseudo-locked region belongs.
- * @sel: Selector of which measurement to perform on a pseudo-locked region.
- *
- * The measurement of latency to access a pseudo-locked region should be
- * done from a cpu that is associated with that pseudo-locked region.
- * Determine which cpu is associated with this region and start a thread on
- * that cpu to perform the measurement, wait for that thread to complete.
- *
- * Return: 0 on success, <0 on failure
- */
-static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
-{
- struct pseudo_lock_region *plr = rdtgrp->plr;
- struct task_struct *thread;
- unsigned int cpu;
- int ret = -1;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- if (rdtgrp->flags & RDT_DELETED) {
- ret = -ENODEV;
- goto out;
- }
-
- if (!plr->d) {
- ret = -ENODEV;
- goto out;
- }
-
- plr->thread_done = 0;
- cpu = cpumask_first(&plr->d->hdr.cpu_mask);
- if (!cpu_online(cpu)) {
- ret = -ENODEV;
- goto out;
- }
-
- plr->cpu = cpu;
-
- if (sel == 1)
- thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn,
- plr, cpu, "pseudo_lock_measure/%u");
- else if (sel == 2)
- thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency,
- plr, cpu, "pseudo_lock_measure/%u");
- else if (sel == 3)
- thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency,
- plr, cpu, "pseudo_lock_measure/%u");
- else
- goto out;
-
- if (IS_ERR(thread)) {
- ret = PTR_ERR(thread);
- goto out;
- }
-
- ret = wait_event_interruptible(plr->lock_thread_wq,
- plr->thread_done == 1);
- if (ret < 0)
- goto out;
-
- ret = 0;
-
-out:
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
- return ret;
-}
-
-static ssize_t pseudo_lock_measure_trigger(struct file *file,
- const char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct rdtgroup *rdtgrp = file->private_data;
- size_t buf_size;
- char buf[32];
- int ret;
- int sel;
-
- buf_size = min(count, (sizeof(buf) - 1));
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
- ret = kstrtoint(buf, 10, &sel);
- if (ret == 0) {
- if (sel != 1 && sel != 2 && sel != 3)
- return -EINVAL;
- ret = debugfs_file_get(file->f_path.dentry);
- if (ret)
- return ret;
- ret = pseudo_lock_measure_cycles(rdtgrp, sel);
- if (ret == 0)
- ret = count;
- debugfs_file_put(file->f_path.dentry);
- }
-
- return ret;
-}
-
-static const struct file_operations pseudo_measure_fops = {
- .write = pseudo_lock_measure_trigger,
- .open = simple_open,
- .llseek = default_llseek,
-};
-
-/**
- * rdtgroup_pseudo_lock_create - Create a pseudo-locked region
- * @rdtgrp: resource group to which pseudo-lock region belongs
- *
- * Called when a resource group in the pseudo-locksetup mode receives a
- * valid schemata that should be pseudo-locked. Since the resource group is
- * in pseudo-locksetup mode the &struct pseudo_lock_region has already been
- * allocated and initialized with the essential information. If a failure
- * occurs the resource group remains in the pseudo-locksetup mode with the
- * &struct pseudo_lock_region associated with it, but cleared from all
- * information and ready for the user to re-attempt pseudo-locking by
- * writing the schemata again.
- *
- * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0
- * on failure. Descriptive error will be written to last_cmd_status buffer.
- */
-int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
-{
- struct pseudo_lock_region *plr = rdtgrp->plr;
- struct task_struct *thread;
- unsigned int new_minor;
- struct device *dev;
- char *kn_name __free(kfree) = NULL;
- int ret;
-
- ret = pseudo_lock_region_alloc(plr);
- if (ret < 0)
- return ret;
-
- ret = pseudo_lock_cstates_constrain(plr);
- if (ret < 0) {
- ret = -EINVAL;
- goto out_region;
- }
- kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL);
- if (!kn_name) {
- ret = -ENOMEM;
- goto out_cstates;
- }
-
- plr->thread_done = 0;
-
- thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr,
- plr->cpu, "pseudo_lock/%u");
- if (IS_ERR(thread)) {
- ret = PTR_ERR(thread);
- rdt_last_cmd_printf("Locking thread returned error %d\n", ret);
- goto out_cstates;
- }
-
- ret = wait_event_interruptible(plr->lock_thread_wq,
- plr->thread_done == 1);
- if (ret < 0) {
- /*
- * If the thread does not get on the CPU for whatever
- * reason and the process which sets up the region is
- * interrupted then this will leave the thread in runnable
- * state and once it gets on the CPU it will dereference
- * the cleared, but not freed, plr struct resulting in an
- * empty pseudo-locking loop.
- */
- rdt_last_cmd_puts("Locking thread interrupted\n");
- goto out_cstates;
- }
-
- ret = pseudo_lock_minor_get(&new_minor);
- if (ret < 0) {
- rdt_last_cmd_puts("Unable to obtain a new minor number\n");
- goto out_cstates;
- }
-
- /*
- * Unlock access but do not release the reference. The
- * pseudo-locked region will still be here on return.
- *
- * The mutex has to be released temporarily to avoid a potential
- * deadlock with the mm->mmap_lock which is obtained in the
- * device_create() and debugfs_create_dir() callpath below as well as
- * before the mmap() callback is called.
- */
- mutex_unlock(&rdtgroup_mutex);
-
- if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
- plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl);
- if (!IS_ERR_OR_NULL(plr->debugfs_dir))
- debugfs_create_file("pseudo_lock_measure", 0200,
- plr->debugfs_dir, rdtgrp,
- &pseudo_measure_fops);
- }
-
- dev = device_create(&pseudo_lock_class, NULL,
- MKDEV(pseudo_lock_major, new_minor),
- rdtgrp, "%s", kn_name);
-
- mutex_lock(&rdtgroup_mutex);
-
- if (IS_ERR(dev)) {
- ret = PTR_ERR(dev);
- rdt_last_cmd_printf("Failed to create character device: %d\n",
- ret);
- goto out_debugfs;
- }
-
- /* We released the mutex - check if group was removed while we did so */
- if (rdtgrp->flags & RDT_DELETED) {
- ret = -ENODEV;
- goto out_device;
- }
-
- plr->minor = new_minor;
-
- rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED;
- closid_free(rdtgrp->closid);
- rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444);
- rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444);
-
- ret = 0;
- goto out;
-
-out_device:
- device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
-out_debugfs:
- debugfs_remove_recursive(plr->debugfs_dir);
- pseudo_lock_minor_release(new_minor);
-out_cstates:
- pseudo_lock_cstates_relax(plr);
-out_region:
- pseudo_lock_region_clear(plr);
-out:
- return ret;
-}
-
-/**
- * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region
- * @rdtgrp: resource group to which the pseudo-locked region belongs
- *
- * The removal of a pseudo-locked region can be initiated when the resource
- * group is removed from user space via a "rmdir" from userspace or the
- * unmount of the resctrl filesystem. On removal the resource group does
- * not go back to pseudo-locksetup mode before it is removed, instead it is
- * removed directly. There is thus asymmetry with the creation where the
- * &struct pseudo_lock_region is removed here while it was not created in
- * rdtgroup_pseudo_lock_create().
- *
- * Return: void
- */
-void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
-{
- struct pseudo_lock_region *plr = rdtgrp->plr;
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- /*
- * Default group cannot be a pseudo-locked region so we can
- * free closid here.
- */
- closid_free(rdtgrp->closid);
- goto free;
- }
-
- pseudo_lock_cstates_relax(plr);
- debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
- device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
- pseudo_lock_minor_release(plr->minor);
-
-free:
- pseudo_lock_free(rdtgrp);
-}
-
-static int pseudo_lock_dev_open(struct inode *inode, struct file *filp)
-{
- struct rdtgroup *rdtgrp;
-
- mutex_lock(&rdtgroup_mutex);
-
- rdtgrp = region_find_by_minor(iminor(inode));
- if (!rdtgrp) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENODEV;
- }
-
- filp->private_data = rdtgrp;
- atomic_inc(&rdtgrp->waitcount);
- /* Perform a non-seekable open - llseek is not supported */
- filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
-
- mutex_unlock(&rdtgroup_mutex);
-
- return 0;
-}
-
-static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
-{
- struct rdtgroup *rdtgrp;
-
- mutex_lock(&rdtgroup_mutex);
- rdtgrp = filp->private_data;
- WARN_ON(!rdtgrp);
- if (!rdtgrp) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENODEV;
- }
- filp->private_data = NULL;
- atomic_dec(&rdtgrp->waitcount);
- mutex_unlock(&rdtgroup_mutex);
- return 0;
-}
-
-static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
-{
- /* Not supported */
- return -EINVAL;
-}
-
-static const struct vm_operations_struct pseudo_mmap_ops = {
- .mremap = pseudo_lock_dev_mremap,
-};
-
-static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
-{
- unsigned long vsize = vma->vm_end - vma->vm_start;
- unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
- struct pseudo_lock_region *plr;
- struct rdtgroup *rdtgrp;
- unsigned long physical;
- unsigned long psize;
-
- mutex_lock(&rdtgroup_mutex);
-
- rdtgrp = filp->private_data;
- WARN_ON(!rdtgrp);
- if (!rdtgrp) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENODEV;
- }
-
- plr = rdtgrp->plr;
-
- if (!plr->d) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENODEV;
- }
-
- /*
- * Task is required to run with affinity to the cpus associated
- * with the pseudo-locked region. If this is not the case the task
- * may be scheduled elsewhere and invalidate entries in the
- * pseudo-locked region.
- */
- if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) {
- mutex_unlock(&rdtgroup_mutex);
- return -EINVAL;
- }
-
- physical = __pa(plr->kmem) >> PAGE_SHIFT;
- psize = plr->size - off;
-
- if (off > plr->size) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENOSPC;
- }
-
- /*
- * Ensure changes are carried directly to the memory being mapped,
- * do not allow copy-on-write mapping.
- */
- if (!(vma->vm_flags & VM_SHARED)) {
- mutex_unlock(&rdtgroup_mutex);
- return -EINVAL;
- }
-
- if (vsize > psize) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENOSPC;
- }
-
- memset(plr->kmem + off, 0, vsize);
-
- if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff,
- vsize, vma->vm_page_prot)) {
- mutex_unlock(&rdtgroup_mutex);
- return -EAGAIN;
- }
- vma->vm_ops = &pseudo_mmap_ops;
- mutex_unlock(&rdtgroup_mutex);
- return 0;
-}
-
-static const struct file_operations pseudo_lock_dev_fops = {
- .owner = THIS_MODULE,
- .read = NULL,
- .write = NULL,
- .open = pseudo_lock_dev_open,
- .release = pseudo_lock_dev_release,
- .mmap = pseudo_lock_dev_mmap,
-};
-
-int rdt_pseudo_lock_init(void)
-{
- int ret;
-
- ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops);
- if (ret < 0)
- return ret;
-
- pseudo_lock_major = ret;
-
- ret = class_register(&pseudo_lock_class);
- if (ret) {
- unregister_chrdev(pseudo_lock_major, "pseudo_lock");
- return ret;
- }
-
- return 0;
-}
-
-void rdt_pseudo_lock_release(void)
-{
- class_unregister(&pseudo_lock_class);
- unregister_chrdev(pseudo_lock_major, "pseudo_lock");
- pseudo_lock_major = 0;
-}
diff --git a/arch/x86/kernel/cpu/resctrl/trace.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
index 2a506316b303..7c8aef08010f 100644
--- a/arch/x86/kernel/cpu/resctrl/trace.h
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
@@ -2,8 +2,8 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM resctrl
-#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_RESCTRL_H
+#if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H
#include <linux/tracepoint.h>
@@ -35,25 +35,11 @@ TRACE_EVENT(pseudo_lock_l3,
TP_printk("hits=%llu miss=%llu",
__entry->l3_hits, __entry->l3_miss));
-TRACE_EVENT(mon_llc_occupancy_limbo,
- TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes),
- TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes),
- TP_STRUCT__entry(__field(u32, ctrl_hw_id)
- __field(u32, mon_hw_id)
- __field(int, domain_id)
- __field(u64, llc_occupancy_bytes)),
- TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id;
- __entry->mon_hw_id = mon_hw_id;
- __entry->domain_id = domain_id;
- __entry->llc_occupancy_bytes = llc_occupancy_bytes;),
- TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu",
- __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id,
- __entry->llc_occupancy_bytes)
- );
-
-#endif /* _TRACE_RESCTRL_H */
+#endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE trace
+
+#define TRACE_INCLUDE_FILE pseudo_lock_trace
+
#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index c85ace29ea3a..885026468440 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -18,6 +18,7 @@
#include <linux/fs_parser.h>
#include <linux/sysfs.h>
#include <linux/kernfs.h>
+#include <linux/resctrl.h>
#include <linux/seq_buf.h>
#include <linux/seq_file.h>
#include <linux/sched/signal.h>
@@ -29,341 +30,16 @@
#include <uapi/linux/magic.h>
#include <asm/msr.h>
-#include <asm/resctrl.h>
#include "internal.h"
DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
-DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
-DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
-
-/* Mutex to protect rdtgroup access. */
-DEFINE_MUTEX(rdtgroup_mutex);
-
-static struct kernfs_root *rdt_root;
-struct rdtgroup rdtgroup_default;
-LIST_HEAD(rdt_all_groups);
-
-/* list of entries for the schemata file */
-LIST_HEAD(resctrl_schema_all);
-
-/* The filesystem can only be mounted once. */
-bool resctrl_mounted;
-
-/* Kernel fs node for "info" directory under root */
-static struct kernfs_node *kn_info;
-
-/* Kernel fs node for "mon_groups" directory under root */
-static struct kernfs_node *kn_mongrp;
-
-/* Kernel fs node for "mon_data" directory under root */
-static struct kernfs_node *kn_mondata;
-
-/*
- * Used to store the max resource name width to display the schemata names in
- * a tabular format.
- */
-int max_name_width;
-
-static struct seq_buf last_cmd_status;
-static char last_cmd_status_buf[512];
-
-static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
-static void rdtgroup_destroy_root(void);
-
-struct dentry *debugfs_resctrl;
-
-/*
- * Memory bandwidth monitoring event to use for the default CTRL_MON group
- * and each new CTRL_MON group created by the user. Only relevant when
- * the filesystem is mounted with the "mba_MBps" option so it does not
- * matter that it remains uninitialized on systems that do not support
- * the "mba_MBps" option.
- */
-enum resctrl_event_id mba_mbps_default_event;
-
-static bool resctrl_debug;
-
-void rdt_last_cmd_clear(void)
-{
- lockdep_assert_held(&rdtgroup_mutex);
- seq_buf_clear(&last_cmd_status);
-}
-
-void rdt_last_cmd_puts(const char *s)
-{
- lockdep_assert_held(&rdtgroup_mutex);
- seq_buf_puts(&last_cmd_status, s);
-}
-
-void rdt_last_cmd_printf(const char *fmt, ...)
-{
- va_list ap;
-
- va_start(ap, fmt);
- lockdep_assert_held(&rdtgroup_mutex);
- seq_buf_vprintf(&last_cmd_status, fmt, ap);
- va_end(ap);
-}
-
-void rdt_staged_configs_clear(void)
-{
- struct rdt_ctrl_domain *dom;
- struct rdt_resource *r;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- for_each_alloc_capable_rdt_resource(r) {
- list_for_each_entry(dom, &r->ctrl_domains, hdr.list)
- memset(dom->staged_config, 0, sizeof(dom->staged_config));
- }
-}
-
-static bool resctrl_is_mbm_enabled(void)
-{
- return (resctrl_arch_is_mbm_total_enabled() ||
- resctrl_arch_is_mbm_local_enabled());
-}
-
-static bool resctrl_is_mbm_event(int e)
-{
- return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
- e <= QOS_L3_MBM_LOCAL_EVENT_ID);
-}
-
-/*
- * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
- * we can keep a bitmap of free CLOSIDs in a single integer.
- *
- * Using a global CLOSID across all resources has some advantages and
- * some drawbacks:
- * + We can simply set current's closid to assign a task to a resource
- * group.
- * + Context switch code can avoid extra memory references deciding which
- * CLOSID to load into the PQR_ASSOC MSR
- * - We give up some options in configuring resource groups across multi-socket
- * systems.
- * - Our choices on how to configure each resource become progressively more
- * limited as the number of resources grows.
- */
-static unsigned long closid_free_map;
-static int closid_free_map_len;
-
-int closids_supported(void)
-{
- return closid_free_map_len;
-}
-
-static void closid_init(void)
-{
- struct resctrl_schema *s;
- u32 rdt_min_closid = 32;
-
- /* Compute rdt_min_closid across all resources */
- list_for_each_entry(s, &resctrl_schema_all, list)
- rdt_min_closid = min(rdt_min_closid, s->num_closid);
-
- closid_free_map = BIT_MASK(rdt_min_closid) - 1;
-
- /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
- __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map);
- closid_free_map_len = rdt_min_closid;
-}
-
-static int closid_alloc(void)
-{
- int cleanest_closid;
- u32 closid;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
- resctrl_arch_is_llc_occupancy_enabled()) {
- cleanest_closid = resctrl_find_cleanest_closid();
- if (cleanest_closid < 0)
- return cleanest_closid;
- closid = cleanest_closid;
- } else {
- closid = ffs(closid_free_map);
- if (closid == 0)
- return -ENOSPC;
- closid--;
- }
- __clear_bit(closid, &closid_free_map);
-
- return closid;
-}
-
-void closid_free(int closid)
-{
- lockdep_assert_held(&rdtgroup_mutex);
-
- __set_bit(closid, &closid_free_map);
-}
-
-/**
- * closid_allocated - test if provided closid is in use
- * @closid: closid to be tested
- *
- * Return: true if @closid is currently associated with a resource group,
- * false if @closid is free
- */
-bool closid_allocated(unsigned int closid)
-{
- lockdep_assert_held(&rdtgroup_mutex);
-
- return !test_bit(closid, &closid_free_map);
-}
-
-/**
- * rdtgroup_mode_by_closid - Return mode of resource group with closid
- * @closid: closid if the resource group
- *
- * Each resource group is associated with a @closid. Here the mode
- * of a resource group can be queried by searching for it using its closid.
- *
- * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
- */
-enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
-{
- struct rdtgroup *rdtgrp;
-
- list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
- if (rdtgrp->closid == closid)
- return rdtgrp->mode;
- }
-
- return RDT_NUM_MODES;
-}
-
-static const char * const rdt_mode_str[] = {
- [RDT_MODE_SHAREABLE] = "shareable",
- [RDT_MODE_EXCLUSIVE] = "exclusive",
- [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup",
- [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked",
-};
-
-/**
- * rdtgroup_mode_str - Return the string representation of mode
- * @mode: the resource group mode as &enum rdtgroup_mode
- *
- * Return: string representation of valid mode, "unknown" otherwise
- */
-static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
-{
- if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
- return "unknown";
-
- return rdt_mode_str[mode];
-}
-/* set uid and gid of rdtgroup dirs and files to that of the creator */
-static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
-{
- struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
- .ia_uid = current_fsuid(),
- .ia_gid = current_fsgid(), };
-
- if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
- gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
- return 0;
-
- return kernfs_setattr(kn, &iattr);
-}
-
-static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
-{
- struct kernfs_node *kn;
- int ret;
-
- kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
- 0, rft->kf_ops, rft, NULL, NULL);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret) {
- kernfs_remove(kn);
- return ret;
- }
-
- return 0;
-}
-
-static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
-{
- struct kernfs_open_file *of = m->private;
- struct rftype *rft = of->kn->priv;
-
- if (rft->seq_show)
- return rft->seq_show(of, m, arg);
- return 0;
-}
-
-static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off)
-{
- struct rftype *rft = of->kn->priv;
-
- if (rft->write)
- return rft->write(of, buf, nbytes, off);
-
- return -EINVAL;
-}
-
-static const struct kernfs_ops rdtgroup_kf_single_ops = {
- .atomic_write_len = PAGE_SIZE,
- .write = rdtgroup_file_write,
- .seq_show = rdtgroup_seqfile_show,
-};
-
-static const struct kernfs_ops kf_mondata_ops = {
- .atomic_write_len = PAGE_SIZE,
- .seq_show = rdtgroup_mondata_show,
-};
-
-static bool is_cpu_list(struct kernfs_open_file *of)
-{
- struct rftype *rft = of->kn->priv;
-
- return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
-}
-
-static int rdtgroup_cpus_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
- struct cpumask *mask;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
-
- if (rdtgrp) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- if (!rdtgrp->plr->d) {
- rdt_last_cmd_clear();
- rdt_last_cmd_puts("Cache domain offline\n");
- ret = -ENODEV;
- } else {
- mask = &rdtgrp->plr->d->hdr.cpu_mask;
- seq_printf(s, is_cpu_list(of) ?
- "%*pbl\n" : "%*pb\n",
- cpumask_pr_args(mask));
- }
- } else {
- seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
- cpumask_pr_args(&rdtgrp->cpu_mask));
- }
- } else {
- ret = -ENOENT;
- }
- rdtgroup_kn_unlock(of->kn);
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
- return ret;
-}
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
/*
- * This is safe against resctrl_sched_in() called from __switch_to()
+ * This is safe against resctrl_arch_sched_in() called from __switch_to()
* because __switch_to() is executed with interrupts disabled. A local call
* from update_closid_rmid() is protected against __switch_to() because
* preemption is disabled.
@@ -382,1223 +58,7 @@ void resctrl_arch_sync_cpu_closid_rmid(void *info)
* executing task might have its own closid selected. Just reuse
* the context switch code.
*/
- resctrl_sched_in(current);
-}
-
-/*
- * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
- *
- * Per task closids/rmids must have been set up before calling this function.
- * @r may be NULL.
- */
-static void
-update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
-{
- struct resctrl_cpu_defaults defaults, *p = NULL;
-
- if (r) {
- defaults.closid = r->closid;
- defaults.rmid = r->mon.rmid;
- p = &defaults;
- }
-
- on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1);
-}
-
-static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
- cpumask_var_t tmpmask)
-{
- struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
- struct list_head *head;
-
- /* Check whether cpus belong to parent ctrl group */
- cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
- if (!cpumask_empty(tmpmask)) {
- rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
- return -EINVAL;
- }
-
- /* Check whether cpus are dropped from this group */
- cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (!cpumask_empty(tmpmask)) {
- /* Give any dropped cpus to parent rdtgroup */
- cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
- update_closid_rmid(tmpmask, prgrp);
- }
-
- /*
- * If we added cpus, remove them from previous group that owned them
- * and update per-cpu rmid
- */
- cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (!cpumask_empty(tmpmask)) {
- head = &prgrp->mon.crdtgrp_list;
- list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
- if (crgrp == rdtgrp)
- continue;
- cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
- tmpmask);
- }
- update_closid_rmid(tmpmask, rdtgrp);
- }
-
- /* Done pushing/pulling - update this group with new mask */
- cpumask_copy(&rdtgrp->cpu_mask, newmask);
-
- return 0;
-}
-
-static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
-{
- struct rdtgroup *crgrp;
-
- cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
- /* update the child mon group masks as well*/
- list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
- cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
-}
-
-static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
- cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
-{
- struct rdtgroup *r, *crgrp;
- struct list_head *head;
-
- /* Check whether cpus are dropped from this group */
- cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (!cpumask_empty(tmpmask)) {
- /* Can't drop from default group */
- if (rdtgrp == &rdtgroup_default) {
- rdt_last_cmd_puts("Can't drop CPUs from default group\n");
- return -EINVAL;
- }
-
- /* Give any dropped cpus to rdtgroup_default */
- cpumask_or(&rdtgroup_default.cpu_mask,
- &rdtgroup_default.cpu_mask, tmpmask);
- update_closid_rmid(tmpmask, &rdtgroup_default);
- }
-
- /*
- * If we added cpus, remove them from previous group and
- * the prev group's child groups that owned them
- * and update per-cpu closid/rmid.
- */
- cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (!cpumask_empty(tmpmask)) {
- list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
- if (r == rdtgrp)
- continue;
- cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
- if (!cpumask_empty(tmpmask1))
- cpumask_rdtgrp_clear(r, tmpmask1);
- }
- update_closid_rmid(tmpmask, rdtgrp);
- }
-
- /* Done pushing/pulling - update this group with new mask */
- cpumask_copy(&rdtgrp->cpu_mask, newmask);
-
- /*
- * Clear child mon group masks since there is a new parent mask
- * now and update the rmid for the cpus the child lost.
- */
- head = &rdtgrp->mon.crdtgrp_list;
- list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
- cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
- update_closid_rmid(tmpmask, rdtgrp);
- cpumask_clear(&crgrp->cpu_mask);
- }
-
- return 0;
-}
-
-static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- cpumask_var_t tmpmask, newmask, tmpmask1;
- struct rdtgroup *rdtgrp;
- int ret;
-
- if (!buf)
- return -EINVAL;
-
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
- return -ENOMEM;
- if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
- free_cpumask_var(tmpmask);
- return -ENOMEM;
- }
- if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
- free_cpumask_var(tmpmask);
- free_cpumask_var(newmask);
- return -ENOMEM;
- }
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- ret = -ENOENT;
- goto unlock;
- }
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Pseudo-locking in progress\n");
- goto unlock;
- }
-
- if (is_cpu_list(of))
- ret = cpulist_parse(buf, newmask);
- else
- ret = cpumask_parse(buf, newmask);
-
- if (ret) {
- rdt_last_cmd_puts("Bad CPU list/mask\n");
- goto unlock;
- }
-
- /* check that user didn't specify any offline cpus */
- cpumask_andnot(tmpmask, newmask, cpu_online_mask);
- if (!cpumask_empty(tmpmask)) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Can only assign online CPUs\n");
- goto unlock;
- }
-
- if (rdtgrp->type == RDTCTRL_GROUP)
- ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
- else if (rdtgrp->type == RDTMON_GROUP)
- ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
- else
- ret = -EINVAL;
-
-unlock:
- rdtgroup_kn_unlock(of->kn);
- free_cpumask_var(tmpmask);
- free_cpumask_var(newmask);
- free_cpumask_var(tmpmask1);
-
- return ret ?: nbytes;
-}
-
-/**
- * rdtgroup_remove - the helper to remove resource group safely
- * @rdtgrp: resource group to remove
- *
- * On resource group creation via a mkdir, an extra kernfs_node reference is
- * taken to ensure that the rdtgroup structure remains accessible for the
- * rdtgroup_kn_unlock() calls where it is removed.
- *
- * Drop the extra reference here, then free the rdtgroup structure.
- *
- * Return: void
- */
-static void rdtgroup_remove(struct rdtgroup *rdtgrp)
-{
- kernfs_put(rdtgrp->kn);
- kfree(rdtgrp);
-}
-
-static void _update_task_closid_rmid(void *task)
-{
- /*
- * If the task is still current on this CPU, update PQR_ASSOC MSR.
- * Otherwise, the MSR is updated when the task is scheduled in.
- */
- if (task == current)
- resctrl_sched_in(task);
-}
-
-static void update_task_closid_rmid(struct task_struct *t)
-{
- if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
- smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
- else
- _update_task_closid_rmid(t);
-}
-
-static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
-{
- u32 closid, rmid = rdtgrp->mon.rmid;
-
- if (rdtgrp->type == RDTCTRL_GROUP)
- closid = rdtgrp->closid;
- else if (rdtgrp->type == RDTMON_GROUP)
- closid = rdtgrp->mon.parent->closid;
- else
- return false;
-
- return resctrl_arch_match_closid(tsk, closid) &&
- resctrl_arch_match_rmid(tsk, closid, rmid);
-}
-
-static int __rdtgroup_move_task(struct task_struct *tsk,
- struct rdtgroup *rdtgrp)
-{
- /* If the task is already in rdtgrp, no need to move the task. */
- if (task_in_rdtgroup(tsk, rdtgrp))
- return 0;
-
- /*
- * Set the task's closid/rmid before the PQR_ASSOC MSR can be
- * updated by them.
- *
- * For ctrl_mon groups, move both closid and rmid.
- * For monitor groups, can move the tasks only from
- * their parent CTRL group.
- */
- if (rdtgrp->type == RDTMON_GROUP &&
- !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
- rdt_last_cmd_puts("Can't move task to different control group\n");
- return -EINVAL;
- }
-
- if (rdtgrp->type == RDTMON_GROUP)
- resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
- rdtgrp->mon.rmid);
- else
- resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
- rdtgrp->mon.rmid);
-
- /*
- * Ensure the task's closid and rmid are written before determining if
- * the task is current that will decide if it will be interrupted.
- * This pairs with the full barrier between the rq->curr update and
- * resctrl_sched_in() during context switch.
- */
- smp_mb();
-
- /*
- * By now, the task's closid and rmid are set. If the task is current
- * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
- * group go into effect. If the task is not current, the MSR will be
- * updated when the task is scheduled in.
- */
- update_task_closid_rmid(tsk);
-
- return 0;
-}
-
-static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
-{
- return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
- resctrl_arch_match_closid(t, r->closid));
-}
-
-static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
-{
- return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
- resctrl_arch_match_rmid(t, r->mon.parent->closid,
- r->mon.rmid));
-}
-
-/**
- * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
- * @r: Resource group
- *
- * Return: 1 if tasks have been assigned to @r, 0 otherwise
- */
-int rdtgroup_tasks_assigned(struct rdtgroup *r)
-{
- struct task_struct *p, *t;
- int ret = 0;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- rcu_read_lock();
- for_each_process_thread(p, t) {
- if (is_closid_match(t, r) || is_rmid_match(t, r)) {
- ret = 1;
- break;
- }
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-static int rdtgroup_task_write_permission(struct task_struct *task,
- struct kernfs_open_file *of)
-{
- const struct cred *tcred = get_task_cred(task);
- const struct cred *cred = current_cred();
- int ret = 0;
-
- /*
- * Even if we're attaching all tasks in the thread group, we only
- * need to check permissions on one of them.
- */
- if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
- !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid)) {
- rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
- ret = -EPERM;
- }
-
- put_cred(tcred);
- return ret;
-}
-
-static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
- struct kernfs_open_file *of)
-{
- struct task_struct *tsk;
- int ret;
-
- rcu_read_lock();
- if (pid) {
- tsk = find_task_by_vpid(pid);
- if (!tsk) {
- rcu_read_unlock();
- rdt_last_cmd_printf("No task %d\n", pid);
- return -ESRCH;
- }
- } else {
- tsk = current;
- }
-
- get_task_struct(tsk);
- rcu_read_unlock();
-
- ret = rdtgroup_task_write_permission(tsk, of);
- if (!ret)
- ret = __rdtgroup_move_task(tsk, rdtgrp);
-
- put_task_struct(tsk);
- return ret;
-}
-
-static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct rdtgroup *rdtgrp;
- char *pid_str;
- int ret = 0;
- pid_t pid;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
- rdt_last_cmd_clear();
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Pseudo-locking in progress\n");
- goto unlock;
- }
-
- while (buf && buf[0] != '\0' && buf[0] != '\n') {
- pid_str = strim(strsep(&buf, ","));
-
- if (kstrtoint(pid_str, 0, &pid)) {
- rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
- ret = -EINVAL;
- break;
- }
-
- if (pid < 0) {
- rdt_last_cmd_printf("Invalid pid %d\n", pid);
- ret = -EINVAL;
- break;
- }
-
- ret = rdtgroup_move_task(pid, rdtgrp, of);
- if (ret) {
- rdt_last_cmd_printf("Error while processing task %d\n", pid);
- break;
- }
- }
-
-unlock:
- rdtgroup_kn_unlock(of->kn);
-
- return ret ?: nbytes;
-}
-
-static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
-{
- struct task_struct *p, *t;
- pid_t pid;
-
- rcu_read_lock();
- for_each_process_thread(p, t) {
- if (is_closid_match(t, r) || is_rmid_match(t, r)) {
- pid = task_pid_vnr(t);
- if (pid)
- seq_printf(s, "%d\n", pid);
- }
- }
- rcu_read_unlock();
-}
-
-static int rdtgroup_tasks_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (rdtgrp)
- show_rdt_tasks(rdtgrp, s);
- else
- ret = -ENOENT;
- rdtgroup_kn_unlock(of->kn);
-
- return ret;
-}
-
-static int rdtgroup_closid_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (rdtgrp)
- seq_printf(s, "%u\n", rdtgrp->closid);
- else
- ret = -ENOENT;
- rdtgroup_kn_unlock(of->kn);
-
- return ret;
-}
-
-static int rdtgroup_rmid_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (rdtgrp)
- seq_printf(s, "%u\n", rdtgrp->mon.rmid);
- else
- ret = -ENOENT;
- rdtgroup_kn_unlock(of->kn);
-
- return ret;
-}
-
-#ifdef CONFIG_PROC_CPU_RESCTRL
-
-/*
- * A task can only be part of one resctrl control group and of one monitor
- * group which is associated to that control group.
- *
- * 1) res:
- * mon:
- *
- * resctrl is not available.
- *
- * 2) res:/
- * mon:
- *
- * Task is part of the root resctrl control group, and it is not associated
- * to any monitor group.
- *
- * 3) res:/
- * mon:mon0
- *
- * Task is part of the root resctrl control group and monitor group mon0.
- *
- * 4) res:group0
- * mon:
- *
- * Task is part of resctrl control group group0, and it is not associated
- * to any monitor group.
- *
- * 5) res:group0
- * mon:mon1
- *
- * Task is part of resctrl control group group0 and monitor group mon1.
- */
-int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *tsk)
-{
- struct rdtgroup *rdtg;
- int ret = 0;
-
- mutex_lock(&rdtgroup_mutex);
-
- /* Return empty if resctrl has not been mounted. */
- if (!resctrl_mounted) {
- seq_puts(s, "res:\nmon:\n");
- goto unlock;
- }
-
- list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
- struct rdtgroup *crg;
-
- /*
- * Task information is only relevant for shareable
- * and exclusive groups.
- */
- if (rdtg->mode != RDT_MODE_SHAREABLE &&
- rdtg->mode != RDT_MODE_EXCLUSIVE)
- continue;
-
- if (!resctrl_arch_match_closid(tsk, rdtg->closid))
- continue;
-
- seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
- rdt_kn_name(rdtg->kn));
- seq_puts(s, "mon:");
- list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
- mon.crdtgrp_list) {
- if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
- crg->mon.rmid))
- continue;
- seq_printf(s, "%s", rdt_kn_name(crg->kn));
- break;
- }
- seq_putc(s, '\n');
- goto unlock;
- }
- /*
- * The above search should succeed. Otherwise return
- * with an error.
- */
- ret = -ENOENT;
-unlock:
- mutex_unlock(&rdtgroup_mutex);
-
- return ret;
-}
-#endif
-
-static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- int len;
-
- mutex_lock(&rdtgroup_mutex);
- len = seq_buf_used(&last_cmd_status);
- if (len)
- seq_printf(seq, "%.*s", len, last_cmd_status_buf);
- else
- seq_puts(seq, "ok\n");
- mutex_unlock(&rdtgroup_mutex);
- return 0;
-}
-
-static void *rdt_kn_parent_priv(struct kernfs_node *kn)
-{
- /*
- * The parent pointer is only valid within RCU section since it can be
- * replaced.
- */
- guard(rcu)();
- return rcu_dereference(kn->__parent)->priv;
-}
-
-static int rdt_num_closids_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
-
- seq_printf(seq, "%u\n", s->num_closid);
- return 0;
-}
-
-static int rdt_default_ctrl_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r));
- return 0;
-}
-
-static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
- return 0;
-}
-
-static int rdt_shareable_bits_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%x\n", r->cache.shareable_bits);
- return 0;
-}
-
-/*
- * rdt_bit_usage_show - Display current usage of resources
- *
- * A domain is a shared resource that can now be allocated differently. Here
- * we display the current regions of the domain as an annotated bitmask.
- * For each domain of this resource its allocation bitmask
- * is annotated as below to indicate the current usage of the corresponding bit:
- * 0 - currently unused
- * X - currently available for sharing and used by software and hardware
- * H - currently used by hardware only but available for software use
- * S - currently used and shareable by software only
- * E - currently used exclusively by one resource group
- * P - currently pseudo-locked by one resource group
- */
-static int rdt_bit_usage_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
- /*
- * Use unsigned long even though only 32 bits are used to ensure
- * test_bit() is used safely.
- */
- unsigned long sw_shareable = 0, hw_shareable = 0;
- unsigned long exclusive = 0, pseudo_locked = 0;
- struct rdt_resource *r = s->res;
- struct rdt_ctrl_domain *dom;
- int i, hwb, swb, excl, psl;
- enum rdtgrp_mode mode;
- bool sep = false;
- u32 ctrl_val;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
- hw_shareable = r->cache.shareable_bits;
- list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
- if (sep)
- seq_putc(seq, ';');
- sw_shareable = 0;
- exclusive = 0;
- seq_printf(seq, "%d=", dom->hdr.id);
- for (i = 0; i < closids_supported(); i++) {
- if (!closid_allocated(i))
- continue;
- ctrl_val = resctrl_arch_get_config(r, dom, i,
- s->conf_type);
- mode = rdtgroup_mode_by_closid(i);
- switch (mode) {
- case RDT_MODE_SHAREABLE:
- sw_shareable |= ctrl_val;
- break;
- case RDT_MODE_EXCLUSIVE:
- exclusive |= ctrl_val;
- break;
- case RDT_MODE_PSEUDO_LOCKSETUP:
- /*
- * RDT_MODE_PSEUDO_LOCKSETUP is possible
- * here but not included since the CBM
- * associated with this CLOSID in this mode
- * is not initialized and no task or cpu can be
- * assigned this CLOSID.
- */
- break;
- case RDT_MODE_PSEUDO_LOCKED:
- case RDT_NUM_MODES:
- WARN(1,
- "invalid mode for closid %d\n", i);
- break;
- }
- }
- for (i = r->cache.cbm_len - 1; i >= 0; i--) {
- pseudo_locked = dom->plr ? dom->plr->cbm : 0;
- hwb = test_bit(i, &hw_shareable);
- swb = test_bit(i, &sw_shareable);
- excl = test_bit(i, &exclusive);
- psl = test_bit(i, &pseudo_locked);
- if (hwb && swb)
- seq_putc(seq, 'X');
- else if (hwb && !swb)
- seq_putc(seq, 'H');
- else if (!hwb && swb)
- seq_putc(seq, 'S');
- else if (excl)
- seq_putc(seq, 'E');
- else if (psl)
- seq_putc(seq, 'P');
- else /* Unused bits remain */
- seq_putc(seq, '0');
- }
- sep = true;
- }
- seq_putc(seq, '\n');
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
- return 0;
-}
-
-static int rdt_min_bw_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->membw.min_bw);
- return 0;
-}
-
-static int rdt_num_rmids_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
-
- seq_printf(seq, "%d\n", r->num_rmid);
-
- return 0;
-}
-
-static int rdt_mon_features_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
- struct mon_evt *mevt;
-
- list_for_each_entry(mevt, &r->evt_list, list) {
- seq_printf(seq, "%s\n", mevt->name);
- if (mevt->configurable)
- seq_printf(seq, "%s_config\n", mevt->name);
- }
-
- return 0;
-}
-
-static int rdt_bw_gran_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->membw.bw_gran);
- return 0;
-}
-
-static int rdt_delay_linear_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->membw.delay_linear);
- return 0;
-}
-
-static int max_threshold_occ_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
-
- return 0;
-}
-
-static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
- struct rdt_resource *r = s->res;
-
- switch (r->membw.throttle_mode) {
- case THREAD_THROTTLE_PER_THREAD:
- seq_puts(seq, "per-thread\n");
- return 0;
- case THREAD_THROTTLE_MAX:
- seq_puts(seq, "max\n");
- return 0;
- case THREAD_THROTTLE_UNDEFINED:
- seq_puts(seq, "undefined\n");
- return 0;
- }
-
- WARN_ON_ONCE(1);
-
- return 0;
-}
-
-static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- unsigned int bytes;
- int ret;
-
- ret = kstrtouint(buf, 0, &bytes);
- if (ret)
- return ret;
-
- if (bytes > resctrl_rmid_realloc_limit)
- return -EINVAL;
-
- resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
-
- return nbytes;
-}
-
-/*
- * rdtgroup_mode_show - Display mode of this resource group
- */
-static int rdtgroup_mode_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
-
- seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
-
- rdtgroup_kn_unlock(of->kn);
- return 0;
-}
-
-static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
-{
- switch (my_type) {
- case CDP_CODE:
- return CDP_DATA;
- case CDP_DATA:
- return CDP_CODE;
- default:
- case CDP_NONE:
- return CDP_NONE;
- }
-}
-
-static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
-
- return 0;
-}
-
-/**
- * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
- * @r: Resource to which domain instance @d belongs.
- * @d: The domain instance for which @closid is being tested.
- * @cbm: Capacity bitmask being tested.
- * @closid: Intended closid for @cbm.
- * @type: CDP type of @r.
- * @exclusive: Only check if overlaps with exclusive resource groups
- *
- * Checks if provided @cbm intended to be used for @closid on domain
- * @d overlaps with any other closids or other hardware usage associated
- * with this domain. If @exclusive is true then only overlaps with
- * resource groups in exclusive mode will be considered. If @exclusive
- * is false then overlaps with any resource group or hardware entities
- * will be considered.
- *
- * @cbm is unsigned long, even if only 32 bits are used, to make the
- * bitmap functions work correctly.
- *
- * Return: false if CBM does not overlap, true if it does.
- */
-static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d,
- unsigned long cbm, int closid,
- enum resctrl_conf_type type, bool exclusive)
-{
- enum rdtgrp_mode mode;
- unsigned long ctrl_b;
- int i;
-
- /* Check for any overlap with regions used by hardware directly */
- if (!exclusive) {
- ctrl_b = r->cache.shareable_bits;
- if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
- return true;
- }
-
- /* Check for overlap with other resource groups */
- for (i = 0; i < closids_supported(); i++) {
- ctrl_b = resctrl_arch_get_config(r, d, i, type);
- mode = rdtgroup_mode_by_closid(i);
- if (closid_allocated(i) && i != closid &&
- mode != RDT_MODE_PSEUDO_LOCKSETUP) {
- if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
- if (exclusive) {
- if (mode == RDT_MODE_EXCLUSIVE)
- return true;
- continue;
- }
- return true;
- }
- }
- }
-
- return false;
-}
-
-/**
- * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
- * @s: Schema for the resource to which domain instance @d belongs.
- * @d: The domain instance for which @closid is being tested.
- * @cbm: Capacity bitmask being tested.
- * @closid: Intended closid for @cbm.
- * @exclusive: Only check if overlaps with exclusive resource groups
- *
- * Resources that can be allocated using a CBM can use the CBM to control
- * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
- * for overlap. Overlap test is not limited to the specific resource for
- * which the CBM is intended though - when dealing with CDP resources that
- * share the underlying hardware the overlap check should be performed on
- * the CDP resource sharing the hardware also.
- *
- * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
- * overlap test.
- *
- * Return: true if CBM overlap detected, false if there is no overlap
- */
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
- unsigned long cbm, int closid, bool exclusive)
-{
- enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
- struct rdt_resource *r = s->res;
-
- if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
- exclusive))
- return true;
-
- if (!resctrl_arch_get_cdp_enabled(r->rid))
- return false;
- return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
-}
-
-/**
- * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
- * @rdtgrp: Resource group identified through its closid.
- *
- * An exclusive resource group implies that there should be no sharing of
- * its allocated resources. At the time this group is considered to be
- * exclusive this test can determine if its current schemata supports this
- * setting by testing for overlap with all other resource groups.
- *
- * Return: true if resource group can be exclusive, false if there is overlap
- * with allocations of other resource groups and thus this resource group
- * cannot be exclusive.
- */
-static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
-{
- int closid = rdtgrp->closid;
- struct rdt_ctrl_domain *d;
- struct resctrl_schema *s;
- struct rdt_resource *r;
- bool has_cache = false;
- u32 ctrl;
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- list_for_each_entry(s, &resctrl_schema_all, list) {
- r = s->res;
- if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
- continue;
- has_cache = true;
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- ctrl = resctrl_arch_get_config(r, d, closid,
- s->conf_type);
- if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
- rdt_last_cmd_puts("Schemata overlaps\n");
- return false;
- }
- }
- }
-
- if (!has_cache) {
- rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
- return false;
- }
-
- return true;
-}
-
-/*
- * rdtgroup_mode_write - Modify the resource group's mode
- */
-static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct rdtgroup *rdtgrp;
- enum rdtgrp_mode mode;
- int ret = 0;
-
- /* Valid input requires a trailing newline */
- if (nbytes == 0 || buf[nbytes - 1] != '\n')
- return -EINVAL;
- buf[nbytes - 1] = '\0';
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
-
- rdt_last_cmd_clear();
-
- mode = rdtgrp->mode;
-
- if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
- (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
- (!strcmp(buf, "pseudo-locksetup") &&
- mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
- (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
- goto out;
-
- if (mode == RDT_MODE_PSEUDO_LOCKED) {
- rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
- ret = -EINVAL;
- goto out;
- }
-
- if (!strcmp(buf, "shareable")) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- ret = rdtgroup_locksetup_exit(rdtgrp);
- if (ret)
- goto out;
- }
- rdtgrp->mode = RDT_MODE_SHAREABLE;
- } else if (!strcmp(buf, "exclusive")) {
- if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
- ret = -EINVAL;
- goto out;
- }
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- ret = rdtgroup_locksetup_exit(rdtgrp);
- if (ret)
- goto out;
- }
- rdtgrp->mode = RDT_MODE_EXCLUSIVE;
- } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) &&
- !strcmp(buf, "pseudo-locksetup")) {
- ret = rdtgroup_locksetup_enter(rdtgrp);
- if (ret)
- goto out;
- rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
- } else {
- rdt_last_cmd_puts("Unknown or unsupported mode\n");
- ret = -EINVAL;
- }
-
-out:
- rdtgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
-}
-
-/**
- * rdtgroup_cbm_to_size - Translate CBM to size in bytes
- * @r: RDT resource to which @d belongs.
- * @d: RDT domain instance.
- * @cbm: bitmask for which the size should be computed.
- *
- * The bitmask provided associated with the RDT domain instance @d will be
- * translated into how many bytes it represents. The size in bytes is
- * computed by first dividing the total cache size by the CBM length to
- * determine how many bytes each bit in the bitmask represents. The result
- * is multiplied with the number of bits set in the bitmask.
- *
- * @cbm is unsigned long, even if only 32 bits are used to make the
- * bitmap functions work correctly.
- */
-unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
- struct rdt_ctrl_domain *d, unsigned long cbm)
-{
- unsigned int size = 0;
- struct cacheinfo *ci;
- int num_b;
-
- if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE))
- return size;
-
- num_b = bitmap_weight(&cbm, r->cache.cbm_len);
- ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope);
- if (ci)
- size = ci->size / r->cache.cbm_len * num_b;
-
- return size;
-}
-
-/*
- * rdtgroup_size_show - Display size in bytes of allocated regions
- *
- * The "size" file mirrors the layout of the "schemata" file, printing the
- * size in bytes of each region instead of the capacity bitmask.
- */
-static int rdtgroup_size_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct resctrl_schema *schema;
- enum resctrl_conf_type type;
- struct rdt_ctrl_domain *d;
- struct rdtgroup *rdtgrp;
- struct rdt_resource *r;
- unsigned int size;
- int ret = 0;
- u32 closid;
- bool sep;
- u32 ctrl;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- if (!rdtgrp->plr->d) {
- rdt_last_cmd_clear();
- rdt_last_cmd_puts("Cache domain offline\n");
- ret = -ENODEV;
- } else {
- seq_printf(s, "%*s:", max_name_width,
- rdtgrp->plr->s->name);
- size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
- rdtgrp->plr->d,
- rdtgrp->plr->cbm);
- seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size);
- }
- goto out;
- }
-
- closid = rdtgrp->closid;
-
- list_for_each_entry(schema, &resctrl_schema_all, list) {
- r = schema->res;
- type = schema->conf_type;
- sep = false;
- seq_printf(s, "%*s:", max_name_width, schema->name);
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- if (sep)
- seq_putc(s, ';');
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- size = 0;
- } else {
- if (is_mba_sc(r))
- ctrl = d->mbps_val[closid];
- else
- ctrl = resctrl_arch_get_config(r, d,
- closid,
- type);
- if (r->rid == RDT_RESOURCE_MBA ||
- r->rid == RDT_RESOURCE_SMBA)
- size = ctrl;
- else
- size = rdtgroup_cbm_to_size(r, d, ctrl);
- }
- seq_printf(s, "%d=%u", d->hdr.id, size);
- sep = true;
- }
- seq_putc(s, '\n');
- }
-
-out:
- rdtgroup_kn_unlock(of->kn);
-
- return ret;
+ resctrl_arch_sched_in(current);
}
#define INVALID_CONFIG_INDEX UINT_MAX
@@ -1642,62 +102,6 @@ void resctrl_arch_mon_event_config_read(void *_config_info)
config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
}
-static void mondata_config_read(struct resctrl_mon_config_info *mon_info)
-{
- smp_call_function_any(&mon_info->d->hdr.cpu_mask,
- resctrl_arch_mon_event_config_read, mon_info, 1);
-}
-
-static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
-{
- struct resctrl_mon_config_info mon_info;
- struct rdt_mon_domain *dom;
- bool sep = false;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- list_for_each_entry(dom, &r->mon_domains, hdr.list) {
- if (sep)
- seq_puts(s, ";");
-
- memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info));
- mon_info.r = r;
- mon_info.d = dom;
- mon_info.evtid = evtid;
- mondata_config_read(&mon_info);
-
- seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
- sep = true;
- }
- seq_puts(s, "\n");
-
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-
- return 0;
-}
-
-static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
-
- mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
-
- return 0;
-}
-
-static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
-
- mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
-
- return 0;
-}
-
void resctrl_arch_mon_event_config_write(void *_config_info)
{
struct resctrl_mon_config_info *config_info = _config_info;
@@ -1711,618 +115,6 @@ void resctrl_arch_mon_event_config_write(void *_config_info)
wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config);
}
-static void mbm_config_write_domain(struct rdt_resource *r,
- struct rdt_mon_domain *d, u32 evtid, u32 val)
-{
- struct resctrl_mon_config_info mon_info = {0};
-
- /*
- * Read the current config value first. If both are the same then
- * no need to write it again.
- */
- mon_info.r = r;
- mon_info.d = d;
- mon_info.evtid = evtid;
- mondata_config_read(&mon_info);
- if (mon_info.mon_config == val)
- return;
-
- mon_info.mon_config = val;
-
- /*
- * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
- * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
- * are scoped at the domain level. Writing any of these MSRs
- * on one CPU is observed by all the CPUs in the domain.
- */
- smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write,
- &mon_info, 1);
-
- /*
- * When an Event Configuration is changed, the bandwidth counters
- * for all RMIDs and Events will be cleared by the hardware. The
- * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
- * every RMID on the next read to any event for every RMID.
- * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
- * cleared while it is tracked by the hardware. Clear the
- * mbm_local and mbm_total counts for all the RMIDs.
- */
- resctrl_arch_reset_rmid_all(r, d);
-}
-
-static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
-{
- char *dom_str = NULL, *id_str;
- unsigned long dom_id, val;
- struct rdt_mon_domain *d;
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
-next:
- if (!tok || tok[0] == '\0')
- return 0;
-
- /* Start processing the strings for each domain */
- dom_str = strim(strsep(&tok, ";"));
- id_str = strsep(&dom_str, "=");
-
- if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
- rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
- return -EINVAL;
- }
-
- if (!dom_str || kstrtoul(dom_str, 16, &val)) {
- rdt_last_cmd_puts("Non-numeric event configuration value\n");
- return -EINVAL;
- }
-
- /* Value from user cannot be more than the supported set of events */
- if ((val & r->mbm_cfg_mask) != val) {
- rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
- r->mbm_cfg_mask);
- return -EINVAL;
- }
-
- list_for_each_entry(d, &r->mon_domains, hdr.list) {
- if (d->hdr.id == dom_id) {
- mbm_config_write_domain(r, d, evtid, val);
- goto next;
- }
- }
-
- return -EINVAL;
-}
-
-static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes,
- loff_t off)
-{
- struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
- int ret;
-
- /* Valid input requires a trailing newline */
- if (nbytes == 0 || buf[nbytes - 1] != '\n')
- return -EINVAL;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- rdt_last_cmd_clear();
-
- buf[nbytes - 1] = '\0';
-
- ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
-
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-
- return ret ?: nbytes;
-}
-
-static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes,
- loff_t off)
-{
- struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
- int ret;
-
- /* Valid input requires a trailing newline */
- if (nbytes == 0 || buf[nbytes - 1] != '\n')
- return -EINVAL;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- rdt_last_cmd_clear();
-
- buf[nbytes - 1] = '\0';
-
- ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
-
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-
- return ret ?: nbytes;
-}
-
-/* rdtgroup information files for one cache resource. */
-static struct rftype res_common_files[] = {
- {
- .name = "last_cmd_status",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_last_cmd_status_show,
- .fflags = RFTYPE_TOP_INFO,
- },
- {
- .name = "num_closids",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_num_closids_show,
- .fflags = RFTYPE_CTRL_INFO,
- },
- {
- .name = "mon_features",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_mon_features_show,
- .fflags = RFTYPE_MON_INFO,
- },
- {
- .name = "num_rmids",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_num_rmids_show,
- .fflags = RFTYPE_MON_INFO,
- },
- {
- .name = "cbm_mask",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_default_ctrl_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "min_cbm_bits",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_min_cbm_bits_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "shareable_bits",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_shareable_bits_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "bit_usage",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_bit_usage_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "min_bandwidth",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_min_bw_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
- },
- {
- .name = "bandwidth_gran",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_bw_gran_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
- },
- {
- .name = "delay_linear",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_delay_linear_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
- },
- /*
- * Platform specific which (if any) capabilities are provided by
- * thread_throttle_mode. Defer "fflags" initialization to platform
- * discovery.
- */
- {
- .name = "thread_throttle_mode",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_thread_throttle_mode_show,
- },
- {
- .name = "max_threshold_occupancy",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = max_threshold_occ_write,
- .seq_show = max_threshold_occ_show,
- .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "mbm_total_bytes_config",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = mbm_total_bytes_config_show,
- .write = mbm_total_bytes_config_write,
- },
- {
- .name = "mbm_local_bytes_config",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = mbm_local_bytes_config_show,
- .write = mbm_local_bytes_config_write,
- },
- {
- .name = "cpus",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_cpus_write,
- .seq_show = rdtgroup_cpus_show,
- .fflags = RFTYPE_BASE,
- },
- {
- .name = "cpus_list",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_cpus_write,
- .seq_show = rdtgroup_cpus_show,
- .flags = RFTYPE_FLAGS_CPUS_LIST,
- .fflags = RFTYPE_BASE,
- },
- {
- .name = "tasks",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_tasks_write,
- .seq_show = rdtgroup_tasks_show,
- .fflags = RFTYPE_BASE,
- },
- {
- .name = "mon_hw_id",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdtgroup_rmid_show,
- .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG,
- },
- {
- .name = "schemata",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_schemata_write,
- .seq_show = rdtgroup_schemata_show,
- .fflags = RFTYPE_CTRL_BASE,
- },
- {
- .name = "mba_MBps_event",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_mba_mbps_event_write,
- .seq_show = rdtgroup_mba_mbps_event_show,
- },
- {
- .name = "mode",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_mode_write,
- .seq_show = rdtgroup_mode_show,
- .fflags = RFTYPE_CTRL_BASE,
- },
- {
- .name = "size",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdtgroup_size_show,
- .fflags = RFTYPE_CTRL_BASE,
- },
- {
- .name = "sparse_masks",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_has_sparse_bitmasks_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "ctrl_hw_id",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdtgroup_closid_show,
- .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
- },
-
-};
-
-static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
-{
- struct rftype *rfts, *rft;
- int ret, len;
-
- rfts = res_common_files;
- len = ARRAY_SIZE(res_common_files);
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- if (resctrl_debug)
- fflags |= RFTYPE_DEBUG;
-
- for (rft = rfts; rft < rfts + len; rft++) {
- if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
- ret = rdtgroup_add_file(kn, rft);
- if (ret)
- goto error;
- }
- }
-
- return 0;
-error:
- pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
- while (--rft >= rfts) {
- if ((fflags & rft->fflags) == rft->fflags)
- kernfs_remove_by_name(kn, rft->name);
- }
- return ret;
-}
-
-static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
-{
- struct rftype *rfts, *rft;
- int len;
-
- rfts = res_common_files;
- len = ARRAY_SIZE(res_common_files);
-
- for (rft = rfts; rft < rfts + len; rft++) {
- if (!strcmp(rft->name, name))
- return rft;
- }
-
- return NULL;
-}
-
-static void thread_throttle_mode_init(void)
-{
- enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED;
- struct rdt_resource *r_mba, *r_smba;
-
- r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
- if (r_mba->alloc_capable &&
- r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
- throttle_mode = r_mba->membw.throttle_mode;
-
- r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA);
- if (r_smba->alloc_capable &&
- r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
- throttle_mode = r_smba->membw.throttle_mode;
-
- if (throttle_mode == THREAD_THROTTLE_UNDEFINED)
- return;
-
- resctrl_file_fflags_init("thread_throttle_mode",
- RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
-}
-
-void resctrl_file_fflags_init(const char *config, unsigned long fflags)
-{
- struct rftype *rft;
-
- rft = rdtgroup_get_rftype_by_name(config);
- if (rft)
- rft->fflags = fflags;
-}
-
-/**
- * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
- * @r: The resource group with which the file is associated.
- * @name: Name of the file
- *
- * The permissions of named resctrl file, directory, or link are modified
- * to not allow read, write, or execute by any user.
- *
- * WARNING: This function is intended to communicate to the user that the
- * resctrl file has been locked down - that it is not relevant to the
- * particular state the system finds itself in. It should not be relied
- * on to protect from user access because after the file's permissions
- * are restricted the user can still change the permissions using chmod
- * from the command line.
- *
- * Return: 0 on success, <0 on failure.
- */
-int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
-{
- struct iattr iattr = {.ia_valid = ATTR_MODE,};
- struct kernfs_node *kn;
- int ret = 0;
-
- kn = kernfs_find_and_get_ns(r->kn, name, NULL);
- if (!kn)
- return -ENOENT;
-
- switch (kernfs_type(kn)) {
- case KERNFS_DIR:
- iattr.ia_mode = S_IFDIR;
- break;
- case KERNFS_FILE:
- iattr.ia_mode = S_IFREG;
- break;
- case KERNFS_LINK:
- iattr.ia_mode = S_IFLNK;
- break;
- }
-
- ret = kernfs_setattr(kn, &iattr);
- kernfs_put(kn);
- return ret;
-}
-
-/**
- * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
- * @r: The resource group with which the file is associated.
- * @name: Name of the file
- * @mask: Mask of permissions that should be restored
- *
- * Restore the permissions of the named file. If @name is a directory the
- * permissions of its parent will be used.
- *
- * Return: 0 on success, <0 on failure.
- */
-int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
- umode_t mask)
-{
- struct iattr iattr = {.ia_valid = ATTR_MODE,};
- struct kernfs_node *kn, *parent;
- struct rftype *rfts, *rft;
- int ret, len;
-
- rfts = res_common_files;
- len = ARRAY_SIZE(res_common_files);
-
- for (rft = rfts; rft < rfts + len; rft++) {
- if (!strcmp(rft->name, name))
- iattr.ia_mode = rft->mode & mask;
- }
-
- kn = kernfs_find_and_get_ns(r->kn, name, NULL);
- if (!kn)
- return -ENOENT;
-
- switch (kernfs_type(kn)) {
- case KERNFS_DIR:
- parent = kernfs_get_parent(kn);
- if (parent) {
- iattr.ia_mode |= parent->mode;
- kernfs_put(parent);
- }
- iattr.ia_mode |= S_IFDIR;
- break;
- case KERNFS_FILE:
- iattr.ia_mode |= S_IFREG;
- break;
- case KERNFS_LINK:
- iattr.ia_mode |= S_IFLNK;
- break;
- }
-
- ret = kernfs_setattr(kn, &iattr);
- kernfs_put(kn);
- return ret;
-}
-
-static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
- unsigned long fflags)
-{
- struct kernfs_node *kn_subdir;
- int ret;
-
- kn_subdir = kernfs_create_dir(kn_info, name,
- kn_info->mode, priv);
- if (IS_ERR(kn_subdir))
- return PTR_ERR(kn_subdir);
-
- ret = rdtgroup_kn_set_ugid(kn_subdir);
- if (ret)
- return ret;
-
- ret = rdtgroup_add_files(kn_subdir, fflags);
- if (!ret)
- kernfs_activate(kn_subdir);
-
- return ret;
-}
-
-static unsigned long fflags_from_resource(struct rdt_resource *r)
-{
- switch (r->rid) {
- case RDT_RESOURCE_L3:
- case RDT_RESOURCE_L2:
- return RFTYPE_RES_CACHE;
- case RDT_RESOURCE_MBA:
- case RDT_RESOURCE_SMBA:
- return RFTYPE_RES_MB;
- }
-
- return WARN_ON_ONCE(1);
-}
-
-static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
-{
- struct resctrl_schema *s;
- struct rdt_resource *r;
- unsigned long fflags;
- char name[32];
- int ret;
-
- /* create the directory */
- kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
- if (IS_ERR(kn_info))
- return PTR_ERR(kn_info);
-
- ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
- if (ret)
- goto out_destroy;
-
- /* loop over enabled controls, these are all alloc_capable */
- list_for_each_entry(s, &resctrl_schema_all, list) {
- r = s->res;
- fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO;
- ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
- if (ret)
- goto out_destroy;
- }
-
- for_each_mon_capable_rdt_resource(r) {
- fflags = fflags_from_resource(r) | RFTYPE_MON_INFO;
- sprintf(name, "%s_MON", r->name);
- ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
- if (ret)
- goto out_destroy;
- }
-
- ret = rdtgroup_kn_set_ugid(kn_info);
- if (ret)
- goto out_destroy;
-
- kernfs_activate(kn_info);
-
- return 0;
-
-out_destroy:
- kernfs_remove(kn_info);
- return ret;
-}
-
-static int
-mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
- char *name, struct kernfs_node **dest_kn)
-{
- struct kernfs_node *kn;
- int ret;
-
- /* create the directory */
- kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- if (dest_kn)
- *dest_kn = kn;
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret)
- goto out_destroy;
-
- kernfs_activate(kn);
-
- return 0;
-
-out_destroy:
- kernfs_remove(kn);
- return ret;
-}
-
static void l3_qos_cfg_update(void *arg)
{
bool *enable = arg;
@@ -2337,11 +129,6 @@ static void l2_qos_cfg_update(void *arg)
wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
}
-static inline bool is_mba_linear(void)
-{
- return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear;
-}
-
static int set_cache_qos_cfg(int level, bool enable)
{
void (*update)(void *arg);
@@ -2397,76 +184,6 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
l3_qos_cfg_update(&hw_res->cdp_enabled);
}
-static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d)
-{
- u32 num_closid = resctrl_arch_get_num_closid(r);
- int cpu = cpumask_any(&d->hdr.cpu_mask);
- int i;
-
- d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
- GFP_KERNEL, cpu_to_node(cpu));
- if (!d->mbps_val)
- return -ENOMEM;
-
- for (i = 0; i < num_closid; i++)
- d->mbps_val[i] = MBA_MAX_MBPS;
-
- return 0;
-}
-
-static void mba_sc_domain_destroy(struct rdt_resource *r,
- struct rdt_ctrl_domain *d)
-{
- kfree(d->mbps_val);
- d->mbps_val = NULL;
-}
-
-/*
- * MBA software controller is supported only if
- * MBM is supported and MBA is in linear scale,
- * and the MBM monitor scope is the same as MBA
- * control scope.
- */
-static bool supports_mba_mbps(void)
-{
- struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3);
- struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
-
- return (resctrl_is_mbm_enabled() &&
- r->alloc_capable && is_mba_linear() &&
- r->ctrl_scope == rmbm->mon_scope);
-}
-
-/*
- * Enable or disable the MBA software controller
- * which helps user specify bandwidth in MBps.
- */
-static int set_mba_sc(bool mba_sc)
-{
- struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
- u32 num_closid = resctrl_arch_get_num_closid(r);
- struct rdt_ctrl_domain *d;
- unsigned long fflags;
- int i;
-
- if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
- return -EINVAL;
-
- r->membw.mba_sc = mba_sc;
-
- rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
-
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- for (i = 0; i < num_closid; i++)
- d->mbps_val[i] = MBA_MAX_MBPS;
- }
-
- fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
- resctrl_file_fflags_init("mba_MBps_event", fflags);
-
- return 0;
-}
-
static int cdp_enable(int level)
{
struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
@@ -2507,419 +224,9 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
return 0;
}
-/*
- * We don't allow rdtgroup directories to be created anywhere
- * except the root directory. Thus when looking for the rdtgroup
- * structure for a kernfs node we are either looking at a directory,
- * in which case the rdtgroup structure is pointed at by the "priv"
- * field, otherwise we have a file, and need only look to the parent
- * to find the rdtgroup.
- */
-static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
-{
- if (kernfs_type(kn) == KERNFS_DIR) {
- /*
- * All the resource directories use "kn->priv"
- * to point to the "struct rdtgroup" for the
- * resource. "info" and its subdirectories don't
- * have rdtgroup structures, so return NULL here.
- */
- if (kn == kn_info ||
- rcu_access_pointer(kn->__parent) == kn_info)
- return NULL;
- else
- return kn->priv;
- } else {
- return rdt_kn_parent_priv(kn);
- }
-}
-
-static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
-{
- atomic_inc(&rdtgrp->waitcount);
- kernfs_break_active_protection(kn);
-}
-
-static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
-{
- if (atomic_dec_and_test(&rdtgrp->waitcount) &&
- (rdtgrp->flags & RDT_DELETED)) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
- rdtgroup_pseudo_lock_remove(rdtgrp);
- kernfs_unbreak_active_protection(kn);
- rdtgroup_remove(rdtgrp);
- } else {
- kernfs_unbreak_active_protection(kn);
- }
-}
-
-struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
-{
- struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
-
- if (!rdtgrp)
- return NULL;
-
- rdtgroup_kn_get(rdtgrp, kn);
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- /* Was this group deleted while we waited? */
- if (rdtgrp->flags & RDT_DELETED)
- return NULL;
-
- return rdtgrp;
-}
-
-void rdtgroup_kn_unlock(struct kernfs_node *kn)
-{
- struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
-
- if (!rdtgrp)
- return;
-
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-
- rdtgroup_kn_put(rdtgrp, kn);
-}
-
-static int mkdir_mondata_all(struct kernfs_node *parent_kn,
- struct rdtgroup *prgrp,
- struct kernfs_node **mon_data_kn);
-
-static void rdt_disable_ctx(void)
-{
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
- set_mba_sc(false);
-
- resctrl_debug = false;
-}
-
-static int rdt_enable_ctx(struct rdt_fs_context *ctx)
-{
- int ret = 0;
-
- if (ctx->enable_cdpl2) {
- ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
- if (ret)
- goto out_done;
- }
-
- if (ctx->enable_cdpl3) {
- ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
- if (ret)
- goto out_cdpl2;
- }
-
- if (ctx->enable_mba_mbps) {
- ret = set_mba_sc(true);
- if (ret)
- goto out_cdpl3;
- }
-
- if (ctx->enable_debug)
- resctrl_debug = true;
-
- return 0;
-
-out_cdpl3:
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
-out_cdpl2:
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
-out_done:
- return ret;
-}
-
-static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
-{
- struct resctrl_schema *s;
- const char *suffix = "";
- int ret, cl;
-
- s = kzalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return -ENOMEM;
-
- s->res = r;
- s->num_closid = resctrl_arch_get_num_closid(r);
- if (resctrl_arch_get_cdp_enabled(r->rid))
- s->num_closid /= 2;
-
- s->conf_type = type;
- switch (type) {
- case CDP_CODE:
- suffix = "CODE";
- break;
- case CDP_DATA:
- suffix = "DATA";
- break;
- case CDP_NONE:
- suffix = "";
- break;
- }
-
- ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
- if (ret >= sizeof(s->name)) {
- kfree(s);
- return -EINVAL;
- }
-
- cl = strlen(s->name);
-
- /*
- * If CDP is supported by this resource, but not enabled,
- * include the suffix. This ensures the tabular format of the
- * schemata file does not change between mounts of the filesystem.
- */
- if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
- cl += 4;
-
- if (cl > max_name_width)
- max_name_width = cl;
-
- switch (r->schema_fmt) {
- case RESCTRL_SCHEMA_BITMAP:
- s->fmt_str = "%d=%x";
- break;
- case RESCTRL_SCHEMA_RANGE:
- s->fmt_str = "%d=%u";
- break;
- }
-
- if (WARN_ON_ONCE(!s->fmt_str)) {
- kfree(s);
- return -EINVAL;
- }
-
- INIT_LIST_HEAD(&s->list);
- list_add(&s->list, &resctrl_schema_all);
-
- return 0;
-}
-
-static int schemata_list_create(void)
-{
- struct rdt_resource *r;
- int ret = 0;
-
- for_each_alloc_capable_rdt_resource(r) {
- if (resctrl_arch_get_cdp_enabled(r->rid)) {
- ret = schemata_list_add(r, CDP_CODE);
- if (ret)
- break;
-
- ret = schemata_list_add(r, CDP_DATA);
- } else {
- ret = schemata_list_add(r, CDP_NONE);
- }
-
- if (ret)
- break;
- }
-
- return ret;
-}
-
-static void schemata_list_destroy(void)
-{
- struct resctrl_schema *s, *tmp;
-
- list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
- list_del(&s->list);
- kfree(s);
- }
-}
-
-static int rdt_get_tree(struct fs_context *fc)
-{
- struct rdt_fs_context *ctx = rdt_fc2context(fc);
- unsigned long flags = RFTYPE_CTRL_BASE;
- struct rdt_mon_domain *dom;
- struct rdt_resource *r;
- int ret;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
- /*
- * resctrl file system can only be mounted once.
- */
- if (resctrl_mounted) {
- ret = -EBUSY;
- goto out;
- }
-
- ret = rdtgroup_setup_root(ctx);
- if (ret)
- goto out;
-
- ret = rdt_enable_ctx(ctx);
- if (ret)
- goto out_root;
-
- ret = schemata_list_create();
- if (ret) {
- schemata_list_destroy();
- goto out_ctx;
- }
-
- closid_init();
-
- if (resctrl_arch_mon_capable())
- flags |= RFTYPE_MON;
-
- ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
- if (ret)
- goto out_schemata_free;
-
- kernfs_activate(rdtgroup_default.kn);
-
- ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
- if (ret < 0)
- goto out_schemata_free;
-
- if (resctrl_arch_mon_capable()) {
- ret = mongroup_create_dir(rdtgroup_default.kn,
- &rdtgroup_default, "mon_groups",
- &kn_mongrp);
- if (ret < 0)
- goto out_info;
-
- ret = mkdir_mondata_all(rdtgroup_default.kn,
- &rdtgroup_default, &kn_mondata);
- if (ret < 0)
- goto out_mongrp;
- rdtgroup_default.mon.mon_data_kn = kn_mondata;
- }
-
- ret = rdt_pseudo_lock_init();
- if (ret)
- goto out_mondata;
-
- ret = kernfs_get_tree(fc);
- if (ret < 0)
- goto out_psl;
-
- if (resctrl_arch_alloc_capable())
- resctrl_arch_enable_alloc();
- if (resctrl_arch_mon_capable())
- resctrl_arch_enable_mon();
-
- if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
- resctrl_mounted = true;
-
- if (resctrl_is_mbm_enabled()) {
- r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
- list_for_each_entry(dom, &r->mon_domains, hdr.list)
- mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
- RESCTRL_PICK_ANY_CPU);
- }
-
- goto out;
-
-out_psl:
- rdt_pseudo_lock_release();
-out_mondata:
- if (resctrl_arch_mon_capable())
- kernfs_remove(kn_mondata);
-out_mongrp:
- if (resctrl_arch_mon_capable())
- kernfs_remove(kn_mongrp);
-out_info:
- kernfs_remove(kn_info);
-out_schemata_free:
- schemata_list_destroy();
-out_ctx:
- rdt_disable_ctx();
-out_root:
- rdtgroup_destroy_root();
-out:
- rdt_last_cmd_clear();
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
- return ret;
-}
-
-enum rdt_param {
- Opt_cdp,
- Opt_cdpl2,
- Opt_mba_mbps,
- Opt_debug,
- nr__rdt_params
-};
-
-static const struct fs_parameter_spec rdt_fs_parameters[] = {
- fsparam_flag("cdp", Opt_cdp),
- fsparam_flag("cdpl2", Opt_cdpl2),
- fsparam_flag("mba_MBps", Opt_mba_mbps),
- fsparam_flag("debug", Opt_debug),
- {}
-};
-
-static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
-{
- struct rdt_fs_context *ctx = rdt_fc2context(fc);
- struct fs_parse_result result;
- const char *msg;
- int opt;
-
- opt = fs_parse(fc, rdt_fs_parameters, param, &result);
- if (opt < 0)
- return opt;
-
- switch (opt) {
- case Opt_cdp:
- ctx->enable_cdpl3 = true;
- return 0;
- case Opt_cdpl2:
- ctx->enable_cdpl2 = true;
- return 0;
- case Opt_mba_mbps:
- msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
- if (!supports_mba_mbps())
- return invalfc(fc, msg);
- ctx->enable_mba_mbps = true;
- return 0;
- case Opt_debug:
- ctx->enable_debug = true;
- return 0;
- }
-
- return -EINVAL;
-}
-
-static void rdt_fs_context_free(struct fs_context *fc)
+bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
{
- struct rdt_fs_context *ctx = rdt_fc2context(fc);
-
- kernfs_free_fs_context(fc);
- kfree(ctx);
-}
-
-static const struct fs_context_operations rdt_fs_context_ops = {
- .free = rdt_fs_context_free,
- .parse_param = rdt_parse_param,
- .get_tree = rdt_get_tree,
-};
-
-static int rdt_init_fs_context(struct fs_context *fc)
-{
- struct rdt_fs_context *ctx;
-
- ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
-
- ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
- fc->fs_private = &ctx->kfc;
- fc->ops = &rdt_fs_context_ops;
- put_user_ns(fc->user_ns);
- fc->user_ns = get_user_ns(&init_user_ns);
- fc->global = true;
- return 0;
+ return rdt_resources_all[l].cdp_enabled;
}
void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
@@ -2953,1460 +260,3 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
return;
}
-
-/*
- * Move tasks from one to the other group. If @from is NULL, then all tasks
- * in the systems are moved unconditionally (used for teardown).
- *
- * If @mask is not NULL the cpus on which moved tasks are running are set
- * in that mask so the update smp function call is restricted to affected
- * cpus.
- */
-static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
- struct cpumask *mask)
-{
- struct task_struct *p, *t;
-
- read_lock(&tasklist_lock);
- for_each_process_thread(p, t) {
- if (!from || is_closid_match(t, from) ||
- is_rmid_match(t, from)) {
- resctrl_arch_set_closid_rmid(t, to->closid,
- to->mon.rmid);
-
- /*
- * Order the closid/rmid stores above before the loads
- * in task_curr(). This pairs with the full barrier
- * between the rq->curr update and resctrl_sched_in()
- * during context switch.
- */
- smp_mb();
-
- /*
- * If the task is on a CPU, set the CPU in the mask.
- * The detection is inaccurate as tasks might move or
- * schedule before the smp function call takes place.
- * In such a case the function call is pointless, but
- * there is no other side effect.
- */
- if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
- cpumask_set_cpu(task_cpu(t), mask);
- }
- }
- read_unlock(&tasklist_lock);
-}
-
-static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
-{
- struct rdtgroup *sentry, *stmp;
- struct list_head *head;
-
- head = &rdtgrp->mon.crdtgrp_list;
- list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
- free_rmid(sentry->closid, sentry->mon.rmid);
- list_del(&sentry->mon.crdtgrp_list);
-
- if (atomic_read(&sentry->waitcount) != 0)
- sentry->flags = RDT_DELETED;
- else
- rdtgroup_remove(sentry);
- }
-}
-
-/*
- * Forcibly remove all of subdirectories under root.
- */
-static void rmdir_all_sub(void)
-{
- struct rdtgroup *rdtgrp, *tmp;
-
- /* Move all tasks to the default resource group */
- rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
-
- list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
- /* Free any child rmids */
- free_all_child_rdtgrp(rdtgrp);
-
- /* Remove each rdtgroup other than root */
- if (rdtgrp == &rdtgroup_default)
- continue;
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
- rdtgroup_pseudo_lock_remove(rdtgrp);
-
- /*
- * Give any CPUs back to the default group. We cannot copy
- * cpu_online_mask because a CPU might have executed the
- * offline callback already, but is still marked online.
- */
- cpumask_or(&rdtgroup_default.cpu_mask,
- &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
-
- free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
-
- kernfs_remove(rdtgrp->kn);
- list_del(&rdtgrp->rdtgroup_list);
-
- if (atomic_read(&rdtgrp->waitcount) != 0)
- rdtgrp->flags = RDT_DELETED;
- else
- rdtgroup_remove(rdtgrp);
- }
- /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
- update_closid_rmid(cpu_online_mask, &rdtgroup_default);
-
- kernfs_remove(kn_info);
- kernfs_remove(kn_mongrp);
- kernfs_remove(kn_mondata);
-}
-
-static void rdt_kill_sb(struct super_block *sb)
-{
- struct rdt_resource *r;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- rdt_disable_ctx();
-
- /* Put everything back to default values. */
- for_each_alloc_capable_rdt_resource(r)
- resctrl_arch_reset_all_ctrls(r);
-
- rmdir_all_sub();
- rdt_pseudo_lock_release();
- rdtgroup_default.mode = RDT_MODE_SHAREABLE;
- schemata_list_destroy();
- rdtgroup_destroy_root();
- if (resctrl_arch_alloc_capable())
- resctrl_arch_disable_alloc();
- if (resctrl_arch_mon_capable())
- resctrl_arch_disable_mon();
- resctrl_mounted = false;
- kernfs_kill_sb(sb);
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-}
-
-static struct file_system_type rdt_fs_type = {
- .name = "resctrl",
- .init_fs_context = rdt_init_fs_context,
- .parameters = rdt_fs_parameters,
- .kill_sb = rdt_kill_sb,
-};
-
-static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
- void *priv)
-{
- struct kernfs_node *kn;
- int ret = 0;
-
- kn = __kernfs_create_file(parent_kn, name, 0444,
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
- &kf_mondata_ops, priv, NULL, NULL);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret) {
- kernfs_remove(kn);
- return ret;
- }
-
- return ret;
-}
-
-static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname)
-{
- struct kernfs_node *kn;
-
- kn = kernfs_find_and_get(pkn, name);
- if (!kn)
- return;
- kernfs_put(kn);
-
- if (kn->dir.subdirs <= 1)
- kernfs_remove(kn);
- else
- kernfs_remove_by_name(kn, subname);
-}
-
-/*
- * Remove all subdirectories of mon_data of ctrl_mon groups
- * and monitor groups for the given domain.
- * Remove files and directories containing "sum" of domain data
- * when last domain being summed is removed.
- */
-static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- struct rdt_mon_domain *d)
-{
- struct rdtgroup *prgrp, *crgrp;
- char subname[32];
- bool snc_mode;
- char name[32];
-
- snc_mode = r->mon_scope == RESCTRL_L3_NODE;
- sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
- if (snc_mode)
- sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
-
- list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname);
-
- list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
- mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname);
- }
-}
-
-static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
- struct rdt_resource *r, struct rdtgroup *prgrp,
- bool do_sum)
-{
- struct rmid_read rr = {0};
- union mon_data_bits priv;
- struct mon_evt *mevt;
- int ret;
-
- if (WARN_ON(list_empty(&r->evt_list)))
- return -EPERM;
-
- priv.u.rid = r->rid;
- priv.u.domid = do_sum ? d->ci->id : d->hdr.id;
- priv.u.sum = do_sum;
- list_for_each_entry(mevt, &r->evt_list, list) {
- priv.u.evtid = mevt->evtid;
- ret = mon_addfile(kn, mevt->name, priv.priv);
- if (ret)
- return ret;
-
- if (!do_sum && resctrl_is_mbm_event(mevt->evtid))
- mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
- }
-
- return 0;
-}
-
-static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
- struct rdt_mon_domain *d,
- struct rdt_resource *r, struct rdtgroup *prgrp)
-{
- struct kernfs_node *kn, *ckn;
- char name[32];
- bool snc_mode;
- int ret = 0;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- snc_mode = r->mon_scope == RESCTRL_L3_NODE;
- sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
- kn = kernfs_find_and_get(parent_kn, name);
- if (kn) {
- /*
- * rdtgroup_mutex will prevent this directory from being
- * removed. No need to keep this hold.
- */
- kernfs_put(kn);
- } else {
- kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret)
- goto out_destroy;
- ret = mon_add_all_files(kn, d, r, prgrp, snc_mode);
- if (ret)
- goto out_destroy;
- }
-
- if (snc_mode) {
- sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id);
- ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp);
- if (IS_ERR(ckn)) {
- ret = -EINVAL;
- goto out_destroy;
- }
-
- ret = rdtgroup_kn_set_ugid(ckn);
- if (ret)
- goto out_destroy;
-
- ret = mon_add_all_files(ckn, d, r, prgrp, false);
- if (ret)
- goto out_destroy;
- }
-
- kernfs_activate(kn);
- return 0;
-
-out_destroy:
- kernfs_remove(kn);
- return ret;
-}
-
-/*
- * Add all subdirectories of mon_data for "ctrl_mon" groups
- * and "monitor" groups with given domain id.
- */
-static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- struct rdt_mon_domain *d)
-{
- struct kernfs_node *parent_kn;
- struct rdtgroup *prgrp, *crgrp;
- struct list_head *head;
-
- list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- parent_kn = prgrp->mon.mon_data_kn;
- mkdir_mondata_subdir(parent_kn, d, r, prgrp);
-
- head = &prgrp->mon.crdtgrp_list;
- list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
- parent_kn = crgrp->mon.mon_data_kn;
- mkdir_mondata_subdir(parent_kn, d, r, crgrp);
- }
- }
-}
-
-static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
- struct rdt_resource *r,
- struct rdtgroup *prgrp)
-{
- struct rdt_mon_domain *dom;
- int ret;
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- list_for_each_entry(dom, &r->mon_domains, hdr.list) {
- ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/*
- * This creates a directory mon_data which contains the monitored data.
- *
- * mon_data has one directory for each domain which are named
- * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
- * with L3 domain looks as below:
- * ./mon_data:
- * mon_L3_00
- * mon_L3_01
- * mon_L3_02
- * ...
- *
- * Each domain directory has one file per event:
- * ./mon_L3_00/:
- * llc_occupancy
- *
- */
-static int mkdir_mondata_all(struct kernfs_node *parent_kn,
- struct rdtgroup *prgrp,
- struct kernfs_node **dest_kn)
-{
- struct rdt_resource *r;
- struct kernfs_node *kn;
- int ret;
-
- /*
- * Create the mon_data directory first.
- */
- ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
- if (ret)
- return ret;
-
- if (dest_kn)
- *dest_kn = kn;
-
- /*
- * Create the subdirectories for each domain. Note that all events
- * in a domain like L3 are grouped into a resource whose domain is L3
- */
- for_each_mon_capable_rdt_resource(r) {
- ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
- if (ret)
- goto out_destroy;
- }
-
- return 0;
-
-out_destroy:
- kernfs_remove(kn);
- return ret;
-}
-
-/**
- * cbm_ensure_valid - Enforce validity on provided CBM
- * @_val: Candidate CBM
- * @r: RDT resource to which the CBM belongs
- *
- * The provided CBM represents all cache portions available for use. This
- * may be represented by a bitmap that does not consist of contiguous ones
- * and thus be an invalid CBM.
- * Here the provided CBM is forced to be a valid CBM by only considering
- * the first set of contiguous bits as valid and clearing all bits.
- * The intention here is to provide a valid default CBM with which a new
- * resource group is initialized. The user can follow this with a
- * modification to the CBM if the default does not satisfy the
- * requirements.
- */
-static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
-{
- unsigned int cbm_len = r->cache.cbm_len;
- unsigned long first_bit, zero_bit;
- unsigned long val = _val;
-
- if (!val)
- return 0;
-
- first_bit = find_first_bit(&val, cbm_len);
- zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
-
- /* Clear any remaining bits to ensure contiguous region */
- bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
- return (u32)val;
-}
-
-/*
- * Initialize cache resources per RDT domain
- *
- * Set the RDT domain up to start off with all usable allocations. That is,
- * all shareable and unused bits. All-zero CBM is invalid.
- */
-static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s,
- u32 closid)
-{
- enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
- enum resctrl_conf_type t = s->conf_type;
- struct resctrl_staged_config *cfg;
- struct rdt_resource *r = s->res;
- u32 used_b = 0, unused_b = 0;
- unsigned long tmp_cbm;
- enum rdtgrp_mode mode;
- u32 peer_ctl, ctrl_val;
- int i;
-
- cfg = &d->staged_config[t];
- cfg->have_new_ctrl = false;
- cfg->new_ctrl = r->cache.shareable_bits;
- used_b = r->cache.shareable_bits;
- for (i = 0; i < closids_supported(); i++) {
- if (closid_allocated(i) && i != closid) {
- mode = rdtgroup_mode_by_closid(i);
- if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
- /*
- * ctrl values for locksetup aren't relevant
- * until the schemata is written, and the mode
- * becomes RDT_MODE_PSEUDO_LOCKED.
- */
- continue;
- /*
- * If CDP is active include peer domain's
- * usage to ensure there is no overlap
- * with an exclusive group.
- */
- if (resctrl_arch_get_cdp_enabled(r->rid))
- peer_ctl = resctrl_arch_get_config(r, d, i,
- peer_type);
- else
- peer_ctl = 0;
- ctrl_val = resctrl_arch_get_config(r, d, i,
- s->conf_type);
- used_b |= ctrl_val | peer_ctl;
- if (mode == RDT_MODE_SHAREABLE)
- cfg->new_ctrl |= ctrl_val | peer_ctl;
- }
- }
- if (d->plr && d->plr->cbm > 0)
- used_b |= d->plr->cbm;
- unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
- unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
- cfg->new_ctrl |= unused_b;
- /*
- * Force the initial CBM to be valid, user can
- * modify the CBM based on system availability.
- */
- cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
- /*
- * Assign the u32 CBM to an unsigned long to ensure that
- * bitmap_weight() does not access out-of-bound memory.
- */
- tmp_cbm = cfg->new_ctrl;
- if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
- rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id);
- return -ENOSPC;
- }
- cfg->have_new_ctrl = true;
-
- return 0;
-}
-
-/*
- * Initialize cache resources with default values.
- *
- * A new RDT group is being created on an allocation capable (CAT)
- * supporting system. Set this group up to start off with all usable
- * allocations.
- *
- * If there are no more shareable bits available on any domain then
- * the entire allocation will fail.
- */
-static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
-{
- struct rdt_ctrl_domain *d;
- int ret;
-
- list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) {
- ret = __init_one_rdt_domain(d, s, closid);
- if (ret < 0)
- return ret;
- }
-
- return 0;
-}
-
-/* Initialize MBA resource with default values. */
-static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
-{
- struct resctrl_staged_config *cfg;
- struct rdt_ctrl_domain *d;
-
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- if (is_mba_sc(r)) {
- d->mbps_val[closid] = MBA_MAX_MBPS;
- continue;
- }
-
- cfg = &d->staged_config[CDP_NONE];
- cfg->new_ctrl = resctrl_get_default_ctrl(r);
- cfg->have_new_ctrl = true;
- }
-}
-
-/* Initialize the RDT group's allocations. */
-static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
-{
- struct resctrl_schema *s;
- struct rdt_resource *r;
- int ret = 0;
-
- rdt_staged_configs_clear();
-
- list_for_each_entry(s, &resctrl_schema_all, list) {
- r = s->res;
- if (r->rid == RDT_RESOURCE_MBA ||
- r->rid == RDT_RESOURCE_SMBA) {
- rdtgroup_init_mba(r, rdtgrp->closid);
- if (is_mba_sc(r))
- continue;
- } else {
- ret = rdtgroup_init_cat(s, rdtgrp->closid);
- if (ret < 0)
- goto out;
- }
-
- ret = resctrl_arch_update_domains(r, rdtgrp->closid);
- if (ret < 0) {
- rdt_last_cmd_puts("Failed to initialize allocations\n");
- goto out;
- }
-
- }
-
- rdtgrp->mode = RDT_MODE_SHAREABLE;
-
-out:
- rdt_staged_configs_clear();
- return ret;
-}
-
-static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- if (!resctrl_arch_mon_capable())
- return 0;
-
- ret = alloc_rmid(rdtgrp->closid);
- if (ret < 0) {
- rdt_last_cmd_puts("Out of RMIDs\n");
- return ret;
- }
- rdtgrp->mon.rmid = ret;
-
- ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
- if (ret) {
- rdt_last_cmd_puts("kernfs subdir error\n");
- free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
- return ret;
- }
-
- return 0;
-}
-
-static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
-{
- if (resctrl_arch_mon_capable())
- free_rmid(rgrp->closid, rgrp->mon.rmid);
-}
-
-/*
- * We allow creating mon groups only with in a directory called "mon_groups"
- * which is present in every ctrl_mon group. Check if this is a valid
- * "mon_groups" directory.
- *
- * 1. The directory should be named "mon_groups".
- * 2. The mon group itself should "not" be named "mon_groups".
- * This makes sure "mon_groups" directory always has a ctrl_mon group
- * as parent.
- */
-static bool is_mon_groups(struct kernfs_node *kn, const char *name)
-{
- return (!strcmp(rdt_kn_name(kn), "mon_groups") &&
- strcmp(name, "mon_groups"));
-}
-
-static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
- const char *name, umode_t mode,
- enum rdt_group_type rtype, struct rdtgroup **r)
-{
- struct rdtgroup *prdtgrp, *rdtgrp;
- unsigned long files = 0;
- struct kernfs_node *kn;
- int ret;
-
- prdtgrp = rdtgroup_kn_lock_live(parent_kn);
- if (!prdtgrp) {
- ret = -ENODEV;
- goto out_unlock;
- }
-
- /*
- * Check that the parent directory for a monitor group is a "mon_groups"
- * directory.
- */
- if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) {
- ret = -EPERM;
- goto out_unlock;
- }
-
- if (rtype == RDTMON_GROUP &&
- (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
- prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Pseudo-locking in progress\n");
- goto out_unlock;
- }
-
- /* allocate the rdtgroup. */
- rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
- if (!rdtgrp) {
- ret = -ENOSPC;
- rdt_last_cmd_puts("Kernel out of memory\n");
- goto out_unlock;
- }
- *r = rdtgrp;
- rdtgrp->mon.parent = prdtgrp;
- rdtgrp->type = rtype;
- INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
-
- /* kernfs creates the directory for rdtgrp */
- kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
- if (IS_ERR(kn)) {
- ret = PTR_ERR(kn);
- rdt_last_cmd_puts("kernfs create error\n");
- goto out_free_rgrp;
- }
- rdtgrp->kn = kn;
-
- /*
- * kernfs_remove() will drop the reference count on "kn" which
- * will free it. But we still need it to stick around for the
- * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
- * which will be dropped by kernfs_put() in rdtgroup_remove().
- */
- kernfs_get(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret) {
- rdt_last_cmd_puts("kernfs perm error\n");
- goto out_destroy;
- }
-
- if (rtype == RDTCTRL_GROUP) {
- files = RFTYPE_BASE | RFTYPE_CTRL;
- if (resctrl_arch_mon_capable())
- files |= RFTYPE_MON;
- } else {
- files = RFTYPE_BASE | RFTYPE_MON;
- }
-
- ret = rdtgroup_add_files(kn, files);
- if (ret) {
- rdt_last_cmd_puts("kernfs fill error\n");
- goto out_destroy;
- }
-
- /*
- * The caller unlocks the parent_kn upon success.
- */
- return 0;
-
-out_destroy:
- kernfs_put(rdtgrp->kn);
- kernfs_remove(rdtgrp->kn);
-out_free_rgrp:
- kfree(rdtgrp);
-out_unlock:
- rdtgroup_kn_unlock(parent_kn);
- return ret;
-}
-
-static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
-{
- kernfs_remove(rgrp->kn);
- rdtgroup_remove(rgrp);
-}
-
-/*
- * Create a monitor group under "mon_groups" directory of a control
- * and monitor group(ctrl_mon). This is a resource group
- * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
- */
-static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
- const char *name, umode_t mode)
-{
- struct rdtgroup *rdtgrp, *prgrp;
- int ret;
-
- ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
- if (ret)
- return ret;
-
- prgrp = rdtgrp->mon.parent;
- rdtgrp->closid = prgrp->closid;
-
- ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
- if (ret) {
- mkdir_rdt_prepare_clean(rdtgrp);
- goto out_unlock;
- }
-
- kernfs_activate(rdtgrp->kn);
-
- /*
- * Add the rdtgrp to the list of rdtgrps the parent
- * ctrl_mon group has to track.
- */
- list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
-
-out_unlock:
- rdtgroup_kn_unlock(parent_kn);
- return ret;
-}
-
-/*
- * These are rdtgroups created under the root directory. Can be used
- * to allocate and monitor resources.
- */
-static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
- const char *name, umode_t mode)
-{
- struct rdtgroup *rdtgrp;
- struct kernfs_node *kn;
- u32 closid;
- int ret;
-
- ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
- if (ret)
- return ret;
-
- kn = rdtgrp->kn;
- ret = closid_alloc();
- if (ret < 0) {
- rdt_last_cmd_puts("Out of CLOSIDs\n");
- goto out_common_fail;
- }
- closid = ret;
- ret = 0;
-
- rdtgrp->closid = closid;
-
- ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
- if (ret)
- goto out_closid_free;
-
- kernfs_activate(rdtgrp->kn);
-
- ret = rdtgroup_init_alloc(rdtgrp);
- if (ret < 0)
- goto out_rmid_free;
-
- list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
-
- if (resctrl_arch_mon_capable()) {
- /*
- * Create an empty mon_groups directory to hold the subset
- * of tasks and cpus to monitor.
- */
- ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
- if (ret) {
- rdt_last_cmd_puts("kernfs subdir error\n");
- goto out_del_list;
- }
- if (is_mba_sc(NULL))
- rdtgrp->mba_mbps_event = mba_mbps_default_event;
- }
-
- goto out_unlock;
-
-out_del_list:
- list_del(&rdtgrp->rdtgroup_list);
-out_rmid_free:
- mkdir_rdt_prepare_rmid_free(rdtgrp);
-out_closid_free:
- closid_free(closid);
-out_common_fail:
- mkdir_rdt_prepare_clean(rdtgrp);
-out_unlock:
- rdtgroup_kn_unlock(parent_kn);
- return ret;
-}
-
-static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
-{
- /* Do not accept '\n' to avoid unparsable situation. */
- if (strchr(name, '\n'))
- return -EINVAL;
-
- /*
- * If the parent directory is the root directory and RDT
- * allocation is supported, add a control and monitoring
- * subdirectory
- */
- if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
- return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
-
- /* Else, attempt to add a monitoring subdirectory. */
- if (resctrl_arch_mon_capable())
- return rdtgroup_mkdir_mon(parent_kn, name, mode);
-
- return -EPERM;
-}
-
-static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
-{
- struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
- u32 closid, rmid;
- int cpu;
-
- /* Give any tasks back to the parent group */
- rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
-
- /*
- * Update per cpu closid/rmid of the moved CPUs first.
- * Note: the closid will not change, but the arch code still needs it.
- */
- closid = prdtgrp->closid;
- rmid = prdtgrp->mon.rmid;
- for_each_cpu(cpu, &rdtgrp->cpu_mask)
- resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
-
- /*
- * Update the MSR on moved CPUs and CPUs which have moved
- * task running on them.
- */
- cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
- update_closid_rmid(tmpmask, NULL);
-
- rdtgrp->flags = RDT_DELETED;
- free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
-
- /*
- * Remove the rdtgrp from the parent ctrl_mon group's list
- */
- WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
- list_del(&rdtgrp->mon.crdtgrp_list);
-
- kernfs_remove(rdtgrp->kn);
-
- return 0;
-}
-
-static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
-{
- rdtgrp->flags = RDT_DELETED;
- list_del(&rdtgrp->rdtgroup_list);
-
- kernfs_remove(rdtgrp->kn);
- return 0;
-}
-
-static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
-{
- u32 closid, rmid;
- int cpu;
-
- /* Give any tasks back to the default group */
- rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
-
- /* Give any CPUs back to the default group */
- cpumask_or(&rdtgroup_default.cpu_mask,
- &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
-
- /* Update per cpu closid and rmid of the moved CPUs first */
- closid = rdtgroup_default.closid;
- rmid = rdtgroup_default.mon.rmid;
- for_each_cpu(cpu, &rdtgrp->cpu_mask)
- resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
-
- /*
- * Update the MSR on moved CPUs and CPUs which have moved
- * task running on them.
- */
- cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
- update_closid_rmid(tmpmask, NULL);
-
- free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
- closid_free(rdtgrp->closid);
-
- rdtgroup_ctrl_remove(rdtgrp);
-
- /*
- * Free all the child monitor group rmids.
- */
- free_all_child_rdtgrp(rdtgrp);
-
- return 0;
-}
-
-static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn)
-{
- /*
- * Valid within the RCU section it was obtained or while rdtgroup_mutex
- * is held.
- */
- return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex));
-}
-
-static int rdtgroup_rmdir(struct kernfs_node *kn)
-{
- struct kernfs_node *parent_kn;
- struct rdtgroup *rdtgrp;
- cpumask_var_t tmpmask;
- int ret = 0;
-
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
- return -ENOMEM;
-
- rdtgrp = rdtgroup_kn_lock_live(kn);
- if (!rdtgrp) {
- ret = -EPERM;
- goto out;
- }
- parent_kn = rdt_kn_parent(kn);
-
- /*
- * If the rdtgroup is a ctrl_mon group and parent directory
- * is the root directory, remove the ctrl_mon group.
- *
- * If the rdtgroup is a mon group and parent directory
- * is a valid "mon_groups" directory, remove the mon group.
- */
- if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
- rdtgrp != &rdtgroup_default) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- ret = rdtgroup_ctrl_remove(rdtgrp);
- } else {
- ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
- }
- } else if (rdtgrp->type == RDTMON_GROUP &&
- is_mon_groups(parent_kn, rdt_kn_name(kn))) {
- ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
- } else {
- ret = -EPERM;
- }
-
-out:
- rdtgroup_kn_unlock(kn);
- free_cpumask_var(tmpmask);
- return ret;
-}
-
-/**
- * mongrp_reparent() - replace parent CTRL_MON group of a MON group
- * @rdtgrp: the MON group whose parent should be replaced
- * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp
- * @cpus: cpumask provided by the caller for use during this call
- *
- * Replaces the parent CTRL_MON group for a MON group, resulting in all member
- * tasks' CLOSID immediately changing to that of the new parent group.
- * Monitoring data for the group is unaffected by this operation.
- */
-static void mongrp_reparent(struct rdtgroup *rdtgrp,
- struct rdtgroup *new_prdtgrp,
- cpumask_var_t cpus)
-{
- struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
-
- WARN_ON(rdtgrp->type != RDTMON_GROUP);
- WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
-
- /* Nothing to do when simply renaming a MON group. */
- if (prdtgrp == new_prdtgrp)
- return;
-
- WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
- list_move_tail(&rdtgrp->mon.crdtgrp_list,
- &new_prdtgrp->mon.crdtgrp_list);
-
- rdtgrp->mon.parent = new_prdtgrp;
- rdtgrp->closid = new_prdtgrp->closid;
-
- /* Propagate updated closid to all tasks in this group. */
- rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
-
- update_closid_rmid(cpus, NULL);
-}
-
-static int rdtgroup_rename(struct kernfs_node *kn,
- struct kernfs_node *new_parent, const char *new_name)
-{
- struct kernfs_node *kn_parent;
- struct rdtgroup *new_prdtgrp;
- struct rdtgroup *rdtgrp;
- cpumask_var_t tmpmask;
- int ret;
-
- rdtgrp = kernfs_to_rdtgroup(kn);
- new_prdtgrp = kernfs_to_rdtgroup(new_parent);
- if (!rdtgrp || !new_prdtgrp)
- return -ENOENT;
-
- /* Release both kernfs active_refs before obtaining rdtgroup mutex. */
- rdtgroup_kn_get(rdtgrp, kn);
- rdtgroup_kn_get(new_prdtgrp, new_parent);
-
- mutex_lock(&rdtgroup_mutex);
-
- rdt_last_cmd_clear();
-
- /*
- * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
- * either kernfs_node is a file.
- */
- if (kernfs_type(kn) != KERNFS_DIR ||
- kernfs_type(new_parent) != KERNFS_DIR) {
- rdt_last_cmd_puts("Source and destination must be directories");
- ret = -EPERM;
- goto out;
- }
-
- if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
- ret = -ENOENT;
- goto out;
- }
-
- kn_parent = rdt_kn_parent(kn);
- if (rdtgrp->type != RDTMON_GROUP || !kn_parent ||
- !is_mon_groups(kn_parent, rdt_kn_name(kn))) {
- rdt_last_cmd_puts("Source must be a MON group\n");
- ret = -EPERM;
- goto out;
- }
-
- if (!is_mon_groups(new_parent, new_name)) {
- rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
- ret = -EPERM;
- goto out;
- }
-
- /*
- * If the MON group is monitoring CPUs, the CPUs must be assigned to the
- * current parent CTRL_MON group and therefore cannot be assigned to
- * the new parent, making the move illegal.
- */
- if (!cpumask_empty(&rdtgrp->cpu_mask) &&
- rdtgrp->mon.parent != new_prdtgrp) {
- rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
- ret = -EPERM;
- goto out;
- }
-
- /*
- * Allocate the cpumask for use in mongrp_reparent() to avoid the
- * possibility of failing to allocate it after kernfs_rename() has
- * succeeded.
- */
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
- ret = -ENOMEM;
- goto out;
- }
-
- /*
- * Perform all input validation and allocations needed to ensure
- * mongrp_reparent() will succeed before calling kernfs_rename(),
- * otherwise it would be necessary to revert this call if
- * mongrp_reparent() failed.
- */
- ret = kernfs_rename(kn, new_parent, new_name);
- if (!ret)
- mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
-
- free_cpumask_var(tmpmask);
-
-out:
- mutex_unlock(&rdtgroup_mutex);
- rdtgroup_kn_put(rdtgrp, kn);
- rdtgroup_kn_put(new_prdtgrp, new_parent);
- return ret;
-}
-
-static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
-{
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
- seq_puts(seq, ",cdp");
-
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
- seq_puts(seq, ",cdpl2");
-
- if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA)))
- seq_puts(seq, ",mba_MBps");
-
- if (resctrl_debug)
- seq_puts(seq, ",debug");
-
- return 0;
-}
-
-static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
- .mkdir = rdtgroup_mkdir,
- .rmdir = rdtgroup_rmdir,
- .rename = rdtgroup_rename,
- .show_options = rdtgroup_show_options,
-};
-
-static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
-{
- rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
- KERNFS_ROOT_CREATE_DEACTIVATED |
- KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
- &rdtgroup_default);
- if (IS_ERR(rdt_root))
- return PTR_ERR(rdt_root);
-
- ctx->kfc.root = rdt_root;
- rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
-
- return 0;
-}
-
-static void rdtgroup_destroy_root(void)
-{
- kernfs_destroy_root(rdt_root);
- rdtgroup_default.kn = NULL;
-}
-
-static void __init rdtgroup_setup_default(void)
-{
- mutex_lock(&rdtgroup_mutex);
-
- rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
- rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
- rdtgroup_default.type = RDTCTRL_GROUP;
- INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
-
- list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
-
- mutex_unlock(&rdtgroup_mutex);
-}
-
-static void domain_destroy_mon_state(struct rdt_mon_domain *d)
-{
- bitmap_free(d->rmid_busy_llc);
- kfree(d->mbm_total);
- kfree(d->mbm_local);
-}
-
-void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
-{
- mutex_lock(&rdtgroup_mutex);
-
- if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
- mba_sc_domain_destroy(r, d);
-
- mutex_unlock(&rdtgroup_mutex);
-}
-
-void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
-{
- mutex_lock(&rdtgroup_mutex);
-
- /*
- * If resctrl is mounted, remove all the
- * per domain monitor data directories.
- */
- if (resctrl_mounted && resctrl_arch_mon_capable())
- rmdir_mondata_subdir_allrdtgrp(r, d);
-
- if (resctrl_is_mbm_enabled())
- cancel_delayed_work(&d->mbm_over);
- if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
- /*
- * When a package is going down, forcefully
- * decrement rmid->ebusy. There is no way to know
- * that the L3 was flushed and hence may lead to
- * incorrect counts in rare scenarios, but leaving
- * the RMID as busy creates RMID leaks if the
- * package never comes back.
- */
- __check_limbo(d, true);
- cancel_delayed_work(&d->cqm_limbo);
- }
-
- domain_destroy_mon_state(d);
-
- mutex_unlock(&rdtgroup_mutex);
-}
-
-/**
- * domain_setup_mon_state() - Initialise domain monitoring structures.
- * @r: The resource for the newly online domain.
- * @d: The newly online domain.
- *
- * Allocate monitor resources that belong to this domain.
- * Called when the first CPU of a domain comes online, regardless of whether
- * the filesystem is mounted.
- * During boot this may be called before global allocations have been made by
- * resctrl_mon_resource_init().
- *
- * Returns 0 for success, or -ENOMEM.
- */
-static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
-{
- u32 idx_limit = resctrl_arch_system_num_rmid_idx();
- size_t tsize;
-
- if (resctrl_arch_is_llc_occupancy_enabled()) {
- d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
- if (!d->rmid_busy_llc)
- return -ENOMEM;
- }
- if (resctrl_arch_is_mbm_total_enabled()) {
- tsize = sizeof(*d->mbm_total);
- d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
- if (!d->mbm_total) {
- bitmap_free(d->rmid_busy_llc);
- return -ENOMEM;
- }
- }
- if (resctrl_arch_is_mbm_local_enabled()) {
- tsize = sizeof(*d->mbm_local);
- d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
- if (!d->mbm_local) {
- bitmap_free(d->rmid_busy_llc);
- kfree(d->mbm_total);
- return -ENOMEM;
- }
- }
-
- return 0;
-}
-
-int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
-{
- int err = 0;
-
- mutex_lock(&rdtgroup_mutex);
-
- if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
- /* RDT_RESOURCE_MBA is never mon_capable */
- err = mba_sc_domain_allocate(r, d);
- }
-
- mutex_unlock(&rdtgroup_mutex);
-
- return err;
-}
-
-int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
-{
- int err;
-
- mutex_lock(&rdtgroup_mutex);
-
- err = domain_setup_mon_state(r, d);
- if (err)
- goto out_unlock;
-
- if (resctrl_is_mbm_enabled()) {
- INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
- mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
- RESCTRL_PICK_ANY_CPU);
- }
-
- if (resctrl_arch_is_llc_occupancy_enabled())
- INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
-
- /*
- * If the filesystem is not mounted then only the default resource group
- * exists. Creation of its directories is deferred until mount time
- * by rdt_get_tree() calling mkdir_mondata_all().
- * If resctrl is mounted, add per domain monitor data directories.
- */
- if (resctrl_mounted && resctrl_arch_mon_capable())
- mkdir_mondata_subdir_allrdtgrp(r, d);
-
-out_unlock:
- mutex_unlock(&rdtgroup_mutex);
-
- return err;
-}
-
-void resctrl_online_cpu(unsigned int cpu)
-{
- mutex_lock(&rdtgroup_mutex);
- /* The CPU is set in default rdtgroup after online. */
- cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
- mutex_unlock(&rdtgroup_mutex);
-}
-
-static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
-{
- struct rdtgroup *cr;
-
- list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
- if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
- break;
- }
-}
-
-static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu,
- struct rdt_resource *r)
-{
- struct rdt_mon_domain *d;
-
- lockdep_assert_cpus_held();
-
- list_for_each_entry(d, &r->mon_domains, hdr.list) {
- /* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
- return d;
- }
-
- return NULL;
-}
-
-void resctrl_offline_cpu(unsigned int cpu)
-{
- struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
- struct rdt_mon_domain *d;
- struct rdtgroup *rdtgrp;
-
- mutex_lock(&rdtgroup_mutex);
- list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
- if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
- clear_childcpus(rdtgrp, cpu);
- break;
- }
- }
-
- if (!l3->mon_capable)
- goto out_unlock;
-
- d = get_mon_domain_from_cpu(cpu, l3);
- if (d) {
- if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
- cancel_delayed_work(&d->mbm_over);
- mbm_setup_overflow_handler(d, 0, cpu);
- }
- if (resctrl_arch_is_llc_occupancy_enabled() &&
- cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
- cancel_delayed_work(&d->cqm_limbo);
- cqm_setup_limbo_handler(d, 0, cpu);
- }
- }
-
-out_unlock:
- mutex_unlock(&rdtgroup_mutex);
-}
-
-/*
- * resctrl_init - resctrl filesystem initialization
- *
- * Setup resctrl file system including set up root, create mount point,
- * register resctrl filesystem, and initialize files under root directory.
- *
- * Return: 0 on success or -errno
- */
-int __init resctrl_init(void)
-{
- int ret = 0;
-
- seq_buf_init(&last_cmd_status, last_cmd_status_buf,
- sizeof(last_cmd_status_buf));
-
- rdtgroup_setup_default();
-
- thread_throttle_mode_init();
-
- ret = resctrl_mon_resource_init();
- if (ret)
- return ret;
-
- ret = sysfs_create_mount_point(fs_kobj, "resctrl");
- if (ret) {
- resctrl_mon_resource_exit();
- return ret;
- }
-
- ret = register_filesystem(&rdt_fs_type);
- if (ret)
- goto cleanup_mountpoint;
-
- /*
- * Adding the resctrl debugfs directory here may not be ideal since
- * it would let the resctrl debugfs directory appear on the debugfs
- * filesystem before the resctrl filesystem is mounted.
- * It may also be ok since that would enable debugging of RDT before
- * resctrl is mounted.
- * The reason why the debugfs directory is created here and not in
- * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
- * during the debugfs directory creation also &sb->s_type->i_mutex_key
- * (the lockdep class of inode->i_rwsem). Other filesystem
- * interactions (eg. SyS_getdents) have the lock ordering:
- * &sb->s_type->i_mutex_key --> &mm->mmap_lock
- * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
- * is taken, thus creating dependency:
- * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
- * issues considering the other two lock dependencies.
- * By creating the debugfs directory here we avoid a dependency
- * that may cause deadlock (even though file operations cannot
- * occur until the filesystem is mounted, but I do not know how to
- * tell lockdep that).
- */
- debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
-
- return 0;
-
-cleanup_mountpoint:
- sysfs_remove_mount_point(fs_kobj, "resctrl");
- resctrl_mon_resource_exit();
-
- return ret;
-}
-
-void __exit resctrl_exit(void)
-{
- debugfs_remove_recursive(debugfs_resctrl);
- unregister_filesystem(&rdt_fs_type);
- sysfs_remove_mount_point(fs_kobj, "resctrl");
-
- resctrl_mon_resource_exit();
-}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index dbf6d71bdf18..b4a1f6732a3a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -50,6 +50,8 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
{ X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
{ X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
+ { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 },
+ { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 },
{ X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 },
{ X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
{ X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h
index 4eddb4d571ef..30f39f92c98f 100644
--- a/arch/x86/kernel/cpu/sgx/driver.h
+++ b/arch/x86/kernel/cpu/sgx/driver.h
@@ -2,7 +2,6 @@
#ifndef __ARCH_SGX_DRIVER_H__
#define __ARCH_SGX_DRIVER_H__
-#include <crypto/hash.h>
#include <linux/kref.h>
#include <linux/mmu_notifier.h>
#include <linux/radix-tree.h>
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 776a20172867..66f1efa16fbb 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -3,6 +3,7 @@
#include <asm/mman.h>
#include <asm/sgx.h>
+#include <crypto/sha2.h>
#include <linux/mman.h>
#include <linux/delay.h>
#include <linux/file.h>
@@ -463,31 +464,6 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
return ret;
}
-static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus,
- void *hash)
-{
- SHASH_DESC_ON_STACK(shash, tfm);
-
- shash->tfm = tfm;
-
- return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash);
-}
-
-static int sgx_get_key_hash(const void *modulus, void *hash)
-{
- struct crypto_shash *tfm;
- int ret;
-
- tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- ret = __sgx_get_key_hash(tfm, modulus, hash);
-
- crypto_free_shash(tfm);
- return ret;
-}
-
static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
void *token)
{
@@ -523,9 +499,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
sgx_xfrm_reserved_mask)
return -EINVAL;
- ret = sgx_get_key_hash(sigstruct->modulus, mrsigner);
- if (ret)
- return ret;
+ sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner);
mutex_lock(&encl->lock);
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 6722b2fc82cf..2de01b379aa3 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -720,6 +720,8 @@ int arch_memory_failure(unsigned long pfn, int flags)
goto out;
}
+ sgx_unmark_page_reclaimable(page);
+
/*
* TBD: Add additional plumbing to enable pre-emptive
* action for asynchronous poison notification. Until
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 0be61c45400c..bcb534688dfe 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -278,6 +278,7 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
unsigned long long mend)
{
unsigned long start, end;
+ int ret;
cmem->ranges[0].start = mstart;
cmem->ranges[0].end = mend;
@@ -286,22 +287,43 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
/* Exclude elf header region */
start = image->elf_load_addr;
end = start + image->elf_headers_sz - 1;
- return crash_exclude_mem_range(cmem, start, end);
+ ret = crash_exclude_mem_range(cmem, start, end);
+
+ if (ret)
+ return ret;
+
+ /* Exclude dm crypt keys region */
+ if (image->dm_crypt_keys_addr) {
+ start = image->dm_crypt_keys_addr;
+ end = start + image->dm_crypt_keys_sz - 1;
+ return crash_exclude_mem_range(cmem, start, end);
+ }
+
+ return ret;
}
/* Prepare memory map for crash dump kernel */
int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
{
+ unsigned int nr_ranges = 0;
int i, ret = 0;
unsigned long flags;
struct e820_entry ei;
struct crash_memmap_data cmd;
struct crash_mem *cmem;
- cmem = vzalloc(struct_size(cmem, ranges, 1));
+ /*
+ * Using random kexec_buf for passing dm crypt keys may cause a range
+ * split. So use two slots here.
+ */
+ nr_ranges = 2;
+ cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
if (!cmem)
return -ENOMEM;
+ cmem->max_nr_ranges = nr_ranges;
+ cmem->nr_ranges = 0;
+
memset(&cmd, 0, sizeof(struct crash_memmap_data));
cmd.params = params;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 9920122018a0..c3acbd26408b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1300,6 +1300,24 @@ void __init e820__memblock_setup(void)
}
/*
+ * At this point memblock is only allowed to allocate from memory
+ * below 1M (aka ISA_END_ADDRESS) up until direct map is completely set
+ * up in init_mem_mapping().
+ *
+ * KHO kernels are special and use only scratch memory for memblock
+ * allocations, but memory below 1M is ignored by kernel after early
+ * boot and cannot be naturally marked as scratch.
+ *
+ * To allow allocation of the real-mode trampoline and a few (if any)
+ * other very early allocations from below 1M forcibly mark the memory
+ * below 1M as scratch.
+ *
+ * After real mode trampoline is allocated, we clear that scratch
+ * marking.
+ */
+ memblock_mark_kho_scratch(0, SZ_1M);
+
+ /*
* 32-bit systems are limited to 4BG of memory even with HIGHMEM and
* to even less without it.
* Discard memory after max_pfn - the actual limit detected at runtime.
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index ea138583dd92..aefd412a23dc 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -37,6 +37,7 @@ DEFINE_PER_CPU(u64, xfd_state);
/* The FPU state configuration data for kernel and user space */
struct fpu_state_config fpu_kernel_cfg __ro_after_init;
struct fpu_state_config fpu_user_cfg __ro_after_init;
+struct vcpu_fpu_config guest_default_cfg __ro_after_init;
/*
* Represents the initial FPU state. It's mostly (but not completely) zeroes,
@@ -217,7 +218,7 @@ void fpu_reset_from_exception_fixup(void)
}
#if IS_ENABLED(CONFIG_KVM)
-static void __fpstate_reset(struct fpstate *fpstate, u64 xfd);
+static void __fpstate_reset(struct fpstate *fpstate);
static void fpu_lock_guest_permissions(void)
{
@@ -242,19 +243,21 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
struct fpstate *fpstate;
unsigned int size;
- size = fpu_kernel_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
+ size = guest_default_cfg.size + ALIGN(offsetof(struct fpstate, regs), 64);
+
fpstate = vzalloc(size);
if (!fpstate)
return false;
- /* Leave xfd to 0 (the reset value defined by spec) */
- __fpstate_reset(fpstate, 0);
- fpstate_init_user(fpstate);
+ /* Initialize indicators to reflect properties of the fpstate */
fpstate->is_valloc = true;
fpstate->is_guest = true;
+ __fpstate_reset(fpstate);
+ fpstate_init_user(fpstate);
+
gfpu->fpstate = fpstate;
- gfpu->xfeatures = fpu_kernel_cfg.default_features;
+ gfpu->xfeatures = guest_default_cfg.features;
/*
* KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
@@ -541,28 +544,50 @@ void fpstate_init_user(struct fpstate *fpstate)
fpstate_init_fstate(fpstate);
}
-static void __fpstate_reset(struct fpstate *fpstate, u64 xfd)
+static void __fpstate_reset(struct fpstate *fpstate)
{
- /* Initialize sizes and feature masks */
- fpstate->size = fpu_kernel_cfg.default_size;
+ /*
+ * Supervisor features (and thus sizes) may diverge between guest
+ * FPUs and host FPUs, as some supervisor features are supported
+ * for guests despite not being utilized by the host. User
+ * features and sizes are always identical, which allows for
+ * common guest and userspace ABI.
+ *
+ * For the host, set XFD to the kernel's desired initialization
+ * value. For guests, set XFD to its architectural RESET value.
+ */
+ if (fpstate->is_guest) {
+ fpstate->size = guest_default_cfg.size;
+ fpstate->xfeatures = guest_default_cfg.features;
+ fpstate->xfd = 0;
+ } else {
+ fpstate->size = fpu_kernel_cfg.default_size;
+ fpstate->xfeatures = fpu_kernel_cfg.default_features;
+ fpstate->xfd = init_fpstate.xfd;
+ }
+
fpstate->user_size = fpu_user_cfg.default_size;
- fpstate->xfeatures = fpu_kernel_cfg.default_features;
fpstate->user_xfeatures = fpu_user_cfg.default_features;
- fpstate->xfd = xfd;
}
void fpstate_reset(struct fpu *fpu)
{
/* Set the fpstate pointer to the default fpstate */
fpu->fpstate = &fpu->__fpstate;
- __fpstate_reset(fpu->fpstate, init_fpstate.xfd);
+ __fpstate_reset(fpu->fpstate);
/* Initialize the permission related info in fpu */
fpu->perm.__state_perm = fpu_kernel_cfg.default_features;
fpu->perm.__state_size = fpu_kernel_cfg.default_size;
fpu->perm.__user_state_size = fpu_user_cfg.default_size;
- /* Same defaults for guests */
- fpu->guest_perm = fpu->perm;
+
+ fpu->guest_perm.__state_perm = guest_default_cfg.features;
+ fpu->guest_perm.__state_size = guest_default_cfg.size;
+ /*
+ * User features and sizes are always identical between host and
+ * guest FPUs, which allows for common guest and userspace ABI.
+ */
+ fpu->guest_perm.__user_state_size = fpu_user_cfg.default_size;
}
static inline void fpu_inherit_perms(struct fpu *dst_fpu)
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 99db41bf9fa6..ff988b9ea39f 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -205,6 +205,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
fpu_kernel_cfg.default_size = size;
fpu_user_cfg.max_size = size;
fpu_user_cfg.default_size = size;
+ guest_default_cfg.size = size;
}
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 9aa9ac8399ae..12ed75c1b567 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -57,7 +57,7 @@ static const char *xfeature_names[] =
"Protection Keys User registers",
"PASID state",
"Control-flow User registers",
- "Control-flow Kernel registers (unused)",
+ "Control-flow Kernel registers (KVM only)",
"unknown xstate feature",
"unknown xstate feature",
"unknown xstate feature",
@@ -81,6 +81,7 @@ static unsigned short xsave_cpuid_features[] __initdata = {
[XFEATURE_PKRU] = X86_FEATURE_OSPKE,
[XFEATURE_PASID] = X86_FEATURE_ENQCMD,
[XFEATURE_CET_USER] = X86_FEATURE_SHSTK,
+ [XFEATURE_CET_KERNEL] = X86_FEATURE_SHSTK,
[XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
[XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
[XFEATURE_APX] = X86_FEATURE_APX,
@@ -372,6 +373,7 @@ static __init void os_xrstor_booting(struct xregs_state *xstate)
XFEATURE_MASK_BNDCSR | \
XFEATURE_MASK_PASID | \
XFEATURE_MASK_CET_USER | \
+ XFEATURE_MASK_CET_KERNEL | \
XFEATURE_MASK_XTILE | \
XFEATURE_MASK_APX)
@@ -573,6 +575,7 @@ static bool __init check_xstate_against_struct(int nr)
case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg);
case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state);
+ case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state);
case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state);
case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
default:
@@ -743,6 +746,9 @@ static int __init init_xstate_size(void)
fpu_user_cfg.default_size =
xstate_calculate_size(fpu_user_cfg.default_features, false);
+ guest_default_cfg.size =
+ xstate_calculate_size(guest_default_cfg.features, compacted);
+
return 0;
}
@@ -763,6 +769,7 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
fpu_kernel_cfg.default_size = legacy_size;
fpu_user_cfg.max_size = legacy_size;
fpu_user_cfg.default_size = legacy_size;
+ guest_default_cfg.size = legacy_size;
/*
* Prevent enabling the static branch which enables writes to the
@@ -773,6 +780,24 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
fpstate_reset(x86_task_fpu(current));
}
+static u64 __init host_default_mask(void)
+{
+ /*
+ * Exclude dynamic features (require userspace opt-in) and features
+ * that are supported only for KVM guests.
+ */
+ return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR);
+}
+
+static u64 __init guest_default_mask(void)
+{
+ /*
+ * Exclude dynamic features, which require userspace opt-in even
+ * for KVM guests.
+ */
+ return ~(u64)XFEATURE_MASK_USER_DYNAMIC;
+}
+
/*
* Enable and initialize the xsave feature.
* Called once per system bootup.
@@ -855,12 +880,13 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
- /* Clean out dynamic features from default */
- fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
- fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
-
- fpu_user_cfg.default_features = fpu_user_cfg.max_features;
- fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+ /*
+ * Now, given maximum feature set, determine default values by
+ * applying default masks.
+ */
+ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask();
+ fpu_user_cfg.default_features = fpu_user_cfg.max_features & host_default_mask();
+ guest_default_cfg.features = fpu_kernel_cfg.max_features & guest_default_mask();
/* Store it for paranoia check at the end */
xfeatures = fpu_kernel_cfg.max_features;
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 6290dd120f5e..ff40f09ad911 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -33,8 +33,9 @@ void io_bitmap_share(struct task_struct *tsk)
set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
}
-static void task_update_io_bitmap(struct task_struct *tsk)
+static void task_update_io_bitmap(void)
{
+ struct task_struct *tsk = current;
struct thread_struct *t = &tsk->thread;
if (t->iopl_emul == 3 || t->io_bitmap) {
@@ -54,7 +55,12 @@ void io_bitmap_exit(struct task_struct *tsk)
struct io_bitmap *iobm = tsk->thread.io_bitmap;
tsk->thread.io_bitmap = NULL;
- task_update_io_bitmap(tsk);
+ /*
+ * Don't touch the TSS when invoked on a failed fork(). TSS
+ * reflects the state of @current and not the state of @tsk.
+ */
+ if (tsk == current)
+ task_update_io_bitmap();
if (iobm && refcount_dec_and_test(&iobm->refcnt))
kfree(iobm);
}
@@ -192,8 +198,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
}
t->iopl_emul = level;
- task_update_io_bitmap(current);
-
+ task_update_io_bitmap();
return 0;
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 81f9b78e0f7b..9ed29ff10e59 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -380,61 +380,18 @@ void intel_posted_msi_init(void)
this_cpu_write(posted_msi_pi_desc.ndst, destination);
}
-/*
- * De-multiplexing posted interrupts is on the performance path, the code
- * below is written to optimize the cache performance based on the following
- * considerations:
- * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
- * accessed by both CPU and IOMMU.
- * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
- * for checking and clearing posted interrupt request (PIR), a 256 bit field
- * within the PID.
- * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
- * line when posting interrupts and setting control bits.
- * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
- * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
- * cache line. The cache line states after each operation are as follows:
- * CPU IOMMU PID Cache line state
- * ---------------------------------------------------------------
- *...read64 exclusive
- *...lock xchg64 modified
- *... post/atomic swap invalid
- *...-------------------------------------------------------------
- *
- * To reduce L1 data cache miss, it is important to avoid contention with
- * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
- * to dispatch interrupt handlers.
- *
- * In addition, the code is trying to keep the cache line state consistent
- * as much as possible. e.g. when making a copy and clearing the PIR
- * (assuming non-zero PIR bits are present in the entire PIR), it does:
- * read, read, read, read, xchg, xchg, xchg, xchg
- * instead of:
- * read, xchg, read, xchg, read, xchg, read, xchg
- */
-static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
+static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs)
{
- int i, vec = FIRST_EXTERNAL_VECTOR;
- unsigned long pir_copy[4];
- bool handled = false;
+ unsigned long pir_copy[NR_PIR_WORDS];
+ int vec = FIRST_EXTERNAL_VECTOR;
- for (i = 0; i < 4; i++)
- pir_copy[i] = pir[i];
-
- for (i = 0; i < 4; i++) {
- if (!pir_copy[i])
- continue;
+ if (!pi_harvest_pir(pir, pir_copy))
+ return false;
- pir_copy[i] = arch_xchg(&pir[i], 0);
- handled = true;
- }
-
- if (handled) {
- for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
- call_irq_handler(vec, regs);
- }
+ for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
+ call_irq_handler(vec, regs);
- return handled;
+ return true;
}
/*
@@ -464,7 +421,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
* MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
*/
while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
- if (!handle_pending_pir(pid->pir64, regs))
+ if (!handle_pending_pir(pid->pir, regs))
break;
}
@@ -479,7 +436,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
* process PIR bits one last time such that handling the new interrupts
* are not delayed until the next IRQ.
*/
- handle_pending_pir(pid->pir64, regs);
+ handle_pending_pir(pid->pir, regs);
apic_eoi();
irq_exit();
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 9cea1fc36c18..243a769fdd97 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -59,6 +59,18 @@ static ssize_t sched_itmt_enabled_write(struct file *filp,
return result;
}
+static int sched_core_priority_show(struct seq_file *s, void *unused)
+{
+ int cpu;
+
+ seq_puts(s, "CPU #\tPriority\n");
+ for_each_possible_cpu(cpu)
+ seq_printf(s, "%d\t%d\n", cpu, arch_asym_cpu_priority(cpu));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(sched_core_priority);
+
static const struct file_operations dfs_sched_itmt_fops = {
.read = debugfs_read_file_bool,
.write = sched_itmt_enabled_write,
@@ -67,6 +79,7 @@ static const struct file_operations dfs_sched_itmt_fops = {
};
static struct dentry *dfs_sched_itmt;
+static struct dentry *dfs_sched_core_prio;
/**
* sched_set_itmt_support() - Indicate platform supports ITMT
@@ -102,6 +115,14 @@ int sched_set_itmt_support(void)
return -ENOMEM;
}
+ dfs_sched_core_prio = debugfs_create_file("sched_core_priority", 0644,
+ arch_debugfs_dir, NULL,
+ &sched_core_priority_fops);
+ if (IS_ERR_OR_NULL(dfs_sched_core_prio)) {
+ dfs_sched_core_prio = NULL;
+ return -ENOMEM;
+ }
+
sched_itmt_capable = true;
sysctl_sched_itmt_enabled = 1;
@@ -133,6 +154,8 @@ void sched_clear_itmt_support(void)
debugfs_remove(dfs_sched_itmt);
dfs_sched_itmt = NULL;
+ debugfs_remove(dfs_sched_core_prio);
+ dfs_sched_core_prio = NULL;
if (sysctl_sched_itmt_enabled) {
/* disable sched_itmt if we are no longer ITMT capable */
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 68530fad05f7..24a41f0e0cf1 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -27,6 +27,8 @@
#include <asm/kexec-bzimage64.h>
#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */
+#define MAX_DMCRYPTKEYS_STR_LEN 31 /* dmcryptkeys=0x<64bit-value> */
+
/*
* Defines lowest physical address for various segments. Not sure where
@@ -76,6 +78,10 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params,
if (image->type == KEXEC_TYPE_CRASH) {
len = sprintf(cmdline_ptr,
"elfcorehdr=0x%lx ", image->elf_load_addr);
+
+ if (image->dm_crypt_keys_addr != 0)
+ len += sprintf(cmdline_ptr + len,
+ "dmcryptkeys=0x%lx ", image->dm_crypt_keys_addr);
}
memcpy(cmdline_ptr + len, cmdline, cmdline_len);
cmdline_len += len;
@@ -233,6 +239,32 @@ setup_ima_state(const struct kimage *image, struct boot_params *params,
#endif /* CONFIG_IMA_KEXEC */
}
+static void setup_kho(const struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int setup_data_offset)
+{
+ struct setup_data *sd = (void *)params + setup_data_offset;
+ struct kho_data *kho = (void *)sd + sizeof(*sd);
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ return;
+
+ sd->type = SETUP_KEXEC_KHO;
+ sd->len = sizeof(struct kho_data);
+
+ /* Only add if we have all KHO images in place */
+ if (!image->kho.fdt || !image->kho.scratch)
+ return;
+
+ /* Add setup data */
+ kho->fdt_addr = image->kho.fdt;
+ kho->fdt_size = PAGE_SIZE;
+ kho->scratch_addr = image->kho.scratch->mem;
+ kho->scratch_size = image->kho.scratch->bufsz;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = params_load_addr + setup_data_offset;
+}
+
static int
setup_boot_parameters(struct kimage *image, struct boot_params *params,
unsigned long params_load_addr,
@@ -312,6 +344,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
sizeof(struct ima_setup_data);
}
+ if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) {
+ /* Setup space to store preservation metadata */
+ setup_kho(image, params, params_load_addr, setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ sizeof(struct kho_data);
+ }
+
/* Setup RNG seed */
setup_rng_seed(params, params_load_addr, setup_data_offset);
@@ -441,6 +480,19 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
ret = crash_load_segments(image);
if (ret)
return ERR_PTR(ret);
+ ret = crash_load_dm_crypt_keys(image);
+ if (ret == -ENOENT) {
+ kexec_dprintk("No dm crypt key to load\n");
+ } else if (ret) {
+ pr_err("Failed to load dm crypt keys\n");
+ return ERR_PTR(ret);
+ }
+ if (image->dm_crypt_keys_addr &&
+ cmdline_len + MAX_ELFCOREHDR_STR_LEN + MAX_DMCRYPTKEYS_STR_LEN >
+ header->cmdline_size) {
+ pr_err("Appending dmcryptkeys=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
}
#endif
@@ -468,6 +520,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
efi_map_sz = efi_get_runtime_map_size();
params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
MAX_ELFCOREHDR_STR_LEN;
+ if (image->dm_crypt_keys_addr)
+ params_cmdline_sz += MAX_DMCRYPTKEYS_STR_LEN;
params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) +
sizeof(struct setup_data) +
@@ -479,6 +533,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
kbuf.bufsz += sizeof(struct setup_data) +
sizeof(struct ima_setup_data);
+ if (IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ kbuf.bufsz += sizeof(struct setup_data) +
+ sizeof(struct kho_data);
+
params = kzalloc(kbuf.bufsz, GFP_KERNEL);
if (!params)
return ERR_PTR(-ENOMEM);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 102641fd2172..8b1a9733d13e 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -385,7 +385,7 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs)
struct perf_event *bp;
/* Disable hardware debugging while we are in kgdb: */
- set_debugreg(0UL, 7);
+ set_debugreg(DR7_FIXED_1, 7);
for (i = 0; i < HBP_NUM; i++) {
if (!breakinfo[i].enabled)
continue;
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index b68d4be9464e..d547de9b3ed8 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -40,7 +40,7 @@ static const struct bin_attribute boot_params_data_attr = {
.name = "data",
.mode = S_IRUGO,
},
- .read_new = boot_params_data_read,
+ .read = boot_params_data_read,
.size = sizeof(boot_params),
};
@@ -56,7 +56,7 @@ static const struct bin_attribute *const boot_params_data_attrs[] = {
static const struct attribute_group boot_params_attr_group = {
.attrs = boot_params_version_attrs,
- .bin_attrs_new = boot_params_data_attrs,
+ .bin_attrs = boot_params_data_attrs,
};
static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
@@ -250,7 +250,7 @@ static struct bin_attribute data_attr __ro_after_init = {
.name = "data",
.mode = S_IRUGO,
},
- .read_new = setup_data_data_read,
+ .read = setup_data_data_read,
};
static struct attribute *setup_data_type_attrs[] = {
@@ -265,7 +265,7 @@ static const struct bin_attribute *const setup_data_data_attrs[] = {
static const struct attribute_group setup_data_attr_group = {
.attrs = setup_data_type_attrs,
- .bin_attrs_new = setup_data_data_attrs,
+ .bin_attrs = setup_data_data_attrs,
};
static int __init create_setup_data_node(struct kobject *parent,
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 921c1c783bc1..8ae750cde0c6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -420,7 +420,7 @@ static u64 kvm_steal_clock(int cpu)
return steal;
}
-static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
+static inline __init void __set_percpu_decrypted(void *ptr, unsigned long size)
{
early_set_memory_decrypted((unsigned long) ptr, size);
}
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 949c9e4bfad2..697fb99406e6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -630,13 +630,35 @@ static void kexec_mark_crashkres(bool protect)
kexec_mark_range(control, crashk_res.end, protect);
}
+/* make the memory storing dm crypt keys in/accessible */
+static void kexec_mark_dm_crypt_keys(bool protect)
+{
+ unsigned long start_paddr, end_paddr;
+ unsigned int nr_pages;
+
+ if (kexec_crash_image->dm_crypt_keys_addr) {
+ start_paddr = kexec_crash_image->dm_crypt_keys_addr;
+ end_paddr = start_paddr + kexec_crash_image->dm_crypt_keys_sz - 1;
+ nr_pages = (PAGE_ALIGN(end_paddr) - PAGE_ALIGN_DOWN(start_paddr))/PAGE_SIZE;
+ if (protect)
+ set_memory_np((unsigned long)phys_to_virt(start_paddr), nr_pages);
+ else
+ __set_memory_prot(
+ (unsigned long)phys_to_virt(start_paddr),
+ nr_pages,
+ __pgprot(_PAGE_PRESENT | _PAGE_NX | _PAGE_RW));
+ }
+}
+
void arch_kexec_protect_crashkres(void)
{
kexec_mark_crashkres(true);
+ kexec_mark_dm_crypt_keys(true);
}
void arch_kexec_unprotect_crashkres(void)
{
+ kexec_mark_dm_crypt_keys(false);
kexec_mark_crashkres(false);
}
#endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c1d2dac72b9c..1b7960cf6eb0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -176,6 +176,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
frame->ret_addr = (unsigned long) ret_from_fork_asm;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
+ clear_tsk_thread_flag(p, TIF_IO_BITMAP);
p->thread.iopl_warn = 0;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
@@ -333,13 +334,21 @@ DEFINE_PER_CPU(u64, msr_misc_features_shadow);
static void set_cpuid_faulting(bool on)
{
- u64 msrval;
- msrval = this_cpu_read(msr_misc_features_shadow);
- msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
- msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
- this_cpu_write(msr_misc_features_shadow, msrval);
- wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval);
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ u64 msrval;
+
+ msrval = this_cpu_read(msr_misc_features_shadow);
+ msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+ msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
+ this_cpu_write(msr_misc_features_shadow, msrval);
+ wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval);
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (on)
+ msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT);
+ else
+ msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT);
+ }
}
static void disable_cpuid(void)
@@ -464,6 +473,11 @@ void native_tss_update_io_bitmap(void)
} else {
struct io_bitmap *iobm = t->io_bitmap;
+ if (WARN_ON_ONCE(!iobm)) {
+ clear_thread_flag(TIF_IO_BITMAP);
+ native_tss_invalidate_io_bitmap();
+ }
+
/*
* Only copy bitmap data when the sequence number differs. The
* update time is accounted to the incoming task.
@@ -901,16 +915,24 @@ static __init bool prefer_mwait_c1_over_halt(void)
*/
static __cpuidle void mwait_idle(void)
{
+ if (need_resched())
+ return;
+
+ x86_idle_clear_cpu_buffers();
+
if (!current_set_polling_and_test()) {
const void *addr = &current_thread_info()->flags;
alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr));
__monitor(addr, 0, 0);
- if (!need_resched()) {
- __sti_mwait(0, 0);
- raw_local_irq_disable();
- }
+ if (need_resched())
+ goto out;
+
+ __sti_mwait(0, 0);
+ raw_local_irq_disable();
}
+
+out:
__current_clr_polling();
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 9bd4fa694da5..3ef15c2f152f 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
/* Only print out debug registers if they are in their non-default state. */
if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
- (d6 == DR6_RESERVED) && (d7 == 0x400))
+ (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))
return;
printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
@@ -208,7 +208,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
raw_cpu_write(current_task, next_p);
/* Load the Intel cache allocation PQR MSR. */
- resctrl_sched_in(next_p);
+ resctrl_arch_sched_in(next_p);
return prev_p;
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index f39ff02e498d..52a5c03c353c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -133,7 +133,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
/* Only print out debug registers if they are in their non-default state. */
if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
- (d6 == DR6_RESERVED) && (d7 == 0x400))) {
+ (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) {
printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
log_lvl, d0, d1, d2);
printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
@@ -705,7 +705,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
}
/* Load the Intel cache allocation PQR MSR. */
- resctrl_sched_in(next_p);
+ resctrl_arch_sched_in(next_p);
+
+ /* Reset hw history on AMD CPUs */
+ if (cpu_feature_enabled(X86_FEATURE_AMD_WORKLOAD_CLASS))
+ wrmsrl(MSR_AMD_WORKLOAD_HRST, 0x1);
return prev_p;
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 095f04bdabdc..3dcadc13f09a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1236,7 +1236,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
static struct user_regset x86_64_regsets[] __ro_after_init = {
[REGSET64_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(struct user_regs_struct) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -1244,7 +1244,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
.set = genregs_set
},
[REGSET64_FP] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct fxregs_state) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -1253,7 +1253,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
.set = xfpregs_set
},
[REGSET64_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
+ USER_REGSET_NOTE_TYPE(X86_XSTATE),
.size = sizeof(u64),
.align = sizeof(u64),
.active = xstateregs_active,
@@ -1261,7 +1261,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
.set = xstateregs_set
},
[REGSET64_IOPERM] = {
- .core_note_type = NT_386_IOPERM,
+ USER_REGSET_NOTE_TYPE(386_IOPERM),
.n = IO_BITMAP_LONGS,
.size = sizeof(long),
.align = sizeof(long),
@@ -1270,7 +1270,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
},
#ifdef CONFIG_X86_USER_SHADOW_STACK
[REGSET64_SSP] = {
- .core_note_type = NT_X86_SHSTK,
+ USER_REGSET_NOTE_TYPE(X86_SHSTK),
.n = 1,
.size = sizeof(u64),
.align = sizeof(u64),
@@ -1297,7 +1297,7 @@ static const struct user_regset_view user_x86_64_view = {
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
static struct user_regset x86_32_regsets[] __ro_after_init = {
[REGSET32_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(struct user_regs_struct32) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -1305,7 +1305,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = genregs32_set
},
[REGSET32_FP] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -1314,7 +1314,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = fpregs_set
},
[REGSET32_XFP] = {
- .core_note_type = NT_PRXFPREG,
+ USER_REGSET_NOTE_TYPE(PRXFPREG),
.n = sizeof(struct fxregs_state) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -1323,7 +1323,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = xfpregs_set
},
[REGSET32_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
+ USER_REGSET_NOTE_TYPE(X86_XSTATE),
.size = sizeof(u64),
.align = sizeof(u64),
.active = xstateregs_active,
@@ -1331,7 +1331,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = xstateregs_set
},
[REGSET32_TLS] = {
- .core_note_type = NT_386_TLS,
+ USER_REGSET_NOTE_TYPE(386_TLS),
.n = GDT_ENTRY_TLS_ENTRIES,
.bias = GDT_ENTRY_TLS_MIN,
.size = sizeof(struct user_desc),
@@ -1341,7 +1341,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.set = regset_tls_set
},
[REGSET32_IOPERM] = {
- .core_note_type = NT_386_IOPERM,
+ USER_REGSET_NOTE_TYPE(386_IOPERM),
.n = IO_BITMAP_BYTES / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 7d9ed79a93c0..0792f31961ac 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -213,8 +213,10 @@ arch_initcall(init_x86_sysctl);
*/
struct screen_info screen_info;
EXPORT_SYMBOL(screen_info);
+#if defined(CONFIG_FIRMWARE_EDID)
struct edid_info edid_info;
EXPORT_SYMBOL_GPL(edid_info);
+#endif
extern int root_mountflags;
@@ -282,8 +284,8 @@ static void __init cleanup_highmap(void)
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
- memblock_reserve(__pa_symbol(_brk_start),
- _brk_end - _brk_start);
+ memblock_reserve_kern(__pa_symbol(_brk_start),
+ _brk_end - _brk_start);
/* Mark brk area as locked down and no longer taking any
new allocations */
@@ -356,7 +358,7 @@ static void __init early_reserve_initrd(void)
!ramdisk_image || !ramdisk_size)
return; /* No initrd provided by bootloader */
- memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
+ memblock_reserve_kern(ramdisk_image, ramdisk_end - ramdisk_image);
}
static void __init reserve_initrd(void)
@@ -409,7 +411,7 @@ static void __init add_early_ima_buffer(u64 phys_addr)
}
if (data->size) {
- memblock_reserve(data->addr, data->size);
+ memblock_reserve_kern(data->addr, data->size);
ima_kexec_buffer_phys = data->addr;
ima_kexec_buffer_size = data->size;
}
@@ -447,6 +449,29 @@ int __init ima_get_kexec_buffer(void **addr, size_t *size)
}
#endif
+static void __init add_kho(u64 phys_addr, u32 data_len)
+{
+ struct kho_data *kho;
+ u64 addr = phys_addr + sizeof(struct setup_data);
+ u64 size = data_len - sizeof(struct setup_data);
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) {
+ pr_warn("Passed KHO data, but CONFIG_KEXEC_HANDOVER not set. Ignoring.\n");
+ return;
+ }
+
+ kho = early_memremap(addr, size);
+ if (!kho) {
+ pr_warn("setup: failed to memremap kho data (0x%llx, 0x%llx)\n",
+ addr, size);
+ return;
+ }
+
+ kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size);
+
+ early_memunmap(kho, size);
+}
+
static void __init parse_setup_data(void)
{
struct setup_data *data;
@@ -475,6 +500,9 @@ static void __init parse_setup_data(void)
case SETUP_IMA:
add_early_ima_buffer(pa_data);
break;
+ case SETUP_KEXEC_KHO:
+ add_kho(pa_data, data_len);
+ break;
case SETUP_RNG_SEED:
data = early_memremap(pa_data, data_len);
add_bootloader_randomness(data->data, data->len);
@@ -499,7 +527,9 @@ static void __init parse_boot_params(void)
{
ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
screen_info = boot_params.screen_info;
+#if defined(CONFIG_FIRMWARE_EDID)
edid_info = boot_params.edid_info;
+#endif
#ifdef CONFIG_X86_32
apm_info.bios = boot_params.apm_bios_info;
ist_info = boot_params.ist_info;
@@ -549,7 +579,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
len = sizeof(*data);
pa_next = data->next;
- memblock_reserve(pa_data, sizeof(*data) + data->len);
+ memblock_reserve_kern(pa_data, sizeof(*data) + data->len);
if (data->type == SETUP_INDIRECT) {
len += data->len;
@@ -563,7 +593,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
indirect = (struct setup_indirect *)data->data;
if (indirect->type != SETUP_INDIRECT)
- memblock_reserve(indirect->addr, indirect->len);
+ memblock_reserve_kern(indirect->addr, indirect->len);
}
pa_data = pa_next;
@@ -766,8 +796,8 @@ static void __init early_reserve_memory(void)
* __end_of_kernel_reserve symbol must be explicitly reserved with a
* separate memblock_reserve() or they will be discarded.
*/
- memblock_reserve(__pa_symbol(_text),
- (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
+ memblock_reserve_kern(__pa_symbol(_text),
+ (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
/*
* The first 4Kb of memory is a BIOS owned area, but generally it is
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 98123ff10506..42bbc42bd350 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -152,6 +152,8 @@ SYSCALL32_DEFINE0(sigreturn)
struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
sigset_t set;
+ prevent_single_step_upon_eretu(regs);
+
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
if (__get_user(set.sig[0], &frame->sc.oldmask)
@@ -175,6 +177,8 @@ SYSCALL32_DEFINE0(rt_sigreturn)
struct rt_sigframe_ia32 __user *frame;
sigset_t set;
+ prevent_single_step_upon_eretu(regs);
+
frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
if (!access_ok(frame, sizeof(*frame)))
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index ee9453891901..d483b585c6c6 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -250,6 +250,8 @@ SYSCALL_DEFINE0(rt_sigreturn)
sigset_t set;
unsigned long uc_flags;
+ prevent_single_step_upon_eretu(regs);
+
frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
@@ -366,6 +368,8 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
sigset_t set;
unsigned long uc_flags;
+ prevent_single_step_upon_eretu(regs);
+
frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
if (!access_ok(frame, sizeof(*frame)))
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 18266cc3d98c..b014e6d229f9 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -299,3 +299,27 @@ struct smp_ops smp_ops = {
.send_call_func_single_ipi = native_send_call_func_single_ipi,
};
EXPORT_SYMBOL_GPL(smp_ops);
+
+int arch_cpu_rescan_dead_smt_siblings(void)
+{
+ enum cpuhp_smt_control old = cpu_smt_control;
+ int ret;
+
+ /*
+ * If SMT has been disabled and SMT siblings are in HLT, bring them back
+ * online and offline them again so that they end up in MWAIT proper.
+ *
+ * Called with hotplug enabled.
+ */
+ if (old != CPU_SMT_DISABLED && old != CPU_SMT_FORCE_DISABLED)
+ return 0;
+
+ ret = cpuhp_smt_enable();
+ if (ret)
+ return ret;
+
+ ret = cpuhp_smt_disable(old);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(arch_cpu_rescan_dead_smt_siblings);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b90d872aa0c8..33e166f6ab12 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -478,44 +478,41 @@ static int x86_cluster_flags(void)
*/
static bool x86_has_numa_in_package;
-static struct sched_domain_topology_level x86_topology[6];
-
-static void __init build_sched_topology(void)
-{
- int i = 0;
-
-#ifdef CONFIG_SCHED_SMT
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)
- };
-#endif
+static struct sched_domain_topology_level x86_topology[] = {
+ SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
#ifdef CONFIG_SCHED_CLUSTER
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
- };
+ SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
- };
+ SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC),
#endif
+ SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG),
+ { NULL },
+};
+
+static void __init build_sched_topology(void)
+{
+ struct sched_domain_topology_level *topology = x86_topology;
+
/*
- * When there is NUMA topology inside the package skip the PKG domain
- * since the NUMA domains will auto-magically create the right spanning
- * domains based on the SLIT.
+ * When there is NUMA topology inside the package invalidate the
+ * PKG domain since the NUMA domains will auto-magically create the
+ * right spanning domains based on the SLIT.
*/
- if (!x86_has_numa_in_package) {
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG)
- };
+ if (x86_has_numa_in_package) {
+ unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
+
+ memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
}
/*
- * There must be one trailing NULL entry left.
+ * Drop the SMT domains if there is only one thread per-core
+ * since it'll get degenerated by the scheduler anyways.
*/
- BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
+ if (cpu_smt_num_threads <= 1)
+ ++topology;
- set_sched_topology(x86_topology);
+ set_sched_topology(topology);
}
void set_cpu_sibling_map(int cpu)
@@ -695,7 +692,7 @@ static void send_init_sequence(u32 phys_apicid)
/*
* Wake up AP by INIT, INIT, STARTUP sequence.
*/
-static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip)
+static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip, unsigned int cpu)
{
unsigned long send_status = 0, accept_status = 0;
int num_starts, j, maxlvt;
@@ -842,7 +839,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
* Returns zero if startup was successfully sent, else error code from
* ->wakeup_secondary_cpu.
*/
-static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle)
+static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle)
{
unsigned long start_ip = real_mode_header->trampoline_start;
int ret;
@@ -896,11 +893,11 @@ static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle)
* - Use an INIT boot APIC message
*/
if (apic->wakeup_secondary_cpu_64)
- ret = apic->wakeup_secondary_cpu_64(apicid, start_ip);
+ ret = apic->wakeup_secondary_cpu_64(apicid, start_ip, cpu);
else if (apic->wakeup_secondary_cpu)
- ret = apic->wakeup_secondary_cpu(apicid, start_ip);
+ ret = apic->wakeup_secondary_cpu(apicid, start_ip, cpu);
else
- ret = wakeup_secondary_cpu_via_init(apicid, start_ip);
+ ret = wakeup_secondary_cpu_via_init(apicid, start_ip, cpu);
/* If the wakeup mechanism failed, cleanup the warm reset vector */
if (ret)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 94c0236963c6..36354b470590 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -352,7 +352,7 @@ static noinstr bool handle_bug(struct pt_regs *regs)
case BUG_UD1_UBSAN:
if (IS_ENABLED(CONFIG_UBSAN_TRAP)) {
pr_crit("%s at %pS\n",
- report_ubsan_failure(regs, ud_imm),
+ report_ubsan_failure(ud_imm),
(void *)regs->ip);
}
break;
@@ -1022,24 +1022,32 @@ static bool is_sysenter_singlestep(struct pt_regs *regs)
#endif
}
-static __always_inline unsigned long debug_read_clear_dr6(void)
+static __always_inline unsigned long debug_read_reset_dr6(void)
{
unsigned long dr6;
+ get_debugreg(dr6, 6);
+ dr6 ^= DR6_RESERVED; /* Flip to positive polarity */
+
/*
* The Intel SDM says:
*
- * Certain debug exceptions may clear bits 0-3. The remaining
- * contents of the DR6 register are never cleared by the
- * processor. To avoid confusion in identifying debug
- * exceptions, debug handlers should clear the register before
- * returning to the interrupted task.
+ * Certain debug exceptions may clear bits 0-3 of DR6.
+ *
+ * BLD induced #DB clears DR6.BLD and any other debug
+ * exception doesn't modify DR6.BLD.
*
- * Keep it simple: clear DR6 immediately.
+ * RTM induced #DB clears DR6.RTM and any other debug
+ * exception sets DR6.RTM.
+ *
+ * To avoid confusion in identifying debug exceptions,
+ * debug handlers should set DR6.BLD and DR6.RTM, and
+ * clear other DR6 bits before returning.
+ *
+ * Keep it simple: write DR6 with its architectural reset
+ * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately.
*/
- get_debugreg(dr6, 6);
set_debugreg(DR6_RESERVED, 6);
- dr6 ^= DR6_RESERVED; /* Flip to positive polarity */
return dr6;
}
@@ -1239,13 +1247,13 @@ out:
/* IST stack entry */
DEFINE_IDTENTRY_DEBUG(exc_debug)
{
- exc_debug_kernel(regs, debug_read_clear_dr6());
+ exc_debug_kernel(regs, debug_read_reset_dr6());
}
/* User entry, runs on regular task stack */
DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
{
- exc_debug_user(regs, debug_read_clear_dr6());
+ exc_debug_user(regs, debug_read_reset_dr6());
}
#ifdef CONFIG_X86_FRED
@@ -1264,7 +1272,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug)
{
/*
* FRED #DB stores DR6 on the stack in the format which
- * debug_read_clear_dr6() returns for the IDT entry points.
+ * debug_read_reset_dr6() returns for the IDT entry points.
*/
unsigned long dr6 = fred_event_data(regs);
@@ -1279,7 +1287,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug)
/* 32 bit does not have separate entry points. */
DEFINE_IDTENTRY_RAW(exc_debug)
{
- unsigned long dr6 = debug_read_clear_dr6();
+ unsigned long dr6 = debug_read_reset_dr6();
if (user_mode(regs))
exc_debug_user(regs, dr6);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index fe8ea8c097de..2c86673155c9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -95,6 +95,8 @@ config KVM_SW_PROTECTED_VM
config KVM_INTEL
tristate "KVM for Intel (and compatible) processors support"
depends on KVM && IA32_FEAT_CTL
+ select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST
+ select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
help
Provides support for KVM on processors equipped with Intel's VT
extensions, a.k.a. Virtual Machine Extensions (VMX).
@@ -129,6 +131,16 @@ config X86_SGX_KVM
If unsure, say N.
+config KVM_INTEL_TDX
+ bool "Intel Trust Domain Extensions (TDX) support"
+ default y
+ depends on INTEL_TDX_HOST
+ help
+ Provides support for launching Intel Trust Domain Extensions (TDX)
+ confidential VMs on Intel processors.
+
+ If unsure, say N.
+
config KVM_AMD
tristate "KVM for AMD processors support"
depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON)
@@ -154,6 +166,16 @@ config KVM_AMD_SEV
Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
Secure Nested Paging (SEV-SNP) technologies on AMD processors.
+config KVM_IOAPIC
+ bool "I/O APIC, PIC, and PIT emulation"
+ default y
+ depends on KVM
+ help
+ Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
+ for full in-kernel APIC emulation.
+
+ If unsure, say Y.
+
config KVM_SMM
bool "System Management Mode emulation"
default y
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f9dddb8cb466..c4b8950c7abe 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,12 +5,11 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror
include $(srctree)/virt/kvm/Makefile.kvm
-kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
- i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- debugfs.o mmu/mmu.o mmu/page_track.o \
- mmu/spte.o
+kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \
+ debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
+kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o
kvm-$(CONFIG_KVM_HYPERV) += hyperv.o
kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-$(CONFIG_KVM_SMM) += smm.o
@@ -20,6 +19,7 @@ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o
+kvm-intel-$(CONFIG_KVM_INTEL_TDX) += vmx/tdx.o
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ecd85f4801cc..e2836a255b16 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -81,17 +81,8 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
return ret;
}
-/*
- * Magic value used by KVM when querying userspace-provided CPUID entries and
- * doesn't care about the CPIUD index because the index of the function in
- * question is not significant. Note, this magic value must have at least one
- * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find()
- * to avoid false positives when processing guest CPUID input.
- */
-#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull
-
-static struct kvm_cpuid_entry2 *cpuid_entry2_find(struct kvm_vcpu *vcpu,
- u32 function, u64 index)
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(
+ struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index)
{
struct kvm_cpuid_entry2 *e;
int i;
@@ -108,8 +99,8 @@ static struct kvm_cpuid_entry2 *cpuid_entry2_find(struct kvm_vcpu *vcpu,
*/
lockdep_assert_irqs_enabled();
- for (i = 0; i < vcpu->arch.cpuid_nent; i++) {
- e = &vcpu->arch.cpuid_entries[i];
+ for (i = 0; i < nent; i++) {
+ e = &entries[i];
if (e->function != function)
continue;
@@ -140,26 +131,7 @@ static struct kvm_cpuid_entry2 *cpuid_entry2_find(struct kvm_vcpu *vcpu,
return NULL;
}
-
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
- u32 function, u32 index)
-{
- return cpuid_entry2_find(vcpu, function, index);
-}
-EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index);
-
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
- u32 function)
-{
- return cpuid_entry2_find(vcpu, function, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
-}
-EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
-
-/*
- * cpuid_entry2_find() and KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used
- * directly outside of kvm_find_cpuid_entry() and kvm_find_cpuid_entry_index().
- */
-#undef KVM_CPUID_INDEX_NOT_SIGNIFICANT
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2);
static int kvm_check_cpuid(struct kvm_vcpu *vcpu)
{
@@ -492,6 +464,20 @@ not_found:
return 36;
}
+int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000);
+ if (!best || best->eax < 0x80000008)
+ goto not_found;
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008);
+ if (best)
+ return (best->eax >> 16) & 0xff;
+not_found:
+ return 0;
+}
+
/*
* This "raw" version returns the reserved GPA bits without any adjustments for
* encryption technologies that usurp bits. The raw mask should be used if and
@@ -992,6 +978,8 @@ void kvm_set_cpu_caps(void)
F(FZRM),
F(FSRS),
F(FSRC),
+ F(WRMSRNS),
+ X86_64_F(LKGS),
F(AMX_FP16),
F(AVX_IFMA),
F(LAM),
@@ -1107,6 +1095,7 @@ void kvm_set_cpu_caps(void)
F(AMD_SSB_NO),
F(AMD_STIBP),
F(AMD_STIBP_ALWAYS_ON),
+ F(AMD_IBRS_SAME_MODE),
F(AMD_PSFD),
F(AMD_IBPB_RET),
);
@@ -1164,6 +1153,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_init(CPUID_8000_0021_EAX,
F(NO_NESTED_DATA_BP),
+ F(WRMSR_XX_BASE_NS),
/*
* Synthesize "LFENCE is serializing" into the AMD-defined entry
* in KVM's supported CPUID, i.e. if the feature is reported as
@@ -1176,17 +1166,27 @@ void kvm_set_cpu_caps(void)
*/
SYNTHESIZED_F(LFENCE_RDTSC),
/* SmmPgCfgLock */
+ /* 4: Resv */
+ SYNTHESIZED_F(VERW_CLEAR),
F(NULL_SEL_CLR_BASE),
+ /* UpperAddressIgnore */
F(AUTOIBRS),
+ F(PREFETCHI),
EMULATED_F(NO_SMM_CTL_MSR),
/* PrefetchCtlMsr */
- F(WRMSR_XX_BASE_NS),
+ /* GpOnUserCpuid */
+ /* EPSF */
SYNTHESIZED_F(SBPB),
SYNTHESIZED_F(IBPB_BRTYPE),
SYNTHESIZED_F(SRSO_NO),
F(SRSO_USER_KERNEL_NO),
);
+ kvm_cpu_cap_init(CPUID_8000_0021_ECX,
+ SYNTHESIZED_F(TSA_SQ_NO),
+ SYNTHESIZED_F(TSA_L1_NO),
+ );
+
kvm_cpu_cap_init(CPUID_8000_0022_EAX,
F(PERFMON_V2),
);
@@ -1756,8 +1756,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
break;
case 0x80000021:
- entry->ebx = entry->ecx = entry->edx = 0;
+ entry->ebx = entry->edx = 0;
cpuid_entry_override(entry, CPUID_8000_0021_EAX);
+ cpuid_entry_override(entry, CPUID_8000_0021_ECX);
break;
/* AMD Extended Performance Monitoring and Debug */
case 0x80000022: {
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index d2884162a46a..d3f5ae15a7ca 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -11,10 +11,34 @@ extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
void kvm_set_cpu_caps(void);
void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
- u32 function, u32 index);
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
- u32 function);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries,
+ int nent, u32 function, u64 index);
+/*
+ * Magic value used by KVM when querying userspace-provided CPUID entries and
+ * doesn't care about the CPIUD index because the index of the function in
+ * question is not significant. Note, this magic value must have at least one
+ * bit set in bits[63:32] and must be consumed as a u64 by kvm_find_cpuid_entry2()
+ * to avoid false positives when processing guest CPUID input.
+ *
+ * KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used directly outside of
+ * kvm_find_cpuid_entry2() and kvm_find_cpuid_entry().
+ */
+#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull
+
+static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
+{
+ return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
+ function, index);
+}
+
+static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function)
+{
+ return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
+ function, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+}
+
int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries,
unsigned int type);
@@ -34,6 +58,7 @@ void __init kvm_init_xstate_sizes(void);
u32 xstate_required_size(u64 xstate_bv, bool compacted);
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu);
u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu);
static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 24f0318c50d7..72b19a88a776 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -497,15 +497,19 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
return ret;
}
-int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
struct kvm_vcpu_hv_synic *synic;
- synic = synic_get(kvm, vpidx);
+ if (!level)
+ return -1;
+
+ synic = synic_get(kvm, e->hv_sint.vcpu);
if (!synic)
return -EINVAL;
- return synic_set_irq(synic, sint);
+ return synic_set_irq(synic, e->hv_sint.sint);
}
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
@@ -1979,6 +1983,9 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY)
goto out_flush_all;
+ if (is_noncanonical_invlpg_address(entries[i], vcpu))
+ continue;
+
/*
* Lower 12 bits of 'address' encode the number of additional
* pages to flush.
@@ -2001,11 +2008,11 @@ out_flush_all:
static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ unsigned long *vcpu_mask = hv_vcpu->vcpu_mask;
u64 *sparse_banks = hv_vcpu->sparse_banks;
struct kvm *kvm = vcpu->kvm;
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
- DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
/*
* Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE'
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 913bfc96959c..6ce160ffa678 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -103,7 +103,8 @@ static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
-int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 739aa6c0d0c3..d1b79b418c05 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -248,8 +248,8 @@ static void pit_do_work(struct kthread_work *work)
if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
return;
- kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
- kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false);
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false);
/*
* Provides NMI watchdog support via Virtual Wire mode.
@@ -288,7 +288,7 @@ static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
atomic_set(&pit->pit_state.irq_ack, 1);
}
-void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+static void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
{
struct kvm_kpit_state *ps = &pit->pit_state;
struct kvm *kvm = pit->kvm;
@@ -400,8 +400,8 @@ static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
}
}
-void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
- int hpet_legacy_start)
+static void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start)
{
u8 saved_mode;
@@ -649,6 +649,79 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
kvm_pit_reset_reinject(pit);
}
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+ BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+ mutex_lock(&kps->lock);
+ memcpy(ps, &kps->channels, sizeof(*ps));
+ mutex_unlock(&kps->lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int i;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+ sizeof(ps->channels));
+ ps->flags = kvm->arch.vpit->pit_state.flags;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ memset(&ps->reserved, 0, sizeof(ps->reserved));
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int start = 0;
+ int i;
+ u32 prev_legacy, cur_legacy;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ if (!prev_legacy && cur_legacy)
+ start = 1;
+ memcpy(&pit->pit_state.channels, &ps->channels,
+ sizeof(pit->pit_state.channels));
+ pit->pit_state.flags = ps->flags;
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
+ start && i == 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control)
+{
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ /* pit->pit_state.lock was overloaded to prevent userspace from getting
+ * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+ * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
+ */
+ mutex_lock(&pit->pit_state.lock);
+ kvm_pit_set_reinject(pit, control->pit_reinject);
+ mutex_unlock(&pit->pit_state.lock);
+
+ return 0;
+}
+
static const struct kvm_io_device_ops pit_dev_ops = {
.read = pit_ioport_read,
.write = pit_ioport_write,
@@ -671,10 +744,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
if (!pit)
return NULL;
- pit->irq_source_id = kvm_request_irq_source_id(kvm);
- if (pit->irq_source_id < 0)
- goto fail_request;
-
mutex_init(&pit->pit_state.lock);
pid = get_pid(task_tgid(current));
@@ -726,8 +795,6 @@ fail_register_pit:
kvm_pit_set_reinject(pit, false);
kthread_destroy_worker(pit->worker);
fail_kthread:
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
-fail_request:
kfree(pit);
return NULL;
}
@@ -744,7 +811,6 @@ void kvm_free_pit(struct kvm *kvm)
kvm_pit_set_reinject(pit, false);
hrtimer_cancel(&pit->pit_state.timer);
kthread_destroy_worker(pit->worker);
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
kfree(pit);
}
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index a768212ba821..60fa499d2f8a 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -6,6 +6,11 @@
#include <kvm/iodev.h>
+#include <uapi/asm/kvm.h>
+
+#include "ioapic.h"
+
+#ifdef CONFIG_KVM_IOAPIC
struct kvm_kpit_channel_state {
u32 count; /* can be 65536 */
u16 latched_count;
@@ -42,7 +47,6 @@ struct kvm_pit {
struct kvm_io_device speaker_dev;
struct kvm *kvm;
struct kvm_kpit_state pit_state;
- int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
struct kthread_worker *worker;
struct kthread_work expired;
@@ -55,11 +59,14 @@ struct kvm_pit {
#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
#define KVM_PIT_CHANNEL_MASK 0x3
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control);
+
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
-
-void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
- int hpet_legacy_start);
-void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
+#endif /* CONFIG_KVM_IOAPIC */
#endif
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a8fb19940975..2ac7f1678c46 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -31,6 +31,8 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/bitops.h>
+
+#include "ioapic.h"
#include "irq.h"
#include <linux/kvm_host.h>
@@ -185,8 +187,11 @@ void kvm_pic_update_irq(struct kvm_pic *s)
pic_unlock(s);
}
-int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
+ struct kvm_pic *s = kvm->arch.vpic;
+ int irq = e->irqchip.pin;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
@@ -203,16 +208,6 @@ int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
return ret;
}
-void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
-{
- int i;
-
- pic_lock(s);
- for (i = 0; i < PIC_NUM_PINS; i++)
- __clear_bit(irq_source_id, &s->irq_states[i]);
- pic_unlock(s);
-}
-
/*
* acknowledge interrupt 'irq'
*/
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 995eb5054360..2b5d389bca5f 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -41,11 +41,11 @@
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/current.h>
-#include <trace/events/kvm.h>
#include "ioapic.h"
#include "lapic.h"
#include "irq.h"
+#include "trace.h"
static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
bool line_status);
@@ -296,11 +296,8 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
index == RTC_GSI) {
u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode);
- if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
- e->fields.dest_id, dm) ||
- kvm_apic_pending_eoi(vcpu, e->fields.vector))
- __set_bit(e->fields.vector,
- ioapic_handled_vectors);
+ kvm_scan_ioapic_irq(vcpu, e->fields.dest_id, dm,
+ e->fields.vector, ioapic_handled_vectors);
}
}
spin_unlock(&ioapic->lock);
@@ -313,6 +310,42 @@ void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
kvm_make_scan_ioapic_request(kvm);
}
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &ioapic->mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_srcu(&kvm->irq_srcu);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ struct kvm_irq_mask_notifier *kimn;
+ int idx, gsi;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kimn, &ioapic->mask_notifier_list, link)
+ if (kimn->irq == gsi)
+ kimn->func(kimn, mask);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
{
unsigned index;
@@ -482,9 +515,11 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
return ret;
}
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
- int level, bool line_status)
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ int irq = e->irqchip.pin;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
@@ -499,16 +534,6 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
return ret;
}
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
-{
- int i;
-
- spin_lock(&ioapic->lock);
- for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
- __clear_bit(irq_source_id, &ioapic->irq_states[i]);
- spin_unlock(&ioapic->lock);
-}
-
static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
{
int i;
@@ -721,6 +746,7 @@ int kvm_ioapic_init(struct kvm *kvm)
return -ENOMEM;
spin_lock_init(&ioapic->lock);
INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
+ INIT_HLIST_HEAD(&ioapic->mask_notifier_list);
kvm->arch.vioapic = ioapic;
kvm_ioapic_reset(ioapic);
kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 539333ac4b38..bf28dbc11ff6 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -86,8 +86,24 @@ struct kvm_ioapic {
struct delayed_work eoi_inject;
u32 irq_eoi[IOAPIC_NUM_PINS];
u32 irr_delivered;
+
+ /* reads protected by irq_srcu, writes by irq_lock */
+ struct hlist_head mask_notifier_list;
+};
+
+struct kvm_irq_mask_notifier {
+ void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+ int irq;
+ struct hlist_node link;
};
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask);
+
#ifdef DEBUG
#define ASSERT(x) \
do { \
@@ -103,7 +119,7 @@ do { \
static inline int ioapic_in_kernel(struct kvm *kvm)
{
- return irqchip_kernel(kvm);
+ return irqchip_full(kvm);
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -111,13 +127,15 @@ void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
int trigger_mode);
int kvm_ioapic_init(struct kvm *kvm);
void kvm_ioapic_destroy(struct kvm *kvm);
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
- int level, bool line_status);
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
+
void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
ulong *ioapic_handled_vectors);
void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
ulong *ioapic_handled_vectors);
+void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
+ u8 vector, unsigned long *ioapic_handled_vectors);
#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 63f66c51975a..16da89259011 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -11,9 +11,12 @@
#include <linux/export.h>
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
+#include "hyperv.h"
+#include "ioapic.h"
#include "irq.h"
-#include "i8254.h"
+#include "trace.h"
#include "x86.h"
#include "xen.h"
@@ -41,6 +44,14 @@ static int pending_userspace_extint(struct kvm_vcpu *v)
return v->arch.pending_external_vector != -1;
}
+static int get_userspace_extint(struct kvm_vcpu *vcpu)
+{
+ int vector = vcpu->arch.pending_external_vector;
+
+ vcpu->arch.pending_external_vector = -1;
+ return vector;
+}
+
/*
* check if there is pending interrupt from
* non-APIC source without intack.
@@ -67,10 +78,13 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v)
if (!kvm_apic_accept_pic_intr(v))
return 0;
- if (irqchip_split(v->kvm))
- return pending_userspace_extint(v);
- else
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
return v->kvm->arch.vpic->output;
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return pending_userspace_extint(v);
}
/*
@@ -100,6 +114,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
if (kvm_cpu_has_extint(v))
return 1;
+ if (lapic_in_kernel(v) && v->arch.apic->guest_apic_protected)
+ return kvm_x86_call(protected_apic_has_interrupt)(v);
+
return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
}
EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
@@ -123,13 +140,13 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v)
return v->kvm->arch.xen.upcall_vector;
#endif
- if (irqchip_split(v->kvm)) {
- int vector = v->arch.pending_external_vector;
-
- v->arch.pending_external_vector = -1;
- return vector;
- } else
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
return kvm_pic_read_irq(v->kvm); /* PIC */
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return get_userspace_extint(v);
}
EXPORT_SYMBOL_GPL(kvm_cpu_get_extint);
@@ -160,7 +177,9 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
+#ifdef CONFIG_KVM_IOAPIC
__kvm_migrate_pit_timer(vcpu);
+#endif
kvm_x86_call(migrate_timers)(vcpu);
}
@@ -168,10 +187,532 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
{
bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE;
- return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
+ return resample ? irqchip_full(kvm) : irqchip_in_kernel(kvm);
}
bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
{
return irqchip_in_kernel(kvm);
}
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, struct dest_map *dest_map)
+{
+ int r = -1;
+ struct kvm_vcpu *vcpu, *lowest = NULL;
+ unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned int dest_vcpus = 0;
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL &&
+ irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
+ pr_info("apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_lowest_prio_delivery(irq)) {
+ if (r < 0)
+ r = 0;
+ r += kvm_apic_set_irq(vcpu, irq, dest_map);
+ } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
+ if (!kvm_vector_hashing_enabled()) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ } else {
+ __set_bit(i, dest_vcpu_bitmap);
+ dest_vcpus++;
+ }
+ }
+ }
+
+ if (dest_vcpus != 0) {
+ int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+ lowest = kvm_get_vcpu(kvm, idx);
+ }
+
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq, dest_map);
+
+ return r;
+}
+
+static void kvm_msi_to_lapic_irq(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
+{
+ struct msi_msg msg = { .address_lo = e->msi.address_lo,
+ .address_hi = e->msi.address_hi,
+ .data = e->msi.data };
+
+ trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
+ (u64)msg.address_hi << 32 : 0), msg.data);
+
+ irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
+ irq->vector = msg.arch_data.vector;
+ irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
+ irq->trig_mode = msg.arch_data.is_level;
+ irq->delivery_mode = msg.arch_data.delivery_mode << 8;
+ irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
+ irq->level = 1;
+ irq->shorthand = APIC_DEST_NOSHORT;
+}
+
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e)
+{
+ return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+ struct kvm_lapic_irq irq;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ if (!level)
+ return -1;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_lapic_irq irq;
+ int r;
+
+ switch (e->type) {
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level,
+ line_status);
+#endif
+
+ case KVM_IRQ_ROUTING_MSI:
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
+ return r;
+ break;
+
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ if (!level)
+ return -1;
+
+ return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
+#endif
+ default:
+ break;
+ }
+
+ return -EWOULDBLOCK;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+ bool line_status)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level,
+ line_status);
+ return 0;
+}
+
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ /* We can't check irqchip_in_kernel() here as some callers are
+ * currently initializing the irqchip. Other callers should therefore
+ * check kvm_arch_can_set_irq_routing() before calling this function.
+ */
+ switch (ue->type) {
+#ifdef CONFIG_KVM_IOAPIC
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ if (irqchip_split(kvm))
+ return -EINVAL;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ switch (ue->u.irqchip.irqchip) {
+ case KVM_IRQCHIP_PIC_SLAVE:
+ e->irqchip.pin += PIC_NUM_PINS / 2;
+ fallthrough;
+ case KVM_IRQCHIP_PIC_MASTER:
+ if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
+ return -EINVAL;
+ e->set = kvm_pic_set_irq;
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+ return -EINVAL;
+ e->set = kvm_ioapic_set_irq;
+ break;
+ default:
+ return -EINVAL;
+ }
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ break;
+#endif
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+ break;
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ e->set = kvm_hv_synic_set_irq;
+ e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+ e->hv_sint.sint = ue->u.hv_sint.sint;
+ break;
+#endif
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int r = 0;
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+ return true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (++r == 2)
+ return false;
+
+ *dest_vcpu = vcpu;
+ }
+
+ return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
+void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
+ u8 vector, unsigned long *ioapic_handled_vectors)
+{
+ /*
+ * Intercept EOI if the vCPU is the target of the new IRQ routing, or
+ * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
+ * may receive a level-triggered IRQ in the future, or already received
+ * level-triggered IRQ. The EOI needs to be intercepted and forwarded
+ * to I/O APIC emulation so that the IRQ can be de-asserted.
+ */
+ if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
+ __set_bit(vector, ioapic_handled_vectors);
+ } else if (kvm_apic_pending_eoi(vcpu, vector)) {
+ __set_bit(vector, ioapic_handled_vectors);
+
+ /*
+ * Track the highest pending EOI for which the vCPU is NOT the
+ * target in the new routing. Only the EOI for the IRQ that is
+ * in-flight (for the old routing) needs to be intercepted, any
+ * future IRQs that arrive on this vCPU will be coincidental to
+ * the level-triggered routing and don't need to be intercepted.
+ */
+ if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
+ vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
+ }
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kernel_irq_routing_entry *entry;
+ struct kvm_irq_routing_table *table;
+ u32 i, nr_ioapic_pins;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+ kvm->arch.nr_reserved_ioapic_pins);
+ for (i = 0; i < nr_ioapic_pins; ++i) {
+ hlist_for_each_entry(entry, &table->map[i], link) {
+ struct kvm_lapic_irq irq;
+
+ if (entry->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ kvm_msi_to_lapic_irq(vcpu->kvm, entry, &irq);
+
+ if (!irq.trig_mode)
+ continue;
+
+ kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
+ irq.vector, ioapic_handled_vectors);
+ }
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_HYPERV
+ kvm_hv_irq_routing_update(kvm);
+#endif
+
+ if (irqchip_split(kvm))
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *entry)
+{
+ unsigned int host_irq = irqfd->producer->irq;
+ struct kvm *kvm = irqfd->kvm;
+ struct kvm_vcpu *vcpu = NULL;
+ struct kvm_lapic_irq irq;
+ int r;
+
+ if (WARN_ON_ONCE(!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass()))
+ return -EINVAL;
+
+ if (entry && entry->type == KVM_IRQ_ROUTING_MSI) {
+ kvm_msi_to_lapic_irq(kvm, entry, &irq);
+
+ /*
+ * Force remapped mode if hardware doesn't support posting the
+ * virtual interrupt to a vCPU. Only IRQs are postable (NMIs,
+ * SMIs, etc. are not), and neither AMD nor Intel IOMMUs support
+ * posting multicast/broadcast IRQs. If the interrupt can't be
+ * posted, the device MSI needs to be routed to the host so that
+ * the guest's desired interrupt can be synthesized by KVM.
+ *
+ * This means that KVM can only post lowest-priority interrupts
+ * if they have a single CPU as the destination, e.g. only if
+ * the guest has affined the interrupt to a single vCPU.
+ */
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq))
+ vcpu = NULL;
+ }
+
+ if (!irqfd->irq_bypass_vcpu && !vcpu)
+ return 0;
+
+ r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi,
+ vcpu, irq.vector);
+ if (r) {
+ WARN_ON_ONCE(irqfd->irq_bypass_vcpu && !vcpu);
+ irqfd->irq_bypass_vcpu = NULL;
+ return r;
+ }
+
+ irqfd->irq_bypass_vcpu = vcpu;
+
+ trace_kvm_pi_irte_update(host_irq, vcpu, irqfd->gsi, irq.vector, !!vcpu);
+ return 0;
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret = 0;
+
+ spin_lock_irq(&kvm->irqfds.lock);
+ irqfd->producer = prod;
+
+ if (!kvm->arch.nr_possible_bypass_irqs++)
+ kvm_x86_call(pi_start_bypass)(kvm);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, &irqfd->irq_entry);
+ if (ret)
+ kvm->arch.nr_possible_bypass_irqs--;
+ }
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+ return ret;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret;
+
+ WARN_ON(irqfd->producer != prod);
+
+ /*
+ * If the producer of an IRQ that is currently being posted to a vCPU
+ * is unregistered, change the associated IRTE back to remapped mode as
+ * the IRQ has been released (or repurposed) by the device driver, i.e.
+ * KVM must relinquish control of the IRTE.
+ */
+ spin_lock_irq(&kvm->irqfds.lock);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, NULL);
+ if (ret)
+ pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n",
+ irqfd->consumer.eventfd, ret);
+ }
+ irqfd->producer = NULL;
+
+ kvm->arch.nr_possible_bypass_irqs--;
+
+ spin_unlock_irq(&kvm->irqfds.lock);
+}
+
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (new->type != KVM_IRQ_ROUTING_MSI &&
+ old->type != KVM_IRQ_ROUTING_MSI)
+ return;
+
+ if (old->type == KVM_IRQ_ROUTING_MSI &&
+ new->type == KVM_IRQ_ROUTING_MSI &&
+ !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
+ return;
+
+ kvm_pi_update_irte(irqfd, new);
+}
+
+#ifdef CONFIG_KVM_IOAPIC
+#define IOAPIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#define PIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
+#define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+ ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+ ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+ ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+ ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+ ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+ ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+ ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+ ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+ ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+ ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+ ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+ ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+};
+
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, default_routing,
+ ARRAY_SIZE(default_routing), 0);
+}
+
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ memcpy(&chip->chip.pic, &pic->pics[0],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ memcpy(&chip->chip.pic, &pic->pics[1],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_get_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[0], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[1], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_set_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ kvm_pic_update_irq(pic);
+ return r;
+}
+#endif
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 76d46b2f41dd..5e62c1f79ce6 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -18,6 +18,8 @@
#include <kvm/iodev.h>
#include "lapic.h"
+#ifdef CONFIG_KVM_IOAPIC
+
#define PIC_NUM_PINS 16
#define SELECT_PIC(irq) \
((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
@@ -63,17 +65,15 @@ int kvm_pic_init(struct kvm *kvm);
void kvm_pic_destroy(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
-static inline int irqchip_split(struct kvm *kvm)
-{
- int mode = kvm->arch.irqchip_mode;
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm);
- /* Matches smp_wmb() when setting irqchip_mode */
- smp_rmb();
- return mode == KVM_IRQCHIP_SPLIT;
-}
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
-static inline int irqchip_kernel(struct kvm *kvm)
+static inline int irqchip_full(struct kvm *kvm)
{
int mode = kvm->arch.irqchip_mode;
@@ -81,10 +81,26 @@ static inline int irqchip_kernel(struct kvm *kvm)
smp_rmb();
return mode == KVM_IRQCHIP_KERNEL;
}
+#else /* CONFIG_KVM_IOAPIC */
+static __always_inline int irqchip_full(struct kvm *kvm)
+{
+ return false;
+}
+#endif
static inline int pic_in_kernel(struct kvm *kvm)
{
- return irqchip_kernel(kvm);
+ return irqchip_full(kvm);
+}
+
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_SPLIT;
}
static inline int irqchip_in_kernel(struct kvm *kvm)
@@ -105,7 +121,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
-int kvm_setup_default_irq_routing(struct kvm *kvm);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
deleted file mode 100644
index 8136695f7b96..000000000000
--- a/arch/x86/kvm/irq_comm.c
+++ /dev/null
@@ -1,442 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * irq_comm.c: Common API for in kernel interrupt controller
- * Copyright (c) 2007, Intel Corporation.
- *
- * Authors:
- * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kvm_host.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/rculist.h>
-
-#include <trace/events/kvm.h>
-
-#include "irq.h"
-
-#include "ioapic.h"
-
-#include "lapic.h"
-
-#include "hyperv.h"
-#include "x86.h"
-#include "xen.h"
-
-static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
-}
-
-static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
- line_status);
-}
-
-int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, struct dest_map *dest_map)
-{
- int r = -1;
- struct kvm_vcpu *vcpu, *lowest = NULL;
- unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
- unsigned int dest_vcpus = 0;
-
- if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
- return r;
-
- if (irq->dest_mode == APIC_DEST_PHYSICAL &&
- irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
- pr_info("apic: phys broadcast and lowest prio\n");
- irq->delivery_mode = APIC_DM_FIXED;
- }
-
- memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!kvm_apic_present(vcpu))
- continue;
-
- if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
- irq->dest_id, irq->dest_mode))
- continue;
-
- if (!kvm_lowest_prio_delivery(irq)) {
- if (r < 0)
- r = 0;
- r += kvm_apic_set_irq(vcpu, irq, dest_map);
- } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
- if (!kvm_vector_hashing_enabled()) {
- if (!lowest)
- lowest = vcpu;
- else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
- lowest = vcpu;
- } else {
- __set_bit(i, dest_vcpu_bitmap);
- dest_vcpus++;
- }
- }
- }
-
- if (dest_vcpus != 0) {
- int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
- dest_vcpu_bitmap, KVM_MAX_VCPUS);
-
- lowest = kvm_get_vcpu(kvm, idx);
- }
-
- if (lowest)
- r = kvm_apic_set_irq(lowest, irq, dest_map);
-
- return r;
-}
-
-void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct kvm_lapic_irq *irq)
-{
- struct msi_msg msg = { .address_lo = e->msi.address_lo,
- .address_hi = e->msi.address_hi,
- .data = e->msi.data };
-
- trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
- (u64)msg.address_hi << 32 : 0), msg.data);
-
- irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
- irq->vector = msg.arch_data.vector;
- irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
- irq->trig_mode = msg.arch_data.is_level;
- irq->delivery_mode = msg.arch_data.delivery_mode << 8;
- irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
- irq->level = 1;
- irq->shorthand = APIC_DEST_NOSHORT;
-}
-EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
-
-static inline bool kvm_msi_route_invalid(struct kvm *kvm,
- struct kvm_kernel_irq_routing_entry *e)
-{
- return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
-}
-
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level, bool line_status)
-{
- struct kvm_lapic_irq irq;
-
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
-
- if (!level)
- return -1;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
-}
-
-#ifdef CONFIG_KVM_HYPERV
-static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- if (!level)
- return -1;
-
- return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
-}
-#endif
-
-int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_lapic_irq irq;
- int r;
-
- switch (e->type) {
-#ifdef CONFIG_KVM_HYPERV
- case KVM_IRQ_ROUTING_HV_SINT:
- return kvm_hv_set_sint(e, kvm, irq_source_id, level,
- line_status);
-#endif
-
- case KVM_IRQ_ROUTING_MSI:
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
- return r;
- break;
-
-#ifdef CONFIG_KVM_XEN
- case KVM_IRQ_ROUTING_XEN_EVTCHN:
- if (!level)
- return -1;
-
- return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
-#endif
- default:
- break;
- }
-
- return -EWOULDBLOCK;
-}
-
-int kvm_request_irq_source_id(struct kvm *kvm)
-{
- unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
- int irq_source_id;
-
- mutex_lock(&kvm->irq_lock);
- irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
-
- if (irq_source_id >= BITS_PER_LONG) {
- pr_warn("exhausted allocatable IRQ sources!\n");
- irq_source_id = -EFAULT;
- goto unlock;
- }
-
- ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
- ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
- set_bit(irq_source_id, bitmap);
-unlock:
- mutex_unlock(&kvm->irq_lock);
-
- return irq_source_id;
-}
-
-void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
-{
- ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
- ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
-
- mutex_lock(&kvm->irq_lock);
- if (irq_source_id < 0 ||
- irq_source_id >= BITS_PER_LONG) {
- pr_err("IRQ source ID out of range!\n");
- goto unlock;
- }
- clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
- if (!irqchip_kernel(kvm))
- goto unlock;
-
- kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
- kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
-unlock:
- mutex_unlock(&kvm->irq_lock);
-}
-
-void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn)
-{
- mutex_lock(&kvm->irq_lock);
- kimn->irq = irq;
- hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
- mutex_unlock(&kvm->irq_lock);
-}
-
-void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn)
-{
- mutex_lock(&kvm->irq_lock);
- hlist_del_rcu(&kimn->link);
- mutex_unlock(&kvm->irq_lock);
- synchronize_srcu(&kvm->irq_srcu);
-}
-
-void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
- bool mask)
-{
- struct kvm_irq_mask_notifier *kimn;
- int idx, gsi;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
- if (gsi != -1)
- hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link)
- if (kimn->irq == gsi)
- kimn->func(kimn, mask);
- srcu_read_unlock(&kvm->irq_srcu, idx);
-}
-
-bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
-{
- return irqchip_in_kernel(kvm);
-}
-
-int kvm_set_routing_entry(struct kvm *kvm,
- struct kvm_kernel_irq_routing_entry *e,
- const struct kvm_irq_routing_entry *ue)
-{
- /* We can't check irqchip_in_kernel() here as some callers are
- * currently initializing the irqchip. Other callers should therefore
- * check kvm_arch_can_set_irq_routing() before calling this function.
- */
- switch (ue->type) {
- case KVM_IRQ_ROUTING_IRQCHIP:
- if (irqchip_split(kvm))
- return -EINVAL;
- e->irqchip.pin = ue->u.irqchip.pin;
- switch (ue->u.irqchip.irqchip) {
- case KVM_IRQCHIP_PIC_SLAVE:
- e->irqchip.pin += PIC_NUM_PINS / 2;
- fallthrough;
- case KVM_IRQCHIP_PIC_MASTER:
- if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
- return -EINVAL;
- e->set = kvm_set_pic_irq;
- break;
- case KVM_IRQCHIP_IOAPIC:
- if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
- return -EINVAL;
- e->set = kvm_set_ioapic_irq;
- break;
- default:
- return -EINVAL;
- }
- e->irqchip.irqchip = ue->u.irqchip.irqchip;
- break;
- case KVM_IRQ_ROUTING_MSI:
- e->set = kvm_set_msi;
- e->msi.address_lo = ue->u.msi.address_lo;
- e->msi.address_hi = ue->u.msi.address_hi;
- e->msi.data = ue->u.msi.data;
-
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
- break;
-#ifdef CONFIG_KVM_HYPERV
- case KVM_IRQ_ROUTING_HV_SINT:
- e->set = kvm_hv_set_sint;
- e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
- e->hv_sint.sint = ue->u.hv_sint.sint;
- break;
-#endif
-#ifdef CONFIG_KVM_XEN
- case KVM_IRQ_ROUTING_XEN_EVTCHN:
- return kvm_xen_setup_evtchn(kvm, e, ue);
-#endif
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
- struct kvm_vcpu **dest_vcpu)
-{
- int r = 0;
- unsigned long i;
- struct kvm_vcpu *vcpu;
-
- if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
- return true;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!kvm_apic_present(vcpu))
- continue;
-
- if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
- irq->dest_id, irq->dest_mode))
- continue;
-
- if (++r == 2)
- return false;
-
- *dest_vcpu = vcpu;
- }
-
- return r == 1;
-}
-EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
-
-#define IOAPIC_ROUTING_ENTRY(irq) \
- { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
- .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
-#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
-
-#define PIC_ROUTING_ENTRY(irq) \
- { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
- .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
-#define ROUTING_ENTRY2(irq) \
- IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
-
-static const struct kvm_irq_routing_entry default_routing[] = {
- ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
- ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
- ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
- ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
- ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
- ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
- ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
- ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
- ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
- ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
- ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
- ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
-};
-
-int kvm_setup_default_irq_routing(struct kvm *kvm)
-{
- return kvm_set_irq_routing(kvm, default_routing,
- ARRAY_SIZE(default_routing), 0);
-}
-
-void kvm_arch_post_irq_routing_update(struct kvm *kvm)
-{
- if (!irqchip_split(kvm))
- return;
- kvm_make_scan_ioapic_request(kvm);
-}
-
-void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
- ulong *ioapic_handled_vectors)
-{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_kernel_irq_routing_entry *entry;
- struct kvm_irq_routing_table *table;
- u32 i, nr_ioapic_pins;
- int idx;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
- kvm->arch.nr_reserved_ioapic_pins);
- for (i = 0; i < nr_ioapic_pins; ++i) {
- hlist_for_each_entry(entry, &table->map[i], link) {
- struct kvm_lapic_irq irq;
-
- if (entry->type != KVM_IRQ_ROUTING_MSI)
- continue;
-
- kvm_set_msi_irq(vcpu->kvm, entry, &irq);
-
- if (irq.trig_mode &&
- (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
- irq.dest_id, irq.dest_mode) ||
- kvm_apic_pending_eoi(vcpu, irq.vector)))
- __set_bit(irq.vector, ioapic_handled_vectors);
- }
- }
- srcu_read_unlock(&kvm->irq_srcu, idx);
-}
-
-void kvm_arch_irq_routing_update(struct kvm *kvm)
-{
-#ifdef CONFIG_KVM_HYPERV
- kvm_hv_irq_routing_update(kvm);
-#endif
-}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 28e3317124fd..8172c2042dd6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -27,6 +27,7 @@
#include <linux/export.h>
#include <linux/math64.h>
#include <linux/slab.h>
+#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/mce.h>
#include <asm/msr.h>
@@ -55,9 +56,6 @@
/* 14 is the version for Xeon and Pentium 8.4.8*/
#define APIC_VERSION 0x14UL
#define LAPIC_MMIO_LENGTH (1 << 12)
-/* followed define is not in apicdef.h */
-#define MAX_APIC_VECTOR 256
-#define APIC_VECTORS_PER_REG 32
/*
* Enable local APIC timer advancement (tscdeadline mode only) with adaptive
@@ -79,42 +77,20 @@ module_param(lapic_timer_advance, bool, 0444);
static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data);
-static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
-{
- *((u32 *) (regs + reg_off)) = val;
-}
-
static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
{
- __kvm_lapic_set_reg(apic->regs, reg_off, val);
-}
-
-static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg)
-{
- BUILD_BUG_ON(reg != APIC_ICR);
- return *((u64 *) (regs + reg));
+ apic_set_reg(apic->regs, reg_off, val);
}
static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
{
- return __kvm_lapic_get_reg64(apic->regs, reg);
-}
-
-static __always_inline void __kvm_lapic_set_reg64(char *regs, int reg, u64 val)
-{
- BUILD_BUG_ON(reg != APIC_ICR);
- *((u64 *) (regs + reg)) = val;
+ return apic_get_reg64(apic->regs, reg);
}
static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
int reg, u64 val)
{
- __kvm_lapic_set_reg64(apic->regs, reg, val);
-}
-
-static inline int apic_test_vector(int vec, void *bitmap)
-{
- return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+ apic_set_reg64(apic->regs, reg, val);
}
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
@@ -125,16 +101,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
apic_test_vector(vector, apic->regs + APIC_IRR);
}
-static inline int __apic_test_and_set_vector(int vec, void *bitmap)
-{
- return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
-{
- return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
@@ -626,21 +592,6 @@ static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
[LVT_CMCI] = LVT_MASK | APIC_MODE_MASK
};
-static int find_highest_vector(void *bitmap)
-{
- int vec;
- u32 *reg;
-
- for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG;
- vec >= 0; vec -= APIC_VECTORS_PER_REG) {
- reg = bitmap + REG_POS(vec);
- if (*reg)
- return __fls(*reg) + vec;
- }
-
- return -1;
-}
-
static u8 count_vectors(void *bitmap)
{
int vec;
@@ -648,34 +599,36 @@ static u8 count_vectors(void *bitmap)
u8 count = 0;
for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
- reg = bitmap + REG_POS(vec);
+ reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec);
count += hweight32(*reg);
}
return count;
}
-bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
+bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr)
{
+ unsigned long pir_vals[NR_PIR_WORDS];
+ u32 *__pir = (void *)pir_vals;
u32 i, vec;
- u32 pir_val, irr_val, prev_irr_val;
+ u32 irr_val, prev_irr_val;
int max_updated_irr;
max_updated_irr = -1;
*max_irr = -1;
+ if (!pi_harvest_pir(pir, pir_vals))
+ return false;
+
for (i = vec = 0; i <= 7; i++, vec += 32) {
u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
- irr_val = *p_irr;
- pir_val = READ_ONCE(pir[i]);
-
- if (pir_val) {
- pir_val = xchg(&pir[i], 0);
+ irr_val = READ_ONCE(*p_irr);
+ if (__pir[i]) {
prev_irr_val = irr_val;
do {
- irr_val = prev_irr_val | pir_val;
+ irr_val = prev_irr_val | __pir[i];
} while (prev_irr_val != irr_val &&
!try_cmpxchg(p_irr, &prev_irr_val, irr_val));
@@ -691,7 +644,7 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
}
EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
-bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr)
{
struct kvm_lapic *apic = vcpu->arch.apic;
bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);
@@ -704,7 +657,7 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
static inline int apic_search_irr(struct kvm_lapic *apic)
{
- return find_highest_vector(apic->regs + APIC_IRR);
+ return apic_find_highest_vector(apic->regs + APIC_IRR);
}
static inline int apic_find_highest_irr(struct kvm_lapic *apic)
@@ -727,10 +680,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
{
if (unlikely(apic->apicv_active)) {
- kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
} else {
apic->irr_pending = false;
- kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
if (apic_search_irr(apic) != -1)
apic->irr_pending = true;
}
@@ -742,9 +695,15 @@ void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec)
}
EXPORT_SYMBOL_GPL(kvm_apic_clear_irr);
+static void *apic_vector_to_isr(int vec, struct kvm_lapic *apic)
+{
+ return apic->regs + APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(vec);
+}
+
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
{
- if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ if (__test_and_set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
+ apic_vector_to_isr(vec, apic)))
return;
/*
@@ -779,7 +738,7 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
if (likely(apic->highest_isr_cache != -1))
return apic->highest_isr_cache;
- result = find_highest_vector(apic->regs + APIC_ISR);
+ result = apic_find_highest_vector(apic->regs + APIC_ISR);
ASSERT(result == -1 || result >= 16);
return result;
@@ -787,7 +746,8 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
{
- if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ if (!__test_and_clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
+ apic_vector_to_isr(vec, apic)))
return;
/*
@@ -1330,11 +1290,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
if (trig_mode)
- kvm_lapic_set_vector(vector,
- apic->regs + APIC_TMR);
+ apic_set_vector(vector, apic->regs + APIC_TMR);
else
- kvm_lapic_clear_vector(vector,
- apic->regs + APIC_TMR);
+ apic_clear_vector(vector, apic->regs + APIC_TMR);
}
kvm_x86_call(deliver_interrupt)(apic, delivery_mode,
@@ -1453,12 +1411,20 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
{
- int trigger_mode;
+ int __maybe_unused trigger_mode;
/* Eoi the ioapic only if the ioapic doesn't own the vector. */
if (!kvm_ioapic_handles_vector(apic, vector))
return;
+ /*
+ * If the intercepted EOI is for an IRQ that was pending from previous
+ * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely
+ * no longer need to be intercepted.
+ */
+ if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector)
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu);
+
/* Request a KVM exit to inform the userspace IOAPIC. */
if (irqchip_split(apic->vcpu->kvm)) {
apic->vcpu->arch.pending_ioapic_eoi = vector;
@@ -1466,12 +1432,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
return;
}
+#ifdef CONFIG_KVM_IOAPIC
if (apic_test_vector(vector, apic->regs + APIC_TMR))
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+#endif
}
static int apic_set_eoi(struct kvm_lapic *apic)
@@ -1790,8 +1758,17 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- u32 reg = kvm_lapic_get_reg(apic, APIC_LVTT);
+ u32 reg;
+ /*
+ * Assume a timer IRQ was "injected" if the APIC is protected. KVM's
+ * copy of the vIRR is bogus, it's the responsibility of the caller to
+ * precisely check whether or not a timer IRQ is pending.
+ */
+ if (apic->guest_apic_protected)
+ return true;
+
+ reg = kvm_lapic_get_reg(apic, APIC_LVTT);
if (kvm_apic_hw_enabled(apic)) {
int vec = reg & APIC_VECTOR_MASK;
void *bitmap = apic->regs + APIC_ISR;
@@ -2650,6 +2627,7 @@ int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated)
kvm_recalculate_apic_map(vcpu->kvm);
return 0;
}
+EXPORT_SYMBOL_GPL(kvm_apic_set_base);
void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
{
@@ -2958,6 +2936,9 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
if (!kvm_apic_present(vcpu))
return -1;
+ if (apic->guest_apic_protected)
+ return -1;
+
__apic_update_ppr(apic, &ppr);
return apic_has_interrupt_for_ppr(apic, ppr);
}
@@ -3061,12 +3042,12 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
if (!kvm_x86_ops.x2apic_icr_is_split) {
if (set) {
- icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
- (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
- __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+ icr = apic_get_reg(s->regs, APIC_ICR) |
+ (u64)apic_get_reg(s->regs, APIC_ICR2) << 32;
+ apic_set_reg64(s->regs, APIC_ICR, icr);
} else {
- icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
- __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ icr = apic_get_reg64(s->regs, APIC_ICR);
+ apic_set_reg(s->regs, APIC_ICR2, icr >> 32);
}
}
}
@@ -3082,8 +3063,7 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
* Get calculated timer current count for remaining timer period (if
* any) and store it in the returned register set.
*/
- __kvm_lapic_set_reg(s->regs, APIC_TMCCT,
- __apic_read(vcpu->arch.apic, APIC_TMCCT));
+ apic_set_reg(s->regs, APIC_TMCCT, __apic_read(vcpu->arch.apic, APIC_TMCCT));
return kvm_apic_state_fixup(vcpu, s, false);
}
@@ -3123,8 +3103,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+#ifdef CONFIG_KVM_IOAPIC
if (ioapic_in_kernel(vcpu->kvm))
kvm_rtc_eoi_tracking_restore_one(vcpu);
+#endif
vcpu->arch.apic_arb_prio = 0;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1a8553ebdb42..72de14527698 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -4,6 +4,8 @@
#include <kvm/iodev.h>
+#include <asm/apic.h>
+
#include <linux/kvm_host.h>
#include "hyperv.h"
@@ -21,6 +23,8 @@
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
enum lapic_mode {
LAPIC_MODE_DISABLED = 0,
LAPIC_MODE_INVALID = X2APIC_ENABLE,
@@ -65,6 +69,8 @@ struct kvm_lapic {
bool sw_enabled;
bool irr_pending;
bool lvt0_in_nmi_mode;
+ /* Select registers in the vAPIC cannot be read/written. */
+ bool guest_apic_protected;
/* Number of bits set in ISR. */
s16 isr_count;
/* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
@@ -101,8 +107,8 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int shorthand, unsigned int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec);
-bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
-bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
+bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr);
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr);
void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
@@ -143,22 +149,9 @@ void kvm_lapic_exit(void);
u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic);
-#define VEC_POS(v) ((v) & (32 - 1))
-#define REG_POS(v) (((v) >> 5) << 4)
-
-static inline void kvm_lapic_clear_vector(int vec, void *bitmap)
-{
- clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline void kvm_lapic_set_vector(int vec, void *bitmap)
-{
- set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
{
- kvm_lapic_set_vector(vec, apic->regs + APIC_IRR);
+ apic_set_vector(vec, apic->regs + APIC_IRR);
/*
* irr_pending must be true if any interrupt is pending; set it after
* APIC_IRR to avoid race with apic_clear_irr
@@ -166,14 +159,9 @@ static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
apic->irr_pending = true;
}
-static inline u32 __kvm_lapic_get_reg(char *regs, int reg_off)
-{
- return *((u32 *) (regs + reg_off));
-}
-
static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
{
- return __kvm_lapic_get_reg(apic->regs, reg_off);
+ return apic_get_reg(apic->regs, reg_off);
}
DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index f2b36d32ef40..b4b6860ab971 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -79,6 +79,7 @@ static inline gfn_t kvm_mmu_max_gfn(void)
u8 kvm_mmu_get_max_tdp_level(void);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value);
void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
@@ -234,7 +235,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
-bool kvm_mmu_may_ignore_guest_pat(void);
+bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm);
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
@@ -256,6 +257,9 @@ extern bool tdp_mmu_enabled;
#define tdp_mmu_enabled false
#endif
+bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa);
+int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level);
+
static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
{
return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8d1b632e33d2..6e838cb6c9e1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -110,6 +110,7 @@ static bool __ro_after_init tdp_mmu_allowed;
#ifdef CONFIG_X86_64
bool __read_mostly tdp_mmu_enabled = true;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
+EXPORT_SYMBOL_GPL(tdp_mmu_enabled);
#endif
static int max_huge_page_level __read_mostly;
@@ -1456,15 +1457,15 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
* enabled but it chooses between clearing the Dirty bit and Writeable
* bit based on the context.
*/
- if (kvm_x86_ops.cpu_dirty_log_size)
+ if (kvm->arch.cpu_dirty_log_size)
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
else
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
-int kvm_cpu_dirty_log_size(void)
+int kvm_cpu_dirty_log_size(struct kvm *kvm)
{
- return kvm_x86_ops.cpu_dirty_log_size;
+ return kvm->arch.cpu_dirty_log_size;
}
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
@@ -1982,14 +1983,35 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
return true;
}
+static __ro_after_init HLIST_HEAD(empty_page_hash);
+
+static struct hlist_head *kvm_get_mmu_page_hash(struct kvm *kvm, gfn_t gfn)
+{
+ /*
+ * Ensure the load of the hash table pointer itself is ordered before
+ * loads to walk the table. The pointer is set at runtime outside of
+ * mmu_lock when the TDP MMU is enabled, i.e. when the hash table of
+ * shadow pages becomes necessary only when KVM needs to shadow L1's
+ * TDP for an L2 guest. Pairs with the smp_store_release() in
+ * kvm_mmu_alloc_page_hash().
+ */
+ struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash);
+
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ if (!page_hash)
+ return &empty_page_hash;
+
+ return &page_hash[kvm_page_table_hashfn(gfn)];
+}
+
#define for_each_valid_sp(_kvm, _sp, _list) \
hlist_for_each_entry(_sp, _list, hash_link) \
if (is_obsolete_sp((_kvm), (_sp))) { \
} else
#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
- for_each_valid_sp(_kvm, _sp, \
- &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
+ for_each_valid_sp(_kvm, _sp, kvm_get_mmu_page_hash(_kvm, _gfn)) \
if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
@@ -2357,6 +2379,12 @@ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
struct kvm_mmu_page *sp;
bool created = false;
+ /*
+ * No need for memory barriers, unlike in kvm_get_mmu_page_hash(), as
+ * mmu_page_hash must be set prior to creating the first shadow root,
+ * i.e. reaching this point is fully serialized by slots_arch_lock.
+ */
+ BUG_ON(!kvm->arch.mmu_page_hash);
sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
@@ -3019,7 +3047,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
}
if (is_shadow_present_pte(*sptep)) {
- if (prefetch)
+ if (prefetch && is_last_spte(*sptep, level) &&
+ pfn == spte_to_pfn(*sptep))
return RET_PF_SPURIOUS;
/*
@@ -3033,7 +3062,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
child = spte_to_child_sp(pte);
drop_parent_pte(vcpu->kvm, child, sptep);
flush = true;
- } else if (pfn != spte_to_pfn(*sptep)) {
+ } else if (WARN_ON_ONCE(pfn != spte_to_pfn(*sptep))) {
drop_spte(vcpu->kvm, sptep);
flush = true;
} else
@@ -3880,6 +3909,28 @@ out_unlock:
return r;
}
+static int kvm_mmu_alloc_page_hash(struct kvm *kvm)
+{
+ struct hlist_head *h;
+
+ if (kvm->arch.mmu_page_hash)
+ return 0;
+
+ h = kvcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT);
+ if (!h)
+ return -ENOMEM;
+
+ /*
+ * Ensure the hash table pointer is set only after all stores to zero
+ * the memory are retired. Pairs with the smp_load_acquire() in
+ * kvm_get_mmu_page_hash(). Note, mmu_lock must be held for write to
+ * add (or remove) shadow pages, and so readers are guaranteed to see
+ * an empty list for their current mmu_lock critical section.
+ */
+ smp_store_release(&kvm->arch.mmu_page_hash, h);
+ return 0;
+}
+
static int mmu_first_shadow_root_alloc(struct kvm *kvm)
{
struct kvm_memslots *slots;
@@ -3899,9 +3950,13 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
if (kvm_shadow_root_allocated(kvm))
goto out_unlock;
+ r = kvm_mmu_alloc_page_hash(kvm);
+ if (r)
+ goto out_unlock;
+
/*
- * Check if anything actually needs to be allocated, e.g. all metadata
- * will be allocated upfront if TDP is disabled.
+ * Check if memslot metadata actually needs to be allocated, e.g. all
+ * metadata will be allocated upfront if TDP is disabled.
*/
if (kvm_memslots_have_rmaps(kvm) &&
kvm_page_track_write_tracking_enabled(kvm))
@@ -4835,19 +4890,6 @@ out_unlock:
}
#endif
-bool kvm_mmu_may_ignore_guest_pat(void)
-{
- /*
- * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM
- * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
- * honor the memtype from the guest's PAT so that guest accesses to
- * memory that is DMA'd aren't cached against the guest's wishes. As a
- * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
- * KVM _always_ ignores guest PAT (when EPT is enabled).
- */
- return shadow_memtype_mask;
-}
-
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
#ifdef CONFIG_X86_64
@@ -4858,8 +4900,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return direct_page_fault(vcpu, fault);
}
-static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
- u8 *level)
+int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level)
{
int r;
@@ -4873,6 +4914,10 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
do {
if (signal_pending(current))
return -EINTR;
+
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
+ return -EIO;
+
cond_resched();
r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
} while (r == RET_PF_RETRY);
@@ -4897,18 +4942,23 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
return -EIO;
}
}
+EXPORT_SYMBOL_GPL(kvm_tdp_map_page);
long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
struct kvm_pre_fault_memory *range)
{
u64 error_code = PFERR_GUEST_FINAL_MASK;
u8 level = PG_LEVEL_4K;
+ u64 direct_bits;
u64 end;
int r;
if (!vcpu->kvm->arch.pre_fault_allowed)
return -EOPNOTSUPP;
+ if (kvm_is_gfn_alias(vcpu->kvm, gpa_to_gfn(range->gpa)))
+ return -EINVAL;
+
/*
* reload is efficient when called repeatedly, so we can do it on
* every iteration.
@@ -4917,15 +4967,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
if (r)
return r;
+ direct_bits = 0;
if (kvm_arch_has_private_mem(vcpu->kvm) &&
kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
error_code |= PFERR_PRIVATE_ACCESS;
+ else
+ direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
/*
* Shadow paging uses GVA for kvm page fault, so restrict to
* two-dimensional paging.
*/
- r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
+ r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level);
if (r < 0)
return r;
@@ -5589,12 +5642,19 @@ void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
{
+ int maxpa;
+
+ if (vcpu->kvm->arch.vm_type == KVM_X86_TDX_VM)
+ maxpa = cpuid_query_maxguestphyaddr(vcpu);
+ else
+ maxpa = cpuid_maxphyaddr(vcpu);
+
/* tdp_root_level is architecture forced level, use it if nonzero */
if (tdp_root_level)
return tdp_root_level;
/* Use 5-level TDP if and only if it's useful/necessary. */
- if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
+ if (max_tdp_level == 5 && maxpa <= 48)
return 4;
return max_tdp_level;
@@ -5913,6 +5973,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
out:
return r;
}
+EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
@@ -6674,15 +6735,22 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
}
-void kvm_mmu_init_vm(struct kvm *kvm)
+int kvm_mmu_init_vm(struct kvm *kvm)
{
+ int r;
+
kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
- if (tdp_mmu_enabled)
+ if (tdp_mmu_enabled) {
kvm_mmu_init_tdp_mmu(kvm);
+ } else {
+ r = kvm_mmu_alloc_page_hash(kvm);
+ if (r)
+ return r;
+ }
kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
@@ -6691,6 +6759,7 @@ void kvm_mmu_init_vm(struct kvm *kvm)
kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
+ return 0;
}
static void mmu_free_vm_memory_caches(struct kvm *kvm)
@@ -6702,6 +6771,8 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm)
void kvm_mmu_uninit_vm(struct kvm *kvm)
{
+ kvfree(kvm->arch.mmu_page_hash);
+
if (tdp_mmu_enabled)
kvm_mmu_uninit_tdp_mmu(kvm);
@@ -7239,6 +7310,7 @@ static void kvm_mmu_zap_memslot(struct kvm *kvm,
.start = slot->base_gfn,
.end = slot->base_gfn + slot->npages,
.may_block = true,
+ .attr_filter = KVM_FILTER_PRIVATE | KVM_FILTER_SHARED,
};
bool flush;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 75f00598289d..65f3c89d7c5d 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -103,6 +103,9 @@ struct kvm_mmu_page {
int root_count;
refcount_t tdp_mmu_root_count;
};
+
+ bool has_mapped_host_mmio;
+
union {
/* These two members aren't used for TDP MMU */
struct {
@@ -187,7 +190,8 @@ static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mm
return kvm_gfn_direct_bits(kvm);
}
-static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
+static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
{
/*
* When using the EPT page-modification log, the GPAs in the CPU dirty
@@ -197,7 +201,7 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
* being enabled is mandatory as the bits used to denote WP-only SPTEs
* are reserved for PAE paging (32-bit KVM).
*/
- return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode;
+ return kvm->arch.cpu_dirty_log_size && sp->role.guest_mode;
}
static inline gfn_t gfn_round_for_level(gfn_t gfn, int level)
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 561c331fd6ec..1b17b12393a8 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -172,6 +172,9 @@ static int kvm_enable_external_write_tracking(struct kvm *kvm)
struct kvm_memory_slot *slot;
int r = 0, i, bkt;
+ if (kvm->arch.vm_type == KVM_X86_TDX_VM)
+ return -EOPNOTSUPP;
+
mutex_lock(&kvm->slots_arch_lock);
/*
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 68e323568e95..ed762bb4b007 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -804,9 +804,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r != RET_PF_CONTINUE)
return r;
+#if PTTYPE != PTTYPE_EPT
/*
- * Do not change pte_access if the pfn is a mmio page, otherwise
- * we will cache the incorrect access into mmio spte.
+ * Treat the guest PTE protections as writable, supervisor-only if this
+ * is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore
+ * PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO,
+ * otherwise KVM will cache incorrect access information in the SPTE.
*/
if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
!is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
@@ -822,6 +825,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (is_cr4_smep(vcpu->arch.mmu))
walker.pte_access &= ~ACC_EXEC_MASK;
}
+#endif
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 0f9f47b4ab0e..df31039b5d63 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -37,7 +37,6 @@ u64 __read_mostly shadow_mmio_value;
u64 __read_mostly shadow_mmio_mask;
u64 __read_mostly shadow_mmio_access_mask;
u64 __read_mostly shadow_present_mask;
-u64 __read_mostly shadow_memtype_mask;
u64 __read_mostly shadow_me_value;
u64 __read_mostly shadow_me_mask;
u64 __read_mostly shadow_acc_track_mask;
@@ -96,8 +95,6 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
u64 spte = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
- WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value);
-
access &= shadow_mmio_access_mask;
spte |= vcpu->kvm->arch.shadow_mmio_value | access;
spte |= gpa | shadow_nonpresent_or_rsvd_mask;
@@ -107,7 +104,7 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
return spte;
}
-static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
+static bool __kvm_is_mmio_pfn(kvm_pfn_t pfn)
{
if (pfn_valid(pfn))
return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
@@ -128,6 +125,35 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
E820_TYPE_RAM);
}
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn, int *is_host_mmio)
+{
+ /*
+ * Determining if a PFN is host MMIO is relative expensive. Cache the
+ * result locally (in the sole caller) to avoid doing the full query
+ * multiple times when creating a single SPTE.
+ */
+ if (*is_host_mmio < 0)
+ *is_host_mmio = __kvm_is_mmio_pfn(pfn);
+
+ return *is_host_mmio;
+}
+
+static void kvm_track_host_mmio_mapping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (root)
+ WRITE_ONCE(root->has_mapped_host_mmio, true);
+ else
+ WRITE_ONCE(vcpu->kvm->arch.has_mapped_host_mmio, true);
+
+ /*
+ * Force vCPUs to exit and flush CPU buffers if the vCPU is using the
+ * affected root(s).
+ */
+ kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
/*
* Returns true if the SPTE needs to be updated atomically due to having bits
* that may be changed without holding mmu_lock, and for which KVM must not
@@ -165,6 +191,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
{
int level = sp->role.level;
u64 spte = SPTE_MMU_PRESENT_MASK;
+ int is_host_mmio = -1;
bool wrprot = false;
/*
@@ -177,7 +204,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (sp->role.ad_disabled)
spte |= SPTE_TDP_AD_DISABLED;
- else if (kvm_mmu_page_ad_need_write_protect(sp))
+ else if (kvm_mmu_page_ad_need_write_protect(vcpu->kvm, sp))
spte |= SPTE_TDP_AD_WRPROT_ONLY;
spte |= shadow_present_mask;
@@ -212,15 +239,15 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
- if (shadow_memtype_mask)
+ if (kvm_x86_ops.get_mt_mask)
spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
+ kvm_is_mmio_pfn(pfn, &is_host_mmio));
if (host_writable)
spte |= shadow_host_writable_mask;
else
pte_access &= ~ACC_WRITE_MASK;
- if (shadow_me_value && !kvm_is_mmio_pfn(pfn))
+ if (shadow_me_value && !kvm_is_mmio_pfn(pfn, &is_host_mmio))
spte |= shadow_me_value;
spte |= (u64)pfn << PAGE_SHIFT;
@@ -265,6 +292,11 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
}
+ if (static_branch_unlikely(&cpu_buf_vm_clear) &&
+ !kvm_vcpu_can_access_host_mmio(vcpu) &&
+ kvm_is_mmio_pfn(pfn, &is_host_mmio))
+ kvm_track_host_mmio_mapping(vcpu);
+
*new_spte = spte;
return wrprot;
}
@@ -440,6 +472,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value)
+{
+ kvm->arch.shadow_mmio_value = mmio_value;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value);
+
void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
{
/* shadow_me_value must be a subset of shadow_me_mask */
@@ -463,13 +501,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
/* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
shadow_present_mask =
(has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT;
- /*
- * EPT overrides the host MTRRs, and so KVM must program the desired
- * memtype directly into the SPTEs. Note, this mask is just the mask
- * of all bits that factor into the memtype, the actual memtype must be
- * dynamically calculated, e.g. to ensure host MMIO is mapped UC.
- */
- shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT;
+
shadow_acc_track_mask = VMX_EPT_RWX_MASK;
shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
@@ -521,12 +553,6 @@ void kvm_mmu_reset_all_pte_masks(void)
shadow_x_mask = 0;
shadow_present_mask = PT_PRESENT_MASK;
- /*
- * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB
- * memtype in the SPTEs, i.e. relies on host MTRRs to provide the
- * correct memtype (WB is the "weakest" memtype).
- */
- shadow_memtype_mask = 0;
shadow_acc_track_mask = 0;
shadow_me_mask = 0;
shadow_me_value = 0;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 79cdceba9857..3133f066927e 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -187,7 +187,6 @@ extern u64 __read_mostly shadow_mmio_value;
extern u64 __read_mostly shadow_mmio_mask;
extern u64 __read_mostly shadow_mmio_access_mask;
extern u64 __read_mostly shadow_present_mask;
-extern u64 __read_mostly shadow_memtype_mask;
extern u64 __read_mostly shadow_me_value;
extern u64 __read_mostly shadow_me_mask;
@@ -281,6 +280,16 @@ static inline bool is_mirror_sptep(tdp_ptep_t sptep)
return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep)));
}
+static inline bool kvm_vcpu_can_access_host_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (root)
+ return READ_ONCE(root->has_mapped_host_mmio);
+
+ return READ_ONCE(vcpu->kvm->arch.has_mapped_host_mmio);
+}
+
static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
{
return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 21a3b8166242..7f3d7229b2c1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -378,7 +378,7 @@ static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
/* Zapping leaf spte is allowed only when write lock is held. */
lockdep_assert_held_write(&kvm->mmu_lock);
/* Because write lock is held, operation should success. */
- ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn);
+ ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn);
KVM_BUG_ON(ret, kvm);
}
@@ -485,8 +485,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
}
if (is_mirror_sp(sp) &&
- WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level,
- sp->external_spt))) {
+ WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level,
+ sp->external_spt))) {
/*
* Failed to free page table page in mirror page table and
* there is nothing to do further.
@@ -538,12 +538,12 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
* external page table, or leaf.
*/
if (is_leaf) {
- ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn);
+ ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn);
} else {
void *external_spt = get_external_spt(gfn, new_spte, level);
KVM_BUG_ON(!external_spt, kvm);
- ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt);
+ ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
}
if (ret)
__kvm_tdp_mmu_write_spte(sptep, old_spte);
@@ -1153,13 +1153,12 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
return RET_PF_RETRY;
- if (fault->prefetch && is_shadow_present_pte(iter->old_spte))
- return RET_PF_SPURIOUS;
-
if (is_shadow_present_pte(iter->old_spte) &&
- is_access_allowed(fault, iter->old_spte) &&
- is_last_spte(iter->old_spte, iter->level))
+ (fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
+ is_last_spte(iter->old_spte, iter->level)) {
+ WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
return RET_PF_SPURIOUS;
+ }
if (unlikely(!fault->slot))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
@@ -1630,21 +1629,21 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
}
}
-static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
+static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp)
{
/*
* All TDP MMU shadow pages share the same role as their root, aside
* from level, so it is valid to key off any shadow page to determine if
* write protection is needed for an entire tree.
*/
- return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled;
+ return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled;
}
static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end)
{
- const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
- shadow_dirty_mask;
+ const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ?
+ PT_WRITABLE_MASK : shadow_dirty_mask;
struct tdp_iter iter;
rcu_read_lock();
@@ -1689,8 +1688,8 @@ void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t gfn, unsigned long mask, bool wrprot)
{
- const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK :
- shadow_dirty_mask;
+ const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ?
+ PT_WRITABLE_MASK : shadow_dirty_mask;
struct tdp_iter iter;
lockdep_assert_held_write(&kvm->mmu_lock);
@@ -1911,16 +1910,13 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
*
* Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
*/
-int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
- int *root_level)
+static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ struct kvm_mmu_page *root)
{
- struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
struct tdp_iter iter;
gfn_t gfn = addr >> PAGE_SHIFT;
int leaf = -1;
- *root_level = vcpu->arch.mmu->root_role.level;
-
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
@@ -1929,6 +1925,36 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
return leaf;
}
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+ *root_level = vcpu->arch.mmu->root_role.level;
+
+ return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root);
+}
+
+bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa)
+{
+ struct kvm *kvm = vcpu->kvm;
+ bool is_direct = kvm_is_addr_direct(kvm, gpa);
+ hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa :
+ vcpu->arch.mmu->mirror_root_hpa;
+ u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte;
+ int leaf;
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ rcu_read_lock();
+ leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root));
+ rcu_read_unlock();
+ if (leaf < 0)
+ return false;
+
+ spte = sptes[leaf];
+ return is_shadow_present_pte(spte) && is_last_spte(spte, leaf);
+}
+EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped);
+
/*
* Returns the last level spte pointer of the shadow page walk for the given
* gpa, and sets *spte to the spte value. This spte may be non-preset. If no
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index fde0ae986003..c53b92379e6e 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -52,6 +52,10 @@
/* CPUID level 0x80000022 (EAX) */
#define KVM_X86_FEATURE_PERFMON_V2 KVM_X86_FEATURE(CPUID_8000_0022_EAX, 0)
+/* CPUID level 0x80000021 (ECX) */
+#define KVM_X86_FEATURE_TSA_SQ_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 1)
+#define KVM_X86_FEATURE_TSA_L1_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 2)
+
struct cpuid_reg {
u32 function;
u32 index;
@@ -82,6 +86,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX},
[CPUID_7_2_EDX] = { 7, 2, CPUID_EDX},
[CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX},
+ [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX},
};
/*
@@ -121,6 +126,8 @@ static __always_inline u32 __feature_translate(int x86_feature)
KVM_X86_TRANSLATE_FEATURE(PERFMON_V2);
KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL);
KVM_X86_TRANSLATE_FEATURE(BHI_CTRL);
+ KVM_X86_TRANSLATE_FEATURE(TSA_SQ_NO);
+ KVM_X86_TRANSLATE_FEATURE(TSA_L1_NO);
default:
return x86_feature;
}
diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h
index a1cf2ac5bd78..551703fbe200 100644
--- a/arch/x86/kvm/smm.h
+++ b/arch/x86/kvm/smm.h
@@ -142,6 +142,9 @@ union kvm_smram {
static inline int kvm_inject_smi(struct kvm_vcpu *vcpu)
{
+ if (!kvm_x86_call(has_emulated_msr)(vcpu->kvm, MSR_IA32_SMBASE))
+ return -ENOTTY;
+
kvm_make_request(KVM_REQ_SMI, vcpu);
return 0;
}
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 067f8e3f5a0d..a34c5c3b164e 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -18,6 +18,7 @@
#include <linux/hashtable.h>
#include <linux/amd-iommu.h>
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
#include <asm/irq_remapping.h>
#include <asm/msr.h>
@@ -29,36 +30,39 @@
#include "svm.h"
/*
- * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID,
- * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry
- * if an interrupt can't be delivered, e.g. because the vCPU isn't running.
+ * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
+ * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
+ * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index
+ * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast
+ * lookup on the index, where as vCPUs whose index doesn't match their ID need
+ * to walk the entire xarray of vCPUs in the worst case scenario.
*
- * For the vCPU ID, use however many bits are currently allowed for the max
+ * For the vCPU index, use however many bits are currently allowed for the max
* guest physical APIC ID (limited by the size of the physical ID table), and
* use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
* size of the GATag is defined by hardware (32 bits), but is an opaque value
* as far as hardware is concerned.
*/
-#define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
+#define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
#define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
#define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
-#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+#define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK)
-#define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
- ((vcpu_id) & AVIC_VCPU_ID_MASK))
-#define AVIC_GATAG(vm_id, vcpu_id) \
+#define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
+ ((vcpu_idx) & AVIC_VCPU_IDX_MASK))
+#define AVIC_GATAG(vm_id, vcpu_idx) \
({ \
- u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \
+ u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \
\
- WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \
+ WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \
WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
ga_tag; \
})
-static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
+static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u);
static bool force_avic;
module_param_unsafe(force_avic, bool, 0444);
@@ -75,14 +79,6 @@ static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
bool x2avic_enabled;
-/*
- * This is a wrapper of struct amd_iommu_ir_data.
- */
-struct amd_svm_iommu_ir {
- struct list_head node; /* Used by SVM for per-vcpu ir_list */
- void *data; /* Storing pointer to struct amd_ir_data */
-};
-
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
@@ -147,16 +143,16 @@ int avic_ga_log_notifier(u32 ga_tag)
struct kvm_svm *kvm_svm;
struct kvm_vcpu *vcpu = NULL;
u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
- u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+ u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag);
- pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
- trace_kvm_avic_ga_log(vm_id, vcpu_id);
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx);
+ trace_kvm_avic_ga_log(vm_id, vcpu_idx);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
if (kvm_svm->avic_vm_id != vm_id)
continue;
- vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
+ vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx);
break;
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
@@ -180,10 +176,8 @@ void avic_vm_destroy(struct kvm *kvm)
if (!enable_apicv)
return;
- if (kvm_svm->avic_logical_id_table_page)
- __free_page(kvm_svm->avic_logical_id_table_page);
- if (kvm_svm->avic_physical_id_table_page)
- __free_page(kvm_svm->avic_physical_id_table_page);
+ free_page((unsigned long)kvm_svm->avic_logical_id_table);
+ free_page((unsigned long)kvm_svm->avic_physical_id_table);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_del(&kvm_svm->hnode);
@@ -196,27 +190,19 @@ int avic_vm_init(struct kvm *kvm)
int err = -ENOMEM;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
struct kvm_svm *k2;
- struct page *p_page;
- struct page *l_page;
u32 vm_id;
if (!enable_apicv)
return 0;
- /* Allocating physical APIC ID table (4KB) */
- p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!p_page)
+ kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!kvm_svm->avic_physical_id_table)
goto free_avic;
- kvm_svm->avic_physical_id_table_page = p_page;
-
- /* Allocating logical APIC ID table (4KB) */
- l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!l_page)
+ kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!kvm_svm->avic_logical_id_table)
goto free_avic;
- kvm_svm->avic_logical_id_table_page = l_page;
-
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
again:
vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
@@ -242,17 +228,19 @@ free_avic:
return err;
}
+static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
+{
+ return __sme_set(__pa(svm->vcpu.arch.apic->regs));
+}
+
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
{
struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
- phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
- phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
- phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
- vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
- vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
- vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
- vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
+ vmcb->control.avic_backing_page = avic_get_backing_page_address(svm);
+ vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table));
+ vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
+ vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
if (kvm_apicv_activated(svm->vcpu.kvm))
avic_activate_vmcb(svm);
@@ -260,32 +248,31 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
avic_deactivate_vmcb(svm);
}
-static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
- unsigned int index)
-{
- u64 *avic_physical_id_table;
- struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
-
- if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
- (index > X2AVIC_MAX_PHYSICAL_ID))
- return NULL;
-
- avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
-
- return &avic_physical_id_table[index];
-}
-
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
- u64 *entry, new_entry;
- int id = vcpu->vcpu_id;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
struct vcpu_svm *svm = to_svm(vcpu);
+ u32 id = vcpu->vcpu_id;
+ u64 new_entry;
+ /*
+ * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC
+ * hardware. Immediately clear apicv_active, i.e. don't wait until the
+ * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as
+ * avic_vcpu_load() expects to be called if and only if the vCPU has
+ * fully initialized AVIC.
+ */
if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
- (id > X2AVIC_MAX_PHYSICAL_ID))
- return -EINVAL;
+ (id > X2AVIC_MAX_PHYSICAL_ID)) {
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
+ vcpu->arch.apic->apicv_active = false;
+ return 0;
+ }
+
+ BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE ||
+ (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE);
- if (!vcpu->arch.apic->regs)
+ if (WARN_ON_ONCE(!vcpu->arch.apic->regs))
return -EINVAL;
if (kvm_apicv_activated(vcpu->kvm)) {
@@ -302,19 +289,21 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return ret;
}
- svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
+ /* Note, fls64() returns the bit position, +1. */
+ BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT >
+ fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK));
/* Setting AVIC backing page address in the phy APIC ID table */
- entry = avic_get_physical_id_entry(vcpu, id);
- if (!entry)
- return -EINVAL;
+ new_entry = avic_get_backing_page_address(svm) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+ svm->avic_physical_id_entry = new_entry;
- new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
- AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
- AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
- WRITE_ONCE(*entry, new_entry);
-
- svm->avic_physical_id_cache = entry;
+ /*
+ * Initialize the real table, as vCPUs must have a valid entry in order
+ * for broadcast IPIs to function correctly (broadcast IPIs ignore
+ * invalid entries, i.e. aren't guaranteed to generate a VM-Exit).
+ */
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry);
return 0;
}
@@ -448,7 +437,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
if (apic_x2apic_mode(source))
avic_logical_id_table = NULL;
else
- avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
+ avic_logical_id_table = kvm_svm->avic_logical_id_table;
/*
* AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
@@ -550,7 +539,6 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- u32 *logical_apic_id_table;
u32 cluster, index;
ldr = GET_APIC_LOGICAL_ID(ldr);
@@ -571,9 +559,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
return NULL;
index += (cluster << 2);
- logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
-
- return &logical_apic_id_table[index];
+ return &kvm_svm->avic_logical_id_table[index];
}
static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
@@ -722,6 +708,9 @@ int avic_init_vcpu(struct vcpu_svm *svm)
int ret;
struct kvm_vcpu *vcpu = &svm->vcpu;
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
+
if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
return 0;
@@ -729,8 +718,6 @@ int avic_init_vcpu(struct vcpu_svm *svm)
if (ret)
return ret;
- INIT_LIST_HEAD(&svm->ir_list);
- spin_lock_init(&svm->ir_list_lock);
svm->dfr_reg = APIC_DFR_FLAT;
return ret;
@@ -742,316 +729,161 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
{
- int ret = 0;
+ struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
unsigned long flags;
- struct amd_svm_iommu_ir *ir;
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm))
- return 0;
- /*
- * Here, we go through the per-vcpu ir_list to update all existing
- * interrupt remapping table entry targeting this vcpu.
- */
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
- if (list_empty(&svm->ir_list))
- goto out;
+ if (!vcpu)
+ return;
- list_for_each_entry(ir, &svm->ir_list, node) {
- if (activate)
- ret = amd_iommu_activate_guest_mode(ir->data);
- else
- ret = amd_iommu_deactivate_guest_mode(ir->data);
- if (ret)
- break;
- }
-out:
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
- return ret;
+ spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
+ list_del(&irqfd->vcpu_list);
+ spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
}
-static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector)
{
- unsigned long flags;
- struct amd_svm_iommu_ir *cur;
-
- spin_lock_irqsave(&svm->ir_list_lock, flags);
- list_for_each_entry(cur, &svm->ir_list, node) {
- if (cur->data != pi->ir_data)
- continue;
- list_del(&cur->node);
- kfree(cur);
- break;
- }
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-}
-
-static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
-{
- int ret = 0;
- unsigned long flags;
- struct amd_svm_iommu_ir *ir;
- u64 entry;
-
- if (WARN_ON_ONCE(!pi->ir_data))
- return -EINVAL;
-
- /**
- * In some cases, the existing irte is updated and re-set,
- * so we need to check here if it's already been * added
- * to the ir_list.
- */
- if (pi->prev_ga_tag) {
- struct kvm *kvm = svm->vcpu.kvm;
- u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
- struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
- struct vcpu_svm *prev_svm;
-
- if (!prev_vcpu) {
- ret = -EINVAL;
- goto out;
- }
-
- prev_svm = to_svm(prev_vcpu);
- svm_ir_list_del(prev_svm, pi);
- }
-
- /**
- * Allocating new amd_iommu_pi_data, which will get
- * add to the per-vcpu ir_list.
- */
- ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
- if (!ir) {
- ret = -ENOMEM;
- goto out;
- }
- ir->data = pi->ir_data;
-
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
/*
- * Update the target pCPU for IOMMU doorbells if the vCPU is running.
- * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
- * will update the pCPU info when the vCPU awkened and/or scheduled in.
- * See also avic_vcpu_load().
+ * If the IRQ was affined to a different vCPU, remove the IRTE metadata
+ * from the *previous* vCPU's list.
*/
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
- amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
- true, pi->ir_data);
-
- list_add(&ir->node, &svm->ir_list);
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-out:
- return ret;
-}
+ svm_ir_list_del(irqfd);
-/*
- * Note:
- * The HW cannot support posting multicast/broadcast
- * interrupts to a vCPU. So, we still use legacy interrupt
- * remapping for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- */
-static int
-get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
-{
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu = NULL;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq)) {
- pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
- __func__, irq.vector);
- return -1;
- }
-
- pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
- irq.vector);
- *svm = to_svm(vcpu);
- vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
- vcpu_info->vector = irq.vector;
-
- return 0;
-}
-
-/*
- * avic_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- bool enable_remapped_mode = true;
- int idx, ret = 0;
-
- if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
- return 0;
-
- pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
- __func__, host_irq, guest_irq, set);
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
- }
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- struct vcpu_data vcpu_info;
- struct vcpu_svm *svm = NULL;
+ if (vcpu) {
+ /*
+ * Try to enable guest_mode in IRTE, unless AVIC is inhibited,
+ * in which case configure the IRTE for legacy mode, but track
+ * the IRTE metadata so that it can be converted to guest mode
+ * if AVIC is enabled/uninhibited in the future.
+ */
+ struct amd_iommu_pi_data pi_data = {
+ .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+ vcpu->vcpu_idx),
+ .is_guest_mode = kvm_vcpu_apicv_active(vcpu),
+ .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
+ .vector = vector,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 entry;
+ int ret;
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
+ /*
+ * Prevent the vCPU from being scheduled out or migrated until
+ * the IRTE is updated and its metadata has been added to the
+ * list of IRQs being posted to the vCPU, to ensure the IRTE
+ * isn't programmed with stale pCPU/IsRunning information.
+ */
+ guard(spinlock_irqsave)(&svm->ir_list_lock);
- /**
- * Here, we setup with legacy mode in the following cases:
- * 1. When cannot target interrupt to a specific vcpu.
- * 2. Unsetting posted interrupt.
- * 3. APIC virtualization is disabled for the vcpu.
- * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
+ /*
+ * Update the target pCPU for IOMMU doorbells if the vCPU is
+ * running. If the vCPU is NOT running, i.e. is blocking or
+ * scheduled out, KVM will update the pCPU info when the vCPU
+ * is awakened and/or scheduled in. See also avic_vcpu_load().
*/
- if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
- kvm_vcpu_apicv_active(&svm->vcpu)) {
- struct amd_iommu_pi_data pi;
-
- enable_remapped_mode = false;
-
- /* Try to enable guest_mode in IRTE */
- pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
- AVIC_HPA_MASK);
- pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
- svm->vcpu.vcpu_id);
- pi.is_guest_mode = true;
- pi.vcpu_data = &vcpu_info;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
-
- /**
- * Here, we successfully setting up vcpu affinity in
- * IOMMU guest mode. Now, we need to store the posted
- * interrupt information in a per-vcpu ir_list so that
- * we can reference to them directly when we update vcpu
- * scheduling information in IOMMU irte.
- */
- if (!ret && pi.is_guest_mode)
- svm_ir_list_add(svm, &pi);
+ entry = svm->avic_physical_id_entry;
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
+ pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ } else {
+ pi_data.cpu = -1;
+ pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
}
- if (!ret && svm) {
- trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
- e->gsi, vcpu_info.vector,
- vcpu_info.pi_desc_addr, set);
- }
+ ret = irq_set_vcpu_affinity(host_irq, &pi_data);
+ if (ret)
+ return ret;
- if (ret < 0) {
- pr_err("%s: failed to update PI IRTE\n", __func__);
- goto out;
+ /*
+ * Revert to legacy mode if the IOMMU didn't provide metadata
+ * for the IRTE, which KVM needs to keep the IRTE up-to-date,
+ * e.g. if the vCPU is migrated or AVIC is disabled.
+ */
+ if (WARN_ON_ONCE(!pi_data.ir_data)) {
+ irq_set_vcpu_affinity(host_irq, NULL);
+ return -EIO;
}
- }
- ret = 0;
- if (enable_remapped_mode) {
- /* Use legacy mode in IRTE */
- struct amd_iommu_pi_data pi;
+ irqfd->irq_bypass_data = pi_data.ir_data;
+ list_add(&irqfd->vcpu_list, &svm->ir_list);
+ return 0;
+ }
+ return irq_set_vcpu_affinity(host_irq, NULL);
+}
- /**
- * Here, pi is used to:
- * - Tell IOMMU to use legacy mode for this interrupt.
- * - Retrieve ga_tag of prior interrupt remapping data.
- */
- pi.prev_ga_tag = 0;
- pi.is_guest_mode = false;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
+enum avic_vcpu_action {
+ /*
+ * There is no need to differentiate between activate and deactivate,
+ * as KVM only refreshes AVIC state when the vCPU is scheduled in and
+ * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is
+ * being (de)activated.
+ */
+ AVIC_TOGGLE_ON_OFF = BIT(0),
+ AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF,
+ AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF,
- /**
- * Check if the posted interrupt was previously
- * setup with the guest_mode by checking if the ga_tag
- * was cached. If so, we need to clean up the per-vcpu
- * ir_list.
- */
- if (!ret && pi.prev_ga_tag) {
- int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
- struct kvm_vcpu *vcpu;
+ /*
+ * No unique action is required to deal with a vCPU that stops/starts
+ * running. A vCPU that starts running by definition stops blocking as
+ * well, and a vCPU that stops running can't have been blocking, i.e.
+ * doesn't need to toggle GALogIntr.
+ */
+ AVIC_START_RUNNING = 0,
+ AVIC_STOP_RUNNING = 0,
- vcpu = kvm_get_vcpu_by_id(kvm, id);
- if (vcpu)
- svm_ir_list_del(to_svm(vcpu), &pi);
- }
- }
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
-}
+ /*
+ * When a vCPU starts blocking, KVM needs to set the GALogIntr flag
+ * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is
+ * sent to the vCPU.
+ */
+ AVIC_START_BLOCKING = BIT(1),
+};
-static inline int
-avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
+static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
{
- int ret = 0;
- struct amd_svm_iommu_ir *ir;
+ bool ga_log_intr = (action & AVIC_START_BLOCKING);
struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_kernel_irqfd *irqfd;
lockdep_assert_held(&svm->ir_list_lock);
- if (!kvm_arch_has_assigned_device(vcpu->kvm))
- return 0;
-
/*
* Here, we go through the per-vcpu ir_list to update all existing
* interrupt remapping table entry targeting this vcpu.
*/
if (list_empty(&svm->ir_list))
- return 0;
+ return;
- list_for_each_entry(ir, &svm->ir_list, node) {
- ret = amd_iommu_update_ga(cpu, r, ir->data);
- if (ret)
- return ret;
+ list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
+ void *data = irqfd->irq_bypass_data;
+
+ if (!(action & AVIC_TOGGLE_ON_OFF))
+ WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr));
+ else if (cpu >= 0)
+ WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr));
+ else
+ WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
}
- return 0;
}
-void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
{
- u64 entry;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
+ u64 entry;
lockdep_assert_preemption_disabled();
if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
- /*
- * No need to update anything if the vCPU is blocking, i.e. if the vCPU
- * is being scheduled in after being preempted. The CPU entries in the
- * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
- * If the vCPU was migrated, its new CPU value will be stuffed when the
- * vCPU unblocks.
- */
- if (kvm_vcpu_is_blocking(vcpu))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
return;
/*
@@ -1063,38 +895,57 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ entry = svm->avic_physical_id_entry;
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK |
+ AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
- avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
+ svm->avic_physical_id_entry = entry;
+
+ /*
+ * If IPI virtualization is disabled, clear IsRunning when updating the
+ * actual Physical ID table, so that the CPU never sees IsRunning=1.
+ * Keep the APIC ID up-to-date in the entry to minimize the chances of
+ * things going sideways if hardware peeks at the ID.
+ */
+ if (!enable_ipiv)
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
-void avic_vcpu_put(struct kvm_vcpu *vcpu)
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- u64 entry;
+ /*
+ * No need to update anything if the vCPU is blocking, i.e. if the vCPU
+ * is being scheduled in after being preempted. The CPU entries in the
+ * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
+ * If the vCPU was migrated, its new CPU value will be stuffed when the
+ * vCPU unblocks.
+ */
+ if (kvm_vcpu_is_blocking(vcpu))
+ return;
+
+ __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING);
+}
+
+static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
+ u64 entry = svm->avic_physical_id_entry;
lockdep_assert_preemption_disabled();
- /*
- * Note, reading the Physical ID entry outside of ir_list_lock is safe
- * as only the pCPU that has loaded (or is loading) the vCPU is allowed
- * to modify the entry, and preemption is disabled. I.e. the vCPU
- * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
- * recursively.
- */
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
-
- /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
- if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
return;
/*
@@ -1107,13 +958,62 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
- avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+ avic_update_iommu_vcpu_affinity(vcpu, -1, action);
+
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
+ /*
+ * Keep the previous APIC ID in the entry so that a rogue doorbell from
+ * hardware is at least restricted to a CPU associated with the vCPU.
+ */
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+
+ if (enable_ipiv)
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ /*
+ * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as
+ * it's a synthetic flag that usurps an unused should-be-zero bit.
+ */
+ if (action & AVIC_START_BLOCKING)
+ entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
+
+ svm->avic_physical_id_entry = entry;
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, reading the Physical ID entry outside of ir_list_lock is safe
+ * as only the pCPU that has loaded (or is loading) the vCPU is allowed
+ * to modify the entry, and preemption is disabled. I.e. the vCPU
+ * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
+ * recursively.
+ */
+ u64 entry = to_svm(vcpu)->avic_physical_id_entry;
+
+ /*
+ * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the
+ * vCPU is preempted while its in the process of blocking. WARN if the
+ * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put
+ * the AVIC if it wasn't previously loaded.
+ */
+ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) {
+ if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu)))
+ return;
+ /*
+ * The vCPU was preempted while blocking, ensure its IRTEs are
+ * configured to generate GA Log Interrupts.
+ */
+ if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR))))
+ return;
+ }
+
+ __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING :
+ AVIC_STOP_RUNNING);
}
void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
@@ -1142,19 +1042,18 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
- bool activated = kvm_vcpu_apicv_active(vcpu);
-
if (!enable_apicv)
return;
+ /* APICv should only be toggled on/off while the vCPU is running. */
+ WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu));
+
avic_refresh_virtual_apic_mode(vcpu);
- if (activated)
- avic_vcpu_load(vcpu, vcpu->cpu);
+ if (kvm_vcpu_apicv_active(vcpu))
+ __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE);
else
- avic_vcpu_put(vcpu);
-
- avic_set_pi_irte_mode(vcpu, activated);
+ __avic_vcpu_put(vcpu, AVIC_DEACTIVATE);
}
void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
@@ -1162,20 +1061,25 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
if (!kvm_vcpu_apicv_active(vcpu))
return;
- /*
- * Unload the AVIC when the vCPU is about to block, _before_
- * the vCPU actually blocks.
- *
- * Any IRQs that arrive before IsRunning=0 will not cause an
- * incomplete IPI vmexit on the source, therefore vIRR will also
- * be checked by kvm_vcpu_check_block() before blocking. The
- * memory barrier implicit in set_current_state orders writing
- * IsRunning=0 before reading the vIRR. The processor needs a
- * matching memory barrier on interrupt delivery between writing
- * IRR and reading IsRunning; the lack of this barrier might be
- * the cause of errata #1235).
- */
- avic_vcpu_put(vcpu);
+ /*
+ * Unload the AVIC when the vCPU is about to block, _before_ the vCPU
+ * actually blocks.
+ *
+ * Note, any IRQs that arrive before IsRunning=0 will not cause an
+ * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles
+ * this by checking vIRR one last time before blocking. The memory
+ * barrier implicit in set_current_state orders writing IsRunning=0
+ * before reading the vIRR. The processor needs a matching memory
+ * barrier on interrupt delivery between writing IRR and reading
+ * IsRunning; the lack of this barrier might be the cause of errata #1235).
+ *
+ * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM
+ * doesn't need to detect events for scheduling purposes. The doorbell
+ * used to signal running vCPUs cannot be blocked, i.e. will perturb the
+ * CPU and cause noisy neighbor problems if the VM is sending interrupts
+ * to the vCPU while it's scheduled out.
+ */
+ __avic_vcpu_put(vcpu, AVIC_START_BLOCKING);
}
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
@@ -1228,6 +1132,14 @@ bool avic_hardware_setup(void)
if (x2avic_enabled)
pr_info("x2AVIC enabled\n");
+ /*
+ * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
+ * due to erratum 1235, which results in missed VM-Exits on the sender
+ * and thus missed wake events for blocking vCPUs due to the CPU
+ * failing to see a software update to clear IsRunning.
+ */
+ enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
+
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
return true;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 834b67672d50..b7fd2e869998 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -185,12 +185,87 @@ void recalc_intercepts(struct vcpu_svm *svm)
}
/*
+ * This array (and its actual size) holds the set of offsets (indexing by chunk
+ * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the
+ * set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g.
+ * based on CPUID features. This array only tracks MSRs that *might* be passed
+ * through to the guest.
+ *
+ * Hardcode the capacity of the array based on the maximum number of _offsets_.
+ * MSRs are batched together, so there are fewer offsets than MSRs.
+ */
+static int nested_svm_msrpm_merge_offsets[7] __ro_after_init;
+static int nested_svm_nr_msrpm_merge_offsets __ro_after_init;
+typedef unsigned long nsvm_msrpm_merge_t;
+
+int __init nested_svm_init_msrpm_merge_offsets(void)
+{
+ static const u32 merge_msrs[] __initconst = {
+ MSR_STAR,
+ MSR_IA32_SYSENTER_CS,
+ MSR_IA32_SYSENTER_EIP,
+ MSR_IA32_SYSENTER_ESP,
+ #ifdef CONFIG_X86_64
+ MSR_GS_BASE,
+ MSR_FS_BASE,
+ MSR_KERNEL_GS_BASE,
+ MSR_LSTAR,
+ MSR_CSTAR,
+ MSR_SYSCALL_MASK,
+ #endif
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_PRED_CMD,
+ MSR_IA32_FLUSH_CMD,
+ MSR_IA32_APERF,
+ MSR_IA32_MPERF,
+ MSR_IA32_LASTBRANCHFROMIP,
+ MSR_IA32_LASTBRANCHTOIP,
+ MSR_IA32_LASTINTFROMIP,
+ MSR_IA32_LASTINTTOIP,
+ };
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) {
+ int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]);
+ u32 offset;
+
+ if (WARN_ON(bit_nr < 0))
+ return -EIO;
+
+ /*
+ * Merging is done in chunks to reduce the number of accesses
+ * to L1's bitmap.
+ */
+ offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t);
+
+ for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) {
+ if (nested_svm_msrpm_merge_offsets[j] == offset)
+ break;
+ }
+
+ if (j < nested_svm_nr_msrpm_merge_offsets)
+ continue;
+
+ if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets)))
+ return -EIO;
+
+ nested_svm_msrpm_merge_offsets[j] = offset;
+ nested_svm_nr_msrpm_merge_offsets++;
+ }
+
+ return 0;
+}
+
+/*
* Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
* is optimized in that it only merges the parts where KVM MSR permission bitmap
* may contain zero bits.
*/
-static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
+static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm;
+ nsvm_msrpm_merge_t *msrpm01 = svm->msrpm;
int i;
/*
@@ -205,7 +280,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
if (!svm->nested.force_msr_bitmap_recalc) {
struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
- if (kvm_hv_hypercall_enabled(&svm->vcpu) &&
+ if (kvm_hv_hypercall_enabled(vcpu) &&
hve->hv_enlightenments_control.msr_bitmap &&
(svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
goto set_msrpm_base_pa;
@@ -215,25 +290,17 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return true;
- for (i = 0; i < MSRPM_OFFSETS; i++) {
- u32 value, p;
- u64 offset;
-
- if (msrpm_offsets[i] == 0xffffffff)
- break;
-
- p = msrpm_offsets[i];
+ for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) {
+ const int p = nested_svm_msrpm_merge_offsets[i];
+ nsvm_msrpm_merge_t l1_val;
+ gpa_t gpa;
- /* x2apic msrs are intercepted always for the nested guest */
- if (is_x2apic_msrpm_offset(p))
- continue;
-
- offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
+ gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val));
- if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
+ if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val)))
return false;
- svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ msrpm02[p] = msrpm01[p] | l1_val;
}
svm->nested.force_msr_bitmap_recalc = false;
@@ -678,6 +745,33 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
+ /*
+ * Stash vmcb02's counter if the guest hasn't moved past the guilty
+ * instruction; otherwise, reset the counter to '0'.
+ *
+ * In order to detect if L2 has made forward progress or not, track the
+ * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP
+ * is changed, guest has clearly made forward progress, bus_lock_counter
+ * still remained '1', so reset bus_lock_counter to '0'. Eg. In the
+ * scenario, where a buslock happened in L1 before VMRUN, the bus lock
+ * firmly happened on an instruction in the past. Even if vmcb01's
+ * counter is still '1', (because the guilty instruction got patched),
+ * the vCPU has clearly made forward progress and so KVM should reset
+ * vmcb02's counter to '0'.
+ *
+ * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN
+ * to prevent the same guilty instruction from triggering a VM-Exit. Eg.
+ * if userspace rate-limits the vCPU, then it's entirely possible that
+ * L1's tick interrupt is pending by the time userspace re-runs the
+ * vCPU. If KVM unconditionally clears the counter on VMRUN, then when
+ * L1 re-enters L2, the same instruction will trigger a VM-Exit and the
+ * entire cycle start over.
+ */
+ if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip))
+ vmcb02->control.bus_lock_counter = 1;
+ else
+ vmcb02->control.bus_lock_counter = 0;
+
/* Done at vmrun: asid. */
/* Also overwritten later if necessary. */
@@ -910,7 +1004,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
goto out_exit_err;
- if (nested_svm_vmrun_msrpm(svm))
+ if (nested_svm_merge_msrpm(vcpu))
goto out;
out_exit_err:
@@ -1039,8 +1133,17 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
}
+ /*
+ * Invalidate bus_lock_rip unless KVM is still waiting for the guest
+ * to make forward progress before re-enabling bus lock detection.
+ */
+ if (!vmcb02->control.bus_lock_counter)
+ svm->nested.ctl.bus_lock_rip = INVALID_GPA;
+
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+ kvm_nested_vmexit_handle_ibrs(vcpu);
+
svm_switch_vmcb(svm, &svm->vmcb01);
/*
@@ -1194,7 +1297,6 @@ int svm_allocate_nested(struct vcpu_svm *svm)
svm->nested.msrpm = svm_vcpu_alloc_msrpm();
if (!svm->nested.msrpm)
goto err_free_vmcb02;
- svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
svm->nested.initialized = true;
return 0;
@@ -1254,26 +1356,26 @@ void svm_leave_nested(struct kvm_vcpu *vcpu)
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{
- u32 offset, msr, value;
- int write, mask;
+ gpa_t base = svm->nested.ctl.msrpm_base_pa;
+ int write, bit_nr;
+ u8 value, mask;
+ u32 msr;
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return NESTED_EXIT_HOST;
msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- offset = svm_msrpm_offset(msr);
+ bit_nr = svm_msrpm_bit_nr(msr);
write = svm->vmcb->control.exit_info_1 & 1;
- mask = 1 << ((2 * (msr & 0xf)) + write);
- if (offset == MSR_INVALID)
+ if (bit_nr < 0)
return NESTED_EXIT_DONE;
- /* Offset is in 32 bit units but need in 8 bit units */
- offset *= 4;
-
- if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
+ if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE,
+ &value, sizeof(value)))
return NESTED_EXIT_DONE;
+ mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1));
return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
@@ -1783,13 +1885,11 @@ out_free:
static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
if (WARN_ON(!is_guest_mode(vcpu)))
return true;
if (!vcpu->arch.pdptrs_from_userspace &&
- !nested_npt_enabled(svm) && is_pae_paging(vcpu))
+ !nested_npt_enabled(to_svm(vcpu)) && is_pae_paging(vcpu))
/*
* Reload the guest's PDPTRs since after a migration
* the guest CR3 might be restored prior to setting the nested
@@ -1798,7 +1898,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
return false;
- if (!nested_svm_vmrun_msrpm(svm)) {
+ if (!nested_svm_merge_msrpm(vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_EMULATION;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 1aa0f07d3a63..2fbdebf79fbb 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -117,6 +117,7 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
*/
down_write(&sev_deactivate_lock);
+ /* SNP firmware requires use of WBINVD for ASID recycling. */
wbinvd_on_all_cpus();
if (sev_snp_enabled)
@@ -446,7 +447,12 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
init_args.probe = false;
ret = sev_platform_init(&init_args);
if (ret)
- goto e_free;
+ goto e_free_asid;
+
+ if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto e_free_asid;
+ }
/* This needs to happen after SEV/SNP firmware initialization. */
if (vm_type == KVM_X86_SNP_VM) {
@@ -464,6 +470,8 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
return 0;
e_free:
+ free_cpumask_var(sev->have_run_cpus);
+e_free_asid:
argp->error = init_args.error;
sev_asid_free(sev);
sev->asid = 0;
@@ -561,6 +569,8 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
+ sev->policy = params.policy;
+
memset(&start, 0, sizeof(start));
dh_blob = NULL;
@@ -706,6 +716,33 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
}
}
+static void sev_writeback_caches(struct kvm *kvm)
+{
+ /*
+ * Note, the caller is responsible for ensuring correctness if the mask
+ * can be modified, e.g. if a CPU could be doing VMRUN.
+ */
+ if (cpumask_empty(to_kvm_sev_info(kvm)->have_run_cpus))
+ return;
+
+ /*
+ * Ensure that all dirty guest tagged cache entries are written back
+ * before releasing the pages back to the system for use. CLFLUSH will
+ * not do this without SME_COHERENT, and flushing many cache lines
+ * individually is slower than blasting WBINVD for large VMs, so issue
+ * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported)
+ * on CPUs that have done VMRUN, i.e. may have dirtied data using the
+ * VM's ASID.
+ *
+ * For simplicity, never remove CPUs from the bitmap. Ideally, KVM
+ * would clear the mask when flushing caches, but doing so requires
+ * serializing multiple calls and having responding CPUs (to the IPI)
+ * mark themselves as still running if they are running (or about to
+ * run) a vCPU for the VM.
+ */
+ wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
+}
+
static unsigned long get_num_contig_pages(unsigned long idx,
struct page **inpages, unsigned long npages)
{
@@ -1593,11 +1630,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* allocate memory for header and transport buffer */
ret = -ENOMEM;
- hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+ hdr = kzalloc(params.hdr_len, GFP_KERNEL);
if (!hdr)
goto e_unpin;
- trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+ trans_data = kzalloc(params.trans_len, GFP_KERNEL);
if (!trans_data)
goto e_free_hdr;
@@ -1883,70 +1920,6 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
atomic_set_release(&src_sev->migration_in_progress, 0);
}
-/* vCPU mutex subclasses. */
-enum sev_migration_role {
- SEV_MIGRATION_SOURCE = 0,
- SEV_MIGRATION_TARGET,
- SEV_NR_MIGRATION_ROLES,
-};
-
-static int sev_lock_vcpus_for_migration(struct kvm *kvm,
- enum sev_migration_role role)
-{
- struct kvm_vcpu *vcpu;
- unsigned long i, j;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (mutex_lock_killable_nested(&vcpu->mutex, role))
- goto out_unlock;
-
-#ifdef CONFIG_PROVE_LOCKING
- if (!i)
- /*
- * Reset the role to one that avoids colliding with
- * the role used for the first vcpu mutex.
- */
- role = SEV_NR_MIGRATION_ROLES;
- else
- mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
-#endif
- }
-
- return 0;
-
-out_unlock:
-
- kvm_for_each_vcpu(j, vcpu, kvm) {
- if (i == j)
- break;
-
-#ifdef CONFIG_PROVE_LOCKING
- if (j)
- mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
-#endif
-
- mutex_unlock(&vcpu->mutex);
- }
- return -EINTR;
-}
-
-static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- unsigned long i;
- bool first = true;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (first)
- first = false;
- else
- mutex_acquire(&vcpu->mutex.dep_map,
- SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
-
- mutex_unlock(&vcpu->mutex);
- }
-}
-
static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
{
struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm);
@@ -2033,6 +2006,10 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
struct kvm_vcpu *src_vcpu;
unsigned long i;
+ if (src->created_vcpus != atomic_read(&src->online_vcpus) ||
+ dst->created_vcpus != atomic_read(&dst->online_vcpus))
+ return -EBUSY;
+
if (!sev_es_guest(src))
return 0;
@@ -2084,10 +2061,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
charged = true;
}
- ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
+ ret = kvm_lock_all_vcpus(kvm);
if (ret)
goto out_dst_cgroup;
- ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
+ ret = kvm_lock_all_vcpus(source_kvm);
if (ret)
goto out_dst_vcpu;
@@ -2095,15 +2072,26 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
if (ret)
goto out_source_vcpu;
+ /*
+ * Allocate a new have_run_cpus for the destination, i.e. don't copy
+ * the set of CPUs from the source. If a CPU was used to run a vCPU in
+ * the source VM but is never used for the destination VM, then the CPU
+ * can only have cached memory that was accessible to the source VM.
+ */
+ if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto out_source_vcpu;
+ }
+
sev_migrate_from(kvm, source_kvm);
kvm_vm_dead(source_kvm);
cg_cleanup_sev = src_sev;
ret = 0;
out_source_vcpu:
- sev_unlock_vcpus_for_migration(source_kvm);
+ kvm_unlock_all_vcpus(source_kvm);
out_dst_vcpu:
- sev_unlock_vcpus_for_migration(kvm);
+ kvm_unlock_all_vcpus(kvm);
out_dst_cgroup:
/* Operates on the source on success, on the destination on failure. */
if (charged)
@@ -2193,12 +2181,10 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
return -EINVAL;
/* Check for policy bits that must be set */
- if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) ||
- !(params.policy & SNP_POLICY_MASK_SMT))
+ if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO))
return -EINVAL;
- if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET)
- return -EINVAL;
+ sev->policy = params.policy;
sev->snp_context = snp_context_create(kvm, argp);
if (!sev->snp_context)
@@ -2754,12 +2740,7 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
goto failed;
}
- /*
- * Ensure that all guest tagged cache entries are flushed before
- * releasing the pages back to the system for use. CLFLUSH will
- * not do this, so issue a WBINVD.
- */
- wbinvd_on_all_cpus();
+ sev_writeback_caches(kvm);
__unregister_enc_region_locked(kvm, region);
@@ -2801,13 +2782,18 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
goto e_unlock;
}
+ mirror_sev = to_kvm_sev_info(kvm);
+ if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto e_unlock;
+ }
+
/*
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
source_sev = to_kvm_sev_info(source_kvm);
kvm_get_kvm(source_kvm);
- mirror_sev = to_kvm_sev_info(kvm);
list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
/* Set enc_context_owner and copy its encryption context over */
@@ -2869,7 +2855,13 @@ void sev_vm_destroy(struct kvm *kvm)
WARN_ON(!list_empty(&sev->mirror_vms));
- /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+ free_cpumask_var(sev->have_run_cpus);
+
+ /*
+ * If this is a mirror VM, remove it from the owner's list of a mirrors
+ * and skip ASID cleanup (the ASID is tied to the lifetime of the owner).
+ * Note, mirror VMs don't support registering encrypted regions.
+ */
if (is_mirroring_enc_context(kvm)) {
struct kvm *owner_kvm = sev->enc_context_owner;
@@ -2880,12 +2872,6 @@ void sev_vm_destroy(struct kvm *kvm)
return;
}
- /*
- * Ensure that all guest tagged cache entries are flushed before
- * releasing the pages back to the system for use. CLFLUSH will
- * not do this, so issue a WBINVD.
- */
- wbinvd_on_all_cpus();
/*
* if userspace was terminated before unregistering the memory regions
@@ -2931,6 +2917,33 @@ void __init sev_set_cpu_caps(void)
}
}
+static bool is_sev_snp_initialized(void)
+{
+ struct sev_user_data_snp_status *status;
+ struct sev_data_snp_addr buf;
+ bool initialized = false;
+ int ret, error = 0;
+
+ status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO);
+ if (!status)
+ return false;
+
+ buf.address = __psp_pa(status);
+ ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error);
+ if (ret) {
+ pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n",
+ ret, error, error);
+ goto out;
+ }
+
+ initialized = !!status->state;
+
+out:
+ snp_free_firmware_page(status);
+
+ return initialized;
+}
+
void __init sev_hardware_setup(void)
{
unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
@@ -3035,6 +3048,14 @@ void __init sev_hardware_setup(void)
sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
out:
+ if (sev_enabled) {
+ init_args.probe = true;
+ if (sev_platform_init(&init_args))
+ sev_supported = sev_es_supported = sev_snp_supported = false;
+ else if (sev_snp_supported)
+ sev_snp_supported = is_sev_snp_initialized();
+ }
+
if (boot_cpu_has(X86_FEATURE_SEV))
pr_info("SEV %s (ASIDs %u - %u)\n",
sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" :
@@ -3061,15 +3082,6 @@ out:
sev_supported_vmsa_features = 0;
if (sev_es_debug_swap_enabled)
sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
-
- if (!sev_enabled)
- return;
-
- /*
- * Do both SNP and SEV initialization at KVM module load.
- */
- init_args.probe = true;
- sev_platform_init(&init_args);
}
void sev_hardware_unsetup(void)
@@ -3129,30 +3141,29 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
/*
* VM Page Flush takes a host virtual address and a guest ASID. Fall
- * back to WBINVD if this faults so as not to make any problems worse
- * by leaving stale encrypted data in the cache.
+ * back to full writeback of caches if this faults so as not to make
+ * any problems worse by leaving stale encrypted data in the cache.
*/
if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
- goto do_wbinvd;
+ goto do_sev_writeback_caches;
return;
-do_wbinvd:
- wbinvd_on_all_cpus();
+do_sev_writeback_caches:
+ sev_writeback_caches(vcpu->kvm);
}
void sev_guest_memory_reclaimed(struct kvm *kvm)
{
/*
* With SNP+gmem, private/encrypted memory is unreachable via the
- * hva-based mmu notifiers, so these events are only actually
- * pertaining to shared pages where there is no need to perform
- * the WBINVD to flush associated caches.
+ * hva-based mmu notifiers, i.e. these events are explicitly scoped to
+ * shared pages, where there's no need to flush caches.
*/
if (!sev_guest(kvm) || sev_snp_guest(kvm))
return;
- wbinvd_on_all_cpus();
+ sev_writeback_caches(kvm);
}
void sev_free_vcpu(struct kvm_vcpu *vcpu)
@@ -3484,6 +3495,15 @@ int pre_sev_run(struct vcpu_svm *svm, int cpu)
if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa))
return -EINVAL;
+ /*
+ * To optimize cache flushes when memory is reclaimed from an SEV VM,
+ * track physical CPUs that enter the guest for SEV VMs and thus can
+ * have encrypted, dirty data in the cache, and flush caches only for
+ * CPUs that have entered the guest.
+ */
+ if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus))
+ cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus);
+
/* Assign the asid allocated with this SEV guest */
svm->asid = asid;
@@ -3916,9 +3936,9 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
* From this point forward, the VMSA will always be a guest-mapped page
* rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
* theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
- * that involves cleanups like wbinvd_on_all_cpus() which would ideally
- * be handled during teardown rather than guest boot. Deferring that
- * also allows the existing logic for SEV-ES VMSAs to be re-used with
+ * that involves cleanups like flushing caches, which would ideally be
+ * handled during teardown rather than guest boot. Deferring that also
+ * allows the existing logic for SEV-ES VMSAs to be re-used with
* minimal SNP-specific changes.
*/
svm->sev_es.snp_has_guest_vmsa = true;
@@ -4007,10 +4027,8 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
* Unless Creation is deferred until INIT, signal the vCPU to update
* its state.
*/
- if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) {
- kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
- kvm_vcpu_kick(target_vcpu);
- }
+ if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT)
+ kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
return 0;
}
@@ -4422,16 +4440,17 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
count, in);
}
-static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
+void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ /* Clear intercepts on MSRs that are context switched by hardware. */
+ svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW);
- if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
- bool v_tsc_aux = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
-
- set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux);
- }
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX))
+ svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID));
/*
* For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
@@ -4445,11 +4464,9 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
* XSAVES being exposed to the guest so that KVM can at least honor
* guest CPUID for RDMSR and WRMSR.
*/
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1);
- else
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) ||
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES));
}
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
@@ -4461,15 +4478,12 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
if (best)
vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
-
- if (sev_es_guest(svm->vcpu.kvm))
- sev_es_vcpu_after_set_cpuid(svm);
}
static void sev_es_init_vmcb(struct vcpu_svm *svm)
{
+ struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
struct vmcb *vmcb = svm->vmcb01.ptr;
- struct kvm_vcpu *vcpu = &svm->vcpu;
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
@@ -4480,8 +4494,16 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
* the VMSA will be NULL if this vCPU is the destination for intrahost
* migration, and will be copied later.
*/
- if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa)
- svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
+ if (!svm->sev_es.snp_has_guest_vmsa) {
+ if (svm->sev_es.vmsa)
+ svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
+ else
+ svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+ }
+
+ if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES))
+ svm->vmcb->control.allowed_sev_features = sev->vmsa_features |
+ VMCB_ALLOWED_SEV_FEATURES_VALID;
/* Can't intercept CR register access, HV can't modify CR registers */
svm_clr_intercept(svm, INTERCEPT_CR0_READ);
@@ -4519,10 +4541,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
/* Can't intercept XSETBV, HV can't modify XCR0 directly */
svm_clr_intercept(svm, INTERCEPT_XSETBV);
-
- /* Clear intercepts on selected MSRs */
- set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1);
}
void sev_init_vmcb(struct vcpu_svm *svm)
@@ -4911,7 +4929,7 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
/*
* SEV-ES avoids host/guest cache coherency issues through
- * WBINVD hooks issued via MMU notifiers during run-time, and
+ * WBNOINVD hooks issued via MMU notifiers during run-time, and
* KVM's VM destroy path at shutdown. Those MMU notifier events
* don't cover gmem since there is no requirement to map pages
* to a HVA in order to use them for a running guest. While the
@@ -4943,3 +4961,97 @@ int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
return level;
}
+
+struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_save_area *vmsa;
+ struct kvm_sev_info *sev;
+ int error = 0;
+ int ret;
+
+ if (!sev_es_guest(vcpu->kvm))
+ return NULL;
+
+ /*
+ * If the VMSA has not yet been encrypted, return a pointer to the
+ * current un-encrypted VMSA.
+ */
+ if (!vcpu->arch.guest_state_protected)
+ return (struct vmcb_save_area *)svm->sev_es.vmsa;
+
+ sev = to_kvm_sev_info(vcpu->kvm);
+
+ /* Check if the SEV policy allows debugging */
+ if (sev_snp_guest(vcpu->kvm)) {
+ if (!(sev->policy & SNP_POLICY_DEBUG))
+ return NULL;
+ } else {
+ if (sev->policy & SEV_POLICY_NODBG)
+ return NULL;
+ }
+
+ if (sev_snp_guest(vcpu->kvm)) {
+ struct sev_data_snp_dbg dbg = {0};
+
+ vmsa = snp_alloc_firmware_page(__GFP_ZERO);
+ if (!vmsa)
+ return NULL;
+
+ dbg.gctx_paddr = __psp_pa(sev->snp_context);
+ dbg.src_addr = svm->vmcb->control.vmsa_pa;
+ dbg.dst_addr = __psp_pa(vmsa);
+
+ ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error);
+
+ /*
+ * Return the target page to a hypervisor page no matter what.
+ * If this fails, the page can't be used, so leak it and don't
+ * try to use it.
+ */
+ if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa))))
+ return NULL;
+
+ if (ret) {
+ pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n",
+ ret, error, error);
+ free_page((unsigned long)vmsa);
+
+ return NULL;
+ }
+ } else {
+ struct sev_data_dbg dbg = {0};
+ struct page *vmsa_page;
+
+ vmsa_page = alloc_page(GFP_KERNEL);
+ if (!vmsa_page)
+ return NULL;
+
+ vmsa = page_address(vmsa_page);
+
+ dbg.handle = sev->handle;
+ dbg.src_addr = svm->vmcb->control.vmsa_pa;
+ dbg.dst_addr = __psp_pa(vmsa);
+ dbg.len = PAGE_SIZE;
+
+ ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error);
+ if (ret) {
+ pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n",
+ ret, error, error);
+ __free_page(vmsa_page);
+
+ return NULL;
+ }
+ }
+
+ return vmsa;
+}
+
+void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa)
+{
+ /* If the VMSA has not yet been encrypted, nothing was allocated */
+ if (!vcpu->arch.guest_state_protected || !vmsa)
+ return;
+
+ free_page((unsigned long)vmsa);
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 67fee545d42a..d9931c6c4bc6 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -29,6 +29,7 @@
#include <linux/cc_platform.h>
#include <linux/smp.h>
#include <linux/string_choices.h>
+#include <linux/mutex.h>
#include <asm/apic.h>
#include <asm/msr.h>
@@ -71,8 +72,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
static bool erratum_383_found __read_mostly;
-u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
-
/*
* Set osvw_len to higher value when updated Revision Guides
* are published and we know what the new status bits are
@@ -81,72 +80,6 @@ static uint64_t osvw_len = 4, osvw_status;
static DEFINE_PER_CPU(u64, current_tsc_ratio);
-#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4))
-
-static const struct svm_direct_access_msrs {
- u32 index; /* Index of the MSR */
- bool always; /* True if intercept is initially cleared */
-} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
- { .index = MSR_STAR, .always = true },
- { .index = MSR_IA32_SYSENTER_CS, .always = true },
- { .index = MSR_IA32_SYSENTER_EIP, .always = false },
- { .index = MSR_IA32_SYSENTER_ESP, .always = false },
-#ifdef CONFIG_X86_64
- { .index = MSR_GS_BASE, .always = true },
- { .index = MSR_FS_BASE, .always = true },
- { .index = MSR_KERNEL_GS_BASE, .always = true },
- { .index = MSR_LSTAR, .always = true },
- { .index = MSR_CSTAR, .always = true },
- { .index = MSR_SYSCALL_MASK, .always = true },
-#endif
- { .index = MSR_IA32_SPEC_CTRL, .always = false },
- { .index = MSR_IA32_PRED_CMD, .always = false },
- { .index = MSR_IA32_FLUSH_CMD, .always = false },
- { .index = MSR_IA32_DEBUGCTLMSR, .always = false },
- { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
- { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
- { .index = MSR_IA32_LASTINTFROMIP, .always = false },
- { .index = MSR_IA32_LASTINTTOIP, .always = false },
- { .index = MSR_IA32_XSS, .always = false },
- { .index = MSR_EFER, .always = false },
- { .index = MSR_IA32_CR_PAT, .always = false },
- { .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
- { .index = MSR_TSC_AUX, .always = false },
- { .index = X2APIC_MSR(APIC_ID), .always = false },
- { .index = X2APIC_MSR(APIC_LVR), .always = false },
- { .index = X2APIC_MSR(APIC_TASKPRI), .always = false },
- { .index = X2APIC_MSR(APIC_ARBPRI), .always = false },
- { .index = X2APIC_MSR(APIC_PROCPRI), .always = false },
- { .index = X2APIC_MSR(APIC_EOI), .always = false },
- { .index = X2APIC_MSR(APIC_RRR), .always = false },
- { .index = X2APIC_MSR(APIC_LDR), .always = false },
- { .index = X2APIC_MSR(APIC_DFR), .always = false },
- { .index = X2APIC_MSR(APIC_SPIV), .always = false },
- { .index = X2APIC_MSR(APIC_ISR), .always = false },
- { .index = X2APIC_MSR(APIC_TMR), .always = false },
- { .index = X2APIC_MSR(APIC_IRR), .always = false },
- { .index = X2APIC_MSR(APIC_ESR), .always = false },
- { .index = X2APIC_MSR(APIC_ICR), .always = false },
- { .index = X2APIC_MSR(APIC_ICR2), .always = false },
-
- /*
- * Note:
- * AMD does not virtualize APIC TSC-deadline timer mode, but it is
- * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
- * the AVIC hardware would generate GP fault. Therefore, always
- * intercept the MSR 0x832, and do not setup direct_access_msr.
- */
- { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false },
- { .index = X2APIC_MSR(APIC_LVTPC), .always = false },
- { .index = X2APIC_MSR(APIC_LVT0), .always = false },
- { .index = X2APIC_MSR(APIC_LVT1), .always = false },
- { .index = X2APIC_MSR(APIC_LVTERR), .always = false },
- { .index = X2APIC_MSR(APIC_TMICT), .always = false },
- { .index = X2APIC_MSR(APIC_TMCCT), .always = false },
- { .index = X2APIC_MSR(APIC_TDCR), .always = false },
- { .index = MSR_INVALID, .always = false },
-};
-
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* pause_filter_count: On processors that support Pause filtering(indicated
@@ -231,6 +164,9 @@ module_param(tsc_scaling, int, 0444);
*/
static bool avic;
module_param(avic, bool, 0444);
+module_param(enable_ipiv, bool, 0444);
+
+module_param(enable_device_posted_irqs, bool, 0444);
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
@@ -250,6 +186,8 @@ static unsigned long iopm_base;
DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
+static DEFINE_MUTEX(vmcb_dump_mutex);
+
/*
* Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
* the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
@@ -259,33 +197,6 @@ DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
*/
static int tsc_aux_uret_slot __read_mostly = -1;
-static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
-
-#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
-#define MSRS_RANGE_SIZE 2048
-#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
-
-u32 svm_msrpm_offset(u32 msr)
-{
- u32 offset;
- int i;
-
- for (i = 0; i < NUM_MSR_MAPS; i++) {
- if (msr < msrpm_ranges[i] ||
- msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
- continue;
-
- offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
- offset += (i * MSRS_RANGE_SIZE); /* add range offset */
-
- /* Now we have the u8 offset - but need the u32 offset */
- return offset / 4;
- }
-
- /* MSR not in any range */
- return MSR_INVALID;
-}
-
static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
@@ -752,50 +663,8 @@ static void clr_dr_intercepts(struct vcpu_svm *svm)
recalc_intercepts(svm);
}
-static int direct_access_msr_slot(u32 msr)
-{
- u32 i;
-
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
- if (direct_access_msrs[i].index == msr)
- return i;
-
- return -ENOENT;
-}
-
-static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
- int write)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int slot = direct_access_msr_slot(msr);
-
- if (slot == -ENOENT)
- return;
-
- /* Set the shadow bitmaps to the desired intercept states */
- if (read)
- set_bit(slot, svm->shadow_msr_intercept.read);
- else
- clear_bit(slot, svm->shadow_msr_intercept.read);
-
- if (write)
- set_bit(slot, svm->shadow_msr_intercept.write);
- else
- clear_bit(slot, svm->shadow_msr_intercept.write);
-}
-
-static bool valid_msr_intercept(u32 index)
-{
- return direct_access_msr_slot(index) != -ENOENT;
-}
-
static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
{
- u8 bit_write;
- unsigned long tmp;
- u32 offset;
- u32 *msrpm;
-
/*
* For non-nested case:
* If the L01 MSR bitmap does not intercept the MSR, then we need to
@@ -805,90 +674,102 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
- to_svm(vcpu)->msrpm;
-
- offset = svm_msrpm_offset(msr);
- bit_write = 2 * (msr & 0x0f) + 1;
- tmp = msrpm[offset];
+ void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm :
+ to_svm(vcpu)->msrpm;
- BUG_ON(offset == MSR_INVALID);
-
- return test_bit(bit_write, &tmp);
+ return svm_test_msr_bitmap_write(msrpm, msr);
}
-static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
- u32 msr, int read, int write)
+void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u8 bit_read, bit_write;
- unsigned long tmp;
- u32 offset;
-
- /*
- * If this warning triggers extend the direct_access_msrs list at the
- * beginning of the file
- */
- WARN_ON(!valid_msr_intercept(msr));
-
- /* Enforce non allowed MSRs to trap */
- if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
- read = 0;
+ void *msrpm = svm->msrpm;
- if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
- write = 0;
-
- offset = svm_msrpm_offset(msr);
- bit_read = 2 * (msr & 0x0f);
- bit_write = 2 * (msr & 0x0f) + 1;
- tmp = msrpm[offset];
-
- BUG_ON(offset == MSR_INVALID);
-
- read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
- write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+ /* Don't disable interception for MSRs userspace wants to handle. */
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ svm_clear_msr_bitmap_read(msrpm, msr);
+ else
+ svm_set_msr_bitmap_read(msrpm, msr);
+ }
- msrpm[offset] = tmp;
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ svm_clear_msr_bitmap_write(msrpm, msr);
+ else
+ svm_set_msr_bitmap_write(msrpm, msr);
+ }
svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
svm->nested.force_msr_bitmap_recalc = true;
}
-void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
- int read, int write)
-{
- set_shadow_msr_intercept(vcpu, msr, read, write);
- set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
-}
-
-u32 *svm_vcpu_alloc_msrpm(void)
+void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
{
- unsigned int order = get_order(MSRPM_SIZE);
- struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
- u32 *msrpm;
+ unsigned int order = get_order(size);
+ struct page *pages = alloc_pages(gfp_mask, order);
+ void *pm;
if (!pages)
return NULL;
- msrpm = page_address(pages);
- memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
+ /*
+ * Set all bits in the permissions map so that all MSR and I/O accesses
+ * are intercepted by default.
+ */
+ pm = page_address(pages);
+ memset(pm, 0xff, PAGE_SIZE * (1 << order));
- return msrpm;
+ return pm;
}
-void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
+static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
{
- int i;
+ bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- if (!direct_access_msrs[i].always)
- continue;
- set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
- }
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept);
+
+ if (sev_es_guest(vcpu->kvm))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept);
}
void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
{
+ static const u32 x2avic_passthrough_msrs[] = {
+ X2APIC_MSR(APIC_ID),
+ X2APIC_MSR(APIC_LVR),
+ X2APIC_MSR(APIC_TASKPRI),
+ X2APIC_MSR(APIC_ARBPRI),
+ X2APIC_MSR(APIC_PROCPRI),
+ X2APIC_MSR(APIC_EOI),
+ X2APIC_MSR(APIC_RRR),
+ X2APIC_MSR(APIC_LDR),
+ X2APIC_MSR(APIC_DFR),
+ X2APIC_MSR(APIC_SPIV),
+ X2APIC_MSR(APIC_ISR),
+ X2APIC_MSR(APIC_TMR),
+ X2APIC_MSR(APIC_IRR),
+ X2APIC_MSR(APIC_ESR),
+ X2APIC_MSR(APIC_ICR),
+ X2APIC_MSR(APIC_ICR2),
+
+ /*
+ * Note! Always intercept LVTT, as TSC-deadline timer mode
+ * isn't virtualized by hardware, and the CPU will generate a
+ * #GP instead of a #VMEXIT.
+ */
+ X2APIC_MSR(APIC_LVTTHMR),
+ X2APIC_MSR(APIC_LVTPC),
+ X2APIC_MSR(APIC_LVT0),
+ X2APIC_MSR(APIC_LVT1),
+ X2APIC_MSR(APIC_LVTERR),
+ X2APIC_MSR(APIC_TMICT),
+ X2APIC_MSR(APIC_TMCCT),
+ X2APIC_MSR(APIC_TDCR),
+ };
int i;
if (intercept == svm->x2avic_msrs_intercepted)
@@ -897,84 +778,79 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
if (!x2avic_enabled)
return;
- for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
- int index = direct_access_msrs[i].index;
-
- if ((index < APIC_BASE_MSR) ||
- (index > APIC_BASE_MSR + 0xff))
- continue;
- set_msr_interception(&svm->vcpu, svm->msrpm, index,
- !intercept, !intercept);
- }
+ for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++)
+ svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i],
+ MSR_TYPE_RW, intercept);
svm->x2avic_msrs_intercepted = intercept;
}
-void svm_vcpu_free_msrpm(u32 *msrpm)
+void svm_vcpu_free_msrpm(void *msrpm)
{
__free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
}
-static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
+static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u32 i;
- /*
- * Set intercept permissions for all direct access MSRs again. They
- * will automatically get filtered through the MSR filter, so we are
- * back in sync after this.
- */
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- u32 msr = direct_access_msrs[i].index;
- u32 read = test_bit(i, svm->shadow_msr_intercept.read);
- u32 write = test_bit(i, svm->shadow_msr_intercept.write);
+ svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
- }
-}
-
-static void add_msr_offset(u32 offset)
-{
- int i;
-
- for (i = 0; i < MSRPM_OFFSETS; ++i) {
-
- /* Offset already in list? */
- if (msrpm_offsets[i] == offset)
- return;
+#ifdef CONFIG_X86_64
+ svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW);
+#endif
- /* Slot used by another offset? */
- if (msrpm_offsets[i] != MSR_INVALID)
- continue;
+ if (lbrv)
+ svm_recalc_lbr_msr_intercepts(vcpu);
- /* Add offset to list */
- msrpm_offsets[i] = offset;
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
- return;
- }
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
/*
- * If this BUG triggers the msrpm_offsets table has an overflow. Just
- * increase MSRPM_OFFSETS in this case.
+ * Disable interception of SPEC_CTRL if KVM doesn't need to manually
+ * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if
+ * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively
+ * using SPEC_CTRL.
*/
- BUG();
-}
-
-static void init_msrpm_offsets(void)
-{
- int i;
-
- memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+ if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !guest_has_spec_ctrl_msr(vcpu));
+ else
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !svm->spec_ctrl);
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- u32 offset;
+ /*
+ * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU,
+ * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits.
+ */
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW,
+ guest_cpuid_is_intel_compatible(vcpu));
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW,
+ guest_cpuid_is_intel_compatible(vcpu));
+
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
+ }
- offset = svm_msrpm_offset(direct_access_msrs[i].index);
- BUG_ON(offset == MSR_INVALID);
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_recalc_msr_intercepts(vcpu);
- add_msr_offset(offset);
- }
+ /*
+ * x2APIC intercepts are modified on-demand and cannot be filtered by
+ * userspace.
+ */
}
void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
@@ -993,13 +869,7 @@ void svm_enable_lbrv(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
-
- if (sev_es_guest(vcpu->kvm))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1);
+ svm_recalc_lbr_msr_intercepts(vcpu);
/* Move the LBR msrs to the vmcb02 so that the guest can see them. */
if (is_guest_mode(vcpu))
@@ -1011,12 +881,8 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
-
svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+ svm_recalc_lbr_msr_intercepts(vcpu);
/*
* Move the LBR msrs back to the vmcb01 to avoid copying them
@@ -1171,9 +1037,10 @@ void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
}
/* Evaluate instruction intercepts that depend on guest CPUID features. */
-static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
- struct vcpu_svm *svm)
+static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
/*
* Intercept INVPCID if shadow paging is enabled to sync/free shadow
* roots, or if INVPCID is disabled in the guest to inject #UD.
@@ -1192,24 +1059,11 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
else
svm_set_intercept(svm, INTERCEPT_RDTSCP);
}
-}
-
-static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
if (guest_cpuid_is_intel_compatible(vcpu)) {
- /*
- * We must intercept SYSENTER_EIP and SYSENTER_ESP
- * accesses because the processor only stores 32 bits.
- * For the same reason we cannot use virtual VMLOAD/VMSAVE.
- */
svm_set_intercept(svm, INTERCEPT_VMLOAD);
svm_set_intercept(svm, INTERCEPT_VMSAVE);
svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
-
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
} else {
/*
* If hardware supports Virtual VMLOAD VMSAVE then enable it
@@ -1220,12 +1074,15 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
svm_clr_intercept(svm, INTERCEPT_VMSAVE);
svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
}
- /* No need to intercept these MSRs */
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
}
}
+static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ svm_recalc_instruction_intercepts(vcpu);
+ svm_recalc_msr_intercepts(vcpu);
+}
+
static void init_vmcb(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1348,15 +1205,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm_clr_intercept(svm, INTERCEPT_PAUSE);
}
- svm_recalc_instruction_intercepts(vcpu, svm);
-
- /*
- * If the host supports V_SPEC_CTRL then disable the interception
- * of MSR_IA32_SPEC_CTRL.
- */
- if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
-
if (kvm_vcpu_apicv_active(vcpu))
avic_init_vmcb(svm, vmcb);
@@ -1369,11 +1217,15 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
}
+ if (vcpu->kvm->arch.bus_lock_detection_enabled)
+ svm_set_intercept(svm, INTERCEPT_BUSLOCK);
+
if (sev_guest(vcpu->kvm))
sev_init_vmcb(svm);
svm_hv_init_vmcb(vmcb);
- init_vmcb_after_set_cpuid(vcpu);
+
+ svm_recalc_intercepts_after_set_cpuid(vcpu);
vmcb_mark_all_dirty(vmcb);
@@ -1384,8 +1236,6 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm_vcpu_init_msrpm(vcpu, svm->msrpm);
-
svm_init_osvw(vcpu);
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
@@ -1478,24 +1328,11 @@ out:
return err;
}
-static void svm_clear_current_vmcb(struct vmcb *vmcb)
-{
- int i;
-
- for_each_online_cpu(i)
- cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
-}
-
static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- /*
- * The vmcb page can be recycled, causing a false negative in
- * svm_vcpu_load(). So, ensure that no logical CPU has this
- * vmcb page recorded as its current vmcb.
- */
- svm_clear_current_vmcb(svm->vmcb);
+ WARN_ON_ONCE(!list_empty(&svm->ir_list));
svm_leave_nested(vcpu);
svm_free_nested(svm);
@@ -1503,7 +1340,7 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
sev_free_vcpu(vcpu);
__free_page(__sme_pa_to_page(svm->vmcb01.pa));
- __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
+ svm_vcpu_free_msrpm(svm->msrpm);
}
#ifdef CONFIG_CPU_MITIGATIONS
@@ -1610,19 +1447,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
-
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
- if (sd->current_vmcb != svm->vmcb) {
- sd->current_vmcb = svm->vmcb;
-
- if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) &&
- static_branch_likely(&switch_vcpu_ibpb))
- indirect_branch_prediction_barrier();
- }
if (kvm_vcpu_apicv_active(vcpu))
avic_vcpu_load(vcpu, cpu);
}
@@ -2897,12 +2724,11 @@ static int svm_get_feature_msr(u32 msr, u64 *data)
return 0;
}
-static bool
-sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info)
{
return sev_es_guest(vcpu->kvm) &&
vcpu->arch.guest_state_protected &&
- svm_msrpm_offset(msr_info->index) != MSR_INVALID &&
!msr_write_intercepted(vcpu, msr_info->index);
}
@@ -3133,11 +2959,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
- * nested_svm_vmrun_msrpm.
+ * nested_svm_merge_msrpm().
* We update the L1 MSR bit as well since it will end up
* touching the MSR anyway now.
*/
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr->host_initiated &&
@@ -3203,8 +3029,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
/*
* TSC_AUX is usually changed only during boot and never read
- * directly. Intercept TSC_AUX instead of exposing it to the
- * guest via direct_access_msrs, and switch it via user return.
+ * directly. Intercept TSC_AUX and switch it via user return.
*/
preempt_disable();
ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
@@ -3221,17 +3046,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
}
/*
- * AMD changed the architectural behavior of bits 5:2. On CPUs
- * without BusLockTrap, bits 5:2 control "external pins", but
- * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap
- * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed
- * the guest to set bits 5:2 despite not actually virtualizing
- * Performance-Monitoring/Breakpoint external pins. Drop bits
- * 5:2 for backwards compatibility.
- */
- data &= ~GENMASK(5, 2);
-
- /*
* Suppress BTF as KVM doesn't virtualize BTF, but there's no
* way to communicate lack of support to the guest.
*/
@@ -3361,6 +3175,37 @@ static int invpcid_interception(struct kvm_vcpu *vcpu)
return kvm_handle_invpcid(vcpu, type, gva);
}
+static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If userspace has NOT changed RIP, then KVM's ABI is to let the guest
+ * execute the bus-locking instruction. Set the bus lock counter to '1'
+ * to effectively step past the bus lock.
+ */
+ if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))
+ svm->vmcb->control.bus_lock_counter = 1;
+
+ return 1;
+}
+
+static int bus_lock_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_userspace_buslock;
+
+ if (is_guest_mode(vcpu))
+ svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip;
+
+ return 0;
+}
+
static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
@@ -3430,6 +3275,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_INVPCID] = invpcid_interception,
[SVM_EXIT_IDLE_HLT] = kvm_emulate_halt,
[SVM_EXIT_NPF] = npf_interception,
+ [SVM_EXIT_BUS_LOCK] = bus_lock_exit,
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
@@ -3444,14 +3290,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
+ char *vm_type;
if (!dump_invalid_vmcb) {
pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
return;
}
- pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
- svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
+ guard(mutex)(&vmcb_dump_mutex);
+
+ vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" :
+ sev_es_guest(vcpu->kvm) ? "SEV-ES" :
+ sev_guest(vcpu->kvm) ? "SEV" : "SVM";
+
+ pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n",
+ vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
pr_err("VMCB Control Area:\n");
pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
@@ -3489,6 +3342,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
+ pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features);
+ pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features);
+
+ if (sev_es_guest(vcpu->kvm)) {
+ save = sev_decrypt_vmsa(vcpu);
+ if (!save)
+ goto no_vmsa;
+
+ save01 = save;
+ }
+
pr_err("VMCB State Save Area:\n");
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"es:",
@@ -3559,6 +3423,63 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-15s %016llx %-13s %016llx\n",
"excp_from:", save->last_excp_from,
"excp_to:", save->last_excp_to);
+
+ if (sev_es_guest(vcpu->kvm)) {
+ struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save;
+
+ pr_err("%-15s %016llx\n",
+ "sev_features", vmsa->sev_features);
+
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rax:", vmsa->rax, "rbx:", vmsa->rbx);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rcx:", vmsa->rcx, "rdx:", vmsa->rdx);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rsi:", vmsa->rsi, "rdi:", vmsa->rdi);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rbp:", vmsa->rbp, "rsp:", vmsa->rsp);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r8:", vmsa->r8, "r9:", vmsa->r9);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r10:", vmsa->r10, "r11:", vmsa->r11);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r12:", vmsa->r12, "r13:", vmsa->r13);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r14:", vmsa->r14, "r15:", vmsa->r15);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "xcr0:", vmsa->xcr0, "xss:", vmsa->xss);
+ } else {
+ pr_err("%-15s %016llx %-13s %016lx\n",
+ "rax:", save->rax, "rbx:",
+ vcpu->arch.regs[VCPU_REGS_RBX]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "rcx:", vcpu->arch.regs[VCPU_REGS_RCX],
+ "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "rsi:", vcpu->arch.regs[VCPU_REGS_RSI],
+ "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]);
+ pr_err("%-15s %016lx %-13s %016llx\n",
+ "rbp:", vcpu->arch.regs[VCPU_REGS_RBP],
+ "rsp:", save->rsp);
+#ifdef CONFIG_X86_64
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r8:", vcpu->arch.regs[VCPU_REGS_R8],
+ "r9:", vcpu->arch.regs[VCPU_REGS_R9]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r10:", vcpu->arch.regs[VCPU_REGS_R10],
+ "r11:", vcpu->arch.regs[VCPU_REGS_R11]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r12:", vcpu->arch.regs[VCPU_REGS_R12],
+ "r13:", vcpu->arch.regs[VCPU_REGS_R13]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r14:", vcpu->arch.regs[VCPU_REGS_R14],
+ "r15:", vcpu->arch.regs[VCPU_REGS_R15]);
+#endif
+ }
+
+no_vmsa:
+ if (sev_es_guest(vcpu->kvm))
+ sev_free_decrypted_vmsa(vcpu, save);
}
static bool svm_check_exit_valid(u64 exit_code)
@@ -3595,6 +3516,10 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
return kvm_emulate_halt(vcpu);
else if (exit_code == SVM_EXIT_NPF)
return npf_interception(vcpu);
+#ifdef CONFIG_KVM_AMD_SEV
+ else if (exit_code == SVM_EXIT_VMGEXIT)
+ return sev_handle_vmgexit(vcpu);
+#endif
#endif
return svm_exit_handlers[exit_code](vcpu);
}
@@ -4306,9 +4231,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
guest_state_exit_irqoff();
}
-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
- bool force_immediate_exit)
+static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_svm *svm = to_svm(vcpu);
bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
@@ -4355,10 +4280,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
svm_hv_update_vp_id(svm->vmcb, vcpu);
/*
- * Run with all-zero DR6 unless needed, so that we can get the exact cause
- * of a #DB.
+ * Run with all-zero DR6 unless the guest can write DR6 freely, so that
+ * KVM can get the exact cause of a #DB. Note, loading guest DR6 from
+ * KVM's snapshot is only necessary when DR accesses won't exit.
*/
- if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
+ svm_set_dr6(vcpu, vcpu->arch.dr6);
+ else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
@@ -4538,20 +4466,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (guest_cpuid_is_intel_compatible(vcpu))
guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
- svm_recalc_instruction_intercepts(vcpu, svm);
-
- if (boot_cpu_has(X86_FEATURE_IBPB))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
- !!guest_has_pred_cmd_msr(vcpu));
-
- if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
- !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
-
if (sev_guest(vcpu->kvm))
sev_vcpu_after_set_cpuid(svm);
- init_vmcb_after_set_cpuid(vcpu);
+ svm_recalc_intercepts_after_set_cpuid(vcpu);
}
static bool svm_has_wbinvd_exit(void)
@@ -5102,7 +5020,7 @@ static int svm_vm_init(struct kvm *kvm)
}
if (!pause_filter_count || !pause_filter_thresh)
- kvm->arch.pause_in_guest = true;
+ kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
if (enable_apicv) {
int ret = avic_vm_init(kvm);
@@ -5169,7 +5087,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
- .set_dr6 = svm_set_dr6,
.set_dr7 = svm_set_dr7,
.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
@@ -5254,7 +5171,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
- .msr_filter_changed = svm_msr_filter_changed,
+ .recalc_msr_intercepts = svm_recalc_msr_intercepts,
.complete_emulated_msr = svm_complete_emulated_msr,
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
@@ -5356,6 +5273,9 @@ static __init void svm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
+ if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD))
+ kvm_caps.has_bus_lock_exit = true;
+
/* CPUID 0x80000008 */
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
@@ -5387,11 +5307,8 @@ static __init void svm_set_cpu_caps(void)
static __init int svm_hardware_setup(void)
{
- int cpu;
- struct page *iopm_pages;
void *iopm_va;
- int r;
- unsigned int order = get_order(IOPM_SIZE);
+ int cpu, r;
/*
* NX is required for shadow paging and for NPT if the NX huge pages
@@ -5403,17 +5320,6 @@ static __init int svm_hardware_setup(void)
}
kvm_enable_efer_bits(EFER_NX);
- iopm_pages = alloc_pages(GFP_KERNEL, order);
-
- if (!iopm_pages)
- return -ENOMEM;
-
- iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
- iopm_base = __sme_page_pa(iopm_pages);
-
- init_msrpm_offsets();
-
kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
XFEATURE_MASK_BNDCSR);
@@ -5447,6 +5353,10 @@ static __init int svm_hardware_setup(void)
if (nested) {
pr_info("Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+
+ r = nested_svm_init_msrpm_merge_offsets();
+ if (r)
+ return r;
}
/*
@@ -5478,6 +5388,13 @@ static __init int svm_hardware_setup(void)
else
pr_info("LBR virtualization supported\n");
}
+
+ iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL);
+ if (!iopm_va)
+ return -ENOMEM;
+
+ iopm_base = __sme_set(__pa(iopm_va));
+
/*
* Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
* may be modified by svm_adjust_mmio_mask()), as well as nrips.
@@ -5495,6 +5412,7 @@ static __init int svm_hardware_setup(void)
enable_apicv = avic = avic && avic_hardware_setup();
if (!enable_apicv) {
+ enable_ipiv = false;
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
@@ -5551,6 +5469,7 @@ static __init int svm_hardware_setup(void)
*/
allow_smaller_maxphyaddr = !npt_enabled;
+ kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED;
return 0;
err:
@@ -5575,6 +5494,8 @@ static int __init svm_init(void)
{
int r;
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm);
+
__unused_size_checks();
if (!kvm_is_svm_supported())
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index f16b068c4228..58b9d168e0c8 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -44,9 +44,6 @@ static inline struct page *__sme_pa_to_page(unsigned long pa)
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
-#define MAX_DIRECT_ACCESS_MSRS 48
-#define MSRPM_OFFSETS 32
-extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
extern int nrips;
extern int vgif;
@@ -98,6 +95,7 @@ struct kvm_sev_info {
unsigned int asid; /* ASID used for this guest */
unsigned int handle; /* SEV firmware handle */
int fd; /* SEV device fd */
+ unsigned long policy;
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
@@ -112,15 +110,19 @@ struct kvm_sev_info {
void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */
void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */
struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
+ cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
};
+#define SEV_POLICY_NODBG BIT_ULL(0)
+#define SNP_POLICY_DEBUG BIT_ULL(19)
+
struct kvm_svm {
struct kvm kvm;
/* Struct members for AVIC */
u32 avic_vm_id;
- struct page *avic_logical_id_table_page;
- struct page *avic_physical_id_table_page;
+ u32 *avic_logical_id_table;
+ u64 *avic_physical_id_table;
struct hlist_node hnode;
struct kvm_sev_info sev_info;
@@ -169,6 +171,7 @@ struct vmcb_ctrl_area_cached {
u64 nested_cr3;
u64 virt_ext;
u32 clean;
+ u64 bus_lock_rip;
union {
#if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV)
struct hv_vmcb_enlightenments hv_enlightenments;
@@ -184,8 +187,11 @@ struct svm_nested_state {
u64 vmcb12_gpa;
u64 last_vmcb12_gpa;
- /* These are the merged vectors */
- u32 *msrpm;
+ /*
+ * The MSR permissions map used for vmcb02, which is the merge result
+ * of vmcb01 and vmcb12
+ */
+ void *msrpm;
/* A VMRUN has started but has not yet been performed, so
* we cannot inject a nested vmexit yet. */
@@ -266,7 +272,7 @@ struct vcpu_svm {
*/
u64 virt_spec_ctrl;
- u32 *msrpm;
+ void *msrpm;
ulong nmi_iret_rip;
@@ -301,24 +307,26 @@ struct vcpu_svm {
u32 ldr_reg;
u32 dfr_reg;
- struct page *avic_backing_page;
- u64 *avic_physical_id_cache;
+
+ /* This is essentially a shadow of the vCPU's actual entry in the
+ * Physical ID table that is programmed into the VMCB, i.e. that is
+ * seen by the CPU. If IPI virtualization is disabled, IsRunning is
+ * only ever set in the shadow, i.e. is never propagated to the "real"
+ * table, so that hardware never sees IsRunning=1.
+ */
+ u64 avic_physical_id_entry;
/*
- * Per-vcpu list of struct amd_svm_iommu_ir:
- * This is used mainly to store interrupt remapping information used
- * when update the vcpu affinity. This avoids the need to scan for
- * IRTE and try to match ga_tag in the IOMMU driver.
+ * Per-vCPU list of irqfds that are eligible to post IRQs directly to
+ * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list
+ * is used to reconfigure IRTEs when the vCPU is loaded/put (to set the
+ * target pCPU), when AVIC is toggled on/off (to (de)activate bypass),
+ * and if the irqfd becomes ineligible for posting (to put the IRTE
+ * back into remapped mode).
*/
struct list_head ir_list;
spinlock_t ir_list_lock;
- /* Save desired MSR intercept (read: pass-through) state */
- struct {
- DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS);
- DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
- } shadow_msr_intercept;
-
struct vcpu_sev_es_state sev_es;
bool guest_state_loaded;
@@ -340,8 +348,6 @@ struct svm_cpu_data {
struct vmcb *save_area;
unsigned long save_area_pa;
- struct vmcb *current_vmcb;
-
/* index = sev_asid, value = vmcb pointer */
struct vmcb **sev_vmcbs;
};
@@ -610,17 +616,74 @@ static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data)
svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data);
}
-/* svm.c */
-#define MSR_INVALID 0xffffffffU
+/*
+ * The MSRPM is 8KiB in size, divided into four 2KiB ranges (the fourth range
+ * is reserved). Each MSR within a range is covered by two bits, one each for
+ * read (bit 0) and write (bit 1), where a bit value of '1' means intercepted.
+ */
+#define SVM_MSRPM_BYTES_PER_RANGE 2048
+#define SVM_BITS_PER_MSR 2
+#define SVM_MSRS_PER_BYTE (BITS_PER_BYTE / SVM_BITS_PER_MSR)
+#define SVM_MSRS_PER_RANGE (SVM_MSRPM_BYTES_PER_RANGE * SVM_MSRS_PER_BYTE)
+static_assert(SVM_MSRS_PER_RANGE == 8192);
+#define SVM_MSRPM_OFFSET_MASK (SVM_MSRS_PER_RANGE - 1)
+
+static __always_inline int svm_msrpm_bit_nr(u32 msr)
+{
+ int range_nr;
+
+ switch (msr & ~SVM_MSRPM_OFFSET_MASK) {
+ case 0:
+ range_nr = 0;
+ break;
+ case 0xc0000000:
+ range_nr = 1;
+ break;
+ case 0xc0010000:
+ range_nr = 2;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return range_nr * SVM_MSRPM_BYTES_PER_RANGE * BITS_PER_BYTE +
+ (msr & SVM_MSRPM_OFFSET_MASK) * SVM_BITS_PER_MSR;
+}
+
+#define __BUILD_SVM_MSR_BITMAP_HELPER(rtype, action, bitop, access, bit_rw) \
+static inline rtype svm_##action##_msr_bitmap_##access(unsigned long *bitmap, \
+ u32 msr) \
+{ \
+ int bit_nr; \
+ \
+ bit_nr = svm_msrpm_bit_nr(msr); \
+ if (bit_nr < 0) \
+ return (rtype)true; \
+ \
+ return bitop##_bit(bit_nr + bit_rw, bitmap); \
+}
+
+#define BUILD_SVM_MSR_BITMAP_HELPERS(ret_type, action, bitop) \
+ __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0) \
+ __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 1)
+
+BUILD_SVM_MSR_BITMAP_HELPERS(bool, test, test)
+BUILD_SVM_MSR_BITMAP_HELPERS(void, clear, __clear)
+BUILD_SVM_MSR_BITMAP_HELPERS(void, set, __set)
#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR)
+/* svm.c */
extern bool dump_invalid_vmcb;
-u32 svm_msrpm_offset(u32 msr);
-u32 *svm_vcpu_alloc_msrpm(void);
-void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
-void svm_vcpu_free_msrpm(u32 *msrpm);
+void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask);
+
+static inline void *svm_vcpu_alloc_msrpm(void)
+{
+ return svm_alloc_permissions_map(MSRPM_SIZE, GFP_KERNEL_ACCOUNT);
+}
+
+void svm_vcpu_free_msrpm(void *msrpm);
void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
void svm_enable_lbrv(struct kvm_vcpu *vcpu);
void svm_update_lbrv(struct kvm_vcpu *vcpu);
@@ -640,6 +703,20 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable);
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
int trig_mode, int vec);
+void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set);
+
+static inline void svm_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ svm_set_intercept_for_msr(vcpu, msr, type, false);
+}
+
+static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ svm_set_intercept_for_msr(vcpu, msr, type, true);
+}
+
/* nested.c */
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -668,6 +745,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
+int __init nested_svm_init_msrpm_merge_offsets(void);
+
int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
void svm_leave_nested(struct kvm_vcpu *vcpu);
@@ -718,7 +797,8 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \
- BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \
+ BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \
)
bool avic_hardware_setup(void);
@@ -733,8 +813,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
@@ -749,6 +830,7 @@ void sev_init_vmcb(struct vcpu_svm *svm);
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_vcpu_reset(struct vcpu_svm *svm);
+void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
@@ -785,6 +867,8 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
+struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu);
+void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa);
#else
static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
{
@@ -816,6 +900,11 @@ static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
return 0;
}
+static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
+{
+ return NULL;
+}
+static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {}
#endif
/* vmenter.S */
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 0c61153b275f..235c4af6b692 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -169,6 +169,9 @@ SYM_FUNC_START(__svm_vcpu_run)
#endif
mov VCPU_RDI(%_ASM_DI), %_ASM_DI
+ /* Clobbers EFLAGS.ZF */
+ VM_CLEAR_CPU_BUFFERS
+
/* Enter guest mode */
3: vmrun %_ASM_AX
4:
@@ -335,6 +338,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
mov SVM_current_vmcb(%rdi), %rax
mov KVM_VMCB_pa(%rax), %rax
+ /* Clobbers EFLAGS.ZF */
+ VM_CLEAR_CPU_BUFFERS
+
/* Enter guest mode */
1: vmrun %rax
2:
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ba736cbb0587..57d79fd31df0 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -260,6 +260,86 @@ TRACE_EVENT(kvm_cpuid,
__entry->used_max_basic ? ", used max basic" : "")
);
+#define kvm_deliver_mode \
+ {0x0, "Fixed"}, \
+ {0x1, "LowPrio"}, \
+ {0x2, "SMI"}, \
+ {0x3, "Res3"}, \
+ {0x4, "NMI"}, \
+ {0x5, "INIT"}, \
+ {0x6, "SIPI"}, \
+ {0x7, "ExtINT"}
+
+#ifdef CONFIG_KVM_IOAPIC
+TRACE_EVENT(kvm_ioapic_set_irq,
+ TP_PROTO(__u64 e, int pin, bool coalesced),
+ TP_ARGS(e, pin, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u64, e )
+ __field( int, pin )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->e = e;
+ __entry->pin = pin;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
+ __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+ TP_PROTO(__u64 e),
+ TP_ARGS(e),
+
+ TP_STRUCT__entry(
+ __field( __u64, e )
+ ),
+
+ TP_fast_assign(
+ __entry->e = e;
+ ),
+
+ TP_printk("dst %x vec %u (%s|%s|%s%s)",
+ (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "")
+);
+#endif
+
+TRACE_EVENT(kvm_msi_set_irq,
+ TP_PROTO(__u64 address, __u64 data),
+ TP_ARGS(address, data),
+
+ TP_STRUCT__entry(
+ __field( __u64, address )
+ __field( __u64, data )
+ ),
+
+ TP_fast_assign(
+ __entry->address = address;
+ __entry->data = data;
+ ),
+
+ TP_printk("dst %llx vec %u (%s|%s|%s%s)",
+ (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
+ (u8)__entry->data,
+ __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->address & (1<<2)) ? "logical" : "physical",
+ (__entry->data & (1<<15)) ? "level" : "edge",
+ (__entry->address & (1<<3)) ? "|rh" : "")
+);
+
#define AREG(x) { APIC_##x, "APIC_" #x }
#define kvm_trace_symbol_apic \
@@ -1096,37 +1176,32 @@ TRACE_EVENT(kvm_smm_transition,
* Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC.
*/
TRACE_EVENT(kvm_pi_irte_update,
- TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
- unsigned int gsi, unsigned int gvec,
- u64 pi_desc_addr, bool set),
- TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
+ TP_PROTO(unsigned int host_irq, struct kvm_vcpu *vcpu,
+ unsigned int gsi, unsigned int gvec, bool set),
+ TP_ARGS(host_irq, vcpu, gsi, gvec, set),
TP_STRUCT__entry(
__field( unsigned int, host_irq )
- __field( unsigned int, vcpu_id )
+ __field( int, vcpu_id )
__field( unsigned int, gsi )
__field( unsigned int, gvec )
- __field( u64, pi_desc_addr )
__field( bool, set )
),
TP_fast_assign(
__entry->host_irq = host_irq;
- __entry->vcpu_id = vcpu_id;
+ __entry->vcpu_id = vcpu ? vcpu->vcpu_id : -1;
__entry->gsi = gsi;
__entry->gvec = gvec;
- __entry->pi_desc_addr = pi_desc_addr;
__entry->set = set;
),
- TP_printk("PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
- "gvec: 0x%x, pi_desc_addr: 0x%llx",
+ TP_printk("PI is %s for irq %u, vcpu %d, gsi: 0x%x, gvec: 0x%x",
__entry->set ? "enabled and being updated" : "disabled",
__entry->host_irq,
__entry->vcpu_id,
__entry->gsi,
- __entry->gvec,
- __entry->pi_desc_addr)
+ __entry->gvec)
);
/*
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index cb6588238f46..5316c27f6099 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -15,7 +15,6 @@ extern bool __read_mostly enable_ept;
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
-extern bool __read_mostly enable_ipiv;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
new file mode 100644
index 000000000000..bc5ece76533a
--- /dev/null
+++ b/arch/x86/kvm/vmx/common.h
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_X86_VMX_COMMON_H
+#define __KVM_X86_VMX_COMMON_H
+
+#include <linux/kvm_host.h>
+#include <asm/posted_intr.h>
+
+#include "mmu.h"
+
+union vmx_exit_reason {
+ struct {
+ u32 basic : 16;
+ u32 reserved16 : 1;
+ u32 reserved17 : 1;
+ u32 reserved18 : 1;
+ u32 reserved19 : 1;
+ u32 reserved20 : 1;
+ u32 reserved21 : 1;
+ u32 reserved22 : 1;
+ u32 reserved23 : 1;
+ u32 reserved24 : 1;
+ u32 reserved25 : 1;
+ u32 bus_lock_detected : 1;
+ u32 enclave_mode : 1;
+ u32 smi_pending_mtf : 1;
+ u32 smi_from_vmx_root : 1;
+ u32 reserved30 : 1;
+ u32 failed_vmentry : 1;
+ };
+ u32 full;
+};
+
+struct vcpu_vt {
+ /* Posted interrupt descriptor */
+ struct pi_desc pi_desc;
+
+ /* Used if this vCPU is waiting for PI notification wakeup. */
+ struct list_head pi_wakeup_list;
+
+ union vmx_exit_reason exit_reason;
+
+ unsigned long exit_qualification;
+ u32 exit_intr_info;
+
+ /*
+ * If true, guest state has been loaded into hardware, and host state
+ * saved into vcpu_{vt,vmx,tdx}. If false, host state is loaded into
+ * hardware.
+ */
+ bool guest_state_loaded;
+ bool emulation_required;
+
+#ifdef CONFIG_X86_64
+ u64 msr_host_kernel_gs_base;
+#endif
+};
+
+#ifdef CONFIG_KVM_INTEL_TDX
+
+static __always_inline bool is_td(struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_TDX_VM;
+}
+
+static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu)
+{
+ return is_td(vcpu->kvm);
+}
+
+#else
+
+static __always_inline bool is_td(struct kvm *kvm) { return false; }
+static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; }
+
+#endif
+
+static inline bool vt_is_tdx_private_gpa(struct kvm *kvm, gpa_t gpa)
+{
+ /* For TDX the direct mask is the shared mask. */
+ return !kvm_is_addr_direct(kvm, gpa);
+}
+
+static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
+ unsigned long exit_qualification)
+{
+ u64 error_code;
+
+ /* Is it a read fault? */
+ error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
+ ? PFERR_USER_MASK : 0;
+ /* Is it a write fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
+ ? PFERR_WRITE_MASK : 0;
+ /* Is it a fetch fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
+ ? PFERR_FETCH_MASK : 0;
+ /* ept page table entry is present? */
+ error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
+ ? PFERR_PRESENT_MASK : 0;
+
+ if (error_code & EPT_VIOLATION_GVA_IS_VALID)
+ error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
+ PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+
+ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
+ return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+}
+
+static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+ int pi_vec)
+{
+#ifdef CONFIG_SMP
+ if (vcpu->mode == IN_GUEST_MODE) {
+ /*
+ * The vector of the virtual has already been set in the PIR.
+ * Send a notification event to deliver the virtual interrupt
+ * unless the vCPU is the currently running vCPU, i.e. the
+ * event is being sent from a fastpath VM-Exit handler, in
+ * which case the PIR will be synced to the vIRR before
+ * re-entering the guest.
+ *
+ * When the target is not the running vCPU, the following
+ * possibilities emerge:
+ *
+ * Case 1: vCPU stays in non-root mode. Sending a notification
+ * event posts the interrupt to the vCPU.
+ *
+ * Case 2: vCPU exits to root mode and is still runnable. The
+ * PIR will be synced to the vIRR before re-entering the guest.
+ * Sending a notification event is ok as the host IRQ handler
+ * will ignore the spurious event.
+ *
+ * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
+ * has already synced PIR to vIRR and never blocks the vCPU if
+ * the vIRR is not empty. Therefore, a blocked vCPU here does
+ * not wait for any requested interrupts in PIR, and sending a
+ * notification event also results in a benign, spurious event.
+ */
+
+ if (vcpu != kvm_get_running_vcpu())
+ __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
+ return;
+ }
+#endif
+ /*
+ * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
+ * otherwise do nothing as KVM will grab the highest priority pending
+ * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
+ */
+ kvm_vcpu_wake_up(vcpu);
+}
+
+/*
+ * Post an interrupt to a vCPU's PIR and trigger the vCPU to process the
+ * interrupt if necessary.
+ */
+static inline void __vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu,
+ struct pi_desc *pi_desc, int vector)
+{
+ if (pi_test_and_set_pir(vector, pi_desc))
+ return;
+
+ /* If a previous notification has sent the IPI, nothing to do. */
+ if (pi_test_and_set_on(pi_desc))
+ return;
+
+ /*
+ * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
+ * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
+ * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
+ * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
+ */
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
+}
+
+noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu);
+
+#endif /* __KVM_X86_VMX_COMMON_H */
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 43ee9ed11291..dbab1c15b0cd 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -3,9 +3,848 @@
#include "x86_ops.h"
#include "vmx.h"
+#include "mmu.h"
#include "nested.h"
#include "pmu.h"
#include "posted_intr.h"
+#include "tdx.h"
+#include "tdx_arch.h"
+
+#ifdef CONFIG_KVM_INTEL_TDX
+static_assert(offsetof(struct vcpu_vmx, vt) == offsetof(struct vcpu_tdx, vt));
+
+static void vt_disable_virtualization_cpu(void)
+{
+ /* Note, TDX *and* VMX need to be disabled if TDX is enabled. */
+ if (enable_tdx)
+ tdx_disable_virtualization_cpu();
+ vmx_disable_virtualization_cpu();
+}
+
+static __init int vt_hardware_setup(void)
+{
+ int ret;
+
+ ret = vmx_hardware_setup();
+ if (ret)
+ return ret;
+
+ if (enable_tdx)
+ tdx_hardware_setup();
+
+ return 0;
+}
+
+static int vt_vm_init(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return tdx_vm_init(kvm);
+
+ return vmx_vm_init(kvm);
+}
+
+static void vt_vm_pre_destroy(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return tdx_mmu_release_hkid(kvm);
+}
+
+static void vt_vm_destroy(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return tdx_vm_destroy(kvm);
+
+ vmx_vm_destroy(kvm);
+}
+
+static int vt_vcpu_precreate(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return 0;
+
+ return vmx_vcpu_precreate(kvm);
+}
+
+static int vt_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_create(vcpu);
+
+ return vmx_vcpu_create(vcpu);
+}
+
+static void vt_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_free(vcpu);
+ return;
+ }
+
+ vmx_vcpu_free(vcpu);
+}
+
+static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_reset(vcpu, init_event);
+ return;
+ }
+
+ vmx_vcpu_reset(vcpu, init_event);
+}
+
+static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_load(vcpu, cpu);
+ return;
+ }
+
+ vmx_vcpu_load(vcpu, cpu);
+}
+
+static void vt_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Basic TDX does not support feature PML. KVM does not enable PML in
+ * TD's VMCS, nor does it allocate or flush PML buffer for TDX.
+ */
+ if (WARN_ON_ONCE(is_td_vcpu(vcpu)))
+ return;
+
+ vmx_update_cpu_dirty_logging(vcpu);
+}
+
+static void vt_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_prepare_switch_to_guest(vcpu);
+ return;
+ }
+
+ vmx_prepare_switch_to_guest(vcpu);
+}
+
+static void vt_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_put(vcpu);
+ return;
+ }
+
+ vmx_vcpu_put(vcpu);
+}
+
+static int vt_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_pre_run(vcpu);
+
+ return vmx_vcpu_pre_run(vcpu);
+}
+
+static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_run(vcpu, run_flags);
+
+ return vmx_vcpu_run(vcpu, run_flags);
+}
+
+static int vt_handle_exit(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion fastpath)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_handle_exit(vcpu, fastpath);
+
+ return vmx_handle_exit(vcpu, fastpath);
+}
+
+static int vt_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ if (unlikely(is_td_vcpu(vcpu)))
+ return tdx_set_msr(vcpu, msr_info);
+
+ return vmx_set_msr(vcpu, msr_info);
+}
+
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+static bool vt_has_emulated_msr(struct kvm *kvm, u32 index)
+{
+ if (kvm && is_td(kvm))
+ return tdx_has_emulated_msr(index);
+
+ return vmx_has_emulated_msr(kvm, index);
+}
+
+static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ if (unlikely(is_td_vcpu(vcpu)))
+ return tdx_get_msr(vcpu, msr_info);
+
+ return vmx_get_msr(vcpu, msr_info);
+}
+
+static void vt_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
+{
+ /*
+ * TDX doesn't allow VMM to configure interception of MSR accesses.
+ * TDX guest requests MSR accesses by calling TDVMCALL. The MSR
+ * filters will be applied when handling the TDVMCALL for RDMSR/WRMSR
+ * if the userspace has set any.
+ */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_recalc_msr_intercepts(vcpu);
+}
+
+static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_complete_emulated_msr(vcpu, err);
+
+ return vmx_complete_emulated_msr(vcpu, err);
+}
+
+#ifdef CONFIG_KVM_SMM
+static int vt_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm))
+ return 0;
+
+ return vmx_smi_allowed(vcpu, for_injection);
+}
+
+static int vt_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
+{
+ if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm))
+ return 0;
+
+ return vmx_enter_smm(vcpu, smram);
+}
+
+static int vt_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+{
+ if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm))
+ return 0;
+
+ return vmx_leave_smm(vcpu, smram);
+}
+
+static void vt_enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm))
+ return;
+
+ /* RSM will cause a vmexit anyway. */
+ vmx_enable_smi_window(vcpu);
+}
+#endif
+
+static int vt_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ /*
+ * For TDX, this can only be triggered for MMIO emulation. Let the
+ * guest retry after installing the SPTE with suppress #VE bit cleared,
+ * so that the guest will receive #VE when retry. The guest is expected
+ * to call TDG.VP.VMCALL<MMIO> to request VMM to do MMIO emulation on
+ * #VE.
+ */
+ if (is_td_vcpu(vcpu))
+ return X86EMUL_RETRY_INSTR;
+
+ return vmx_check_emulate_instruction(vcpu, emul_type, insn, insn_len);
+}
+
+static bool vt_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+ /*
+ * INIT and SIPI are always blocked for TDX, i.e., INIT handling and
+ * the OP vcpu_deliver_sipi_vector() won't be called.
+ */
+ if (is_td_vcpu(vcpu))
+ return true;
+
+ return vmx_apic_init_signal_blocked(vcpu);
+}
+
+static void vt_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ /* Only x2APIC mode is supported for TD. */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ return vmx_set_virtual_apic_mode(vcpu);
+}
+
+static void vt_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ return vmx_hwapic_isr_update(vcpu, max_isr);
+}
+
+static int vt_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return -1;
+
+ return vmx_sync_pir_to_irr(vcpu);
+}
+
+static void vt_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ if (is_td_vcpu(apic->vcpu)) {
+ tdx_deliver_interrupt(apic, delivery_mode, trig_mode,
+ vector);
+ return;
+ }
+
+ vmx_deliver_interrupt(apic, delivery_mode, trig_mode, vector);
+}
+
+static void vt_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_vcpu_after_set_cpuid(vcpu);
+}
+
+static void vt_update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_update_exception_bitmap(vcpu);
+}
+
+static u64 vt_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_segment_base(vcpu, seg);
+}
+
+static void vt_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var,
+ int seg)
+{
+ if (is_td_vcpu(vcpu)) {
+ memset(var, 0, sizeof(*var));
+ return;
+ }
+
+ vmx_get_segment(vcpu, var, seg);
+}
+
+static void vt_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var,
+ int seg)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_segment(vcpu, var, seg);
+}
+
+static int vt_get_cpl(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_cpl(vcpu);
+}
+
+static int vt_get_cpl_no_cache(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_cpl_no_cache(vcpu);
+}
+
+static void vt_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ if (is_td_vcpu(vcpu)) {
+ *db = 0;
+ *l = 0;
+ return;
+ }
+
+ vmx_get_cs_db_l_bits(vcpu, db, l);
+}
+
+static bool vt_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ if (is_td_vcpu(vcpu))
+ return true;
+
+ return vmx_is_valid_cr0(vcpu, cr0);
+}
+
+static void vt_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_cr0(vcpu, cr0);
+}
+
+static bool vt_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ if (is_td_vcpu(vcpu))
+ return true;
+
+ return vmx_is_valid_cr4(vcpu, cr4);
+}
+
+static void vt_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_cr4(vcpu, cr4);
+}
+
+static int vt_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_set_efer(vcpu, efer);
+}
+
+static void vt_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ if (is_td_vcpu(vcpu)) {
+ memset(dt, 0, sizeof(*dt));
+ return;
+ }
+
+ vmx_get_idt(vcpu, dt);
+}
+
+static void vt_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_idt(vcpu, dt);
+}
+
+static void vt_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ if (is_td_vcpu(vcpu)) {
+ memset(dt, 0, sizeof(*dt));
+ return;
+ }
+
+ vmx_get_gdt(vcpu, dt);
+}
+
+static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_gdt(vcpu, dt);
+}
+
+static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_dr7(vcpu, val);
+}
+
+static void vt_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ /*
+ * MOV-DR exiting is always cleared for TD guest, even in debug mode.
+ * Thus KVM_DEBUGREG_WONT_EXIT can never be set and it should never
+ * reach here for TD vcpu.
+ */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_sync_dirty_debug_regs(vcpu);
+}
+
+static void vt_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ if (WARN_ON_ONCE(is_td_vcpu(vcpu)))
+ return;
+
+ vmx_cache_reg(vcpu, reg);
+}
+
+static unsigned long vt_get_rflags(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_rflags(vcpu);
+}
+
+static void vt_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_rflags(vcpu, rflags);
+}
+
+static bool vt_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return false;
+
+ return vmx_get_if_flag(vcpu);
+}
+
+static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_flush_tlb_all(vcpu);
+ return;
+ }
+
+ vmx_flush_tlb_all(vcpu);
+}
+
+static void vt_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_flush_tlb_current(vcpu);
+ return;
+ }
+
+ vmx_flush_tlb_current(vcpu);
+}
+
+static void vt_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_flush_tlb_gva(vcpu, addr);
+}
+
+static void vt_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_flush_tlb_guest(vcpu);
+}
+
+static void vt_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_inject_nmi(vcpu);
+ return;
+ }
+
+ vmx_inject_nmi(vcpu);
+}
+
+static int vt_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ /*
+ * The TDX module manages NMI windows and NMI reinjection, and hides NMI
+ * blocking, all KVM can do is throw an NMI over the wall.
+ */
+ if (is_td_vcpu(vcpu))
+ return true;
+
+ return vmx_nmi_allowed(vcpu, for_injection);
+}
+
+static bool vt_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ /*
+ * KVM can't get NMI blocking status for TDX guest, assume NMIs are
+ * always unmasked.
+ */
+ if (is_td_vcpu(vcpu))
+ return false;
+
+ return vmx_get_nmi_mask(vcpu);
+}
+
+static void vt_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_nmi_mask(vcpu, masked);
+}
+
+static void vt_enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ /* Refer to the comments in tdx_inject_nmi(). */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_enable_nmi_window(vcpu);
+}
+
+static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int pgd_level)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_load_mmu_pgd(vcpu, root_hpa, pgd_level);
+ return;
+ }
+
+ vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level);
+}
+
+static void vt_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_interrupt_shadow(vcpu, mask);
+}
+
+static u32 vt_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_interrupt_shadow(vcpu);
+}
+
+static void vt_patch_hypercall(struct kvm_vcpu *vcpu,
+ unsigned char *hypercall)
+{
+ /*
+ * Because guest memory is protected, guest can't be patched. TD kernel
+ * is modified to use TDG.VP.VMCALL for hypercall.
+ */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_patch_hypercall(vcpu, hypercall);
+}
+
+static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_inject_irq(vcpu, reinjected);
+}
+
+static void vt_inject_exception(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_inject_exception(vcpu);
+}
+
+static void vt_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_cancel_injection(vcpu);
+}
+
+static int vt_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_interrupt_allowed(vcpu);
+
+ return vmx_interrupt_allowed(vcpu, for_injection);
+}
+
+static void vt_enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_enable_irq_window(vcpu);
+}
+
+static void vt_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
+{
+ *intr_info = 0;
+ *error_code = 0;
+
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_get_entry_info(vcpu, intr_info, error_code);
+}
+
+static void vt_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_get_exit_info(vcpu, reason, info1, info2, intr_info,
+ error_code);
+ return;
+ }
+
+ vmx_get_exit_info(vcpu, reason, info1, info2, intr_info, error_code);
+}
+
+static void vt_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_update_cr8_intercept(vcpu, tpr, irr);
+}
+
+static void vt_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_apic_access_page_addr(vcpu);
+}
+
+static void vt_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ KVM_BUG_ON(!kvm_vcpu_apicv_active(vcpu), vcpu->kvm);
+ return;
+ }
+
+ vmx_refresh_apicv_exec_ctrl(vcpu);
+}
+
+static void vt_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+}
+
+static int vt_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ if (is_td(kvm))
+ return 0;
+
+ return vmx_set_tss_addr(kvm, addr);
+}
+
+static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+ if (is_td(kvm))
+ return 0;
+
+ return vmx_set_identity_map_addr(kvm, ident_addr);
+}
+
+static u64 vt_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ /* TDX doesn't support L2 guest at the moment. */
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_l2_tsc_offset(vcpu);
+}
+
+static u64 vt_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ /* TDX doesn't support L2 guest at the moment. */
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_l2_tsc_multiplier(vcpu);
+}
+
+static void vt_write_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ /* In TDX, tsc offset can't be changed. */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_write_tsc_offset(vcpu);
+}
+
+static void vt_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ /* In TDX, tsc multiplier can't be changed. */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_write_tsc_multiplier(vcpu);
+}
+
+#ifdef CONFIG_X86_64
+static int vt_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired)
+{
+ /* VMX-preemption timer isn't available for TDX. */
+ if (is_td_vcpu(vcpu))
+ return -EINVAL;
+
+ return vmx_set_hv_timer(vcpu, guest_deadline_tsc, expired);
+}
+
+static void vt_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+ /* VMX-preemption timer can't be set. See vt_set_hv_timer(). */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_cancel_hv_timer(vcpu);
+}
+#endif
+
+static void vt_setup_mce(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_setup_mce(vcpu);
+}
+
+static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
+{
+ if (!is_td(kvm))
+ return -ENOTTY;
+
+ return tdx_vm_ioctl(kvm, argp);
+}
+
+static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ if (!is_td_vcpu(vcpu))
+ return -EINVAL;
+
+ return tdx_vcpu_ioctl(vcpu, argp);
+}
+
+static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ if (is_td(kvm))
+ return tdx_gmem_private_max_mapping_level(kvm, pfn);
+
+ return 0;
+}
+
+#define vt_op(name) vt_##name
+#define vt_op_tdx_only(name) vt_##name
+#else /* CONFIG_KVM_INTEL_TDX */
+#define vt_op(name) vmx_##name
+#define vt_op_tdx_only(name) NULL
+#endif /* CONFIG_KVM_INTEL_TDX */
#define VMX_REQUIRED_APICV_INHIBITS \
(BIT(APICV_INHIBIT_REASON_DISABLED) | \
@@ -24,147 +863,210 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.hardware_unsetup = vmx_hardware_unsetup,
.enable_virtualization_cpu = vmx_enable_virtualization_cpu,
- .disable_virtualization_cpu = vmx_disable_virtualization_cpu,
+ .disable_virtualization_cpu = vt_op(disable_virtualization_cpu),
.emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu,
- .has_emulated_msr = vmx_has_emulated_msr,
+ .has_emulated_msr = vt_op(has_emulated_msr),
.vm_size = sizeof(struct kvm_vmx),
- .vm_init = vmx_vm_init,
- .vm_destroy = vmx_vm_destroy,
- .vcpu_precreate = vmx_vcpu_precreate,
- .vcpu_create = vmx_vcpu_create,
- .vcpu_free = vmx_vcpu_free,
- .vcpu_reset = vmx_vcpu_reset,
+ .vm_init = vt_op(vm_init),
+ .vm_destroy = vt_op(vm_destroy),
+ .vm_pre_destroy = vt_op_tdx_only(vm_pre_destroy),
- .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
+ .vcpu_precreate = vt_op(vcpu_precreate),
+ .vcpu_create = vt_op(vcpu_create),
+ .vcpu_free = vt_op(vcpu_free),
+ .vcpu_reset = vt_op(vcpu_reset),
- .update_exception_bitmap = vmx_update_exception_bitmap,
+ .prepare_switch_to_guest = vt_op(prepare_switch_to_guest),
+ .vcpu_load = vt_op(vcpu_load),
+ .vcpu_put = vt_op(vcpu_put),
+
+ .HOST_OWNED_DEBUGCTL = VMX_HOST_OWNED_DEBUGCTL_BITS,
+
+ .update_exception_bitmap = vt_op(update_exception_bitmap),
.get_feature_msr = vmx_get_feature_msr,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cpl_no_cache = vmx_get_cpl_no_cache,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .is_valid_cr0 = vmx_is_valid_cr0,
- .set_cr0 = vmx_set_cr0,
- .is_valid_cr4 = vmx_is_valid_cr4,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .set_dr6 = vmx_set_dr6,
- .set_dr7 = vmx_set_dr7,
- .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
- .get_if_flag = vmx_get_if_flag,
-
- .flush_tlb_all = vmx_flush_tlb_all,
- .flush_tlb_current = vmx_flush_tlb_current,
- .flush_tlb_gva = vmx_flush_tlb_gva,
- .flush_tlb_guest = vmx_flush_tlb_guest,
-
- .vcpu_pre_run = vmx_vcpu_pre_run,
- .vcpu_run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
+ .get_msr = vt_op(get_msr),
+ .set_msr = vt_op(set_msr),
+
+ .get_segment_base = vt_op(get_segment_base),
+ .get_segment = vt_op(get_segment),
+ .set_segment = vt_op(set_segment),
+ .get_cpl = vt_op(get_cpl),
+ .get_cpl_no_cache = vt_op(get_cpl_no_cache),
+ .get_cs_db_l_bits = vt_op(get_cs_db_l_bits),
+ .is_valid_cr0 = vt_op(is_valid_cr0),
+ .set_cr0 = vt_op(set_cr0),
+ .is_valid_cr4 = vt_op(is_valid_cr4),
+ .set_cr4 = vt_op(set_cr4),
+ .set_efer = vt_op(set_efer),
+ .get_idt = vt_op(get_idt),
+ .set_idt = vt_op(set_idt),
+ .get_gdt = vt_op(get_gdt),
+ .set_gdt = vt_op(set_gdt),
+ .set_dr7 = vt_op(set_dr7),
+ .sync_dirty_debug_regs = vt_op(sync_dirty_debug_regs),
+ .cache_reg = vt_op(cache_reg),
+ .get_rflags = vt_op(get_rflags),
+ .set_rflags = vt_op(set_rflags),
+ .get_if_flag = vt_op(get_if_flag),
+
+ .flush_tlb_all = vt_op(flush_tlb_all),
+ .flush_tlb_current = vt_op(flush_tlb_current),
+ .flush_tlb_gva = vt_op(flush_tlb_gva),
+ .flush_tlb_guest = vt_op(flush_tlb_guest),
+
+ .vcpu_pre_run = vt_op(vcpu_pre_run),
+ .vcpu_run = vt_op(vcpu_run),
+ .handle_exit = vt_op(handle_exit),
.skip_emulated_instruction = vmx_skip_emulated_instruction,
.update_emulated_instruction = vmx_update_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .inject_irq = vmx_inject_irq,
- .inject_nmi = vmx_inject_nmi,
- .inject_exception = vmx_inject_exception,
- .cancel_injection = vmx_cancel_injection,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .get_nmi_mask = vmx_get_nmi_mask,
- .set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = vmx_enable_nmi_window,
- .enable_irq_window = vmx_enable_irq_window,
- .update_cr8_intercept = vmx_update_cr8_intercept,
+ .set_interrupt_shadow = vt_op(set_interrupt_shadow),
+ .get_interrupt_shadow = vt_op(get_interrupt_shadow),
+ .patch_hypercall = vt_op(patch_hypercall),
+ .inject_irq = vt_op(inject_irq),
+ .inject_nmi = vt_op(inject_nmi),
+ .inject_exception = vt_op(inject_exception),
+ .cancel_injection = vt_op(cancel_injection),
+ .interrupt_allowed = vt_op(interrupt_allowed),
+ .nmi_allowed = vt_op(nmi_allowed),
+ .get_nmi_mask = vt_op(get_nmi_mask),
+ .set_nmi_mask = vt_op(set_nmi_mask),
+ .enable_nmi_window = vt_op(enable_nmi_window),
+ .enable_irq_window = vt_op(enable_irq_window),
+ .update_cr8_intercept = vt_op(update_cr8_intercept),
.x2apic_icr_is_split = false,
- .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
- .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vmx_load_eoi_exitmap,
- .apicv_pre_state_restore = vmx_apicv_pre_state_restore,
+ .set_virtual_apic_mode = vt_op(set_virtual_apic_mode),
+ .set_apic_access_page_addr = vt_op(set_apic_access_page_addr),
+ .refresh_apicv_exec_ctrl = vt_op(refresh_apicv_exec_ctrl),
+ .load_eoi_exitmap = vt_op(load_eoi_exitmap),
+ .apicv_pre_state_restore = pi_apicv_pre_state_restore,
.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
- .hwapic_isr_update = vmx_hwapic_isr_update,
- .sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_interrupt = vmx_deliver_interrupt,
+ .hwapic_isr_update = vt_op(hwapic_isr_update),
+ .sync_pir_to_irr = vt_op(sync_pir_to_irr),
+ .deliver_interrupt = vt_op(deliver_interrupt),
.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
- .set_tss_addr = vmx_set_tss_addr,
- .set_identity_map_addr = vmx_set_identity_map_addr,
+ .set_tss_addr = vt_op(set_tss_addr),
+ .set_identity_map_addr = vt_op(set_identity_map_addr),
.get_mt_mask = vmx_get_mt_mask,
- .get_exit_info = vmx_get_exit_info,
- .get_entry_info = vmx_get_entry_info,
+ .get_exit_info = vt_op(get_exit_info),
+ .get_entry_info = vt_op(get_entry_info),
- .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
+ .vcpu_after_set_cpuid = vt_op(vcpu_after_set_cpuid),
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
- .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
- .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
- .write_tsc_offset = vmx_write_tsc_offset,
- .write_tsc_multiplier = vmx_write_tsc_multiplier,
+ .get_l2_tsc_offset = vt_op(get_l2_tsc_offset),
+ .get_l2_tsc_multiplier = vt_op(get_l2_tsc_multiplier),
+ .write_tsc_offset = vt_op(write_tsc_offset),
+ .write_tsc_multiplier = vt_op(write_tsc_multiplier),
- .load_mmu_pgd = vmx_load_mmu_pgd,
+ .load_mmu_pgd = vt_op(load_mmu_pgd),
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,
- .cpu_dirty_log_size = PML_LOG_NR_ENTRIES,
- .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
+ .update_cpu_dirty_logging = vt_op(update_cpu_dirty_logging),
.nested_ops = &vmx_nested_ops,
.pi_update_irte = vmx_pi_update_irte,
- .pi_start_assignment = vmx_pi_start_assignment,
+ .pi_start_bypass = vmx_pi_start_bypass,
#ifdef CONFIG_X86_64
- .set_hv_timer = vmx_set_hv_timer,
- .cancel_hv_timer = vmx_cancel_hv_timer,
+ .set_hv_timer = vt_op(set_hv_timer),
+ .cancel_hv_timer = vt_op(cancel_hv_timer),
#endif
- .setup_mce = vmx_setup_mce,
+ .setup_mce = vt_op(setup_mce),
#ifdef CONFIG_KVM_SMM
- .smi_allowed = vmx_smi_allowed,
- .enter_smm = vmx_enter_smm,
- .leave_smm = vmx_leave_smm,
- .enable_smi_window = vmx_enable_smi_window,
+ .smi_allowed = vt_op(smi_allowed),
+ .enter_smm = vt_op(enter_smm),
+ .leave_smm = vt_op(leave_smm),
+ .enable_smi_window = vt_op(enable_smi_window),
#endif
- .check_emulate_instruction = vmx_check_emulate_instruction,
- .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+ .check_emulate_instruction = vt_op(check_emulate_instruction),
+ .apic_init_signal_blocked = vt_op(apic_init_signal_blocked),
.migrate_timers = vmx_migrate_timers,
- .msr_filter_changed = vmx_msr_filter_changed,
- .complete_emulated_msr = kvm_complete_insn_gp,
+ .recalc_msr_intercepts = vt_op(recalc_msr_intercepts),
+ .complete_emulated_msr = vt_op(complete_emulated_msr),
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
.get_untagged_addr = vmx_get_untagged_addr,
+
+ .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl),
+ .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl),
+
+ .private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level)
};
struct kvm_x86_init_ops vt_init_ops __initdata = {
- .hardware_setup = vmx_hardware_setup,
+ .hardware_setup = vt_op(hardware_setup),
.handle_intel_pt_intr = NULL,
.runtime_ops = &vt_x86_ops,
.pmu_ops = &intel_pmu_ops,
};
+
+static void __exit vt_exit(void)
+{
+ kvm_exit();
+ tdx_cleanup();
+ vmx_exit();
+}
+module_exit(vt_exit);
+
+static int __init vt_init(void)
+{
+ unsigned vcpu_size, vcpu_align;
+ int r;
+
+ r = vmx_init();
+ if (r)
+ return r;
+
+ /* tdx_init() has been taken */
+ r = tdx_bringup();
+ if (r)
+ goto err_tdx_bringup;
+
+ /*
+ * TDX and VMX have different vCPU structures. Calculate the
+ * maximum size/align so that kvm_init() can use the larger
+ * values to create the kmem_vcpu_cache.
+ */
+ vcpu_size = sizeof(struct vcpu_vmx);
+ vcpu_align = __alignof__(struct vcpu_vmx);
+ if (enable_tdx) {
+ vcpu_size = max_t(unsigned, vcpu_size,
+ sizeof(struct vcpu_tdx));
+ vcpu_align = max_t(unsigned, vcpu_align,
+ __alignof__(struct vcpu_tdx));
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_TDX_VM);
+ }
+
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(vcpu_size, vcpu_align, THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
+ return 0;
+
+err_kvm_init:
+ tdx_cleanup();
+err_tdx_bringup:
+ vmx_exit();
+ return r;
+}
+module_init(vt_init);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d268224227f0..b8ea1969113d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -276,7 +276,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
{
struct vmcs_host_state *dest, *src;
- if (unlikely(!vmx->guest_state_loaded))
+ if (unlikely(!vmx->vt.guest_state_loaded))
return;
src = &prev->host_state;
@@ -302,7 +302,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
cpu = get_cpu();
prev = vmx->loaded_vmcs;
vmx->loaded_vmcs = vmcs;
- vmx_vcpu_load_vmcs(vcpu, cpu, prev);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
@@ -426,7 +426,7 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
* tables also changed, but KVM should not treat EPT Misconfig
* VM-Exits as writes.
*/
- WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION);
+ WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION);
/*
* PML Full and EPT Violation VM-Exits both use bit 12 to report
@@ -715,6 +715,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_APERF, MSR_TYPE_R);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_MPERF, MSR_TYPE_R);
+
kvm_vcpu_unmap(vcpu, &map);
vmx->nested.force_msr_bitmap_recalc = false;
@@ -825,12 +831,30 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
return 0;
}
+static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
+ vmx->nested.msrs.misc_high);
+
+ return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
+}
+
static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
u32 count, u64 addr)
{
if (count == 0)
return 0;
+ /*
+ * Exceeding the limit results in architecturally _undefined_ behavior,
+ * i.e. KVM is allowed to do literally anything in response to a bad
+ * limit. Immediately generate a consistency check so that code that
+ * consumes the count doesn't need to worry about extreme edge cases.
+ */
+ if (count > nested_vmx_max_atomic_switch_msrs(vcpu))
+ return -EINVAL;
+
if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
!kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
return -EINVAL;
@@ -941,15 +965,6 @@ static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
return 0;
}
-static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
- vmx->nested.msrs.misc_high);
-
- return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
-}
-
/*
* Load guest's/host's msr at nested entry/exit.
* return 0 for success, entry index for failure.
@@ -966,7 +981,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
for (i = 0; i < count; i++) {
- if (unlikely(i >= max_msr_list_size))
+ if (WARN_ON_ONCE(i >= max_msr_list_size))
goto fail;
if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
@@ -1054,7 +1069,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
for (i = 0; i < count; i++) {
- if (unlikely(i >= max_msr_list_size))
+ if (WARN_ON_ONCE(i >= max_msr_list_size))
return -EINVAL;
if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
@@ -2654,10 +2669,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (vmx->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
+ vmx_get_supported_debugctl(vcpu, false));
} else {
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
+ vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
}
if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
@@ -3147,7 +3163,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
- CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
+ (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
+ CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
return -EINVAL;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
@@ -3521,7 +3538,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
- vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
if (kvm_mpx_supported() &&
(!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
@@ -4521,12 +4538,12 @@ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
cpu = get_cpu();
vmx->loaded_vmcs = &vmx->nested.vmcs02;
- vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
vmx->loaded_vmcs = &vmx->vmcs01;
- vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
put_cpu();
}
@@ -4599,6 +4616,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+ /*
+ * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
+ * Writes to DEBUGCTL that aren't intercepted by L1 are immediately
+ * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
+ * vmcs02 doesn't strictly track vmcs12.
+ */
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
vmcs12->guest_dr7 = vcpu->arch.dr7;
@@ -4623,7 +4646,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
{
/* update exit information fields: */
vmcs12->vm_exit_reason = vm_exit_reason;
- if (to_vmx(vcpu)->exit_reason.enclave_mode)
+ if (vmx_get_exit_reason(vcpu).enclave_mode)
vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
vmcs12->exit_qualification = exit_qualification;
@@ -4789,13 +4812,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
__vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
kvm_set_dr(vcpu, 7, 0x400);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+ vmx_guest_debugctl_write(vcpu, 0);
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
vmcs12->vm_exit_msr_load_count))
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
- to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
+ to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu);
}
static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
@@ -4844,6 +4867,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
}
+ /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
+ vmx_reload_guest_debugctl(vcpu);
+
/*
* Note that calling vmx_set_{efer,cr0,cr4} is important as they
* handle a variety of side effects to KVM's software model.
@@ -5021,16 +5047,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- /*
- * If IBRS is advertised to the vCPU, KVM must flush the indirect
- * branch predictors when transitioning from L2 to L1, as L1 expects
- * hardware (KVM in this case) to provide separate predictor modes.
- * Bare metal isolates VMX root (host) from VMX non-root (guest), but
- * doesn't isolate different VMCSs, i.e. in this case, doesn't provide
- * separate modes for L2 vs L1.
- */
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL))
- indirect_branch_prediction_barrier();
+ kvm_nested_vmexit_handle_ibrs(vcpu);
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
@@ -6128,7 +6145,7 @@ fail:
* nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
* EXIT_REASON_VMFUNC as the exit reason.
*/
- nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
+ nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full,
vmx_get_intr_info(vcpu),
vmx_get_exit_qual(vcpu));
return 1;
@@ -6573,7 +6590,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- union vmx_exit_reason exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx->vt.exit_reason;
unsigned long exit_qual;
u32 exit_intr_info;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 231a9633359c..0b173602821b 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -20,6 +20,7 @@
#include "lapic.h"
#include "nested.h"
#include "pmu.h"
+#include "tdx.h"
/*
* Perf's "BASE" is wildly misleading, architectural PMUs use bits 31:16 of ECX
@@ -35,6 +36,24 @@
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
+static struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return NULL;
+
+ return &to_vmx(vcpu)->lbr_desc;
+}
+
+static struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return NULL;
+
+ return &to_vmx(vcpu)->lbr_desc.records;
+}
+
+#pragma GCC poison to_vmx
+
static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
{
struct kvm_pmc *pmc;
@@ -130,6 +149,22 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
}
+static bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return false;
+
+ return cpuid_model_is_consistent(vcpu);
+}
+
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return false;
+
+ return !!vcpu_to_lbr_records(vcpu)->nr;
+}
+
static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index)
{
struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu);
@@ -195,6 +230,9 @@ static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu)
{
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ if (!lbr_desc)
+ return;
+
if (lbr_desc->event) {
perf_event_release_kernel(lbr_desc->event);
lbr_desc->event = NULL;
@@ -236,6 +274,9 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
PERF_SAMPLE_BRANCH_USER,
};
+ if (WARN_ON_ONCE(!lbr_desc))
+ return 0;
+
if (unlikely(lbr_desc->event)) {
__set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
return 0;
@@ -467,6 +508,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
u64 perf_capabilities;
u64 counter_rsvd;
+ if (!lbr_desc)
+ return;
+
memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
/*
@@ -543,7 +587,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
perf_capabilities = vcpu_get_perf_capabilities(vcpu);
- if (cpuid_model_is_consistent(vcpu) &&
+ if (intel_pmu_lbr_is_compatible(vcpu) &&
(perf_capabilities & PMU_CAP_LBR_FMT))
memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps));
else
@@ -571,6 +615,9 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ if (!lbr_desc)
+ return;
+
for (i = 0; i < KVM_MAX_NR_INTEL_GP_COUNTERS; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
@@ -606,11 +653,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
*/
static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
{
- u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ u64 data = vmx_guest_debugctl_read();
if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
data &= ~DEBUGCTLMSR_LBR;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ vmx_guest_debugctl_write(vcpu, data);
}
}
@@ -678,9 +725,12 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ if (WARN_ON_ONCE(!lbr_desc))
+ return;
+
if (!lbr_desc->event) {
vmx_disable_lbr_msrs_passthrough(vcpu);
- if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
+ if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)
goto warn;
if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
goto warn;
@@ -702,7 +752,7 @@ warn:
static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
{
- if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
+ if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR))
intel_pmu_release_guest_lbr_event(vcpu);
}
diff --git a/arch/x86/kvm/vmx/pmu_intel.h b/arch/x86/kvm/vmx/pmu_intel.h
new file mode 100644
index 000000000000..5620d0882cdc
--- /dev/null
+++ b/arch/x86/kvm/vmx/pmu_intel.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_PMU_INTEL_H
+#define __KVM_X86_VMX_PMU_INTEL_H
+
+#include <linux/kvm_host.h>
+
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu);
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
+
+struct lbr_desc {
+ /* Basic info about guest LBR records. */
+ struct x86_pmu_lbr records;
+
+ /*
+ * Emulate LBR feature via passthrough LBR registers when the
+ * per-vcpu guest LBR event is scheduled on the current pcpu.
+ *
+ * The records may be inaccurate if the host reclaims the LBR.
+ */
+ struct perf_event *event;
+
+ /* True if LBRs are marked as not intercepted in the MSR bitmap */
+ bool msr_passthrough;
+};
+
+extern struct x86_pmu_lbr vmx_lbr_caps;
+
+#endif /* __KVM_X86_VMX_PMU_INTEL_H */
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index d70e5b90087d..4a6d9a17da23 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -2,6 +2,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
#include <asm/irq_remapping.h>
#include <asm/cpu.h>
@@ -11,6 +12,7 @@
#include "posted_intr.h"
#include "trace.h"
#include "vmx.h"
+#include "tdx.h"
/*
* Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler()
@@ -33,9 +35,9 @@ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING
-static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
- return &(to_vmx(vcpu)->pi_desc);
+ return &(to_vt(vcpu)->pi_desc);
}
static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new)
@@ -55,7 +57,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new)
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
struct pi_desc old, new;
unsigned long flags;
unsigned int dest;
@@ -71,13 +73,10 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
/*
* If the vCPU wasn't on the wakeup list and wasn't migrated, then the
* full update can be skipped as neither the vector nor the destination
- * needs to be changed.
+ * needs to be changed. Clear SN even if there is no assigned device,
+ * again for simplicity.
*/
if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
- /*
- * Clear SN if it was set due to being preempted. Again, do
- * this even if there is no assigned device for simplicity.
- */
if (pi_test_and_clear_sn(pi_desc))
goto after_clear_sn;
return;
@@ -102,7 +101,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
*/
raw_spin_lock(spinlock);
spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_);
- list_del(&vmx->pi_wakeup_list);
+ list_del(&vt->pi_wakeup_list);
spin_release(&spinlock->dep_map, _RET_IP_);
raw_spin_unlock(spinlock);
}
@@ -147,9 +146,13 @@ after_clear_sn:
static bool vmx_can_use_vtd_pi(struct kvm *kvm)
{
- return irqchip_in_kernel(kvm) && enable_apicv &&
- kvm_arch_has_assigned_device(kvm) &&
- irq_remapping_cap(IRQ_POSTING_CAP);
+ /*
+ * Note, reading the number of possible bypass IRQs can race with a
+ * bypass IRQ being attached to the VM. vmx_pi_start_bypass() ensures
+ * blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK.
+ */
+ return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() &&
+ READ_ONCE(kvm->arch.nr_possible_bypass_irqs);
}
/*
@@ -159,7 +162,7 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm)
static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
struct pi_desc old, new;
lockdep_assert_irqs_disabled();
@@ -178,7 +181,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
*/
raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu),
PI_LOCK_SCHED_OUT);
- list_add_tail(&vmx->pi_wakeup_list,
+ list_add_tail(&vt->pi_wakeup_list,
&per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
@@ -213,7 +216,8 @@ static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
* notification vector is switched to the one that calls
* back to the pi_wakeup_handler() function.
*/
- return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm);
+ return (vmx_can_use_ipiv(vcpu) && !is_td_vcpu(vcpu)) ||
+ vmx_can_use_vtd_pi(vcpu->kvm);
}
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@ -223,15 +227,23 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
if (!vmx_needs_pi_wakeup(vcpu))
return;
- if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
- pi_enable_wakeup_handler(vcpu);
-
/*
- * Set SN when the vCPU is preempted. Note, the vCPU can both be seen
- * as blocking and preempted, e.g. if it's preempted between setting
- * its wait state and manually scheduling out.
+ * If the vCPU is blocking with IRQs enabled and ISN'T being preempted,
+ * enable the wakeup handler so that notification IRQ wakes the vCPU as
+ * expected. There is no need to enable the wakeup handler if the vCPU
+ * is preempted between setting its wait state and manually scheduling
+ * out, as the task is still runnable, i.e. doesn't need a wake event
+ * from KVM to be scheduled in.
+ *
+ * If the wakeup handler isn't being enabled, Suppress Notifications as
+ * the cost of propagating PIR.IRR to PID.ON is negligible compared to
+ * the cost of a spurious IRQ, and vCPU put/load is a slow path.
*/
- if (vcpu->preempted)
+ if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) &&
+ ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) ||
+ (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu))))
+ pi_enable_wakeup_handler(vcpu);
+ else
pi_set_sn(pi_desc);
}
@@ -243,13 +255,13 @@ void pi_wakeup_handler(void)
int cpu = smp_processor_id();
struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu);
raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu);
- struct vcpu_vmx *vmx;
+ struct vcpu_vt *vt;
raw_spin_lock(spinlock);
- list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) {
+ list_for_each_entry(vt, wakeup_list, pi_wakeup_list) {
- if (pi_test_on(&vmx->pi_desc))
- kvm_vcpu_wake_up(&vmx->vcpu);
+ if (pi_test_on(&vt->pi_desc))
+ kvm_vcpu_wake_up(vt_to_vcpu(vt));
}
raw_spin_unlock(spinlock);
}
@@ -260,6 +272,14 @@ void __init pi_init_cpu(int cpu)
raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}
+void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi = vcpu_to_pi_desc(vcpu);
+
+ pi_clear_on(pi);
+ memset(pi->pir, 0, sizeof(pi->pir));
+}
+
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -270,99 +290,30 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
/*
- * Bail out of the block loop if the VM has an assigned
- * device, but the blocking vCPU didn't reconfigure the
- * PI.NV to the wakeup vector, i.e. the assigned device
- * came along after the initial check in vmx_vcpu_pi_put().
+ * Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as
+ * blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup
+ * vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put().
*/
-void vmx_pi_start_assignment(struct kvm *kvm)
+void vmx_pi_start_bypass(struct kvm *kvm)
{
- if (!irq_remapping_cap(IRQ_POSTING_CAP))
+ if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm)))
return;
kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
}
-/*
- * vmx_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector)
{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- bool enable_remapped_mode = true;
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu;
- struct vcpu_data vcpu_info;
- int idx, ret = 0;
-
- if (!vmx_can_use_vtd_pi(kvm))
- return 0;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
+ if (vcpu) {
+ struct intel_iommu_pi_data pi_data = {
+ .pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)),
+ .vector = vector,
+ };
+
+ return irq_set_vcpu_affinity(host_irq, &pi_data);
+ } else {
+ return irq_set_vcpu_affinity(host_irq, NULL);
}
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
- /*
- * VT-d PI cannot support posting multicast/broadcast
- * interrupts to a vCPU, we still use interrupt remapping
- * for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- *
- * We will support full lowest-priority interrupt later.
- *
- * In addition, we can only inject generic interrupts using
- * the PI mechanism, refuse to route others through it.
- */
-
- kvm_set_msi_irq(kvm, e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq))
- continue;
-
- vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
- vcpu_info.vector = irq.vector;
-
- trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
- vcpu_info.vector, vcpu_info.pi_desc_addr, set);
-
- if (!set)
- continue;
-
- enable_remapped_mode = false;
-
- ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
- if (ret < 0) {
- printk(KERN_INFO "%s: failed to update PI IRTE\n",
- __func__);
- goto out;
- }
- }
-
- if (enable_remapped_mode)
- ret = irq_set_vcpu_affinity(host_irq, NULL);
-
- ret = 0;
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
}
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index ad9116a99bcc..a4af39948cf0 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -3,22 +3,27 @@
#define __KVM_X86_VMX_POSTED_INTR_H
#include <linux/bitmap.h>
+#include <linux/find.h>
+#include <linux/kvm_host.h>
+
#include <asm/posted_intr.h>
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
+void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
-void vmx_pi_start_assignment(struct kvm *kvm);
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
+void vmx_pi_start_bypass(struct kvm *kvm);
static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
{
int vec;
- vec = find_last_bit((unsigned long *)pi_desc->pir, 256);
+ vec = find_last_bit(pi_desc->pir, 256);
return vec < 256 ? vec : -1;
}
diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
index 6a9bfdfbb6e5..2f20fb170def 100644
--- a/arch/x86/kvm/vmx/run_flags.h
+++ b/arch/x86/kvm/vmx/run_flags.h
@@ -2,10 +2,12 @@
#ifndef __KVM_X86_VMX_RUN_FLAGS_H
#define __KVM_X86_VMX_RUN_FLAGS_H
-#define VMX_RUN_VMRESUME_SHIFT 0
-#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
+#define VMX_RUN_VMRESUME_SHIFT 0
+#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
+#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT 2
-#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT)
-#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
+#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT)
+#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
+#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT)
#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
new file mode 100644
index 000000000000..66744f5768c8
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -0,0 +1,3643 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cleanup.h>
+#include <linux/cpu.h>
+#include <asm/cpufeature.h>
+#include <asm/fpu/xcr.h>
+#include <linux/misc_cgroup.h>
+#include <linux/mmu_context.h>
+#include <asm/tdx.h>
+#include "capabilities.h"
+#include "mmu.h"
+#include "x86_ops.h"
+#include "lapic.h"
+#include "tdx.h"
+#include "vmx.h"
+#include "mmu/spte.h"
+#include "common.h"
+#include "posted_intr.h"
+#include "irq.h"
+#include <trace/events/kvm.h>
+#include "trace.h"
+
+#pragma GCC poison to_vmx
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define pr_tdx_error(__fn, __err) \
+ pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
+
+#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \
+ pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__)
+
+#define pr_tdx_error_1(__fn, __err, __rcx) \
+ __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
+
+#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \
+ __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
+
+#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \
+ __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
+
+bool enable_tdx __ro_after_init;
+module_param_named(tdx, enable_tdx, bool, 0444);
+
+#define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
+#define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
+
+static enum cpuhp_state tdx_cpuhp_state;
+
+static const struct tdx_sys_info *tdx_sysinfo;
+
+void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
+{
+ KVM_BUG_ON(1, tdx->vcpu.kvm);
+ pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
+}
+
+void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
+ u64 val, u64 err)
+{
+ KVM_BUG_ON(1, tdx->vcpu.kvm);
+ pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
+}
+
+#define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
+
+static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_tdx, kvm);
+}
+
+static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_tdx, vcpu);
+}
+
+static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
+{
+ u64 val = KVM_SUPPORTED_TD_ATTRS;
+
+ if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
+ return 0;
+
+ val &= td_conf->attributes_fixed0;
+
+ return val;
+}
+
+static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
+{
+ u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
+
+ if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
+ return 0;
+
+ val &= td_conf->xfam_fixed0;
+
+ return val;
+}
+
+static int tdx_get_guest_phys_addr_bits(const u32 eax)
+{
+ return (eax & GENMASK(23, 16)) >> 16;
+}
+
+static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
+{
+ return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
+}
+
+#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
+
+static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
+{
+ return entry->function == 7 && entry->index == 0 &&
+ (entry->ebx & TDX_FEATURE_TSX);
+}
+
+static void clear_tsx(struct kvm_cpuid_entry2 *entry)
+{
+ entry->ebx &= ~TDX_FEATURE_TSX;
+}
+
+static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
+{
+ return entry->function == 7 && entry->index == 0 &&
+ (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
+}
+
+static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
+{
+ entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
+}
+
+static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
+{
+ if (has_tsx(entry))
+ clear_tsx(entry);
+
+ if (has_waitpkg(entry))
+ clear_waitpkg(entry);
+}
+
+static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
+{
+ return has_tsx(entry) || has_waitpkg(entry);
+}
+
+#define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1)
+
+static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
+{
+ const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+
+ entry->function = (u32)td_conf->cpuid_config_leaves[idx];
+ entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
+ entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
+ entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
+ entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
+ entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
+
+ if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
+ entry->index = 0;
+
+ /*
+ * The TDX module doesn't allow configuring the guest phys addr bits
+ * (EAX[23:16]). However, KVM uses it as an interface to the userspace
+ * to configure the GPAW. Report these bits as configurable.
+ */
+ if (entry->function == 0x80000008)
+ entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
+
+ tdx_clear_unsupported_cpuid(entry);
+}
+
+#define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1)
+
+static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
+ struct kvm_tdx_capabilities *caps)
+{
+ int i;
+
+ caps->supported_attrs = tdx_get_supported_attrs(td_conf);
+ if (!caps->supported_attrs)
+ return -EIO;
+
+ caps->supported_xfam = tdx_get_supported_xfam(td_conf);
+ if (!caps->supported_xfam)
+ return -EIO;
+
+ caps->cpuid.nent = td_conf->num_cpuid_config;
+
+ caps->user_tdvmcallinfo_1_r11 =
+ TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
+
+ for (i = 0; i < td_conf->num_cpuid_config; i++)
+ td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
+
+ return 0;
+}
+
+/*
+ * Some SEAMCALLs acquire the TDX module globally, and can fail with
+ * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs.
+ */
+static DEFINE_MUTEX(tdx_lock);
+
+static atomic_t nr_configured_hkid;
+
+static bool tdx_operand_busy(u64 err)
+{
+ return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
+}
+
+
+/*
+ * A per-CPU list of TD vCPUs associated with a given CPU.
+ * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
+ * list.
+ * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
+ * the old CPU during the IPI callback running on the old CPU, and then added
+ * to the per-CPU list of the new CPU.
+ * - When a TD is tearing down, all vCPUs are disassociated from their current
+ * running CPUs and removed from the per-CPU list during the IPI callback
+ * running on those CPUs.
+ * - When a CPU is brought down, traverse the per-CPU list to disassociate all
+ * associated TD vCPUs and remove them from the per-CPU list.
+ */
+static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
+
+static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
+{
+ return to_tdx(vcpu)->vp_enter_args.r10;
+}
+
+static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
+{
+ return to_tdx(vcpu)->vp_enter_args.r11;
+}
+
+static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
+ long val)
+{
+ to_tdx(vcpu)->vp_enter_args.r10 = val;
+}
+
+static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
+ unsigned long val)
+{
+ to_tdx(vcpu)->vp_enter_args.r11 = val;
+}
+
+static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
+{
+ tdx_guest_keyid_free(kvm_tdx->hkid);
+ kvm_tdx->hkid = -1;
+ atomic_dec(&nr_configured_hkid);
+ misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
+ put_misc_cg(kvm_tdx->misc_cg);
+ kvm_tdx->misc_cg = NULL;
+}
+
+static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
+{
+ return kvm_tdx->hkid > 0;
+}
+
+static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
+{
+ lockdep_assert_irqs_disabled();
+
+ list_del(&to_tdx(vcpu)->cpu_list);
+
+ /*
+ * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
+ * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
+ * to its list before it's deleted from this CPU's list.
+ */
+ smp_wmb();
+
+ vcpu->cpu = -1;
+}
+
+static void tdx_clear_page(struct page *page)
+{
+ const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
+ void *dest = page_to_virt(page);
+ unsigned long i;
+
+ /*
+ * The page could have been poisoned. MOVDIR64B also clears
+ * the poison bit so the kernel can safely use the page again.
+ */
+ for (i = 0; i < PAGE_SIZE; i += 64)
+ movdir64b(dest + i, zero_page);
+ /*
+ * MOVDIR64B store uses WC buffer. Prevent following memory reads
+ * from seeing potentially poisoned cache.
+ */
+ __mb();
+}
+
+static void tdx_no_vcpus_enter_start(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
+static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
+}
+
+/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
+static int __tdx_reclaim_page(struct page *page)
+{
+ u64 err, rcx, rdx, r8;
+
+ err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
+
+ /*
+ * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
+ * before the HKID is released and control pages have also been
+ * released at this point, so there is no possibility of contention.
+ */
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int tdx_reclaim_page(struct page *page)
+{
+ int r;
+
+ r = __tdx_reclaim_page(page);
+ if (!r)
+ tdx_clear_page(page);
+ return r;
+}
+
+
+/*
+ * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
+ * private KeyID. Assume the cache associated with the TDX private KeyID has
+ * been flushed.
+ */
+static void tdx_reclaim_control_page(struct page *ctrl_page)
+{
+ /*
+ * Leak the page if the kernel failed to reclaim the page.
+ * The kernel cannot use it safely anymore.
+ */
+ if (tdx_reclaim_page(ctrl_page))
+ return;
+
+ __free_page(ctrl_page);
+}
+
+struct tdx_flush_vp_arg {
+ struct kvm_vcpu *vcpu;
+ u64 err;
+};
+
+static void tdx_flush_vp(void *_arg)
+{
+ struct tdx_flush_vp_arg *arg = _arg;
+ struct kvm_vcpu *vcpu = arg->vcpu;
+ u64 err;
+
+ arg->err = 0;
+ lockdep_assert_irqs_disabled();
+
+ /* Task migration can race with CPU offlining. */
+ if (unlikely(vcpu->cpu != raw_smp_processor_id()))
+ return;
+
+ /*
+ * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The
+ * list tracking still needs to be updated so that it's correct if/when
+ * the vCPU does get initialized.
+ */
+ if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
+ /*
+ * No need to retry. TDX Resources needed for TDH.VP.FLUSH are:
+ * TDVPR as exclusive, TDR as shared, and TDCS as shared. This
+ * vp flush function is called when destructing vCPU/TD or vCPU
+ * migration. No other thread uses TDVPR in those cases.
+ */
+ err = tdh_vp_flush(&to_tdx(vcpu)->vp);
+ if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
+ /*
+ * This function is called in IPI context. Do not use
+ * printk to avoid console semaphore.
+ * The caller prints out the error message, instead.
+ */
+ if (err)
+ arg->err = err;
+ }
+ }
+
+ tdx_disassociate_vp(vcpu);
+}
+
+static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
+{
+ struct tdx_flush_vp_arg arg = {
+ .vcpu = vcpu,
+ };
+ int cpu = vcpu->cpu;
+
+ if (unlikely(cpu == -1))
+ return;
+
+ smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
+ if (KVM_BUG_ON(arg.err, vcpu->kvm))
+ pr_tdx_error(TDH_VP_FLUSH, arg.err);
+}
+
+void tdx_disable_virtualization_cpu(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
+ struct tdx_flush_vp_arg arg;
+ struct vcpu_tdx *tdx, *tmp;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
+ list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
+ arg.vcpu = &tdx->vcpu;
+ tdx_flush_vp(&arg);
+ }
+ local_irq_restore(flags);
+}
+
+#define TDX_SEAMCALL_RETRIES 10000
+
+static void smp_func_do_phymem_cache_wb(void *unused)
+{
+ u64 err = 0;
+ bool resume;
+ int i;
+
+ /*
+ * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
+ * KeyID on the package or core. The TDX module may not finish the
+ * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The
+ * kernel should retry it until it returns success w/o rescheduling.
+ */
+ for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
+ resume = !!err;
+ err = tdh_phymem_cache_wb(resume);
+ switch (err) {
+ case TDX_INTERRUPTED_RESUMABLE:
+ continue;
+ case TDX_NO_HKID_READY_TO_WBCACHE:
+ err = TDX_SUCCESS; /* Already done by other thread */
+ fallthrough;
+ default:
+ goto out;
+ }
+ }
+
+out:
+ if (WARN_ON_ONCE(err))
+ pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
+}
+
+void tdx_mmu_release_hkid(struct kvm *kvm)
+{
+ bool packages_allocated, targets_allocated;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ cpumask_var_t packages, targets;
+ struct kvm_vcpu *vcpu;
+ unsigned long j;
+ int i;
+ u64 err;
+
+ if (!is_hkid_assigned(kvm_tdx))
+ return;
+
+ packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
+ targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
+ cpus_read_lock();
+
+ kvm_for_each_vcpu(j, vcpu, kvm)
+ tdx_flush_vp_on_cpu(vcpu);
+
+ /*
+ * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
+ * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
+ * Multiple TDX guests can be destroyed simultaneously. Take the
+ * mutex to prevent it from getting error.
+ */
+ mutex_lock(&tdx_lock);
+
+ /*
+ * Releasing HKID is in vm_destroy().
+ * After the above flushing vps, there should be no more vCPU
+ * associations, as all vCPU fds have been released at this stage.
+ */
+ err = tdh_mng_vpflushdone(&kvm_tdx->td);
+ if (err == TDX_FLUSHVP_NOT_DONE)
+ goto out;
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
+ pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
+ kvm_tdx->hkid);
+ goto out;
+ }
+
+ for_each_online_cpu(i) {
+ if (packages_allocated &&
+ cpumask_test_and_set_cpu(topology_physical_package_id(i),
+ packages))
+ continue;
+ if (targets_allocated)
+ cpumask_set_cpu(i, targets);
+ }
+ if (targets_allocated)
+ on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
+ else
+ on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
+ /*
+ * In the case of error in smp_func_do_phymem_cache_wb(), the following
+ * tdh_mng_key_freeid() will fail.
+ */
+ err = tdh_mng_key_freeid(&kvm_tdx->td);
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error(TDH_MNG_KEY_FREEID, err);
+ pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
+ kvm_tdx->hkid);
+ } else {
+ tdx_hkid_free(kvm_tdx);
+ }
+
+out:
+ mutex_unlock(&tdx_lock);
+ cpus_read_unlock();
+ free_cpumask_var(targets);
+ free_cpumask_var(packages);
+}
+
+static void tdx_reclaim_td_control_pages(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err;
+ int i;
+
+ /*
+ * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong
+ * heavily with TDX module. Give up freeing TD pages. As the function
+ * already warned, don't warn it again.
+ */
+ if (is_hkid_assigned(kvm_tdx))
+ return;
+
+ if (kvm_tdx->td.tdcs_pages) {
+ for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ if (!kvm_tdx->td.tdcs_pages[i])
+ continue;
+
+ tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
+ }
+ kfree(kvm_tdx->td.tdcs_pages);
+ kvm_tdx->td.tdcs_pages = NULL;
+ }
+
+ if (!kvm_tdx->td.tdr_page)
+ return;
+
+ if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
+ return;
+
+ /*
+ * Use a SEAMCALL to ask the TDX module to flush the cache based on the
+ * KeyID. TDX module may access TDR while operating on TD (Especially
+ * when it is reclaiming TDCS).
+ */
+ err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+ return;
+ }
+ tdx_clear_page(kvm_tdx->td.tdr_page);
+
+ __free_page(kvm_tdx->td.tdr_page);
+ kvm_tdx->td.tdr_page = NULL;
+}
+
+void tdx_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ tdx_reclaim_td_control_pages(kvm);
+
+ kvm_tdx->state = TD_STATE_UNINITIALIZED;
+}
+
+static int tdx_do_tdh_mng_key_config(void *param)
+{
+ struct kvm_tdx *kvm_tdx = param;
+ u64 err;
+
+ /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
+ err = tdh_mng_key_config(&kvm_tdx->td);
+
+ if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
+ pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int tdx_vm_init(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ kvm->arch.has_protected_state = true;
+ kvm->arch.has_private_mem = true;
+ kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+
+ /*
+ * Because guest TD is protected, VMM can't parse the instruction in TD.
+ * Instead, guest uses MMIO hypercall. For unmodified device driver,
+ * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
+ * instruction into MMIO hypercall.
+ *
+ * SPTE value for MMIO needs to be setup so that #VE is injected into
+ * TD instead of triggering EPT MISCONFIG.
+ * - RWX=0 so that EPT violation is triggered.
+ * - suppress #VE bit is cleared to inject #VE.
+ */
+ kvm_mmu_set_mmio_spte_value(kvm, 0);
+
+ /*
+ * TDX has its own limit of maximum vCPUs it can support for all
+ * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports
+ * such limit via the MAX_VCPU_PER_TD global metadata. In
+ * practice, it reflects the number of logical CPUs that ALL
+ * platforms that the TDX module supports can possibly have.
+ *
+ * Limit TDX guest's maximum vCPUs to the number of logical CPUs
+ * the platform has. Simply forwarding the MAX_VCPU_PER_TD to
+ * userspace would result in an unpredictable ABI.
+ */
+ kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
+
+ kvm_tdx->state = TD_STATE_UNINITIALIZED;
+
+ return 0;
+}
+
+int tdx_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ if (kvm_tdx->state != TD_STATE_INITIALIZED)
+ return -EIO;
+
+ /*
+ * TDX module mandates APICv, which requires an in-kernel local APIC.
+ * Disallow an in-kernel I/O APIC, because level-triggered interrupts
+ * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
+ */
+ if (!irqchip_split(vcpu->kvm))
+ return -EINVAL;
+
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
+ vcpu->arch.apic->guest_apic_protected = true;
+ INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
+
+ vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
+
+ vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
+ vcpu->arch.cr0_guest_owned_bits = -1ul;
+ vcpu->arch.cr4_guest_owned_bits = -1ul;
+
+ /* KVM can't change TSC offset/multiplier as TDX module manages them. */
+ vcpu->arch.guest_tsc_protected = true;
+ vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
+ vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
+ vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
+ vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
+
+ vcpu->arch.guest_state_protected =
+ !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
+
+ if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
+ vcpu->arch.xfd_no_write_intercept = true;
+
+ tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
+ __pi_set_sn(&tdx->vt.pi_desc);
+
+ tdx->state = VCPU_TD_STATE_UNINITIALIZED;
+
+ return 0;
+}
+
+void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ vmx_vcpu_pi_load(vcpu, cpu);
+ if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
+ return;
+
+ tdx_flush_vp_on_cpu(vcpu);
+
+ KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
+ local_irq_disable();
+ /*
+ * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
+ * vcpu->cpu is read before tdx->cpu_list.
+ */
+ smp_rmb();
+
+ list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
+ local_irq_enable();
+}
+
+bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ /*
+ * KVM can't get the interrupt status of TDX guest and it assumes
+ * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
+ * which passes the interrupt blocked flag.
+ */
+ return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
+ !to_tdx(vcpu)->vp_enter_args.r12;
+}
+
+static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ u64 vcpu_state_details;
+
+ if (pi_has_pending_interrupt(vcpu))
+ return true;
+
+ /*
+ * Only check RVI pending for HALTED case with IRQ enabled.
+ * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the
+ * interrupt was pending before TD exit, then it _must_ be blocked,
+ * otherwise the interrupt would have been serviced at the instruction
+ * boundary.
+ */
+ if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
+ to_tdx(vcpu)->vp_enter_args.r12)
+ return false;
+
+ vcpu_state_details =
+ td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
+
+ return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
+}
+
+/*
+ * Compared to vmx_prepare_switch_to_guest(), there is not much to do
+ * as SEAMCALL/SEAMRET calls take care of most of save and restore.
+ */
+void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ if (vt->guest_state_loaded)
+ return;
+
+ if (likely(is_64bit_mm(current->mm)))
+ vt->msr_host_kernel_gs_base = current->thread.gsbase;
+ else
+ vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+
+ vt->guest_state_loaded = true;
+}
+
+struct tdx_uret_msr {
+ u32 msr;
+ unsigned int slot;
+ u64 defval;
+};
+
+static struct tdx_uret_msr tdx_uret_msrs[] = {
+ {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
+ {.msr = MSR_STAR,},
+ {.msr = MSR_LSTAR,},
+ {.msr = MSR_TSC_AUX,},
+};
+
+static void tdx_user_return_msr_update_cache(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
+ kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
+ tdx_uret_msrs[i].defval);
+}
+
+static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ if (!vt->guest_state_loaded)
+ return;
+
+ ++vcpu->stat.host_state_reload;
+ wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
+
+ if (tdx->guest_entered) {
+ tdx_user_return_msr_update_cache();
+ tdx->guest_entered = false;
+ }
+
+ vt->guest_state_loaded = false;
+}
+
+void tdx_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ vmx_vcpu_pi_put(vcpu);
+ tdx_prepare_switch_to_host(vcpu);
+}
+
+void tdx_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ int i;
+
+ /*
+ * It is not possible to reclaim pages while hkid is assigned. It might
+ * be assigned if:
+ * 1. the TD VM is being destroyed but freeing hkid failed, in which
+ * case the pages are leaked
+ * 2. TD VCPU creation failed and this on the error path, in which case
+ * there is nothing to do anyway
+ */
+ if (is_hkid_assigned(kvm_tdx))
+ return;
+
+ if (tdx->vp.tdcx_pages) {
+ for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ if (tdx->vp.tdcx_pages[i])
+ tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
+ }
+ kfree(tdx->vp.tdcx_pages);
+ tdx->vp.tdcx_pages = NULL;
+ }
+ if (tdx->vp.tdvpr_page) {
+ tdx_reclaim_control_page(tdx->vp.tdvpr_page);
+ tdx->vp.tdvpr_page = 0;
+ }
+
+ tdx->state = VCPU_TD_STATE_UNINITIALIZED;
+}
+
+int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
+ to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
+ return -EINVAL;
+
+ return 1;
+}
+
+static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
+{
+ switch (tdvmcall_leaf(vcpu)) {
+ case EXIT_REASON_CPUID:
+ case EXIT_REASON_HLT:
+ case EXIT_REASON_IO_INSTRUCTION:
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ return tdvmcall_leaf(vcpu);
+ case EXIT_REASON_EPT_VIOLATION:
+ return EXIT_REASON_EPT_MISCONFIG;
+ default:
+ break;
+ }
+
+ return EXIT_REASON_TDCALL;
+}
+
+static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u32 exit_reason;
+
+ switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
+ case TDX_SUCCESS:
+ case TDX_NON_RECOVERABLE_VCPU:
+ case TDX_NON_RECOVERABLE_TD:
+ case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
+ case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
+ break;
+ default:
+ return -1u;
+ }
+
+ exit_reason = tdx->vp_enter_ret;
+
+ switch (exit_reason) {
+ case EXIT_REASON_TDCALL:
+ if (tdvmcall_exit_type(vcpu))
+ return EXIT_REASON_VMCALL;
+
+ return tdcall_to_vmx_exit_reason(vcpu);
+ case EXIT_REASON_EPT_MISCONFIG:
+ /*
+ * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
+ * non-instrumentable code with interrupts disabled.
+ */
+ return -1u;
+ default:
+ break;
+ }
+
+ return exit_reason;
+}
+
+static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ guest_state_enter_irqoff();
+
+ tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
+
+ vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
+
+ vt->exit_qualification = tdx->vp_enter_args.rcx;
+ tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
+ tdx->exit_gpa = tdx->vp_enter_args.r8;
+ vt->exit_intr_info = tdx->vp_enter_args.r9;
+
+ vmx_handle_nmi(vcpu);
+
+ guest_state_exit_irqoff();
+}
+
+static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_exit_reason(vcpu).failed_vmentry &&
+ vmx_get_exit_reason(vcpu).full != -1u;
+}
+
+static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+ u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
+
+ /*
+ * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
+ * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
+ *
+ * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
+ * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
+ * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
+ * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
+ * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
+ * requester may be blocked endlessly.
+ */
+ if (unlikely(tdx_operand_busy(vp_enter_ret)))
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ return EXIT_FASTPATH_NONE;
+}
+
+#define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
+ BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
+ BIT_ULL(VCPU_REGS_RAX) | \
+ BIT_ULL(VCPU_REGS_RBX) | \
+ BIT_ULL(VCPU_REGS_RCX) | \
+ BIT_ULL(VCPU_REGS_RDX) | \
+ BIT_ULL(VCPU_REGS_RBP) | \
+ BIT_ULL(VCPU_REGS_RSI) | \
+ BIT_ULL(VCPU_REGS_RDI) | \
+ BIT_ULL(VCPU_REGS_R8) | \
+ BIT_ULL(VCPU_REGS_R9) | \
+ BIT_ULL(VCPU_REGS_R10) | \
+ BIT_ULL(VCPU_REGS_R11) | \
+ BIT_ULL(VCPU_REGS_R12) | \
+ BIT_ULL(VCPU_REGS_R13) | \
+ BIT_ULL(VCPU_REGS_R14) | \
+ BIT_ULL(VCPU_REGS_R15))
+
+static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+
+ /*
+ * All TDX hosts support PKRU; but even if they didn't,
+ * vcpu->arch.host_pkru would be 0 and the wrpkru would be
+ * skipped.
+ */
+ if (vcpu->arch.host_pkru != 0)
+ wrpkru(vcpu->arch.host_pkru);
+
+ if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
+
+ /*
+ * Likewise, even if a TDX hosts didn't support XSS both arms of
+ * the comparison would be 0 and the wrmsrl would be skipped.
+ */
+ if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
+ wrmsrl(MSR_IA32_XSS, kvm_host.xss);
+}
+
+#define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
+ DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
+ DEBUGCTLMSR_FREEZE_IN_SMM)
+
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ /*
+ * WARN if KVM wants to force an immediate exit, as the TDX module does
+ * not guarantee entry into the guest, i.e. it's possible for KVM to
+ * _think_ it completed entry to the guest and forced an immediate exit
+ * without actually having done so. Luckily, KVM never needs to force
+ * an immediate exit for TDX (KVM can't do direct event injection, so
+ * just WARN and continue on.
+ */
+ WARN_ON_ONCE(run_flags);
+
+ /*
+ * Wait until retry of SEPT-zap-related SEAMCALL completes before
+ * allowing vCPU entry to avoid contention with tdh_vp_enter() and
+ * TDCALLs.
+ */
+ if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);
+
+ if (pi_test_on(&vt->pi_desc)) {
+ apic->send_IPI_self(POSTED_INTR_VECTOR);
+
+ if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
+ APIC_VECTOR_MASK, &vt->pi_desc))
+ kvm_wait_lapic_expire(vcpu);
+ }
+
+ tdx_vcpu_enter_exit(vcpu);
+
+ if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
+
+ tdx_load_host_xsave_state(vcpu);
+ tdx->guest_entered = true;
+
+ vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
+
+ if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
+ return EXIT_FASTPATH_NONE;
+
+ if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
+ return EXIT_FASTPATH_NONE;
+
+ if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
+ kvm_machine_check();
+
+ trace_kvm_exit(vcpu, KVM_ISA_VMX);
+
+ if (unlikely(tdx_failed_vmentry(vcpu)))
+ return EXIT_FASTPATH_NONE;
+
+ return tdx_exit_handlers_fastpath(vcpu);
+}
+
+void tdx_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.nmi_injections;
+ td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
+ /*
+ * From KVM's perspective, NMI injection is completed right after
+ * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by
+ * the TDX module or not.
+ */
+ vcpu->arch.nmi_injected = false;
+ /*
+ * TDX doesn't support KVM to request NMI window exit. If there is
+ * still a pending vNMI, KVM is not able to inject it along with the
+ * one pending in TDX module in a back-to-back way. Since the previous
+ * vNMI is still pending in TDX module, i.e. it has not been delivered
+ * to TDX guest yet, it's OK to collapse the pending vNMI into the
+ * previous one. The guest is expected to handle all the NMI sources
+ * when handling the first vNMI.
+ */
+ vcpu->arch.nmi_pending = 0;
+}
+
+static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
+{
+ u32 intr_info = vmx_get_intr_info(vcpu);
+
+ /*
+ * Machine checks are handled by handle_exception_irqoff(), or by
+ * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
+ * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit().
+ */
+ if (is_nmi(intr_info) || is_machine_check(intr_info))
+ return 1;
+
+ vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
+ vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
+ vcpu->run->ex.error_code = 0;
+
+ return 0;
+}
+
+static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
+{
+ tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
+ return 1;
+}
+
+static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
+{
+ kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
+ kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
+ kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
+ kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
+ kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
+
+ return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
+}
+
+/*
+ * Split into chunks and check interrupt pending between chunks. This allows
+ * for timely injection of interrupts to prevent issues with guest lockup
+ * detection.
+ */
+#define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
+static void __tdx_map_gpa(struct vcpu_tdx *tdx);
+
+static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ if (vcpu->run->hypercall.ret) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ tdx->vp_enter_args.r11 = tdx->map_gpa_next;
+ return 1;
+ }
+
+ tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
+ if (tdx->map_gpa_next >= tdx->map_gpa_end)
+ return 1;
+
+ /*
+ * Stop processing the remaining part if there is a pending interrupt,
+ * which could be qualified to deliver. Skip checking pending RVI for
+ * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
+ */
+ if (kvm_vcpu_has_events(vcpu)) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
+ tdx->vp_enter_args.r11 = tdx->map_gpa_next;
+ return 1;
+ }
+
+ __tdx_map_gpa(tdx);
+ return 0;
+}
+
+static void __tdx_map_gpa(struct vcpu_tdx *tdx)
+{
+ u64 gpa = tdx->map_gpa_next;
+ u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
+
+ if (size > TDX_MAP_GPA_MAX_LEN)
+ size = TDX_MAP_GPA_MAX_LEN;
+
+ tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL;
+ tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ /*
+ * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
+ * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
+ * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
+ * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
+ */
+ tdx->vcpu.run->hypercall.ret = 0;
+ tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
+ tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
+ tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
+ KVM_MAP_GPA_RANGE_ENCRYPTED :
+ KVM_MAP_GPA_RANGE_DECRYPTED;
+ tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE;
+
+ tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
+}
+
+static int tdx_map_gpa(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 gpa = tdx->vp_enter_args.r12;
+ u64 size = tdx->vp_enter_args.r13;
+ u64 ret;
+
+ /*
+ * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
+ * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
+ * bit set. This is a base call so it should always be supported, but
+ * KVM has no way to ensure that userspace implements the GHCI correctly.
+ * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
+ * to the guest.
+ */
+ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
+ ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
+ goto error;
+ }
+
+ if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
+ !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
+ (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
+ vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
+ ret = TDVMCALL_STATUS_INVALID_OPERAND;
+ goto error;
+ }
+
+ if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
+ ret = TDVMCALL_STATUS_ALIGN_ERROR;
+ goto error;
+ }
+
+ tdx->map_gpa_end = gpa + size;
+ tdx->map_gpa_next = gpa;
+
+ __tdx_map_gpa(tdx);
+ return 0;
+
+error:
+ tdvmcall_set_return_code(vcpu, ret);
+ tdx->vp_enter_args.r11 = gpa;
+ return 1;
+}
+
+static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 *regs = vcpu->run->system_event.data;
+ u64 *module_regs = &tdx->vp_enter_args.r8;
+ int index = VCPU_REGS_RAX;
+
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
+ vcpu->run->system_event.ndata = 16;
+
+ /* Dump 16 general-purpose registers to userspace in ascending order. */
+ regs[index++] = tdx->vp_enter_ret;
+ regs[index++] = tdx->vp_enter_args.rcx;
+ regs[index++] = tdx->vp_enter_args.rdx;
+ regs[index++] = tdx->vp_enter_args.rbx;
+ regs[index++] = 0;
+ regs[index++] = 0;
+ regs[index++] = tdx->vp_enter_args.rsi;
+ regs[index] = tdx->vp_enter_args.rdi;
+ for (index = 0; index < 8; index++)
+ regs[VCPU_REGS_R8 + index] = module_regs[index];
+
+ return 0;
+}
+
+static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+ u32 eax, ebx, ecx, edx;
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ /* EAX and ECX for cpuid is stored in R12 and R13. */
+ eax = tdx->vp_enter_args.r12;
+ ecx = tdx->vp_enter_args.r13;
+
+ kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
+
+ tdx->vp_enter_args.r12 = eax;
+ tdx->vp_enter_args.r13 = ebx;
+ tdx->vp_enter_args.r14 = ecx;
+ tdx->vp_enter_args.r15 = edx;
+
+ return 1;
+}
+
+static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+ return 1;
+}
+
+static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ unsigned long val = 0;
+ int ret;
+
+ ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
+ vcpu->arch.pio.port, &val, 1);
+
+ WARN_ON_ONCE(!ret);
+
+ tdvmcall_set_return_val(vcpu, val);
+
+ return 1;
+}
+
+static int tdx_emulate_io(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ unsigned long val = 0;
+ unsigned int port;
+ u64 size, write;
+ int ret;
+
+ ++vcpu->stat.io_exits;
+
+ size = tdx->vp_enter_args.r12;
+ write = tdx->vp_enter_args.r13;
+ port = tdx->vp_enter_args.r14;
+
+ if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+
+ if (write) {
+ val = tdx->vp_enter_args.r15;
+ ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
+ } else {
+ ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
+ }
+
+ if (!ret)
+ vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
+ tdx_complete_pio_in;
+ else if (!write)
+ tdvmcall_set_return_val(vcpu, val);
+
+ return ret;
+}
+
+static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
+{
+ unsigned long val = 0;
+ gpa_t gpa;
+ int size;
+
+ gpa = vcpu->mmio_fragments[0].gpa;
+ size = vcpu->mmio_fragments[0].len;
+
+ memcpy(&val, vcpu->run->mmio.data, size);
+ tdvmcall_set_return_val(vcpu, val);
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
+ return 1;
+}
+
+static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
+ unsigned long val)
+{
+ if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+ trace_kvm_fast_mmio(gpa);
+ return 0;
+ }
+
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
+ if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
+{
+ unsigned long val;
+
+ if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
+ return -EOPNOTSUPP;
+
+ tdvmcall_set_return_val(vcpu, val);
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
+ return 0;
+}
+
+static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ int size, write, r;
+ unsigned long val;
+ gpa_t gpa;
+
+ size = tdx->vp_enter_args.r12;
+ write = tdx->vp_enter_args.r13;
+ gpa = tdx->vp_enter_args.r14;
+ val = write ? tdx->vp_enter_args.r15 : 0;
+
+ if (size != 1 && size != 2 && size != 4 && size != 8)
+ goto error;
+ if (write != 0 && write != 1)
+ goto error;
+
+ /*
+ * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
+ * do MMIO emulation for private GPA.
+ */
+ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
+ vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
+ goto error;
+
+ gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
+
+ if (write)
+ r = tdx_mmio_write(vcpu, gpa, size, val);
+ else
+ r = tdx_mmio_read(vcpu, gpa, size);
+ if (!r)
+ /* Kernel completed device emulation. */
+ return 1;
+
+ /* Request the device emulation to userspace device model. */
+ vcpu->mmio_is_write = write;
+ if (!write)
+ vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
+
+ vcpu->run->mmio.phys_addr = gpa;
+ vcpu->run->mmio.len = size;
+ vcpu->run->mmio.is_write = write;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+ if (write) {
+ memcpy(vcpu->run->mmio.data, &val, size);
+ } else {
+ vcpu->mmio_fragments[0].gpa = gpa;
+ vcpu->mmio_fragments[0].len = size;
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
+ }
+ return 0;
+
+error:
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+}
+
+static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
+
+ /*
+ * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
+ * directly without the support from userspace, just set the value
+ * returned from userspace.
+ */
+ tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
+ tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
+ tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
+ tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
+
+ return 1;
+}
+
+static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ switch (tdx->vp_enter_args.r12) {
+ case 0:
+ tdx->vp_enter_args.r11 = 0;
+ tdx->vp_enter_args.r12 = 0;
+ tdx->vp_enter_args.r13 = 0;
+ tdx->vp_enter_args.r14 = 0;
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
+ return 1;
+ case 1:
+ vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
+ vcpu->run->exit_reason = KVM_EXIT_TDX;
+ vcpu->run->tdx.flags = 0;
+ vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
+ vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
+ vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
+ vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
+ vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
+ vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
+ vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
+ return 0;
+ default:
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+}
+
+static int tdx_complete_simple(struct kvm_vcpu *vcpu)
+{
+ tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
+ return 1;
+}
+
+static int tdx_get_quote(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 gpa = tdx->vp_enter_args.r12;
+ u64 size = tdx->vp_enter_args.r13;
+
+ /* The gpa of buffer must have shared bit set. */
+ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_TDX;
+ vcpu->run->tdx.flags = 0;
+ vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
+ vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
+ vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
+ vcpu->run->tdx.get_quote.size = size;
+
+ vcpu->arch.complete_userspace_io = tdx_complete_simple;
+
+ return 0;
+}
+
+static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 vector = tdx->vp_enter_args.r12;
+
+ if (vector < 32 || vector > 255) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_TDX;
+ vcpu->run->tdx.flags = 0;
+ vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
+ vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
+ vcpu->run->tdx.setup_event_notify.vector = vector;
+
+ vcpu->arch.complete_userspace_io = tdx_complete_simple;
+
+ return 0;
+}
+
+static int handle_tdvmcall(struct kvm_vcpu *vcpu)
+{
+ switch (tdvmcall_leaf(vcpu)) {
+ case TDVMCALL_MAP_GPA:
+ return tdx_map_gpa(vcpu);
+ case TDVMCALL_REPORT_FATAL_ERROR:
+ return tdx_report_fatal_error(vcpu);
+ case TDVMCALL_GET_TD_VM_CALL_INFO:
+ return tdx_get_td_vm_call_info(vcpu);
+ case TDVMCALL_GET_QUOTE:
+ return tdx_get_quote(vcpu);
+ case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
+ return tdx_setup_event_notify_interrupt(vcpu);
+ default:
+ break;
+ }
+
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
+ return 1;
+}
+
+void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
+{
+ u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
+ TDX_SHARED_BIT_PWL_4;
+
+ if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
+ return;
+
+ td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
+}
+
+static void tdx_unpin(struct kvm *kvm, struct page *page)
+{
+ put_page(page);
+}
+
+static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, struct page *page)
+{
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 entry, level_state;
+ u64 err;
+
+ err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
+ if (unlikely(tdx_operand_busy(err))) {
+ tdx_unpin(kvm, page);
+ return -EBUSY;
+ }
+
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
+ tdx_unpin(kvm, page);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
+ * callback tdx_gmem_post_populate() then maps pages into private memory.
+ * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the
+ * private EPT structures for the page to have been built before, which is
+ * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
+ * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
+ * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
+ * are no half-initialized shared EPT pages.
+ */
+static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
+ return -EINVAL;
+
+ /* nr_premapped will be decreased when tdh_mem_page_add() is called. */
+ atomic64_inc(&kvm_tdx->nr_premapped);
+ return 0;
+}
+
+static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct page *page = pfn_to_page(pfn);
+
+ /* TODO: handle large pages. */
+ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+ return -EINVAL;
+
+ /*
+ * Because guest_memfd doesn't support page migration with
+ * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
+ * migration. Until guest_memfd supports page migration, prevent page
+ * migration.
+ * TODO: Once guest_memfd introduces callback on page migration,
+ * implement it and remove get_page/put_page().
+ */
+ get_page(page);
+
+ /*
+ * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
+ * barrier in tdx_td_finalize().
+ */
+ smp_rmb();
+ if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
+ return tdx_mem_page_aug(kvm, gfn, level, page);
+
+ return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
+}
+
+static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, struct page *page)
+{
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 err, entry, level_state;
+
+ /* TODO: handle large pages. */
+ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+ return -EINVAL;
+
+ if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
+ return -EINVAL;
+
+ /*
+ * When zapping private page, write lock is held. So no race condition
+ * with other vcpu sept operation.
+ * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+ */
+ err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
+ &level_state);
+
+ if (unlikely(tdx_operand_busy(err))) {
+ /*
+ * The second retry is expected to succeed after kicking off all
+ * other vCPUs and prevent them from invoking TDH.VP.ENTER.
+ */
+ tdx_no_vcpus_enter_start(kvm);
+ err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
+ &level_state);
+ tdx_no_vcpus_enter_stop(kvm);
+ }
+
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
+ return -EIO;
+ }
+
+ err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+ return -EIO;
+ }
+ tdx_clear_page(page);
+ tdx_unpin(kvm, page);
+ return 0;
+}
+
+static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt)
+{
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ struct page *page = virt_to_page(private_spt);
+ u64 err, entry, level_state;
+
+ err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
+ &level_state);
+ if (unlikely(tdx_operand_busy(err)))
+ return -EBUSY;
+
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
+ * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
+ * successfully.
+ *
+ * Since tdh_mem_sept_add() must have been invoked successfully before a
+ * non-leaf entry present in the mirrored page table, the SEPT ZAP related
+ * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
+ * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
+ * SEPT.
+ *
+ * Further check if the returned entry from SEPT walking is with RWX permissions
+ * to filter out anything unexpected.
+ *
+ * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
+ * level_state returned from a SEAMCALL error is the same as that passed into
+ * the SEAMCALL.
+ */
+static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
+ u64 entry, int level)
+{
+ if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
+ return false;
+
+ if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
+ return false;
+
+ if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
+ return false;
+
+ return true;
+}
+
+static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, struct page *page)
+{
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
+ u64 err, entry, level_state;
+
+ /* For now large page isn't supported yet. */
+ WARN_ON_ONCE(level != PG_LEVEL_4K);
+
+ err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
+
+ if (unlikely(tdx_operand_busy(err))) {
+ /* After no vCPUs enter, the second retry is expected to succeed */
+ tdx_no_vcpus_enter_start(kvm);
+ err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
+ tdx_no_vcpus_enter_stop(kvm);
+ }
+ if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
+ !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
+ atomic64_dec(&kvm_tdx->nr_premapped);
+ tdx_unpin(kvm, page);
+ return 0;
+ }
+
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
+ return -EIO;
+ }
+ return 1;
+}
+
+/*
+ * Ensure shared and private EPTs to be flushed on all vCPUs.
+ * tdh_mem_track() is the only caller that increases TD epoch. An increase in
+ * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
+ * running in guest mode with the value "N - 1".
+ *
+ * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
+ * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
+ * being increased to "N + 1".
+ *
+ * Kicking off all vCPUs after that further results in no vCPUs can run in guest
+ * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
+ * to increase TD epoch to "N + 2").
+ *
+ * TDX module will flush EPT on the next TD enter and make vCPUs to run in
+ * guest mode with TD epoch value "N + 1".
+ *
+ * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
+ * waiting empty IPI handler ack_kick().
+ *
+ * No action is required to the vCPUs being kicked off since the kicking off
+ * occurs certainly after TD epoch increment and before the next
+ * tdh_mem_track().
+ */
+static void tdx_track(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err;
+
+ /* If TD isn't finalized, it's before any vcpu running. */
+ if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
+ return;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ err = tdh_mem_track(&kvm_tdx->td);
+ if (unlikely(tdx_operand_busy(err))) {
+ /* After no vCPUs enter, the second retry is expected to succeed */
+ tdx_no_vcpus_enter_start(kvm);
+ err = tdh_mem_track(&kvm_tdx->td);
+ tdx_no_vcpus_enter_stop(kvm);
+ }
+
+ if (KVM_BUG_ON(err, kvm))
+ pr_tdx_error(TDH_MEM_TRACK, err);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
+static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ /*
+ * free_external_spt() is only called after hkid is freed when TD is
+ * tearing down.
+ * KVM doesn't (yet) zap page table pages in mirror page table while
+ * TD is active, though guest pages mapped in mirror page table could be
+ * zapped during TD is active, e.g. for shared <-> private conversion
+ * and slot move/deletion.
+ */
+ if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
+ return -EINVAL;
+
+ /*
+ * The HKID assigned to this TD was already freed and cache was
+ * already flushed. We don't have to flush again.
+ */
+ return tdx_reclaim_page(virt_to_page(private_spt));
+}
+
+static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
+{
+ struct page *page = pfn_to_page(pfn);
+ int ret;
+
+ /*
+ * HKID is released after all private pages have been removed, and set
+ * before any might be populated. Warn if zapping is attempted when
+ * there can't be anything populated in the private EPT.
+ */
+ if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
+ return -EINVAL;
+
+ ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
+ if (ret <= 0)
+ return ret;
+
+ /*
+ * TDX requires TLB tracking before dropping private page. Do
+ * it here, although it is also done later.
+ */
+ tdx_track(kvm);
+
+ return tdx_sept_drop_private_spte(kvm, gfn, level, page);
+}
+
+void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ /* TDX supports only posted interrupt. No lapic emulation. */
+ __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
+
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
+}
+
+static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
+{
+ u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
+ u64 eq = vmx_get_exit_qual(vcpu);
+
+ if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
+ return false;
+
+ return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
+}
+
+static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qual;
+ gpa_t gpa = to_tdx(vcpu)->exit_gpa;
+ bool local_retry = false;
+ int ret;
+
+ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
+ if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
+ pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
+ gpa, vcpu->vcpu_id);
+ kvm_vm_dead(vcpu->kvm);
+ return -EIO;
+ }
+ /*
+ * Always treat SEPT violations as write faults. Ignore the
+ * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
+ * TD private pages are always RWX in the SEPT tables,
+ * i.e. they're always mapped writable. Just as importantly,
+ * treating SEPT violations as write faults is necessary to
+ * avoid COW allocations, which will cause TDAUGPAGE failures
+ * due to aliasing a single HPA to multiple GPAs.
+ */
+ exit_qual = EPT_VIOLATION_ACC_WRITE;
+
+ /* Only private GPA triggers zero-step mitigation */
+ local_retry = true;
+ } else {
+ exit_qual = vmx_get_exit_qual(vcpu);
+ /*
+ * EPT violation due to instruction fetch should never be
+ * triggered from shared memory in TDX guest. If such EPT
+ * violation occurs, treat it as broken hardware.
+ */
+ if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
+ return -EIO;
+ }
+
+ trace_kvm_page_fault(vcpu, gpa, exit_qual);
+
+ /*
+ * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
+ * mapping in TDX.
+ *
+ * KVM may return RET_PF_RETRY for private GPA due to
+ * - contentions when atomically updating SPTEs of the mirror page table
+ * - in-progress GFN invalidation or memslot removal.
+ * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
+ * caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
+ * or certain TDCALLs.
+ *
+ * If TDH.VP.ENTER is invoked more times than the threshold set by the
+ * TDX module before KVM resolves the private GPA mapping, the TDX
+ * module will activate zero-step mitigation during TDH.VP.ENTER. This
+ * process acquires an SEPT tree lock in the TDX module, leading to
+ * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
+ * operations on other vCPUs.
+ *
+ * Breaking out of local retries for kvm_vcpu_has_events() is for
+ * interrupt injection. kvm_vcpu_has_events() should not see pending
+ * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
+ * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
+ * the guest even if the IRQ/NMI can't be delivered.
+ *
+ * Note: even without breaking out of local retries, zero-step
+ * mitigation may still occur due to
+ * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
+ * - a single RIP causing EPT violations for more GFNs than the
+ * threshold count.
+ * This is safe, as triggering zero-step mitigation only introduces
+ * contentions to page installation SEAMCALLs on other vCPUs, which will
+ * handle retries locally in their EPT violation handlers.
+ */
+ while (1) {
+ ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
+
+ if (ret != RET_PF_RETRY || !local_retry)
+ break;
+
+ if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
+ break;
+
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
+ ret = -EIO;
+ break;
+ }
+
+ cond_resched();
+ }
+ return ret;
+}
+
+int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
+{
+ if (err) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+
+ if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
+ tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
+
+ return 1;
+}
+
+
+int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 vp_enter_ret = tdx->vp_enter_ret;
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
+
+ if (fastpath != EXIT_FASTPATH_NONE)
+ return 1;
+
+ if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
+ KVM_BUG_ON(1, vcpu->kvm);
+ return -EIO;
+ }
+
+ /*
+ * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
+ * TDX_SEAMCALL_VMFAILINVALID.
+ */
+ if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
+ KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
+ goto unhandled_exit;
+ }
+
+ if (unlikely(tdx_failed_vmentry(vcpu))) {
+ /*
+ * If the guest state is protected, that means off-TD debug is
+ * not enabled, TDX_NON_RECOVERABLE must be set.
+ */
+ WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
+ !(vp_enter_ret & TDX_NON_RECOVERABLE));
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
+ return 0;
+ }
+
+ if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
+ exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
+ kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
+ goto unhandled_exit;
+ }
+
+ WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
+ (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
+
+ switch (exit_reason.basic) {
+ case EXIT_REASON_TRIPLE_FAULT:
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ return 0;
+ case EXIT_REASON_EXCEPTION_NMI:
+ return tdx_handle_exception_nmi(vcpu);
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ ++vcpu->stat.irq_exits;
+ return 1;
+ case EXIT_REASON_CPUID:
+ return tdx_emulate_cpuid(vcpu);
+ case EXIT_REASON_HLT:
+ return kvm_emulate_halt_noskip(vcpu);
+ case EXIT_REASON_TDCALL:
+ return handle_tdvmcall(vcpu);
+ case EXIT_REASON_VMCALL:
+ return tdx_emulate_vmcall(vcpu);
+ case EXIT_REASON_IO_INSTRUCTION:
+ return tdx_emulate_io(vcpu);
+ case EXIT_REASON_MSR_READ:
+ kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
+ return kvm_emulate_rdmsr(vcpu);
+ case EXIT_REASON_MSR_WRITE:
+ kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
+ kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
+ kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
+ return kvm_emulate_wrmsr(vcpu);
+ case EXIT_REASON_EPT_MISCONFIG:
+ return tdx_emulate_mmio(vcpu);
+ case EXIT_REASON_EPT_VIOLATION:
+ return tdx_handle_ept_violation(vcpu);
+ case EXIT_REASON_OTHER_SMI:
+ /*
+ * Unlike VMX, SMI in SEAM non-root mode (i.e. when
+ * TD guest vCPU is running) will cause VM exit to TDX module,
+ * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered
+ * and handled by kernel handler right away.
+ *
+ * The Other SMI exit can also be caused by the SEAM non-root
+ * machine check delivered via Machine Check System Management
+ * Interrupt (MSMI), but it has already been handled by the
+ * kernel machine check handler, i.e., the memory page has been
+ * marked as poisoned and it won't be freed to the free list
+ * when the TDX guest is terminated (the TDX module marks the
+ * guest as dead and prevent it from further running when
+ * machine check happens in SEAM non-root).
+ *
+ * - A MSMI will not reach here, it's handled as non_recoverable
+ * case above.
+ * - If it's not an MSMI, no need to do anything here.
+ */
+ return 1;
+ default:
+ break;
+ }
+
+unhandled_exit:
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = vp_enter_ret;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ return 0;
+}
+
+void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ *reason = tdx->vt.exit_reason.full;
+ if (*reason != -1u) {
+ *info1 = vmx_get_exit_qual(vcpu);
+ *info2 = tdx->ext_exit_qualification;
+ *intr_info = vmx_get_intr_info(vcpu);
+ } else {
+ *info1 = 0;
+ *info2 = 0;
+ *intr_info = 0;
+ }
+
+ *error_code = 0;
+}
+
+bool tdx_has_emulated_msr(u32 index)
+{
+ switch (index) {
+ case MSR_IA32_UCODE_REV:
+ case MSR_IA32_ARCH_CAPABILITIES:
+ case MSR_IA32_POWER_CTL:
+ case MSR_IA32_CR_PAT:
+ case MSR_MTRRcap:
+ case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ case MSR_IA32_TSC_DEADLINE:
+ case MSR_IA32_MISC_ENABLE:
+ case MSR_PLATFORM_INFO:
+ case MSR_MISC_FEATURES_ENABLES:
+ case MSR_IA32_APICBASE:
+ case MSR_EFER:
+ case MSR_IA32_FEAT_CTL:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_EXT_CTL:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
+ case MSR_KVM_POLL_CONTROL:
+ return true;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+ /*
+ * x2APIC registers that are virtualized by the CPU can't be
+ * emulated, KVM doesn't have access to the virtual APIC page.
+ */
+ switch (index) {
+ case X2APIC_MSR(APIC_TASKPRI):
+ case X2APIC_MSR(APIC_PROCPRI):
+ case X2APIC_MSR(APIC_EOI):
+ case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
+ case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
+ case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
+ return false;
+ default:
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+static bool tdx_is_read_only_msr(u32 index)
+{
+ return index == MSR_IA32_APICBASE || index == MSR_EFER ||
+ index == MSR_IA32_FEAT_CTL;
+}
+
+int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ switch (msr->index) {
+ case MSR_IA32_FEAT_CTL:
+ /*
+ * MCE and MCA are advertised via cpuid. Guest kernel could
+ * check if LMCE is enabled or not.
+ */
+ msr->data = FEAT_CTL_LOCKED;
+ if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+ msr->data |= FEAT_CTL_LMCE_ENABLED;
+ return 0;
+ case MSR_IA32_MCG_EXT_CTL:
+ if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
+ return 1;
+ msr->data = vcpu->arch.mcg_ext_ctl;
+ return 0;
+ default:
+ if (!tdx_has_emulated_msr(msr->index))
+ return 1;
+
+ return kvm_get_msr_common(vcpu, msr);
+ }
+}
+
+int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ switch (msr->index) {
+ case MSR_IA32_MCG_EXT_CTL:
+ if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
+ (msr->data & ~MCG_EXT_CTL_LMCE_EN))
+ return 1;
+ vcpu->arch.mcg_ext_ctl = msr->data;
+ return 0;
+ default:
+ if (tdx_is_read_only_msr(msr->index))
+ return 1;
+
+ if (!tdx_has_emulated_msr(msr->index))
+ return 1;
+
+ return kvm_set_msr_common(vcpu, msr);
+ }
+}
+
+static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
+{
+ const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+ struct kvm_tdx_capabilities __user *user_caps;
+ struct kvm_tdx_capabilities *caps = NULL;
+ u32 nr_user_entries;
+ int ret = 0;
+
+ /* flags is reserved for future use */
+ if (cmd->flags)
+ return -EINVAL;
+
+ caps = kzalloc(sizeof(*caps) +
+ sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
+ GFP_KERNEL);
+ if (!caps)
+ return -ENOMEM;
+
+ user_caps = u64_to_user_ptr(cmd->data);
+ if (get_user(nr_user_entries, &user_caps->cpuid.nent)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (nr_user_entries < td_conf->num_cpuid_config) {
+ ret = -E2BIG;
+ goto out;
+ }
+
+ ret = init_kvm_tdx_caps(td_conf, caps);
+ if (ret)
+ goto out;
+
+ if (copy_to_user(user_caps, caps, sizeof(*caps))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
+ caps->cpuid.nent *
+ sizeof(caps->cpuid.entries[0])))
+ ret = -EFAULT;
+
+out:
+ /* kfree() accepts NULL. */
+ kfree(caps);
+ return ret;
+}
+
+/*
+ * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
+ * similar to TDX's GPAW. Use this field as the interface for userspace to
+ * configure the GPAW and EPT level for TDs.
+ *
+ * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
+ * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
+ * supported. Value 52 is only supported when the platform supports 5 level
+ * EPT.
+ */
+static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
+ struct td_params *td_params)
+{
+ const struct kvm_cpuid_entry2 *entry;
+ int guest_pa;
+
+ entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
+ if (!entry)
+ return -EINVAL;
+
+ guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
+
+ if (guest_pa != 48 && guest_pa != 52)
+ return -EINVAL;
+
+ if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
+ return -EINVAL;
+
+ td_params->eptp_controls = VMX_EPTP_MT_WB;
+ if (guest_pa == 52) {
+ td_params->eptp_controls |= VMX_EPTP_PWL_5;
+ td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
+ } else {
+ td_params->eptp_controls |= VMX_EPTP_PWL_4;
+ }
+
+ return 0;
+}
+
+static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
+ struct td_params *td_params)
+{
+ const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+ const struct kvm_cpuid_entry2 *entry;
+ struct tdx_cpuid_value *value;
+ int i, copy_cnt = 0;
+
+ /*
+ * td_params.cpuid_values: The number and the order of cpuid_value must
+ * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
+ * It's assumed that td_params was zeroed.
+ */
+ for (i = 0; i < td_conf->num_cpuid_config; i++) {
+ struct kvm_cpuid_entry2 tmp;
+
+ td_init_cpuid_entry2(&tmp, i);
+
+ entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
+ tmp.function, tmp.index);
+ if (!entry)
+ continue;
+
+ if (tdx_unsupported_cpuid(entry))
+ return -EINVAL;
+
+ copy_cnt++;
+
+ value = &td_params->cpuid_values[i];
+ value->eax = entry->eax;
+ value->ebx = entry->ebx;
+ value->ecx = entry->ecx;
+ value->edx = entry->edx;
+
+ /*
+ * TDX module does not accept nonzero bits 16..23 for the
+ * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
+ */
+ if (tmp.function == 0x80000008)
+ value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
+ }
+
+ /*
+ * Rely on the TDX module to reject invalid configuration, but it can't
+ * check of leafs that don't have a proper slot in td_params->cpuid_values
+ * to stick then. So fail if there were entries that didn't get copied to
+ * td_params.
+ */
+ if (copy_cnt != cpuid->nent)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
+ struct kvm_tdx_init_vm *init_vm)
+{
+ const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+ struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
+ int ret;
+
+ if (kvm->created_vcpus)
+ return -EBUSY;
+
+ if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
+ return -EINVAL;
+
+ if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
+ return -EINVAL;
+
+ td_params->max_vcpus = kvm->max_vcpus;
+ td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
+ td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
+
+ td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
+ td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
+
+ ret = setup_tdparams_eptp_controls(cpuid, td_params);
+ if (ret)
+ return ret;
+
+ ret = setup_tdparams_cpuids(cpuid, td_params);
+ if (ret)
+ return ret;
+
+#define MEMCPY_SAME_SIZE(dst, src) \
+ do { \
+ BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \
+ memcpy((dst), (src), sizeof(dst)); \
+ } while (0)
+
+ MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
+ MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
+ MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
+
+ return 0;
+}
+
+static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
+ u64 *seamcall_err)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ cpumask_var_t packages;
+ struct page **tdcs_pages = NULL;
+ struct page *tdr_page;
+ int ret, i;
+ u64 err, rcx;
+
+ *seamcall_err = 0;
+ ret = tdx_guest_keyid_alloc();
+ if (ret < 0)
+ return ret;
+ kvm_tdx->hkid = ret;
+ kvm_tdx->misc_cg = get_current_misc_cg();
+ ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
+ if (ret)
+ goto free_hkid;
+
+ ret = -ENOMEM;
+
+ atomic_inc(&nr_configured_hkid);
+
+ tdr_page = alloc_page(GFP_KERNEL);
+ if (!tdr_page)
+ goto free_hkid;
+
+ kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
+ /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
+ kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
+ tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!tdcs_pages)
+ goto free_tdr;
+
+ for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ tdcs_pages[i] = alloc_page(GFP_KERNEL);
+ if (!tdcs_pages[i])
+ goto free_tdcs;
+ }
+
+ if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
+ goto free_tdcs;
+
+ cpus_read_lock();
+
+ /*
+ * Need at least one CPU of the package to be online in order to
+ * program all packages for host key id. Check it.
+ */
+ for_each_present_cpu(i)
+ cpumask_set_cpu(topology_physical_package_id(i), packages);
+ for_each_online_cpu(i)
+ cpumask_clear_cpu(topology_physical_package_id(i), packages);
+ if (!cpumask_empty(packages)) {
+ ret = -EIO;
+ /*
+ * Because it's hard for human operator to figure out the
+ * reason, warn it.
+ */
+#define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n"
+ pr_warn_ratelimited(MSG_ALLPKG);
+ goto free_packages;
+ }
+
+ /*
+ * TDH.MNG.CREATE tries to grab the global TDX module and fails
+ * with TDX_OPERAND_BUSY when it fails to grab. Take the global
+ * lock to prevent it from failure.
+ */
+ mutex_lock(&tdx_lock);
+ kvm_tdx->td.tdr_page = tdr_page;
+ err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
+ mutex_unlock(&tdx_lock);
+
+ if (err == TDX_RND_NO_ENTROPY) {
+ ret = -EAGAIN;
+ goto free_packages;
+ }
+
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_CREATE, err);
+ ret = -EIO;
+ goto free_packages;
+ }
+
+ for_each_online_cpu(i) {
+ int pkg = topology_physical_package_id(i);
+
+ if (cpumask_test_and_set_cpu(pkg, packages))
+ continue;
+
+ /*
+ * Program the memory controller in the package with an
+ * encryption key associated to a TDX private host key id
+ * assigned to this TDR. Concurrent operations on same memory
+ * controller results in TDX_OPERAND_BUSY. No locking needed
+ * beyond the cpus_read_lock() above as it serializes against
+ * hotplug and the first online CPU of the package is always
+ * used. We never have two CPUs in the same socket trying to
+ * program the key.
+ */
+ ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
+ kvm_tdx, true);
+ if (ret)
+ break;
+ }
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+ if (ret) {
+ i = 0;
+ goto teardown;
+ }
+
+ kvm_tdx->td.tdcs_pages = tdcs_pages;
+ for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
+ if (err == TDX_RND_NO_ENTROPY) {
+ /* Here it's hard to allow userspace to retry. */
+ ret = -EAGAIN;
+ goto teardown;
+ }
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_ADDCX, err);
+ ret = -EIO;
+ goto teardown;
+ }
+ }
+
+ err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
+ if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
+ /*
+ * Because a user gives operands, don't warn.
+ * Return a hint to the user because it's sometimes hard for the
+ * user to figure out which operand is invalid. SEAMCALL status
+ * code includes which operand caused invalid operand error.
+ */
+ *seamcall_err = err;
+ ret = -EINVAL;
+ goto teardown;
+ } else if (WARN_ON_ONCE(err)) {
+ pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
+ ret = -EIO;
+ goto teardown;
+ }
+
+ return 0;
+
+ /*
+ * The sequence for freeing resources from a partially initialized TD
+ * varies based on where in the initialization flow failure occurred.
+ * Simply use the full teardown and destroy, which naturally play nice
+ * with partial initialization.
+ */
+teardown:
+ /* Only free pages not yet added, so start at 'i' */
+ for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ if (tdcs_pages[i]) {
+ __free_page(tdcs_pages[i]);
+ tdcs_pages[i] = NULL;
+ }
+ }
+ if (!kvm_tdx->td.tdcs_pages)
+ kfree(tdcs_pages);
+
+ tdx_mmu_release_hkid(kvm);
+ tdx_reclaim_td_control_pages(kvm);
+
+ return ret;
+
+free_packages:
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+
+free_tdcs:
+ for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ if (tdcs_pages[i])
+ __free_page(tdcs_pages[i]);
+ }
+ kfree(tdcs_pages);
+ kvm_tdx->td.tdcs_pages = NULL;
+
+free_tdr:
+ if (tdr_page)
+ __free_page(tdr_page);
+ kvm_tdx->td.tdr_page = 0;
+
+free_hkid:
+ tdx_hkid_free(kvm_tdx);
+
+ return ret;
+}
+
+static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
+ u64 *data)
+{
+ u64 err;
+
+ err = tdh_mng_rd(&tdx->td, field_id, data);
+
+ return err;
+}
+
+#define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7)
+#define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7)
+
+static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
+ bool sub_leaf_set, int *entry_index,
+ struct kvm_cpuid_entry2 *out)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
+ u64 ebx_eax, edx_ecx;
+ u64 err = 0;
+
+ if (sub_leaf > 0b1111111)
+ return -EINVAL;
+
+ if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
+ return -EINVAL;
+
+ if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
+ sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
+ return -EINVAL;
+
+ /*
+ * bit 23:17, REVSERVED: reserved, must be 0;
+ * bit 16, LEAF_31: leaf number bit 31;
+ * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
+ * implicitly 0;
+ * bit 8, SUBLEAF_NA: sub-leaf not applicable flag;
+ * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
+ * the SUBLEAF_6_0 is all-1.
+ * sub-leaf bits 31:7 are implicitly 0;
+ * bit 0, ELEMENT_I: Element index within field;
+ */
+ field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
+ field_id |= (leaf & 0x7f) << 9;
+ if (sub_leaf_set)
+ field_id |= (sub_leaf & 0x7f) << 1;
+ else
+ field_id |= 0x1fe;
+
+ err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
+ if (err) //TODO check for specific errors
+ goto err_out;
+
+ out->eax = (u32) ebx_eax;
+ out->ebx = (u32) (ebx_eax >> 32);
+
+ field_id++;
+ err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
+ /*
+ * It's weird that reading edx_ecx fails while reading ebx_eax
+ * succeeded.
+ */
+ if (WARN_ON_ONCE(err))
+ goto err_out;
+
+ out->ecx = (u32) edx_ecx;
+ out->edx = (u32) (edx_ecx >> 32);
+
+ out->function = leaf;
+ out->index = sub_leaf;
+ out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
+
+ /*
+ * Work around missing support on old TDX modules, fetch
+ * guest maxpa from gfn_direct_bits.
+ */
+ if (leaf == 0x80000008) {
+ gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
+ unsigned int g_maxpa = __ffs(gpa_bits) + 1;
+
+ out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
+ }
+
+ (*entry_index)++;
+
+ return 0;
+
+err_out:
+ out->eax = 0;
+ out->ebx = 0;
+ out->ecx = 0;
+ out->edx = 0;
+
+ return -EIO;
+}
+
+static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct kvm_tdx_init_vm *init_vm;
+ struct td_params *td_params = NULL;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
+ BUILD_BUG_ON(sizeof(struct td_params) != 1024);
+
+ if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
+ return -EINVAL;
+
+ if (cmd->flags)
+ return -EINVAL;
+
+ init_vm = kmalloc(sizeof(*init_vm) +
+ sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
+ GFP_KERNEL);
+ if (!init_vm)
+ return -ENOMEM;
+
+ if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
+ ret = -E2BIG;
+ goto out;
+ }
+
+ if (copy_from_user(init_vm->cpuid.entries,
+ u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
+ flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (init_vm->cpuid.padding) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
+ if (!td_params) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = setup_tdparams(kvm, td_params, init_vm);
+ if (ret)
+ goto out;
+
+ ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
+ if (ret)
+ goto out;
+
+ kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
+ kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
+ kvm_tdx->attributes = td_params->attributes;
+ kvm_tdx->xfam = td_params->xfam;
+
+ if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
+ kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
+ else
+ kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
+
+ kvm_tdx->state = TD_STATE_INITIALIZED;
+out:
+ /* kfree() accepts NULL. */
+ kfree(init_vm);
+ kfree(td_params);
+
+ return ret;
+}
+
+void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ /*
+ * flush_tlb_current() is invoked when the first time for the vcpu to
+ * run or when root of shared EPT is invalidated.
+ * KVM only needs to flush shared EPT because the TDX module handles TLB
+ * invalidation for private EPT in tdh_vp_enter();
+ *
+ * A single context invalidation for shared EPT can be performed here.
+ * However, this single context invalidation requires the private EPTP
+ * rather than the shared EPTP to flush shared EPT, as shared EPT uses
+ * private EPTP as its ASID for TLB invalidation.
+ *
+ * To avoid reading back private EPTP, perform a global invalidation for
+ * shared EPT instead to keep this function simple.
+ */
+ ept_sync_global();
+}
+
+void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ /*
+ * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
+ * ensure that private EPT will be flushed on the next TD enter. No need
+ * to call tdx_track() here again even when this callback is a result of
+ * zapping private EPT.
+ *
+ * Due to the lack of the context to determine which EPT has been
+ * affected by zapping, invoke invept() directly here for both shared
+ * EPT and private EPT for simplicity, though it's not necessary for
+ * private EPT.
+ */
+ ept_sync_global();
+}
+
+static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ guard(mutex)(&kvm->slots_lock);
+
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+ /*
+ * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
+ * TDH.MEM.PAGE.ADD().
+ */
+ if (atomic64_read(&kvm_tdx->nr_premapped))
+ return -EINVAL;
+
+ cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
+ if (tdx_operand_busy(cmd->hw_error))
+ return -EBUSY;
+ if (KVM_BUG_ON(cmd->hw_error, kvm)) {
+ pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
+ return -EIO;
+ }
+
+ kvm_tdx->state = TD_STATE_RUNNABLE;
+ /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
+ smp_wmb();
+ kvm->arch.pre_fault_allowed = true;
+ return 0;
+}
+
+int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_tdx_cmd tdx_cmd;
+ int r;
+
+ if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
+ return -EFAULT;
+
+ /*
+ * Userspace should never set hw_error. It is used to fill
+ * hardware-defined error by the kernel.
+ */
+ if (tdx_cmd.hw_error)
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ switch (tdx_cmd.id) {
+ case KVM_TDX_CAPABILITIES:
+ r = tdx_get_capabilities(&tdx_cmd);
+ break;
+ case KVM_TDX_INIT_VM:
+ r = tdx_td_init(kvm, &tdx_cmd);
+ break;
+ case KVM_TDX_FINALIZE_VM:
+ r = tdx_td_finalize(kvm, &tdx_cmd);
+ break;
+ default:
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
+ r = -EFAULT;
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
+static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct page *page;
+ int ret, i;
+ u64 err;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ tdx->vp.tdvpr_page = page;
+
+ tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
+ GFP_KERNEL);
+ if (!tdx->vp.tdcx_pages) {
+ ret = -ENOMEM;
+ goto free_tdvpr;
+ }
+
+ for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ ret = -ENOMEM;
+ goto free_tdcx;
+ }
+ tdx->vp.tdcx_pages[i] = page;
+ }
+
+ err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
+ if (KVM_BUG_ON(err, vcpu->kvm)) {
+ ret = -EIO;
+ pr_tdx_error(TDH_VP_CREATE, err);
+ goto free_tdcx;
+ }
+
+ for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
+ if (KVM_BUG_ON(err, vcpu->kvm)) {
+ pr_tdx_error(TDH_VP_ADDCX, err);
+ /*
+ * Pages already added are reclaimed by the vcpu_free
+ * method, but the rest are freed here.
+ */
+ for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ __free_page(tdx->vp.tdcx_pages[i]);
+ tdx->vp.tdcx_pages[i] = NULL;
+ }
+ return -EIO;
+ }
+ }
+
+ err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
+ if (KVM_BUG_ON(err, vcpu->kvm)) {
+ pr_tdx_error(TDH_VP_INIT, err);
+ return -EIO;
+ }
+
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ return 0;
+
+free_tdcx:
+ for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ if (tdx->vp.tdcx_pages[i])
+ __free_page(tdx->vp.tdcx_pages[i]);
+ tdx->vp.tdcx_pages[i] = NULL;
+ }
+ kfree(tdx->vp.tdcx_pages);
+ tdx->vp.tdcx_pages = NULL;
+
+free_tdvpr:
+ if (tdx->vp.tdvpr_page)
+ __free_page(tdx->vp.tdvpr_page);
+ tdx->vp.tdvpr_page = 0;
+
+ return ret;
+}
+
+/* Sometimes reads multipple subleafs. Return how many enties were written. */
+static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
+ struct kvm_cpuid_entry2 *output_e)
+{
+ int sub_leaf = 0;
+ int ret;
+
+ /* First try without a subleaf */
+ ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
+
+ /* If success, or invalid leaf, just give up */
+ if (ret != -EIO)
+ return ret;
+
+ /*
+ * If the try without a subleaf failed, try reading subleafs until
+ * failure. The TDX module only supports 6 bits of subleaf index.
+ */
+ while (1) {
+ /* Keep reading subleafs until there is a failure. */
+ if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
+ return !sub_leaf;
+
+ sub_leaf++;
+ output_e++;
+ }
+
+ return 0;
+}
+
+static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+{
+ struct kvm_cpuid2 __user *output, *td_cpuid;
+ int r = 0, i = 0, leaf;
+ u32 level;
+
+ output = u64_to_user_ptr(cmd->data);
+ td_cpuid = kzalloc(sizeof(*td_cpuid) +
+ sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
+ GFP_KERNEL);
+ if (!td_cpuid)
+ return -ENOMEM;
+
+ if (copy_from_user(td_cpuid, output, sizeof(*output))) {
+ r = -EFAULT;
+ goto out;
+ }
+
+ /* Read max CPUID for normal range */
+ if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
+ r = -EIO;
+ goto out;
+ }
+ level = td_cpuid->entries[0].eax;
+
+ for (leaf = 1; leaf <= level; leaf++)
+ tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
+
+ /* Read max CPUID for extended range */
+ if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
+ r = -EIO;
+ goto out;
+ }
+ level = td_cpuid->entries[i - 1].eax;
+
+ for (leaf = 0x80000001; leaf <= level; leaf++)
+ tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
+
+ if (td_cpuid->nent < i)
+ r = -E2BIG;
+ td_cpuid->nent = i;
+
+ if (copy_to_user(output, td_cpuid, sizeof(*output))) {
+ r = -EFAULT;
+ goto out;
+ }
+
+ if (r == -E2BIG)
+ goto out;
+
+ if (copy_to_user(output->entries, td_cpuid->entries,
+ td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+ r = -EFAULT;
+
+out:
+ kfree(td_cpuid);
+
+ return r;
+}
+
+static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+{
+ u64 apic_base;
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ int ret;
+
+ if (cmd->flags)
+ return -EINVAL;
+
+ if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
+ return -EINVAL;
+
+ /*
+ * TDX requires X2APIC, userspace is responsible for configuring guest
+ * CPUID accordingly.
+ */
+ apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
+ (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
+ if (kvm_apic_set_base(vcpu, apic_base, true))
+ return -EINVAL;
+
+ ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
+ if (ret)
+ return ret;
+
+ td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
+ td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
+
+ tdx->state = VCPU_TD_STATE_INITIALIZED;
+
+ return 0;
+}
+
+void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ /*
+ * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
+ * INIT events.
+ *
+ * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
+ * userspace needs to define the vCPU model before KVM can initialize
+ * vCPU state, e.g. to enable x2APIC.
+ */
+ WARN_ON_ONCE(init_event);
+}
+
+struct tdx_gmem_post_populate_arg {
+ struct kvm_vcpu *vcpu;
+ __u32 flags;
+};
+
+static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
+ void __user *src, int order, void *_arg)
+{
+ u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct tdx_gmem_post_populate_arg *arg = _arg;
+ struct kvm_vcpu *vcpu = arg->vcpu;
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u8 level = PG_LEVEL_4K;
+ struct page *src_page;
+ int ret, i;
+ u64 err, entry, level_state;
+
+ /*
+ * Get the source page if it has been faulted in. Return failure if the
+ * source page has been swapped out or unmapped in primary memory.
+ */
+ ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
+ if (ret < 0)
+ return ret;
+ if (ret != 1)
+ return -ENOMEM;
+
+ ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * The private mem cannot be zapped after kvm_tdp_map_page()
+ * because all paths are covered by slots_lock and the
+ * filemap invalidate lock. Check that they are indeed enough.
+ */
+ if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
+ scoped_guard(read_lock, &kvm->mmu_lock) {
+ if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
+ ret = -EIO;
+ goto out;
+ }
+ }
+ }
+
+ ret = 0;
+ err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
+ src_page, &entry, &level_state);
+ if (err) {
+ ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
+ goto out;
+ }
+
+ if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
+ atomic64_dec(&kvm_tdx->nr_premapped);
+
+ if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
+ for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
+ err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
+ &level_state);
+ if (err) {
+ ret = -EIO;
+ break;
+ }
+ }
+ }
+
+out:
+ put_page(src_page);
+ return ret;
+}
+
+static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct kvm_tdx_init_mem_region region;
+ struct tdx_gmem_post_populate_arg arg;
+ long gmem_ret;
+ int ret;
+
+ if (tdx->state != VCPU_TD_STATE_INITIALIZED)
+ return -EINVAL;
+
+ guard(mutex)(&kvm->slots_lock);
+
+ /* Once TD is finalized, the initial guest memory is fixed. */
+ if (kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+
+ if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
+ return -EINVAL;
+
+ if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
+ return -EFAULT;
+
+ if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
+ !region.nr_pages ||
+ region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
+ !vt_is_tdx_private_gpa(kvm, region.gpa) ||
+ !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
+ return -EINVAL;
+
+ kvm_mmu_reload(vcpu);
+ ret = 0;
+ while (region.nr_pages) {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ arg = (struct tdx_gmem_post_populate_arg) {
+ .vcpu = vcpu,
+ .flags = cmd->flags,
+ };
+ gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
+ u64_to_user_ptr(region.source_addr),
+ 1, tdx_gmem_post_populate, &arg);
+ if (gmem_ret < 0) {
+ ret = gmem_ret;
+ break;
+ }
+
+ if (gmem_ret != 1) {
+ ret = -EIO;
+ break;
+ }
+
+ region.source_addr += PAGE_SIZE;
+ region.gpa += PAGE_SIZE;
+ region.nr_pages--;
+
+ cond_resched();
+ }
+
+ if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
+ ret = -EFAULT;
+ return ret;
+}
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct kvm_tdx_cmd cmd;
+ int ret;
+
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+
+ if (copy_from_user(&cmd, argp, sizeof(cmd)))
+ return -EFAULT;
+
+ if (cmd.hw_error)
+ return -EINVAL;
+
+ switch (cmd.id) {
+ case KVM_TDX_INIT_VCPU:
+ ret = tdx_vcpu_init(vcpu, &cmd);
+ break;
+ case KVM_TDX_INIT_MEM_REGION:
+ ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
+ break;
+ case KVM_TDX_GET_CPUID:
+ ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ return PG_LEVEL_4K;
+}
+
+static int tdx_online_cpu(unsigned int cpu)
+{
+ unsigned long flags;
+ int r;
+
+ /* Sanity check CPU is already in post-VMXON */
+ WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
+
+ local_irq_save(flags);
+ r = tdx_cpu_enable();
+ local_irq_restore(flags);
+
+ return r;
+}
+
+static int tdx_offline_cpu(unsigned int cpu)
+{
+ int i;
+
+ /* No TD is running. Allow any cpu to be offline. */
+ if (!atomic_read(&nr_configured_hkid))
+ return 0;
+
+ /*
+ * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
+ * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
+ * controller with pconfig. If we have active TDX HKID, refuse to
+ * offline the last online cpu.
+ */
+ for_each_online_cpu(i) {
+ /*
+ * Found another online cpu on the same package.
+ * Allow to offline.
+ */
+ if (i != cpu && topology_physical_package_id(i) ==
+ topology_physical_package_id(cpu))
+ return 0;
+ }
+
+ /*
+ * This is the last cpu of this package. Don't offline it.
+ *
+ * Because it's hard for human operator to understand the
+ * reason, warn it.
+ */
+#define MSG_ALLPKG_ONLINE \
+ "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
+ pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
+ return -EBUSY;
+}
+
+static void __do_tdx_cleanup(void)
+{
+ /*
+ * Once TDX module is initialized, it cannot be disabled and
+ * re-initialized again w/o runtime update (which isn't
+ * supported by kernel). Only need to remove the cpuhp here.
+ * The TDX host core code tracks TDX status and can handle
+ * 'multiple enabling' scenario.
+ */
+ WARN_ON_ONCE(!tdx_cpuhp_state);
+ cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
+ tdx_cpuhp_state = 0;
+}
+
+static void __tdx_cleanup(void)
+{
+ cpus_read_lock();
+ __do_tdx_cleanup();
+ cpus_read_unlock();
+}
+
+static int __init __do_tdx_bringup(void)
+{
+ int r;
+
+ /*
+ * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
+ * online CPUs before calling tdx_enable(), and on any new
+ * going-online CPU to make sure it is ready for TDX guest.
+ */
+ r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
+ "kvm/cpu/tdx:online",
+ tdx_online_cpu, tdx_offline_cpu);
+ if (r < 0)
+ return r;
+
+ tdx_cpuhp_state = r;
+
+ r = tdx_enable();
+ if (r)
+ __do_tdx_cleanup();
+
+ return r;
+}
+
+static int __init __tdx_bringup(void)
+{
+ const struct tdx_sys_info_td_conf *td_conf;
+ int r, i;
+
+ for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
+ /*
+ * Check if MSRs (tdx_uret_msrs) can be saved/restored
+ * before returning to user space.
+ *
+ * this_cpu_ptr(user_return_msrs)->registered isn't checked
+ * because the registration is done at vcpu runtime by
+ * tdx_user_return_msr_update_cache().
+ */
+ tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
+ if (tdx_uret_msrs[i].slot == -1) {
+ /* If any MSR isn't supported, it is a KVM bug */
+ pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
+ tdx_uret_msrs[i].msr);
+ return -EIO;
+ }
+ }
+
+ /*
+ * Enabling TDX requires enabling hardware virtualization first,
+ * as making SEAMCALLs requires CPU being in post-VMXON state.
+ */
+ r = kvm_enable_virtualization();
+ if (r)
+ return r;
+
+ cpus_read_lock();
+ r = __do_tdx_bringup();
+ cpus_read_unlock();
+
+ if (r)
+ goto tdx_bringup_err;
+
+ /* Get TDX global information for later use */
+ tdx_sysinfo = tdx_get_sysinfo();
+ if (WARN_ON_ONCE(!tdx_sysinfo)) {
+ r = -EINVAL;
+ goto get_sysinfo_err;
+ }
+
+ /* Check TDX module and KVM capabilities */
+ if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
+ !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
+ goto get_sysinfo_err;
+
+ if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
+ goto get_sysinfo_err;
+
+ /*
+ * TDX has its own limit of maximum vCPUs it can support for all
+ * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to
+ * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
+ * extension on per-VM basis.
+ *
+ * TDX module reports such limit via the MAX_VCPU_PER_TD global
+ * metadata. Different modules may report different values.
+ * Some old module may also not support this metadata (in which
+ * case this limit is U16_MAX).
+ *
+ * In practice, the reported value reflects the maximum logical
+ * CPUs that ALL the platforms that the module supports can
+ * possibly have.
+ *
+ * Simply forwarding the MAX_VCPU_PER_TD to userspace could
+ * result in an unpredictable ABI. KVM instead always advertise
+ * the number of logical CPUs the platform has as the maximum
+ * vCPUs for TDX guests.
+ *
+ * Make sure MAX_VCPU_PER_TD reported by TDX module is not
+ * smaller than the number of logical CPUs, otherwise KVM will
+ * report an unsupported value to userspace.
+ *
+ * Note, a platform with TDX enabled in the BIOS cannot support
+ * physical CPU hotplug, and TDX requires the BIOS has marked
+ * all logical CPUs in MADT table as enabled. Just use
+ * num_present_cpus() for the number of logical CPUs.
+ */
+ td_conf = &tdx_sysinfo->td_conf;
+ if (td_conf->max_vcpus_per_td < num_present_cpus()) {
+ pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
+ td_conf->max_vcpus_per_td, num_present_cpus());
+ r = -EINVAL;
+ goto get_sysinfo_err;
+ }
+
+ if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) {
+ r = -EINVAL;
+ goto get_sysinfo_err;
+ }
+
+ /*
+ * Leave hardware virtualization enabled after TDX is enabled
+ * successfully. TDX CPU hotplug depends on this.
+ */
+ return 0;
+
+get_sysinfo_err:
+ __tdx_cleanup();
+tdx_bringup_err:
+ kvm_disable_virtualization();
+ return r;
+}
+
+void tdx_cleanup(void)
+{
+ if (enable_tdx) {
+ misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
+ __tdx_cleanup();
+ kvm_disable_virtualization();
+ }
+}
+
+int __init tdx_bringup(void)
+{
+ int r, i;
+
+ /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
+ for_each_possible_cpu(i)
+ INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
+
+ if (!enable_tdx)
+ return 0;
+
+ if (!enable_ept) {
+ pr_err("EPT is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
+ pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!enable_apicv) {
+ pr_err("APICv is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
+ pr_err("tdx: OSXSAVE is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
+ pr_err("tdx: MOVDIR64B is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
+ pr_err("Self-snoop is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
+ pr_err("tdx: no TDX private KeyIDs available\n");
+ goto success_disable_tdx;
+ }
+
+ if (!enable_virt_at_load) {
+ pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
+ goto success_disable_tdx;
+ }
+
+ /*
+ * Ideally KVM should probe whether TDX module has been loaded
+ * first and then try to bring it up. But TDX needs to use SEAMCALL
+ * to probe whether the module is loaded (there is no CPUID or MSR
+ * for that), and making SEAMCALL requires enabling virtualization
+ * first, just like the rest steps of bringing up TDX module.
+ *
+ * So, for simplicity do everything in __tdx_bringup(); the first
+ * SEAMCALL will return -ENODEV when the module is not loaded. The
+ * only complication is having to make sure that initialization
+ * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
+ * cases.
+ */
+ r = __tdx_bringup();
+ if (r) {
+ /*
+ * Disable TDX only but don't fail to load module if the TDX
+ * module could not be loaded. No need to print message saying
+ * "module is not loaded" because it was printed when the first
+ * SEAMCALL failed. Don't bother unwinding the S-EPT hooks or
+ * vm_size, as kvm_x86_ops have already been finalized (and are
+ * intentionally not exported). The S-EPT code is unreachable,
+ * and allocating a few more bytes per VM in a should-be-rare
+ * failure scenario is a non-issue.
+ */
+ if (r == -ENODEV)
+ goto success_disable_tdx;
+
+ enable_tdx = 0;
+ }
+
+ return r;
+
+success_disable_tdx:
+ enable_tdx = 0;
+ return 0;
+}
+
+void __init tdx_hardware_setup(void)
+{
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
+
+ /*
+ * Note, if the TDX module can't be loaded, KVM TDX support will be
+ * disabled but KVM will continue loading (see tdx_bringup()).
+ */
+ vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
+
+ vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
+ vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
+ vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
+ vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
+ vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
+}
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
new file mode 100644
index 000000000000..ca39a9391db1
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_TDX_H
+#define __KVM_X86_VMX_TDX_H
+
+#include "tdx_arch.h"
+#include "tdx_errno.h"
+
+#ifdef CONFIG_KVM_INTEL_TDX
+#include "common.h"
+
+void tdx_hardware_setup(void);
+int tdx_bringup(void);
+void tdx_cleanup(void);
+
+extern bool enable_tdx;
+
+/* TDX module hardware states. These follow the TDX module OP_STATEs. */
+enum kvm_tdx_state {
+ TD_STATE_UNINITIALIZED = 0,
+ TD_STATE_INITIALIZED,
+ TD_STATE_RUNNABLE,
+};
+
+struct kvm_tdx {
+ struct kvm kvm;
+
+ struct misc_cg *misc_cg;
+ int hkid;
+ enum kvm_tdx_state state;
+
+ u64 attributes;
+ u64 xfam;
+
+ u64 tsc_offset;
+ u64 tsc_multiplier;
+
+ struct tdx_td td;
+
+ /* For KVM_TDX_INIT_MEM_REGION. */
+ atomic64_t nr_premapped;
+
+ /*
+ * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do
+ * not contend with tdh_vp_enter() and TDCALLs.
+ * Set/unset is protected with kvm->mmu_lock.
+ */
+ bool wait_for_sept_zap;
+};
+
+/* TDX module vCPU states */
+enum vcpu_tdx_state {
+ VCPU_TD_STATE_UNINITIALIZED = 0,
+ VCPU_TD_STATE_INITIALIZED,
+};
+
+struct vcpu_tdx {
+ struct kvm_vcpu vcpu;
+ struct vcpu_vt vt;
+ u64 ext_exit_qualification;
+ gpa_t exit_gpa;
+ struct tdx_module_args vp_enter_args;
+
+ struct tdx_vp vp;
+
+ struct list_head cpu_list;
+
+ u64 vp_enter_ret;
+
+ enum vcpu_tdx_state state;
+ bool guest_entered;
+
+ u64 map_gpa_next;
+ u64 map_gpa_end;
+};
+
+void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err);
+void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
+ u64 val, u64 err);
+
+static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
+{
+ u64 err, data;
+
+ err = tdh_mng_rd(&kvm_tdx->td, TDCS_EXEC(field), &data);
+ if (unlikely(err)) {
+ pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err);
+ return 0;
+ }
+ return data;
+}
+
+static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
+{
+#define VMCS_ENC_ACCESS_TYPE_MASK 0x1UL
+#define VMCS_ENC_ACCESS_TYPE_FULL 0x0UL
+#define VMCS_ENC_ACCESS_TYPE_HIGH 0x1UL
+#define VMCS_ENC_ACCESS_TYPE(field) ((field) & VMCS_ENC_ACCESS_TYPE_MASK)
+
+ /* TDX is 64bit only. HIGH field isn't supported. */
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) &&
+ VMCS_ENC_ACCESS_TYPE(field) == VMCS_ENC_ACCESS_TYPE_HIGH,
+ "Read/Write to TD VMCS *_HIGH fields not supported");
+
+ BUILD_BUG_ON(bits != 16 && bits != 32 && bits != 64);
+
+#define VMCS_ENC_WIDTH_MASK GENMASK(14, 13)
+#define VMCS_ENC_WIDTH_16BIT (0UL << 13)
+#define VMCS_ENC_WIDTH_64BIT (1UL << 13)
+#define VMCS_ENC_WIDTH_32BIT (2UL << 13)
+#define VMCS_ENC_WIDTH_NATURAL (3UL << 13)
+#define VMCS_ENC_WIDTH(field) ((field) & VMCS_ENC_WIDTH_MASK)
+
+ /* TDX is 64bit only. i.e. natural width = 64bit. */
+ BUILD_BUG_ON_MSG(bits != 64 && __builtin_constant_p(field) &&
+ (VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_64BIT ||
+ VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_NATURAL),
+ "Invalid TD VMCS access for 64-bit field");
+ BUILD_BUG_ON_MSG(bits != 32 && __builtin_constant_p(field) &&
+ VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_32BIT,
+ "Invalid TD VMCS access for 32-bit field");
+ BUILD_BUG_ON_MSG(bits != 16 && __builtin_constant_p(field) &&
+ VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_16BIT,
+ "Invalid TD VMCS access for 16-bit field");
+}
+
+static __always_inline void tdvps_management_check(u64 field, u8 bits) {}
+static __always_inline void tdvps_state_non_arch_check(u64 field, u8 bits) {}
+
+#define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass) \
+static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx, \
+ u32 field) \
+{ \
+ u64 err, data; \
+ \
+ tdvps_##lclass##_check(field, bits); \
+ err = tdh_vp_rd(&tdx->vp, TDVPS_##uclass(field), &data); \
+ if (unlikely(err)) { \
+ tdh_vp_rd_failed(tdx, #uclass, field, err); \
+ return 0; \
+ } \
+ return (u##bits)data; \
+} \
+static __always_inline void td_##lclass##_write##bits(struct vcpu_tdx *tdx, \
+ u32 field, u##bits val) \
+{ \
+ u64 err; \
+ \
+ tdvps_##lclass##_check(field, bits); \
+ err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), val, \
+ GENMASK_ULL(bits - 1, 0)); \
+ if (unlikely(err)) \
+ tdh_vp_wr_failed(tdx, #uclass, " = ", field, (u64)val, err); \
+} \
+static __always_inline void td_##lclass##_setbit##bits(struct vcpu_tdx *tdx, \
+ u32 field, u64 bit) \
+{ \
+ u64 err; \
+ \
+ tdvps_##lclass##_check(field, bits); \
+ err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), bit, bit); \
+ if (unlikely(err)) \
+ tdh_vp_wr_failed(tdx, #uclass, " |= ", field, bit, err); \
+} \
+static __always_inline void td_##lclass##_clearbit##bits(struct vcpu_tdx *tdx, \
+ u32 field, u64 bit) \
+{ \
+ u64 err; \
+ \
+ tdvps_##lclass##_check(field, bits); \
+ err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), 0, bit); \
+ if (unlikely(err)) \
+ tdh_vp_wr_failed(tdx, #uclass, " &= ~", field, bit, err);\
+}
+
+
+bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu);
+int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err);
+
+TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs);
+TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs);
+TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
+
+TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
+TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch);
+
+#else
+static inline int tdx_bringup(void) { return 0; }
+static inline void tdx_cleanup(void) {}
+
+#define enable_tdx 0
+
+struct kvm_tdx {
+ struct kvm kvm;
+};
+
+struct vcpu_tdx {
+ struct kvm_vcpu vcpu;
+};
+
+static inline bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) { return false; }
+static inline int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { return 0; }
+
+#endif
+
+#endif
diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h
new file mode 100644
index 000000000000..a30e880849e3
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx_arch.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* architectural constants/data definitions for TDX SEAMCALLs */
+
+#ifndef __KVM_X86_TDX_ARCH_H
+#define __KVM_X86_TDX_ARCH_H
+
+#include <linux/types.h>
+
+/* TDX control structure (TDR/TDCS/TDVPS) field access codes */
+#define TDX_NON_ARCH BIT_ULL(63)
+#define TDX_CLASS_SHIFT 56
+#define TDX_FIELD_MASK GENMASK_ULL(31, 0)
+
+#define __BUILD_TDX_FIELD(non_arch, class, field) \
+ (((non_arch) ? TDX_NON_ARCH : 0) | \
+ ((u64)(class) << TDX_CLASS_SHIFT) | \
+ ((u64)(field) & TDX_FIELD_MASK))
+
+#define BUILD_TDX_FIELD(class, field) \
+ __BUILD_TDX_FIELD(false, (class), (field))
+
+#define BUILD_TDX_FIELD_NON_ARCH(class, field) \
+ __BUILD_TDX_FIELD(true, (class), (field))
+
+
+/* Class code for TD */
+#define TD_CLASS_EXECUTION_CONTROLS 17ULL
+
+/* Class code for TDVPS */
+#define TDVPS_CLASS_VMCS 0ULL
+#define TDVPS_CLASS_GUEST_GPR 16ULL
+#define TDVPS_CLASS_OTHER_GUEST 17ULL
+#define TDVPS_CLASS_MANAGEMENT 32ULL
+
+enum tdx_tdcs_execution_control {
+ TD_TDCS_EXEC_TSC_OFFSET = 10,
+ TD_TDCS_EXEC_TSC_MULTIPLIER = 11,
+};
+
+enum tdx_vcpu_guest_other_state {
+ TD_VCPU_STATE_DETAILS_NON_ARCH = 0x100,
+};
+
+#define TDX_VCPU_STATE_DETAILS_INTR_PENDING BIT_ULL(0)
+
+static inline bool tdx_vcpu_state_details_intr_pending(u64 vcpu_state_details)
+{
+ return !!(vcpu_state_details & TDX_VCPU_STATE_DETAILS_INTR_PENDING);
+}
+
+/* @field is any of enum tdx_tdcs_execution_control */
+#define TDCS_EXEC(field) BUILD_TDX_FIELD(TD_CLASS_EXECUTION_CONTROLS, (field))
+
+/* @field is the VMCS field encoding */
+#define TDVPS_VMCS(field) BUILD_TDX_FIELD(TDVPS_CLASS_VMCS, (field))
+
+/* @field is any of enum tdx_guest_other_state */
+#define TDVPS_STATE(field) BUILD_TDX_FIELD(TDVPS_CLASS_OTHER_GUEST, (field))
+#define TDVPS_STATE_NON_ARCH(field) BUILD_TDX_FIELD_NON_ARCH(TDVPS_CLASS_OTHER_GUEST, (field))
+
+/* Management class fields */
+enum tdx_vcpu_guest_management {
+ TD_VCPU_PEND_NMI = 11,
+};
+
+/* @field is any of enum tdx_vcpu_guest_management */
+#define TDVPS_MANAGEMENT(field) BUILD_TDX_FIELD(TDVPS_CLASS_MANAGEMENT, (field))
+
+#define TDX_EXTENDMR_CHUNKSIZE 256
+
+struct tdx_cpuid_value {
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+} __packed;
+
+#define TDX_TD_ATTR_DEBUG BIT_ULL(0)
+#define TDX_TD_ATTR_SEPT_VE_DISABLE BIT_ULL(28)
+#define TDX_TD_ATTR_PKS BIT_ULL(30)
+#define TDX_TD_ATTR_KL BIT_ULL(31)
+#define TDX_TD_ATTR_PERFMON BIT_ULL(63)
+
+#define TDX_EXT_EXIT_QUAL_TYPE_MASK GENMASK(3, 0)
+#define TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION 6
+/*
+ * TD_PARAMS is provided as an input to TDH_MNG_INIT, the size of which is 1024B.
+ */
+struct td_params {
+ u64 attributes;
+ u64 xfam;
+ u16 max_vcpus;
+ u8 reserved0[6];
+
+ u64 eptp_controls;
+ u64 config_flags;
+ u16 tsc_frequency;
+ u8 reserved1[38];
+
+ u64 mrconfigid[6];
+ u64 mrowner[6];
+ u64 mrownerconfig[6];
+ u64 reserved2[4];
+
+ union {
+ DECLARE_FLEX_ARRAY(struct tdx_cpuid_value, cpuid_values);
+ u8 reserved3[768];
+ };
+} __packed __aligned(1024);
+
+/*
+ * Guest uses MAX_PA for GPAW when set.
+ * 0: GPA.SHARED bit is GPA[47]
+ * 1: GPA.SHARED bit is GPA[51]
+ */
+#define TDX_CONFIG_FLAGS_MAX_GPAW BIT_ULL(0)
+
+/*
+ * TDH.VP.ENTER, TDG.VP.VMCALL preserves RBP
+ * 0: RBP can be used for TDG.VP.VMCALL input. RBP is clobbered.
+ * 1: RBP can't be used for TDG.VP.VMCALL input. RBP is preserved.
+ */
+#define TDX_CONFIG_FLAGS_NO_RBP_MOD BIT_ULL(2)
+
+
+/*
+ * TDX requires the frequency to be defined in units of 25MHz, which is the
+ * frequency of the core crystal clock on TDX-capable platforms, i.e. the TDX
+ * module can only program frequencies that are multiples of 25MHz. The
+ * frequency must be between 100mhz and 10ghz (inclusive).
+ */
+#define TDX_TSC_KHZ_TO_25MHZ(tsc_in_khz) ((tsc_in_khz) / (25 * 1000))
+#define TDX_TSC_25MHZ_TO_KHZ(tsc_in_25mhz) ((tsc_in_25mhz) * (25 * 1000))
+#define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000)
+#define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000)
+
+/* Additional Secure EPT entry information */
+#define TDX_SEPT_LEVEL_MASK GENMASK_ULL(2, 0)
+#define TDX_SEPT_STATE_MASK GENMASK_ULL(15, 8)
+#define TDX_SEPT_STATE_SHIFT 8
+
+enum tdx_sept_entry_state {
+ TDX_SEPT_FREE = 0,
+ TDX_SEPT_BLOCKED = 1,
+ TDX_SEPT_PENDING = 2,
+ TDX_SEPT_PENDING_BLOCKED = 3,
+ TDX_SEPT_PRESENT = 4,
+};
+
+static inline u8 tdx_get_sept_level(u64 sept_entry_info)
+{
+ return sept_entry_info & TDX_SEPT_LEVEL_MASK;
+}
+
+static inline u8 tdx_get_sept_state(u64 sept_entry_info)
+{
+ return (sept_entry_info & TDX_SEPT_STATE_MASK) >> TDX_SEPT_STATE_SHIFT;
+}
+
+#define MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM BIT_ULL(20)
+
+/*
+ * TD scope metadata field ID.
+ */
+#define TD_MD_FIELD_ID_CPUID_VALUES 0x9410000300000000ULL
+
+#endif /* __KVM_X86_TDX_ARCH_H */
diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/kvm/vmx/tdx_errno.h
new file mode 100644
index 000000000000..6ff4672c4181
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx_errno.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* architectural status code for SEAMCALL */
+
+#ifndef __KVM_X86_TDX_ERRNO_H
+#define __KVM_X86_TDX_ERRNO_H
+
+#define TDX_SEAMCALL_STATUS_MASK 0xFFFFFFFF00000000ULL
+
+/*
+ * TDX SEAMCALL Status Codes (returned in RAX)
+ */
+#define TDX_NON_RECOVERABLE_VCPU 0x4000000100000000ULL
+#define TDX_NON_RECOVERABLE_TD 0x4000000200000000ULL
+#define TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE 0x6000000500000000ULL
+#define TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE 0x6000000700000000ULL
+#define TDX_INTERRUPTED_RESUMABLE 0x8000000300000000ULL
+#define TDX_OPERAND_INVALID 0xC000010000000000ULL
+#define TDX_OPERAND_BUSY 0x8000020000000000ULL
+#define TDX_PREVIOUS_TLB_EPOCH_BUSY 0x8000020100000000ULL
+#define TDX_PAGE_METADATA_INCORRECT 0xC000030000000000ULL
+#define TDX_VCPU_NOT_ASSOCIATED 0x8000070200000000ULL
+#define TDX_KEY_GENERATION_FAILED 0x8000080000000000ULL
+#define TDX_KEY_STATE_INCORRECT 0xC000081100000000ULL
+#define TDX_KEY_CONFIGURED 0x0000081500000000ULL
+#define TDX_NO_HKID_READY_TO_WBCACHE 0x0000082100000000ULL
+#define TDX_FLUSHVP_NOT_DONE 0x8000082400000000ULL
+#define TDX_EPT_WALK_FAILED 0xC0000B0000000000ULL
+#define TDX_EPT_ENTRY_STATE_INCORRECT 0xC0000B0D00000000ULL
+#define TDX_METADATA_FIELD_NOT_READABLE 0xC0000C0200000000ULL
+
+/*
+ * TDX module operand ID, appears in 31:0 part of error code as
+ * detail information
+ */
+#define TDX_OPERAND_ID_RCX 0x01
+#define TDX_OPERAND_ID_TDR 0x80
+#define TDX_OPERAND_ID_SEPT 0x92
+#define TDX_OPERAND_ID_TD_EPOCH 0xa9
+
+#endif /* __KVM_X86_TDX_ERRNO_H */
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index f6986dee6f8c..0a6cf5bff2aa 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -59,8 +59,7 @@
* without the explicit restore, thinks the stack is getting walloped.
* Using an unwind hint is problematic due to x86-64's dynamic alignment.
*/
- mov %_ASM_BP, %_ASM_SP
- pop %_ASM_BP
+ leave
RET
.endm
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 157c23db22be..aa157fe5b7b3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -54,6 +54,7 @@
#include <trace/events/ipi.h>
#include "capabilities.h"
+#include "common.h"
#include "cpuid.h"
#include "hyperv.h"
#include "kvm_onhyperv.h"
@@ -74,6 +75,8 @@
#include "vmx_onhyperv.h"
#include "posted_intr.h"
+#include "mmu/spte.h"
+
MODULE_AUTHOR("Qumranet");
MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");
@@ -112,10 +115,10 @@ static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, 0444);
module_param(enable_apicv, bool, 0444);
-
-bool __read_mostly enable_ipiv = true;
module_param(enable_ipiv, bool, 0444);
+module_param(enable_device_posted_irqs, bool, 0444);
+
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -165,31 +168,6 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
RTIT_STATUS_BYTECNT))
/*
- * List of MSRs that can be directly passed to the guest.
- * In addition to these x2apic, PT and LBR MSRs are handled specially.
- */
-static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
- MSR_IA32_SPEC_CTRL,
- MSR_IA32_PRED_CMD,
- MSR_IA32_FLUSH_CMD,
- MSR_IA32_TSC,
-#ifdef CONFIG_X86_64
- MSR_FS_BASE,
- MSR_GS_BASE,
- MSR_KERNEL_GS_BASE,
- MSR_IA32_XFD,
- MSR_IA32_XFD_ERR,
-#endif
- MSR_IA32_SYSENTER_CS,
- MSR_IA32_SYSENTER_ESP,
- MSR_IA32_SYSENTER_EIP,
- MSR_CORE_C1_RES,
- MSR_CORE_C3_RESIDENCY,
- MSR_CORE_C6_RESIDENCY,
- MSR_CORE_C7_RESIDENCY,
-};
-
-/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
* executions of PAUSE in a loop. Also indicate if ple enabled.
@@ -671,40 +649,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static int vmx_get_passthrough_msr_slot(u32 msr)
-{
- int i;
-
- switch (msr) {
- case 0x800 ... 0x8ff:
- /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
- return -ENOENT;
- case MSR_IA32_RTIT_STATUS:
- case MSR_IA32_RTIT_OUTPUT_BASE:
- case MSR_IA32_RTIT_OUTPUT_MASK:
- case MSR_IA32_RTIT_CR3_MATCH:
- case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
- /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
- case MSR_LBR_SELECT:
- case MSR_LBR_TOS:
- case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
- case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
- case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
- case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
- case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
- /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
- return -ENOENT;
- }
-
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
- if (vmx_possible_passthrough_msrs[i] == msr)
- return i;
- }
-
- WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
- return -ENOENT;
-}
-
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -771,8 +715,11 @@ void vmx_emergency_disable_virtualization_cpu(void)
return;
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
+ loaded_vmcss_on_cpu_link) {
vmcs_clear(v->vmcs);
+ if (v->shadow_vmcs)
+ vmcs_clear(v->shadow_vmcs);
+ }
kvm_cpu_vmxoff();
}
@@ -957,6 +904,10 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
flags |= VMX_RUN_SAVE_SPEC_CTRL;
+ if (static_branch_unlikely(&cpu_buf_vm_clear) &&
+ kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
+ flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
+
return flags;
}
@@ -1283,6 +1234,7 @@ void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
struct vmcs_host_state *host_state;
#ifdef CONFIG_X86_64
int cpu = raw_smp_processor_id();
@@ -1311,7 +1263,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
if (vmx->nested.need_vmcs12_to_shadow_sync)
nested_sync_vmcs12_to_shadow(vcpu);
- if (vmx->guest_state_loaded)
+ if (vt->guest_state_loaded)
return;
host_state = &vmx->loaded_vmcs->host_state;
@@ -1332,12 +1284,12 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
fs_sel = current->thread.fsindex;
gs_sel = current->thread.gsindex;
fs_base = current->thread.fsbase;
- vmx->msr_host_kernel_gs_base = current->thread.gsbase;
+ vt->msr_host_kernel_gs_base = current->thread.gsbase;
} else {
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
fs_base = read_msr(MSR_FS_BASE);
- vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+ vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
}
wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
@@ -1349,14 +1301,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
#endif
vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
- vmx->guest_state_loaded = true;
+ vt->guest_state_loaded = true;
}
static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
{
struct vmcs_host_state *host_state;
- if (!vmx->guest_state_loaded)
+ if (!vmx->vt.guest_state_loaded)
return;
host_state = &vmx->loaded_vmcs->host_state;
@@ -1384,10 +1336,10 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
#endif
invalidate_tss_limit();
#ifdef CONFIG_X86_64
- wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base);
#endif
load_fixmap_gdt(raw_smp_processor_id());
- vmx->guest_state_loaded = false;
+ vmx->vt.guest_state_loaded = false;
vmx->guest_uret_msrs_loaded = false;
}
@@ -1395,7 +1347,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
{
preempt_disable();
- if (vmx->guest_state_loaded)
+ if (vmx->vt.guest_state_loaded)
rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
preempt_enable();
return vmx->msr_guest_kernel_gs_base;
@@ -1404,7 +1356,7 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
{
preempt_disable();
- if (vmx->guest_state_loaded)
+ if (vmx->vt.guest_state_loaded)
wrmsrq(MSR_KERNEL_GS_BASE, data);
preempt_enable();
vmx->msr_guest_kernel_gs_base = data;
@@ -1443,8 +1395,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
- struct loaded_vmcs *buddy)
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
@@ -1471,17 +1422,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
if (prev != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
-
- /*
- * No indirect branch prediction barrier needed when switching
- * the active VMCS within a vCPU, unless IBRS is advertised to
- * the vCPU. To minimize the number of IBPBs executed, KVM
- * performs IBPB on nested VM-Exit (a single nested transition
- * may switch the active VMCS multiple times).
- */
- if (static_branch_likely(&switch_vcpu_ibpb) &&
- (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)))
- indirect_branch_prediction_barrier();
}
if (!already_loaded) {
@@ -1520,7 +1460,7 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
- vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
vmx_vcpu_pi_load(vcpu, cpu);
}
@@ -1581,7 +1521,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmcs_writel(GUEST_RFLAGS, rflags);
if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
- vmx->emulation_required = vmx_emulation_required(vcpu);
+ vmx->vt.emulation_required = vmx_emulation_required(vcpu);
}
bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
@@ -1701,7 +1641,7 @@ int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* so that guest userspace can't DoS the guest simply by triggering
* emulation (enclaves are CPL3 only).
*/
- if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+ if (vmx_get_exit_reason(vcpu).enclave_mode) {
kvm_queue_exception(vcpu, UD_VECTOR);
return X86EMUL_PROPAGATE_FAULT;
}
@@ -1716,7 +1656,7 @@ int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
unsigned long rip, orig_rip;
u32 instr_len;
@@ -1863,7 +1803,7 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu)
return;
}
- WARN_ON_ONCE(vmx->emulation_required);
+ WARN_ON_ONCE(vmx->vt.emulation_required);
if (kvm_exception_is_soft(ex->vector)) {
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
@@ -2154,7 +2094,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ msr_info->data = vmx_guest_debugctl_read();
break;
default:
find_uret_msr:
@@ -2179,7 +2119,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
return (unsigned long)data;
}
-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
{
u64 debugctl = 0;
@@ -2191,9 +2131,25 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
(host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ if (boot_cpu_has(X86_FEATURE_RTM) &&
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
+ debugctl |= DEBUGCTLMSR_RTM_DEBUG;
+
return debugctl;
}
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+{
+ u64 invalid;
+
+ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
+ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
+ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
+ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
+ }
+ return !invalid;
+}
+
/*
* Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -2262,29 +2218,22 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_DEBUGCTLMSR: {
- u64 invalid;
-
- invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
- if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
- kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
- data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- }
-
- if (invalid)
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
return 1;
+ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ vmx_guest_debugctl_write(vcpu, data);
+
if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
(data & DEBUGCTLMSR_LBR))
intel_pmu_create_guest_lbr_event(vcpu);
return 0;
- }
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
@@ -3406,7 +3355,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
/* depends on vcpu->arch.cr0 to be set to a new value */
- vmx->emulation_required = vmx_emulation_required(vcpu);
+ vmx->vt.emulation_required = vmx_emulation_required(vcpu);
}
static int vmx_get_max_ept_level(void)
@@ -3669,7 +3618,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
__vmx_set_segment(vcpu, var, seg);
- to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
+ to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu);
}
void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -4018,76 +3967,29 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
vmx->nested.force_msr_bitmap_recalc = true;
}
-void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
- int idx;
if (!cpu_has_vmx_msr_bitmap())
return;
vmx_msr_bitmap_l01_changed(vmx);
- /*
- * Mark the desired intercept state in shadow bitmap, this is needed
- * for resync when the MSR filters change.
- */
- idx = vmx_get_passthrough_msr_slot(msr);
- if (idx >= 0) {
- if (type & MSR_TYPE_R)
- clear_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- clear_bit(idx, vmx->shadow_msr_intercept.write);
- }
-
- if ((type & MSR_TYPE_R) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
- vmx_set_msr_bitmap_read(msr_bitmap, msr);
- type &= ~MSR_TYPE_R;
- }
-
- if ((type & MSR_TYPE_W) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
- vmx_set_msr_bitmap_write(msr_bitmap, msr);
- type &= ~MSR_TYPE_W;
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ vmx_clear_msr_bitmap_read(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
}
- if (type & MSR_TYPE_R)
- vmx_clear_msr_bitmap_read(msr_bitmap, msr);
-
- if (type & MSR_TYPE_W)
- vmx_clear_msr_bitmap_write(msr_bitmap, msr);
-}
-
-void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
- int idx;
-
- if (!cpu_has_vmx_msr_bitmap())
- return;
-
- vmx_msr_bitmap_l01_changed(vmx);
-
- /*
- * Mark the desired intercept state in shadow bitmap, this is needed
- * for resync when the MSR filter changes.
- */
- idx = vmx_get_passthrough_msr_slot(msr);
- if (idx >= 0) {
- if (type & MSR_TYPE_R)
- set_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- set_bit(idx, vmx->shadow_msr_intercept.write);
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ vmx_clear_msr_bitmap_write(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-
- if (type & MSR_TYPE_R)
- vmx_set_msr_bitmap_read(msr_bitmap, msr);
-
- if (type & MSR_TYPE_W)
- vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
@@ -4166,79 +4068,57 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
}
}
-void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 i;
-
if (!cpu_has_vmx_msr_bitmap())
return;
- /*
- * Redo intercept permissions for MSRs that KVM is passing through to
- * the guest. Disabling interception will check the new MSR filter and
- * ensure that KVM enables interception if usersepace wants to filter
- * the MSR. MSRs that KVM is already intercepting don't need to be
- * refreshed since KVM is going to intercept them regardless of what
- * userspace wants.
- */
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
- u32 msr = vmx_possible_passthrough_msrs[i];
-
- if (!test_bit(i, vmx->shadow_msr_intercept.read))
- vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
-
- if (!test_bit(i, vmx->shadow_msr_intercept.write))
- vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
+ vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ if (kvm_cstate_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ }
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
}
/* PT MSRs can be passed through iff PT is exposed to the guest. */
if (vmx_pt_mode_is_host_guest())
pt_update_intercept_for_msr(vcpu);
-}
-static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
- int pi_vec)
-{
-#ifdef CONFIG_SMP
- if (vcpu->mode == IN_GUEST_MODE) {
- /*
- * The vector of the virtual has already been set in the PIR.
- * Send a notification event to deliver the virtual interrupt
- * unless the vCPU is the currently running vCPU, i.e. the
- * event is being sent from a fastpath VM-Exit handler, in
- * which case the PIR will be synced to the vIRR before
- * re-entering the guest.
- *
- * When the target is not the running vCPU, the following
- * possibilities emerge:
- *
- * Case 1: vCPU stays in non-root mode. Sending a notification
- * event posts the interrupt to the vCPU.
- *
- * Case 2: vCPU exits to root mode and is still runnable. The
- * PIR will be synced to the vIRR before re-entering the guest.
- * Sending a notification event is ok as the host IRQ handler
- * will ignore the spurious event.
- *
- * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
- * has already synced PIR to vIRR and never blocks the vCPU if
- * the vIRR is not empty. Therefore, a blocked vCPU here does
- * not wait for any requested interrupts in PIR, and sending a
- * notification event also results in a benign, spurious event.
- */
+ if (vcpu->arch.xfd_no_write_intercept)
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);
+
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !to_vmx(vcpu)->spec_ctrl);
+
+ if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
+
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
+
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
- if (vcpu != kvm_get_running_vcpu())
- __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
- return;
- }
-#endif
/*
- * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
- * otherwise do nothing as KVM will grab the highest priority pending
- * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
+ * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
+ * filtered by userspace.
*/
- kvm_vcpu_wake_up(vcpu);
}
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -4289,7 +4169,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
*/
static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
int r;
r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
@@ -4300,20 +4180,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (!vcpu->arch.apic->apicv_active)
return -1;
- if (pi_test_and_set_pir(vector, &vmx->pi_desc))
- return 0;
-
- /* If a previous notification has sent the IPI, nothing to do. */
- if (pi_test_and_set_on(&vmx->pi_desc))
- return 0;
-
- /*
- * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
- * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
- * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
- * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
- */
- kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
+ __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector);
return 0;
}
@@ -4780,7 +4647,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write16(GUEST_INTR_STATUS, 0);
vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
- vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
+ vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc)));
}
if (vmx_can_use_ipiv(&vmx->vcpu)) {
@@ -4852,7 +4719,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write32(GUEST_SYSENTER_CS, 0);
vmcs_writel(GUEST_SYSENTER_ESP, 0);
vmcs_writel(GUEST_SYSENTER_EIP, 0);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ vmx_guest_debugctl_write(&vmx->vcpu, 0);
if (cpu_has_vmx_tpr_shadow()) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
@@ -4893,8 +4761,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
* Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
* or POSTED_INTR_WAKEUP_VECTOR.
*/
- vmx->pi_desc.nv = POSTED_INTR_VECTOR;
- __pi_set_sn(&vmx->pi_desc);
+ vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
+ __pi_set_sn(&vmx->vt.pi_desc);
}
void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -5668,12 +5536,6 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
-void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
-{
- lockdep_assert_irqs_disabled();
- set_debugreg(vcpu->arch.dr6, 6);
-}
-
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
@@ -5811,11 +5673,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification;
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
gpa_t gpa;
- u64 error_code;
-
- exit_qualification = vmx_get_exit_qual(vcpu);
/*
* EPT violation happened while executing iret from NMI,
@@ -5831,23 +5690,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(vcpu, gpa, exit_qualification);
- /* Is it a read fault? */
- error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
- ? PFERR_USER_MASK : 0;
- /* Is it a write fault? */
- error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
- ? PFERR_WRITE_MASK : 0;
- /* Is it a fetch fault? */
- error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
- ? PFERR_FETCH_MASK : 0;
- /* ept page table entry is present? */
- error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
- ? PFERR_PRESENT_MASK : 0;
-
- if (error_code & EPT_VIOLATION_GVA_IS_VALID)
- error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
- PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
-
/*
* Check that the GPA doesn't exceed physical memory limits, as that is
* a guest page fault. We have to emulate the instruction here, because
@@ -5859,7 +5701,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
return kvm_emulate_instruction(vcpu, 0);
- return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+ return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
}
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
@@ -5904,7 +5746,7 @@ static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!vmx->emulation_required)
+ if (!vmx->vt.emulation_required)
return false;
/*
@@ -5936,7 +5778,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
intr_window_requested = exec_controls_get(vmx) &
CPU_BASED_INTR_WINDOW_EXITING;
- while (vmx->emulation_required && count-- != 0) {
+ while (vmx->vt.emulation_required && count-- != 0) {
if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
return handle_interrupt_window(&vmx->vcpu);
@@ -6131,7 +5973,7 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
* VM-Exits. Unconditionally set the flag here and leave the handling to
* vmx_handle_exit().
*/
- to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
+ to_vt(vcpu)->exit_reason.bus_lock_detected = true;
return 1;
}
@@ -6229,9 +6071,9 @@ void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- *reason = vmx->exit_reason.full;
+ *reason = vmx->vt.exit_reason.full;
*info1 = vmx_get_exit_qual(vcpu);
- if (!(vmx->exit_reason.failed_vmentry)) {
+ if (!(vmx->vt.exit_reason.failed_vmentry)) {
*info2 = vmx->idt_vectoring_info;
*intr_info = vmx_get_intr_info(vcpu);
if (is_exception_with_error_code(*intr_info))
@@ -6527,7 +6369,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- union vmx_exit_reason exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
u32 vectoring_info = vmx->idt_vectoring_info;
u16 exit_handler_index;
@@ -6583,7 +6425,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
* the least awful solution for the userspace case without
* risking false positives.
*/
- if (vmx->emulation_required) {
+ if (vmx->vt.emulation_required) {
nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
return 1;
}
@@ -6593,7 +6435,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
}
/* If guest state is invalid, start emulating. L2 is handled above. */
- if (vmx->emulation_required)
+ if (vmx->vt.emulation_required)
return handle_invalid_guest_state(vcpu);
if (exit_reason.failed_vmentry) {
@@ -6693,7 +6535,7 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
* Exit to user space when bus lock detected to inform that there is
* a bus lock in guest.
*/
- if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
+ if (vmx_get_exit_reason(vcpu).bus_lock_detected) {
if (ret > 0)
vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
@@ -6972,22 +6814,22 @@ static void vmx_set_rvi(int vector)
int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
int max_irr;
bool got_posted_interrupt;
if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
return -EIO;
- if (pi_test_on(&vmx->pi_desc)) {
- pi_clear_on(&vmx->pi_desc);
+ if (pi_test_on(&vt->pi_desc)) {
+ pi_clear_on(&vt->pi_desc);
/*
* IOMMU can write to PID.ON, so the barrier matters even on UP.
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
got_posted_interrupt =
- kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
+ kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
got_posted_interrupt = false;
@@ -7027,14 +6869,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}
-void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- pi_clear_on(&vmx->pi_desc);
- memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
-}
-
void vmx_do_interrupt_irqoff(unsigned long entry);
void vmx_do_nmi_irqoff(void);
@@ -7091,14 +6925,12 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (vmx->emulation_required)
+ if (to_vt(vcpu)->emulation_required)
return;
- if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
+ if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
- else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
+ else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI)
handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
}
@@ -7333,10 +7165,10 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
* the fastpath even, all other exits must use the slow path.
*/
if (is_guest_mode(vcpu) &&
- to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
+ vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER)
return EXIT_FASTPATH_NONE;
- switch (to_vmx(vcpu)->exit_reason.basic) {
+ switch (vmx_get_exit_reason(vcpu).basic) {
case EXIT_REASON_MSR_WRITE:
return handle_fastpath_set_msr_irqoff(vcpu);
case EXIT_REASON_PREEMPTION_TIMER:
@@ -7348,6 +7180,20 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
}
}
+noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu)
+{
+ if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI ||
+ !is_nmi(vmx_get_intr_info(vcpu)))
+ return;
+
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
+ else
+ vmx_do_nmi_irqoff();
+ kvm_after_interrupt(vcpu);
+}
+
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
unsigned int flags)
{
@@ -7368,8 +7214,8 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
else if (static_branch_unlikely(&cpu_buf_vm_clear) &&
- kvm_arch_has_assigned_device(vcpu->kvm))
- mds_clear_cpu_buffers();
+ (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO))
+ x86_clear_cpu_buffers();
vmx_disable_fb_clear(vmx);
@@ -7387,30 +7233,23 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vmx_enable_fb_clear(vmx);
if (unlikely(vmx->fail)) {
- vmx->exit_reason.full = 0xdead;
+ vmx->vt.exit_reason.full = 0xdead;
goto out;
}
- vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
- if (likely(!vmx->exit_reason.failed_vmentry))
+ vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry))
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
- is_nmi(vmx_get_intr_info(vcpu))) {
- kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
- if (cpu_feature_enabled(X86_FEATURE_FRED))
- fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
- else
- vmx_do_nmi_irqoff();
- kvm_after_interrupt(vcpu);
- }
+ vmx_handle_nmi(vcpu);
out:
guest_state_exit_irqoff();
}
-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7424,15 +7263,15 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
* start emulation until we arrive back to a valid state. Synthesize a
* consistency check VM-Exit due to invalid guest state and bail.
*/
- if (unlikely(vmx->emulation_required)) {
+ if (unlikely(vmx->vt.emulation_required)) {
vmx->fail = 0;
- vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
- vmx->exit_reason.failed_vmentry = 1;
+ vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE;
+ vmx->vt.exit_reason.failed_vmentry = 1;
kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
- vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
+ vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT;
kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
- vmx->exit_intr_info = 0;
+ vmx->vt.exit_intr_info = 0;
return EXIT_FASTPATH_NONE;
}
@@ -7455,6 +7294,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
vcpu->arch.regs_dirty = 0;
+ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+ set_debugreg(vcpu->arch.dr6, 6);
+
+ if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
+ vmx_reload_guest_debugctl(vcpu);
+
/*
* Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
* prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
@@ -7535,7 +7380,7 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
* checking.
*/
if (vmx->nested.nested_run_pending &&
- !vmx->exit_reason.failed_vmentry)
+ !vmx_get_exit_reason(vcpu).failed_vmentry)
++vcpu->stat.nested_run;
vmx->nested.nested_run_pending = 0;
@@ -7544,12 +7389,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
if (unlikely(vmx->fail))
return EXIT_FASTPATH_NONE;
- if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
+ if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
trace_kvm_exit(vcpu, KVM_ISA_VMX);
- if (unlikely(vmx->exit_reason.failed_vmentry))
+ if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry))
return EXIT_FASTPATH_NONE;
vmx->loaded_vmcs->launched = 1;
@@ -7581,7 +7426,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
vmx = to_vmx(vcpu);
- INIT_LIST_HEAD(&vmx->pi_wakeup_list);
+ INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list);
err = -ENOMEM;
@@ -7629,26 +7474,6 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
evmcs->hv_enlightenments_control.msr_bitmap = 1;
}
- /* The MSR bitmap starts with all ones */
- bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
-
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
-#ifdef CONFIG_X86_64
- vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
-#endif
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
- if (kvm_cstate_in_guest(vcpu->kvm)) {
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
- }
-
vmx->loaded_vmcs = &vmx->vmcs01;
if (cpu_need_virtualize_apic_accesses(vcpu)) {
@@ -7679,7 +7504,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
if (vmx_can_use_ipiv(vcpu))
WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
- __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
+ __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID);
return 0;
@@ -7698,7 +7523,7 @@ free_vpid:
int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
- kvm->arch.pause_in_guest = true;
+ kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
switch (l1tf_mitigation) {
@@ -7724,9 +7549,23 @@ int vmx_vm_init(struct kvm *kvm)
break;
}
}
+
+ if (enable_pml)
+ kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES;
return 0;
}
+static inline bool vmx_ignore_guest_pat(struct kvm *kvm)
+{
+ /*
+ * Non-coherent DMA devices need the guest to flush CPU properly.
+ * In that case it is not possible to map all guest RAM as WB, so
+ * always trust guest PAT.
+ */
+ return !kvm_arch_has_noncoherent_dma(kvm) &&
+ kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT);
+}
+
u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
/*
@@ -7736,13 +7575,8 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
if (is_mmio)
return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- /*
- * Force WB and ignore guest PAT if the VM does NOT have a non-coherent
- * device attached. Letting the guest control memory types on Intel
- * CPUs may result in unexpected behavior, and so KVM's ABI is to trust
- * the guest to behave only as a last resort.
- */
- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ /* Force WB if ignoring guest PAT */
+ if (vmx_ignore_guest_pat(vcpu->kvm))
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
@@ -7926,18 +7760,6 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
}
}
- if (kvm_cpu_cap_has(X86_FEATURE_XFD))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
- !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
-
- if (boot_cpu_has(X86_FEATURE_IBPB))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
- !guest_has_pred_cmd_msr(vcpu));
-
- if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
- !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
-
set_cr4_guest_host_mask(vmx);
vmx_write_encls_bitmap(vcpu, NULL);
@@ -7953,6 +7775,9 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx->msr_ia32_feature_control_valid_bits &=
~FEAT_CTL_SGX_LC_ENABLED;
+ /* Recalc MSR interception to account for feature changes. */
+ vmx_recalc_msr_intercepts(vcpu);
+
/* Refresh #PF interception to account for MAXPHYADDR changes. */
vmx_update_exception_bitmap(vcpu);
}
@@ -8604,6 +8429,8 @@ __init int vmx_hardware_setup(void)
if (enable_ept)
kvm_mmu_set_ept_masks(enable_ept_ad_bits,
cpu_has_vmx_ept_execute_only());
+ else
+ vt_x86_ops.get_mt_mask = NULL;
/*
* Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
@@ -8621,9 +8448,6 @@ __init int vmx_hardware_setup(void)
if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
enable_pml = 0;
- if (!enable_pml)
- vt_x86_ops.cpu_dirty_log_size = 0;
-
if (!cpu_has_vmx_preemption_timer())
enable_preemption_timer = false;
@@ -8681,6 +8505,27 @@ __init int vmx_hardware_setup(void)
kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+ /*
+ * On Intel CPUs that lack self-snoop feature, letting the guest control
+ * memory types may result in unexpected behavior. So always ignore guest
+ * PAT on those CPUs and map VM as writeback, not allowing userspace to
+ * disable the quirk.
+ *
+ * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is
+ * supported, UC is slow enough to cause issues with some older guests (e.g.
+ * an old version of bochs driver uses ioremap() instead of ioremap_wc() to
+ * map the video RAM, causing wayland desktop to fail to get started
+ * correctly). To avoid breaking those older guests that rely on KVM to force
+ * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the
+ * safer (for performance) default behavior.
+ *
+ * On top of this, non-coherent DMA devices need the guest to flush CPU
+ * caches properly. This also requires honoring guest PAT, and is forced
+ * independent of the quirk in vmx_ignore_guest_pat().
+ */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+ kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
return r;
}
@@ -8694,26 +8539,21 @@ static void vmx_cleanup_l1d_flush(void)
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
-static void __vmx_exit(void)
+void vmx_exit(void)
{
allow_smaller_maxphyaddr = false;
vmx_cleanup_l1d_flush();
-}
-static void __exit vmx_exit(void)
-{
- kvm_exit();
- __vmx_exit();
kvm_x86_vendor_exit();
-
}
-module_exit(vmx_exit);
-static int __init vmx_init(void)
+int __init vmx_init(void)
{
int r, cpu;
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
+
if (!kvm_is_vmx_supported())
return -EOPNOTSUPP;
@@ -8754,21 +8594,9 @@ static int __init vmx_init(void)
if (!enable_ept)
allow_smaller_maxphyaddr = true;
- /*
- * Common KVM initialization _must_ come last, after this, /dev/kvm is
- * exposed to userspace!
- */
- r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
- THIS_MODULE);
- if (r)
- goto err_kvm_init;
-
return 0;
-err_kvm_init:
- __vmx_exit();
err_l1d_flush:
kvm_x86_vendor_exit();
return r;
}
-module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 951e44dc9d0e..d3389baf3ab3 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -11,13 +11,13 @@
#include "capabilities.h"
#include "../kvm_cache_regs.h"
+#include "pmu_intel.h"
#include "vmcs.h"
#include "vmx_ops.h"
#include "../cpuid.h"
#include "run_flags.h"
#include "../mmu.h"
-
-#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+#include "common.h"
#ifdef CONFIG_X86_64
#define MAX_NR_USER_RETURN_MSRS 7
@@ -67,47 +67,6 @@ struct pt_desc {
struct pt_ctx guest;
};
-union vmx_exit_reason {
- struct {
- u32 basic : 16;
- u32 reserved16 : 1;
- u32 reserved17 : 1;
- u32 reserved18 : 1;
- u32 reserved19 : 1;
- u32 reserved20 : 1;
- u32 reserved21 : 1;
- u32 reserved22 : 1;
- u32 reserved23 : 1;
- u32 reserved24 : 1;
- u32 reserved25 : 1;
- u32 bus_lock_detected : 1;
- u32 enclave_mode : 1;
- u32 smi_pending_mtf : 1;
- u32 smi_from_vmx_root : 1;
- u32 reserved30 : 1;
- u32 failed_vmentry : 1;
- };
- u32 full;
-};
-
-struct lbr_desc {
- /* Basic info about guest LBR records. */
- struct x86_pmu_lbr records;
-
- /*
- * Emulate LBR feature via passthrough LBR registers when the
- * per-vcpu guest LBR event is scheduled on the current pcpu.
- *
- * The records may be inaccurate if the host reclaims the LBR.
- */
- struct perf_event *event;
-
- /* True if LBRs are marked as not intercepted in the MSR bitmap */
- bool msr_passthrough;
-};
-
-extern struct x86_pmu_lbr vmx_lbr_caps;
-
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -248,20 +207,10 @@ struct nested_vmx {
struct vcpu_vmx {
struct kvm_vcpu vcpu;
+ struct vcpu_vt vt;
u8 fail;
u8 x2apic_msr_bitmap_mode;
- /*
- * If true, host state has been stored in vmx->loaded_vmcs for
- * the CPU registers that only need to be switched when transitioning
- * to/from the kernel, and the registers have been loaded with guest
- * values. If false, host state is loaded in the CPU registers
- * and vmx->loaded_vmcs->host_state is invalid.
- */
- bool guest_state_loaded;
-
- unsigned long exit_qualification;
- u32 exit_intr_info;
u32 idt_vectoring_info;
ulong rflags;
@@ -274,7 +223,6 @@ struct vcpu_vmx {
struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
bool guest_uret_msrs_loaded;
#ifdef CONFIG_X86_64
- u64 msr_host_kernel_gs_base;
u64 msr_guest_kernel_gs_base;
#endif
@@ -313,15 +261,6 @@ struct vcpu_vmx {
} seg[8];
} segment_cache;
int vpid;
- bool emulation_required;
-
- union vmx_exit_reason exit_reason;
-
- /* Posted interrupt descriptor */
- struct pi_desc pi_desc;
-
- /* Used if this vCPU is waiting for PI notification wakeup. */
- struct list_head pi_wakeup_list;
/* Support for a guest hypervisor (nested VMX) */
struct nested_vmx nested;
@@ -355,13 +294,6 @@ struct vcpu_vmx {
struct pt_desc pt_desc;
struct lbr_desc lbr_desc;
- /* Save desired MSR intercept (read: pass-through) state */
-#define MAX_POSSIBLE_PASSTHROUGH_MSRS 16
- struct {
- DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- } shadow_msr_intercept;
-
/* ve_info must be page aligned. */
struct vmx_ve_information *ve_info;
};
@@ -376,8 +308,44 @@ struct kvm_vmx {
u64 *pid_table;
};
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
- struct loaded_vmcs *buddy);
+static __always_inline struct vcpu_vt *to_vt(struct kvm_vcpu *vcpu)
+{
+ return &(container_of(vcpu, struct vcpu_vmx, vcpu)->vt);
+}
+
+static __always_inline struct kvm_vcpu *vt_to_vcpu(struct vcpu_vt *vt)
+{
+ return &(container_of(vt, struct vcpu_vmx, vt)->vcpu);
+}
+
+static __always_inline union vmx_exit_reason vmx_get_exit_reason(struct kvm_vcpu *vcpu)
+{
+ return to_vt(vcpu)->exit_reason;
+}
+
+static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1) &&
+ !WARN_ON_ONCE(is_td_vcpu(vcpu)))
+ vt->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ return vt->exit_qualification;
+}
+
+static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2) &&
+ !WARN_ON_ONCE(is_td_vcpu(vcpu)))
+ vt->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ return vt->exit_intr_info;
+}
+
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu);
int allocate_vpid(void);
void free_vpid(int vpid);
void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
@@ -418,24 +386,54 @@ bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs,
int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
-void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
-void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set);
+
+static inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ vmx_set_intercept_for_msr(vcpu, msr, type, false);
+}
+
+static inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ vmx_set_intercept_for_msr(vcpu, msr, type, true);
+}
u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
-static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
- int type, bool value)
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
+
+#define VMX_HOST_OWNED_DEBUGCTL_BITS (DEBUGCTLMSR_FREEZE_IN_SMM)
+
+static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
+{
+ WARN_ON_ONCE(val & VMX_HOST_OWNED_DEBUGCTL_BITS);
+
+ val |= vcpu->arch.host_debugctl & VMX_HOST_OWNED_DEBUGCTL_BITS;
+ vmcs_write64(GUEST_IA32_DEBUGCTL, val);
+}
+
+static inline u64 vmx_guest_debugctl_read(void)
{
- if (value)
- vmx_enable_intercept_for_msr(vcpu, msr, type);
- else
- vmx_disable_intercept_for_msr(vcpu, msr, type);
+ return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~VMX_HOST_OWNED_DEBUGCTL_BITS;
}
-void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu)
+{
+ u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+ if (!((val ^ vcpu->arch.host_debugctl) & VMX_HOST_OWNED_DEBUGCTL_BITS))
+ return;
+
+ vmx_guest_debugctl_write(vcpu, val & ~VMX_HOST_OWNED_DEBUGCTL_BITS);
+}
/*
* Note, early Intel manuals have the write-low and read-high bitmap offsets
@@ -662,45 +660,10 @@ static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
-static inline struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu)
-{
- return &to_vmx(vcpu)->lbr_desc;
-}
-
-static inline struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu)
-{
- return &vcpu_to_lbr_desc(vcpu)->records;
-}
-
-static inline bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
-{
- return !!vcpu_to_lbr_records(vcpu)->nr;
-}
-
void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu);
int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu);
-static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1))
- vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
- return vmx->exit_qualification;
-}
-
-static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2))
- vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
- return vmx->exit_intr_info;
-}
-
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
void free_vmcs(struct vmcs *vmcs);
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
@@ -758,4 +721,7 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
vmx->segment_cache.bitmask = 0;
}
+int vmx_init(void);
+void vmx_exit(void);
+
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 430773a5ef8e..2b3424f638db 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -21,7 +21,7 @@ void vmx_vm_destroy(struct kvm *kvm);
int vmx_vcpu_precreate(struct kvm *kvm);
int vmx_vcpu_create(struct kvm_vcpu *vcpu);
int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu);
-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
void vmx_vcpu_free(struct kvm_vcpu *vcpu);
void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
@@ -46,18 +46,18 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu,
bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu);
void vmx_migrate_timers(struct kvm_vcpu *vcpu);
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
-void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu);
void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
int trig_mode, int vector);
void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
bool vmx_has_emulated_msr(struct kvm *kvm, u32 index);
-void vmx_msr_filter_changed(struct kvm_vcpu *vcpu);
+void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu);
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
int vmx_get_feature_msr(u32 msr, u64 *data);
int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+#define vmx_complete_emulated_msr kvm_complete_insn_gp
u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -121,4 +121,39 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
#endif
void vmx_setup_mce(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_KVM_INTEL_TDX
+void tdx_disable_virtualization_cpu(void);
+int tdx_vm_init(struct kvm *kvm);
+void tdx_mmu_release_hkid(struct kvm *kvm);
+void tdx_vm_destroy(struct kvm *kvm);
+int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
+
+int tdx_vcpu_create(struct kvm_vcpu *vcpu);
+void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+void tdx_vcpu_free(struct kvm_vcpu *vcpu);
+void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu);
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
+void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
+void tdx_vcpu_put(struct kvm_vcpu *vcpu);
+int tdx_handle_exit(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion fastpath);
+
+void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector);
+void tdx_inject_nmi(struct kvm_vcpu *vcpu);
+void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
+bool tdx_has_emulated_msr(u32 index);
+int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
+int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
+
+void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
+void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
+void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
+int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
+#endif
+
#endif /* __KVM_X86_VMX_X86_OPS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5bdb5b854924..a1c49bc681c4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -90,7 +90,6 @@
#include "trace.h"
#define MAX_IO_MSRS 256
-#define KVM_MAX_MCE_BANKS 32
/*
* Note, kvm_caps fields should *never* have default values, all fields must be
@@ -227,6 +226,12 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
bool __read_mostly enable_apicv = true;
EXPORT_SYMBOL_GPL(enable_apicv);
+bool __read_mostly enable_ipiv = true;
+EXPORT_SYMBOL_GPL(enable_ipiv);
+
+bool __read_mostly enable_device_posted_irqs = true;
+EXPORT_SYMBOL_GPL(enable_device_posted_irqs);
+
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
@@ -636,6 +641,15 @@ static void kvm_user_return_msr_cpu_online(void)
}
}
+static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs)
+{
+ if (!msrs->registered) {
+ msrs->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&msrs->urn);
+ msrs->registered = true;
+ }
+}
+
int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
{
struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
@@ -649,15 +663,20 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
return 1;
msrs->values[slot].curr = value;
- if (!msrs->registered) {
- msrs->urn.on_user_return = kvm_on_user_return;
- user_return_notifier_register(&msrs->urn);
- msrs->registered = true;
- }
+ kvm_user_return_register_notifier(msrs);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
+void kvm_user_return_msr_update_cache(unsigned int slot, u64 value)
+{
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+
+ msrs->values[slot].curr = value;
+ kvm_user_return_register_notifier(msrs);
+}
+EXPORT_SYMBOL_GPL(kvm_user_return_msr_update_cache);
+
static void drop_user_return_notifiers(void)
{
struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
@@ -3242,9 +3261,11 @@ int kvm_guest_time_update(struct kvm_vcpu *v)
/* With all the info we got, fill in the values */
- if (kvm_caps.has_tsc_control)
+ if (kvm_caps.has_tsc_control) {
tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
v->arch.l1_tsc_scaling_ratio);
+ tgt_tsc_khz = tgt_tsc_khz ? : 1;
+ }
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
@@ -4561,6 +4582,9 @@ static u64 kvm_get_allowed_disable_exits(void)
{
u64 r = KVM_X86_DISABLE_EXITS_PAUSE;
+ if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+ r |= KVM_X86_DISABLE_EXITS_APERFMPERF;
+
if (!mitigate_smt_rsb) {
r |= KVM_X86_DISABLE_EXITS_HLT |
KVM_X86_DISABLE_EXITS_CSTATE;
@@ -4616,17 +4640,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_EXT_CPUID:
case KVM_CAP_EXT_EMUL_CPUID:
case KVM_CAP_CLOCKSOURCE:
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CAP_PIT:
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_REINJECT_CONTROL:
+#endif
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
case KVM_CAP_USER_NMI:
- case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_IOEVENTFD:
case KVM_CAP_IOEVENTFD_NO_LENGTH:
- case KVM_CAP_PIT2:
- case KVM_CAP_PIT_STATE2:
+
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
case KVM_CAP_VCPU_EVENTS:
#ifdef CONFIG_KVM_HYPERV
@@ -4739,6 +4766,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
+ if (kvm)
+ r = kvm->max_vcpus;
break;
case KVM_CAP_MAX_VCPU_ID:
r = KVM_MAX_VCPU_IDS;
@@ -4794,7 +4823,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
break;
case KVM_CAP_DISABLE_QUIRKS2:
- r = KVM_X86_VALID_QUIRKS;
+ r = kvm_caps.supported_quirks;
break;
case KVM_CAP_X86_NOTIFY_VMEXIT:
r = kvm_caps.has_notify_vmexit;
@@ -4965,16 +4994,13 @@ out:
return r;
}
-static void wbinvd_ipi(void *garbage)
-{
- wbinvd();
-}
-
static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
return kvm_arch_has_noncoherent_dma(vcpu->kvm);
}
+static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu);
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -4991,12 +5017,24 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_x86_call(has_wbinvd_exit)())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
- smp_call_function_single(vcpu->cpu,
- wbinvd_ipi, NULL, 1);
+ wbinvd_on_cpu(vcpu->cpu);
}
kvm_x86_call(vcpu_load)(vcpu, cpu);
+ if (vcpu != per_cpu(last_vcpu, cpu)) {
+ /*
+ * Flush the branch predictor when switching vCPUs on the same
+ * physical CPU, as each vCPU needs its own branch prediction
+ * domain. No IBPB is needed when switching between L1 and L2
+ * on the same vCPU unless IBRS is advertised to the vCPU; that
+ * is handled on the nested VM-Exit path.
+ */
+ if (static_branch_likely(&switch_vcpu_ibpb))
+ indirect_branch_prediction_barrier();
+ per_cpu(last_vcpu, cpu) = vcpu;
+ }
+
/* Save host pkru register if supported */
vcpu->arch.host_pkru = read_pkru();
@@ -5117,6 +5155,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
+ if (vcpu->arch.apic->guest_apic_protected)
+ return -EINVAL;
+
kvm_x86_call(sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
@@ -5127,6 +5168,9 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
{
int r;
+ if (vcpu->arch.apic->guest_apic_protected)
+ return -EINVAL;
+
r = kvm_apic_set_state(vcpu, s);
if (r)
return r;
@@ -5448,12 +5492,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
(events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
return -EINVAL;
- /* INITs are latched while in SMM */
- if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
- (events->smi.smm || events->smi.pending) &&
- vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
- return -EINVAL;
-
process_nmi(vcpu);
/*
@@ -6147,6 +6185,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
u32 user_tsc_khz;
r = -EINVAL;
+
+ if (vcpu->arch.guest_tsc_protected)
+ goto out;
+
user_tsc_khz = (u32)arg;
if (kvm_caps.has_tsc_control &&
@@ -6304,6 +6346,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_SET_DEVICE_ATTR:
r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
break;
+ case KVM_MEMORY_ENCRYPT_OP:
+ r = -ENOTTY;
+ if (!kvm_x86_ops.vcpu_mem_enc_ioctl)
+ goto out;
+ r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp);
+ break;
default:
r = -EINVAL;
}
@@ -6350,135 +6398,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- memcpy(&chip->chip.pic, &pic->pics[0],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- memcpy(&chip->chip.pic, &pic->pics[1],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_IOAPIC:
- kvm_get_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- return r;
-}
-
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- spin_lock(&pic->lock);
- memcpy(&pic->pics[0], &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic->lock);
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- spin_lock(&pic->lock);
- memcpy(&pic->pics[1], &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic->lock);
- break;
- case KVM_IRQCHIP_IOAPIC:
- kvm_set_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- kvm_pic_update_irq(pic);
- return r;
-}
-
-static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
-
- BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
-
- mutex_lock(&kps->lock);
- memcpy(ps, &kps->channels, sizeof(*ps));
- mutex_unlock(&kps->lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- int i;
- struct kvm_pit *pit = kvm->arch.vpit;
-
- mutex_lock(&pit->pit_state.lock);
- memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
- for (i = 0; i < 3; i++)
- kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
- mutex_unlock(&pit->pit_state.lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
- sizeof(ps->channels));
- ps->flags = kvm->arch.vpit->pit_state.flags;
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- memset(&ps->reserved, 0, sizeof(ps->reserved));
- return 0;
-}
-
-static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- int start = 0;
- int i;
- u32 prev_legacy, cur_legacy;
- struct kvm_pit *pit = kvm->arch.vpit;
-
- mutex_lock(&pit->pit_state.lock);
- prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
- cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
- if (!prev_legacy && cur_legacy)
- start = 1;
- memcpy(&pit->pit_state.channels, &ps->channels,
- sizeof(pit->pit_state.channels));
- pit->pit_state.flags = ps->flags;
- for (i = 0; i < 3; i++)
- kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
- start && i == 0);
- mutex_unlock(&pit->pit_state.lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_reinject(struct kvm *kvm,
- struct kvm_reinject_control *control)
-{
- struct kvm_pit *pit = kvm->arch.vpit;
-
- /* pit->pit_state.lock was overloaded to prevent userspace from getting
- * an inconsistent state after running multiple KVM_REINJECT_CONTROL
- * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
- */
- mutex_lock(&pit->pit_state.lock);
- kvm_pit_set_reinject(pit, control->pit_reinject);
- mutex_unlock(&pit->pit_state.lock);
-
- return 0;
-}
-
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
@@ -6491,25 +6410,13 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
struct kvm_vcpu *vcpu;
unsigned long i;
- if (!kvm_x86_ops.cpu_dirty_log_size)
+ if (!kvm->arch.cpu_dirty_log_size)
return;
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vcpu_kick(vcpu);
}
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
- bool line_status)
-{
- if (!irqchip_in_kernel(kvm))
- return -ENXIO;
-
- irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event->irq, irq_event->level,
- line_status);
- return 0;
-}
-
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
@@ -6521,11 +6428,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
switch (cap->cap) {
case KVM_CAP_DISABLE_QUIRKS2:
r = -EINVAL;
- if (cap->args[0] & ~KVM_X86_VALID_QUIRKS)
+ if (cap->args[0] & ~kvm_caps.supported_quirks)
break;
fallthrough;
case KVM_CAP_DISABLE_QUIRKS:
- kvm->arch.disabled_quirks = cap->args[0];
+ kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks;
r = 0;
break;
case KVM_CAP_SPLIT_IRQCHIP: {
@@ -6574,17 +6481,11 @@ split_irqchip_unlock:
if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) &&
cpu_smt_possible() &&
- (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
+ (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE |
+ KVM_X86_DISABLE_EXITS_APERFMPERF)))
pr_warn_once(SMT_RSB_MSG);
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
- kvm->arch.pause_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT)
- kvm->arch.mwait_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
- kvm->arch.hlt_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
- kvm->arch.cstate_in_guest = true;
+ kvm_disable_exits(kvm, cap->args[0]);
r = 0;
disable_exits_unlock:
mutex_unlock(&kvm->lock);
@@ -7021,9 +6922,11 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r = -ENOTTY;
+
+#ifdef CONFIG_KVM_IOAPIC
/*
* This union makes it completely explicit to gcc-3.x
- * that these two variables' stack usage should be
+ * that these three variables' stack usage should be
* combined, not added together.
*/
union {
@@ -7031,6 +6934,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm_pit_state2 ps2;
struct kvm_pit_config pit_config;
} u;
+#endif
switch (ioctl) {
case KVM_SET_TSS_ADDR:
@@ -7054,6 +6958,7 @@ set_identity_unlock:
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
break;
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CREATE_IRQCHIP: {
mutex_lock(&kvm->lock);
@@ -7075,7 +6980,7 @@ set_identity_unlock:
goto create_irqchip_unlock;
}
- r = kvm_setup_default_irq_routing(kvm);
+ r = kvm_setup_default_ioapic_and_pic_routing(kvm);
if (r) {
kvm_ioapic_destroy(kvm);
kvm_pic_destroy(kvm);
@@ -7123,7 +7028,7 @@ set_identity_unlock:
}
r = -ENXIO;
- if (!irqchip_kernel(kvm))
+ if (!irqchip_full(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@@ -7147,7 +7052,7 @@ set_identity_unlock:
}
r = -ENXIO;
- if (!irqchip_kernel(kvm))
+ if (!irqchip_full(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
set_irqchip_out:
@@ -7220,6 +7125,7 @@ set_pit2_out:
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
}
+#endif
case KVM_SET_BOOT_CPU_ID:
r = 0;
mutex_lock(&kvm->lock);
@@ -7290,23 +7196,25 @@ set_pit2_out:
if (user_tsc_khz == 0)
user_tsc_khz = tsc_khz;
- WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
- r = 0;
-
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
goto out;
}
case KVM_GET_TSC_KHZ: {
r = READ_ONCE(kvm->arch.default_tsc_khz);
goto out;
}
- case KVM_MEMORY_ENCRYPT_OP: {
+ case KVM_MEMORY_ENCRYPT_OP:
r = -ENOTTY;
if (!kvm_x86_ops.mem_enc_ioctl)
goto out;
r = kvm_x86_call(mem_enc_ioctl)(kvm, argp);
break;
- }
case KVM_MEMORY_ENCRYPT_REG_REGION: {
struct kvm_enc_region region;
@@ -8000,7 +7908,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
return rc;
if (!vcpu->mmio_nr_fragments)
- return rc;
+ return X86EMUL_CONTINUE;
gpa = vcpu->mmio_fragments[0].gpa;
@@ -8245,8 +8153,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
- wbinvd_ipi, NULL, 1);
+ wbinvd_on_cpus_mask(vcpu->arch.wbinvd_dirty_mask);
put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
} else
@@ -9338,7 +9245,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
{
vcpu->arch.pio.count = 0;
- if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)))
return 1;
return kvm_skip_emulated_instruction(vcpu);
@@ -9363,7 +9270,7 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
complete_fast_pio_out_port_0x7e;
kvm_skip_emulated_instruction(vcpu);
} else {
- vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_out;
}
return 0;
@@ -9376,7 +9283,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
/* We should only ever be called with arch.pio.count equal to 1 */
BUG_ON(vcpu->arch.pio.count != 1);
- if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) {
vcpu->arch.pio.count = 0;
return 1;
}
@@ -9405,7 +9312,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
return ret;
}
- vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_in;
return 0;
@@ -9771,6 +9678,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0;
}
+ kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS;
+ kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS;
rdmsrq_safe(MSR_EFER, &kvm_host.efer);
@@ -9786,6 +9695,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (r != 0)
goto out_mmu_exit;
+ enable_device_posted_irqs &= enable_apicv &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
+
kvm_ops_update(ops);
for_each_online_cpu(cpu) {
@@ -9815,6 +9727,10 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled)
kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM);
+ /* KVM always ignores guest PAT for shadow paging. */
+ if (!tdp_enabled)
+ kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
kvm_caps.supported_xss = 0;
@@ -10023,13 +9939,16 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
-int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
- unsigned long a0, unsigned long a1,
- unsigned long a2, unsigned long a3,
- int op_64_bit, int cpl,
+int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl,
int (*complete_hypercall)(struct kvm_vcpu *))
{
unsigned long ret;
+ unsigned long nr = kvm_rax_read(vcpu);
+ unsigned long a0 = kvm_rbx_read(vcpu);
+ unsigned long a1 = kvm_rcx_read(vcpu);
+ unsigned long a2 = kvm_rdx_read(vcpu);
+ unsigned long a3 = kvm_rsi_read(vcpu);
+ int op_64_bit = is_64_bit_hypercall(vcpu);
++vcpu->stat.hypercalls;
@@ -10132,9 +10051,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
if (kvm_hv_hypercall_enabled(vcpu))
return kvm_hv_hypercall(vcpu);
- return __kvm_emulate_hypercall(vcpu, rax, rbx, rcx, rdx, rsi,
- is_64_bit_hypercall(vcpu),
- kvm_x86_call(get_cpl)(vcpu),
+ return __kvm_emulate_hypercall(vcpu, kvm_x86_call(get_cpl)(vcpu),
complete_hypercall_exit);
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
@@ -10664,13 +10581,16 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
return;
bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
+ vcpu->arch.highest_stale_pending_ioapic_eoi = -1;
kvm_x86_call(sync_pir_to_irr)(vcpu);
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+#ifdef CONFIG_KVM_IOAPIC
else if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+#endif
if (is_guest_mode(vcpu))
vcpu->arch.load_eoi_exitmap_pending = true;
@@ -10724,6 +10644,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
fastpath_t exit_fastpath;
+ u64 run_flags, debug_ctl;
bool req_immediate_exit = false;
@@ -10871,8 +10792,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_vcpu_update_apicv(vcpu);
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
kvm_check_async_pf_completion(vcpu);
+
+ /*
+ * Recalc MSR intercepts as userspace may want to intercept
+ * accesses to MSRs that KVM would otherwise pass through to
+ * the guest.
+ */
if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
- kvm_x86_call(msr_filter_changed)(vcpu);
+ kvm_x86_call(recalc_msr_intercepts)(vcpu);
if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
kvm_x86_call(update_cpu_dirty_logging)(vcpu);
@@ -10968,8 +10895,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}
- if (req_immediate_exit)
+ run_flags = 0;
+ if (req_immediate_exit) {
+ run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -10978,20 +10908,31 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_fpu.xfd_err)
wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
- if (unlikely(vcpu->arch.switch_db_regs)) {
- set_debugreg(0, 7);
+ if (unlikely(vcpu->arch.switch_db_regs &&
+ !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
+ set_debugreg(DR7_FIXED_1, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
/* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
+ run_flags |= KVM_RUN_LOAD_GUEST_DR6;
} else if (unlikely(hw_breakpoint_active())) {
- set_debugreg(0, 7);
+ set_debugreg(DR7_FIXED_1, 7);
}
- vcpu->arch.host_debugctl = get_debugctlmsr();
+ /*
+ * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
+ * can be modified in IRQ context, e.g. via SMP function calls. Inform
+ * vendor code if any host-owned bits were changed, e.g. so that the
+ * value loaded into hardware while running the guest can be updated.
+ */
+ debug_ctl = get_debugctlmsr();
+ if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
+ !vcpu->arch.guest_state_protected)
+ run_flags |= KVM_RUN_LOAD_DEBUGCTL;
+ vcpu->arch.host_debugctl = debug_ctl;
guest_timing_enter_irqoff();
@@ -11005,8 +10946,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
- exit_fastpath = kvm_x86_call(vcpu_run)(vcpu,
- req_immediate_exit);
+ exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, run_flags);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -11018,6 +10958,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
break;
}
+ run_flags = 0;
+
/* Note, VM-Exits that go down the "slow" path are accounted below. */
++vcpu->stat.exits;
}
@@ -11030,6 +10972,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+ WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH);
kvm_x86_call(sync_dirty_debug_regs)(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr7(vcpu);
@@ -11134,7 +11077,7 @@ static bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
!vcpu->arch.apf.halted);
}
-static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
+bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
{
if (!list_empty_careful(&vcpu->async_pf.done))
return true;
@@ -11143,9 +11086,6 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
kvm_apic_init_sipi_allowed(vcpu))
return true;
- if (vcpu->arch.pv.pv_unhalted)
- return true;
-
if (kvm_is_exception_pending(vcpu))
return true;
@@ -11183,10 +11123,12 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
return false;
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_has_events);
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
+ return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted ||
+ kvm_vcpu_has_events(vcpu);
}
/* Called within kvm->srcu read side. */
@@ -11320,7 +11262,7 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
*/
++vcpu->stat.halt_exits;
if (lapic_in_kernel(vcpu)) {
- if (kvm_vcpu_has_events(vcpu))
+ if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted)
state = KVM_MP_STATE_RUNNABLE;
kvm_set_mp_state(vcpu, state);
return 1;
@@ -11491,6 +11433,28 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
trace_kvm_fpu(0);
}
+static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ /*
+ * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and
+ * tracks the pending SIPI separately. SIPI_RECEIVED is still accepted
+ * by KVM_SET_VCPU_EVENTS for backwards compatibility, but should be
+ * converted to INIT_RECEIVED.
+ */
+ if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED))
+ return -EINVAL;
+
+ /*
+ * Disallow running the vCPU if userspace forced it into an impossible
+ * MP_STATE, e.g. if the vCPU is in WFS but SIPI is blocked.
+ */
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED &&
+ !kvm_apic_init_sipi_allowed(vcpu))
+ return -EINVAL;
+
+ return kvm_x86_call(vcpu_pre_run)(vcpu);
+}
+
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
@@ -11593,7 +11557,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
- r = kvm_x86_call(vcpu_pre_run)(vcpu);
+ r = kvm_x86_vcpu_pre_run(vcpu);
if (r <= 0)
goto out;
@@ -11837,21 +11801,16 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
}
/*
- * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
- * forcing the guest into INIT/SIPI if those events are supposed to be
- * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state
- * if an SMI is pending as well.
+ * SIPI_RECEIVED is obsolete and no longer used internally; KVM instead
+ * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI.
+ * Translate SIPI_RECEIVED as appropriate for backwards compatibility.
*/
- if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
- (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
- mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
- goto out;
-
if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
- kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
+ mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED;
set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
- } else
- kvm_set_mp_state(vcpu, mp_state->mp_state);
+ }
+
+ kvm_set_mp_state(vcpu, mp_state->mp_state);
kvm_make_request(KVM_REQ_EVENT, vcpu);
ret = 0;
@@ -12388,13 +12347,16 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- int idx;
+ int idx, cpu;
kvm_clear_async_pf_completion_queue(vcpu);
kvm_mmu_unload(vcpu);
kvmclock_reset(vcpu);
+ for_each_possible_cpu(cpu)
+ cmpxchg(per_cpu_ptr(&last_vcpu, cpu), vcpu, NULL);
+
kvm_x86_call(vcpu_free)(vcpu);
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
@@ -12694,6 +12656,7 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
{
@@ -12723,26 +12686,22 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
/* Decided by the vendor code for other VM types. */
kvm->arch.pre_fault_allowed =
type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM;
+ kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks;
ret = kvm_page_track_init(kvm);
if (ret)
goto out;
- kvm_mmu_init_vm(kvm);
+ ret = kvm_mmu_init_vm(kvm);
+ if (ret)
+ goto out_cleanup_page_track;
ret = kvm_x86_call(vm_init)(kvm);
if (ret)
goto out_uninit_mmu;
- INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
- /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
- set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
- /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
- set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
- &kvm->arch.irq_sources_bitmap);
-
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
@@ -12781,6 +12740,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
out_uninit_mmu:
kvm_mmu_uninit_vm(kvm);
+out_cleanup_page_track:
kvm_page_track_cleanup(kvm);
out:
return ret;
@@ -12873,9 +12833,12 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
+#ifdef CONFIG_KVM_IOAPIC
kvm_free_pit(kvm);
+#endif
kvm_mmu_pre_destroy_vm(kvm);
+ static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
@@ -12896,8 +12859,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
}
kvm_destroy_vcpus(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
+#ifdef CONFIG_KVM_IOAPIC
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
+#endif
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
@@ -13073,7 +13038,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
{
int nr_slots;
- if (!kvm_x86_ops.cpu_dirty_log_size)
+ if (!kvm->arch.cpu_dirty_log_size)
return;
nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
@@ -13145,7 +13110,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
if (READ_ONCE(eager_page_split))
kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
- if (kvm_x86_ops.cpu_dirty_log_size) {
+ if (kvm->arch.cpu_dirty_log_size) {
kvm_mmu_slot_leaf_clear_dirty(kvm, new);
kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
} else {
@@ -13507,25 +13472,6 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
}
-void kvm_arch_start_assignment(struct kvm *kvm)
-{
- if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
- kvm_x86_call(pi_start_assignment)(kvm);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
-
-void kvm_arch_end_assignment(struct kvm *kvm)
-{
- atomic_dec(&kvm->arch.assigned_device_count);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
-
-bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
-{
- return raw_atomic_read(&kvm->arch.assigned_device_count);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
-
static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
{
/*
@@ -13534,8 +13480,10 @@ static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
* due to toggling the "ignore PAT" bit. Zap all SPTEs when the first
* (or last) non-coherent device is (un)registered to so that new SPTEs
* with the correct "ignore guest PAT" setting are created.
+ *
+ * If KVM always honors guest PAT, however, there is nothing to do.
*/
- if (kvm_mmu_may_ignore_guest_pat())
+ if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT))
kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
}
@@ -13559,77 +13507,6 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
-int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
- struct irq_bypass_producer *prod)
-{
- struct kvm_kernel_irqfd *irqfd =
- container_of(cons, struct kvm_kernel_irqfd, consumer);
- struct kvm *kvm = irqfd->kvm;
- int ret;
-
- kvm_arch_start_assignment(irqfd->kvm);
-
- spin_lock_irq(&kvm->irqfds.lock);
- irqfd->producer = prod;
-
- ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 1);
- if (ret)
- kvm_arch_end_assignment(irqfd->kvm);
-
- spin_unlock_irq(&kvm->irqfds.lock);
-
-
- return ret;
-}
-
-void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
- struct irq_bypass_producer *prod)
-{
- int ret;
- struct kvm_kernel_irqfd *irqfd =
- container_of(cons, struct kvm_kernel_irqfd, consumer);
- struct kvm *kvm = irqfd->kvm;
-
- WARN_ON(irqfd->producer != prod);
-
- /*
- * When producer of consumer is unregistered, we change back to
- * remapped mode, so we can re-use the current implementation
- * when the irq is masked/disabled or the consumer side (KVM
- * int this case doesn't want to receive the interrupts.
- */
- spin_lock_irq(&kvm->irqfds.lock);
- irqfd->producer = NULL;
-
- ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 0);
- if (ret)
- printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
- " fails: %d\n", irqfd->consumer.token, ret);
-
- spin_unlock_irq(&kvm->irqfds.lock);
-
-
- kvm_arch_end_assignment(irqfd->kvm);
-}
-
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
-}
-
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
- struct kvm_kernel_irq_routing_entry *new)
-{
- if (old->type != KVM_IRQ_ROUTING_MSI ||
- new->type != KVM_IRQ_ROUTING_MSI)
- return true;
-
- return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
-}
-
bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
@@ -14012,6 +13889,7 @@ EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
@@ -14028,7 +13906,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 9dc32a409076..bcfd9b719ada 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -10,6 +10,8 @@
#include "kvm_emulate.h"
#include "cpuid.h"
+#define KVM_MAX_MCE_BANKS 32
+
struct kvm_caps {
/* control of guest tsc rate supported? */
bool has_tsc_control;
@@ -32,6 +34,9 @@ struct kvm_caps {
u64 supported_xcr0;
u64 supported_xss;
u64 supported_perf_cap;
+
+ u64 supported_quirks;
+ u64 inapplicable_quirks;
};
struct kvm_host_values {
@@ -50,6 +55,28 @@ struct kvm_host_values {
void kvm_spurious_fault(void);
+#define SIZE_OF_MEMSLOTS_HASHTABLE \
+ (sizeof(((struct kvm_memslots *)0)->id_hash) * 2 * KVM_MAX_NR_ADDRESS_SPACES)
+
+/* Sanity check the size of the memslot hash tables. */
+static_assert(SIZE_OF_MEMSLOTS_HASHTABLE ==
+ (1024 * (1 + IS_ENABLED(CONFIG_X86_64)) * (1 + IS_ENABLED(CONFIG_KVM_SMM))));
+
+/*
+ * Assert that "struct kvm_{svm,vmx,tdx}" is an order-0 or order-1 allocation.
+ * Spilling over to an order-2 allocation isn't fundamentally problematic, but
+ * isn't expected to happen in the foreseeable future (O(years)). Assert that
+ * the size is an order-0 allocation when ignoring the memslot hash tables, to
+ * help detect and debug unexpected size increases.
+ */
+#define KVM_SANITY_CHECK_VM_STRUCT_SIZE(x) \
+do { \
+ BUILD_BUG_ON(get_order(sizeof(struct x) - SIZE_OF_MEMSLOTS_HASHTABLE) && \
+ !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \
+ BUILD_BUG_ON(get_order(sizeof(struct x)) > 1 && \
+ !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \
+} while (0)
+
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
({ \
bool failed = (consistency_check); \
@@ -116,6 +143,24 @@ static inline void kvm_leave_nested(struct kvm_vcpu *vcpu)
kvm_x86_ops.nested_ops->leave_nested(vcpu);
}
+/*
+ * If IBRS is advertised to the vCPU, KVM must flush the indirect branch
+ * predictors when transitioning from L2 to L1, as L1 expects hardware (KVM in
+ * this case) to provide separate predictor modes. Bare metal isolates the host
+ * from the guest, but doesn't isolate different guests from one another (in
+ * this case L1 and L2). The exception is if bare metal supports same mode IBRS,
+ * which offers protection within the same mode, and hence protects L1 from L2.
+ */
+static inline void kvm_nested_vmexit_handle_ibrs(struct kvm_vcpu *vcpu)
+{
+ if (cpu_feature_enabled(X86_FEATURE_AMD_IBRS_SAME_MODE))
+ return;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS))
+ indirect_branch_prediction_barrier();
+}
+
static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
{
return vcpu->arch.last_vmentry_cpu != -1;
@@ -476,24 +521,34 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
__rem; \
})
+static inline void kvm_disable_exits(struct kvm *kvm, u64 mask)
+{
+ kvm->arch.disabled_exits |= mask;
+}
+
static inline bool kvm_mwait_in_guest(struct kvm *kvm)
{
- return kvm->arch.mwait_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_MWAIT;
}
static inline bool kvm_hlt_in_guest(struct kvm *kvm)
{
- return kvm->arch.hlt_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_HLT;
}
static inline bool kvm_pause_in_guest(struct kvm *kvm)
{
- return kvm->arch.pause_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_PAUSE;
}
static inline bool kvm_cstate_in_guest(struct kvm *kvm)
{
- return kvm->arch.cstate_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_CSTATE;
+}
+
+static inline bool kvm_aperfmperf_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_APERFMPERF;
}
static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
@@ -629,25 +684,17 @@ static inline bool user_exit_on_hypercall(struct kvm *kvm, unsigned long hc_nr)
return kvm->arch.hypercall_exit_enabled & BIT(hc_nr);
}
-int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
- unsigned long a0, unsigned long a1,
- unsigned long a2, unsigned long a3,
- int op_64_bit, int cpl,
+int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl,
int (*complete_hypercall)(struct kvm_vcpu *));
-#define __kvm_emulate_hypercall(_vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, complete_hypercall) \
-({ \
- int __ret; \
- \
- __ret = ____kvm_emulate_hypercall(_vcpu, \
- kvm_##nr##_read(_vcpu), kvm_##a0##_read(_vcpu), \
- kvm_##a1##_read(_vcpu), kvm_##a2##_read(_vcpu), \
- kvm_##a3##_read(_vcpu), op_64_bit, cpl, \
- complete_hypercall); \
- \
- if (__ret > 0) \
- __ret = complete_hypercall(_vcpu); \
- __ret; \
+#define __kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall) \
+({ \
+ int __ret; \
+ __ret = ____kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall); \
+ \
+ if (__ret > 0) \
+ __ret = complete_hypercall(_vcpu); \
+ __ret; \
})
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 38b33cdd4232..d6b2a665b499 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1526,7 +1526,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports,
sched_poll.nr_ports * sizeof(*ports), &e)) {
*r = -EFAULT;
- return true;
+ goto out;
}
for (i = 0; i < sched_poll.nr_ports; i++) {
@@ -1571,7 +1571,8 @@ out:
static void cancel_evtchn_poll(struct timer_list *t)
{
- struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer);
+ struct kvm_vcpu *vcpu = timer_container_of(vcpu, t,
+ arch.xen.poll_timer);
kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
kvm_vcpu_kick(vcpu);
@@ -1970,8 +1971,19 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
{
struct kvm_vcpu *vcpu;
- if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
- return -EINVAL;
+ /*
+ * Don't check for the port being within range of max_evtchn_port().
+ * Userspace can configure what ever targets it likes; events just won't
+ * be delivered if/while the target is invalid, just like userspace can
+ * configure MSIs which target non-existent APICs.
+ *
+ * This allow on Live Migration and Live Update, the IRQ routing table
+ * can be restored *independently* of other things like creating vCPUs,
+ * without imposing an ordering dependency on userspace. In this
+ * particular case, the problematic ordering would be with setting the
+ * Xen 'long mode' flag, which changes max_evtchn_port() to allow 4096
+ * instead of 1024 event channels.
+ */
/* We only support 2 level event channels for now */
if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
index 8ae0f93ecbfd..ec2131c9fd20 100644
--- a/arch/x86/lib/.gitignore
+++ b/arch/x86/lib/.gitignore
@@ -1,2 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
+
inat-tables.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 4fa5c4e1ba8a..2dba7f83ef97 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -3,8 +3,6 @@
# Makefile for x86 specific library files.
#
-obj-y += crypto/
-
# Produces uninteresting flaky coverage.
KCOV_INSTRUMENT_delay.o := n
@@ -40,16 +38,6 @@ lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o
-obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o
-crc32-x86-y := crc32.o crc32-pclmul.o
-crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o
-
-obj-$(CONFIG_CRC64_ARCH) += crc64-x86.o
-crc64-x86-y := crc64.o crc64-pclmul.o
-
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o
-crc-t10dif-x86-y := crc-t10dif.o crc16-msb-pclmul.o
-
obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
obj-y += iomem.o
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
index 7af743bd3b13..c5c60d07308c 100644
--- a/arch/x86/lib/cache-smp.c
+++ b/arch/x86/lib/cache-smp.c
@@ -14,9 +14,31 @@ void wbinvd_on_cpu(int cpu)
}
EXPORT_SYMBOL(wbinvd_on_cpu);
-int wbinvd_on_all_cpus(void)
+void wbinvd_on_all_cpus(void)
{
on_each_cpu(__wbinvd, NULL, 1);
- return 0;
}
EXPORT_SYMBOL(wbinvd_on_all_cpus);
+
+void wbinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ on_each_cpu_mask(cpus, __wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(wbinvd_on_cpus_mask);
+
+static void __wbnoinvd(void *dummy)
+{
+ wbnoinvd();
+}
+
+void wbnoinvd_on_all_cpus(void)
+{
+ on_each_cpu(__wbnoinvd, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(wbnoinvd_on_all_cpus);
+
+void wbnoinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ on_each_cpu_mask(cpus, __wbnoinvd, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(wbnoinvd_on_cpus_mask);
diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h
deleted file mode 100644
index fcc63c064333..000000000000
--- a/arch/x86/lib/crc-pclmul-consts.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * CRC constants generated by:
- *
- * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
- *
- * Do not edit manually.
- */
-
-/*
- * CRC folding constants generated for most-significant-bit-first CRC-16 using
- * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
- */
-static const struct {
- u8 bswap_mask[16];
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = {
- .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
- .fold_across_2048_bits_consts = {
- 0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */
- 0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */
- },
- .fold_across_1024_bits_consts = {
- 0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */
- 0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */
- },
- .fold_across_512_bits_consts = {
- 0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */
- 0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */
- },
- .fold_across_256_bits_consts = {
- 0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */
- 0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */
- },
- .fold_across_128_bits_consts = {
- 0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */
- 0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */
- 0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */
- },
-};
-
-/*
- * CRC folding constants generated for least-significant-bit-first CRC-32 using
- * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
- * x^5 + x^4 + x^2 + x^1 + x^0
- */
-static const struct {
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = {
- .fold_across_2048_bits_consts = {
- 0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */
- 0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */
- },
- .fold_across_1024_bits_consts = {
- 0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */
- 0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */
- },
- .fold_across_512_bits_consts = {
- 0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */
- 0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */
- },
- .fold_across_256_bits_consts = {
- 0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */
- 0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */
- },
- .fold_across_128_bits_consts = {
- 0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */
- 0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */
- 0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */
- },
-};
-
-/*
- * CRC folding constants generated for most-significant-bit-first CRC-64 using
- * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
- * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
- * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
- * x^7 + x^4 + x^1 + x^0
- */
-static const struct {
- u8 bswap_mask[16];
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = {
- .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
- .fold_across_2048_bits_consts = {
- 0x7f52691a60ddc70d, /* LO64_TERMS: (x^2048 mod G) * x^0 */
- 0x7036b0389f6a0c82, /* HI64_TERMS: (x^2112 mod G) * x^0 */
- },
- .fold_across_1024_bits_consts = {
- 0x05cf79dea9ac37d6, /* LO64_TERMS: (x^1024 mod G) * x^0 */
- 0x001067e571d7d5c2, /* HI64_TERMS: (x^1088 mod G) * x^0 */
- },
- .fold_across_512_bits_consts = {
- 0x5f6843ca540df020, /* LO64_TERMS: (x^512 mod G) * x^0 */
- 0xddf4b6981205b83f, /* HI64_TERMS: (x^576 mod G) * x^0 */
- },
- .fold_across_256_bits_consts = {
- 0x571bee0a227ef92b, /* LO64_TERMS: (x^256 mod G) * x^0 */
- 0x44bef2a201b5200c, /* HI64_TERMS: (x^320 mod G) * x^0 */
- },
- .fold_across_128_bits_consts = {
- 0x05f5c3c7eb52fab6, /* LO64_TERMS: (x^128 mod G) * x^0 */
- 0x4eb938a7d257740e, /* HI64_TERMS: (x^192 mod G) * x^0 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0x42f0e1eba9ea3693, /* LO64_TERMS: (G - x^64) * x^0 */
- 0x578d29d06cc4f872, /* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */
- },
-};
-
-/*
- * CRC folding constants generated for least-significant-bit-first CRC-64 using
- * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 +
- * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 +
- * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 +
- * x^4 + x^3 + x^0
- */
-static const struct {
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = {
- .fold_across_2048_bits_consts = {
- 0x37ccd3e14069cabc, /* HI64_TERMS: (x^2111 mod G) * x^0 */
- 0xa043808c0f782663, /* LO64_TERMS: (x^2047 mod G) * x^0 */
- },
- .fold_across_1024_bits_consts = {
- 0xa1ca681e733f9c40, /* HI64_TERMS: (x^1087 mod G) * x^0 */
- 0x5f852fb61e8d92dc, /* LO64_TERMS: (x^1023 mod G) * x^0 */
- },
- .fold_across_512_bits_consts = {
- 0x0c32cdb31e18a84a, /* HI64_TERMS: (x^575 mod G) * x^0 */
- 0x62242240ace5045a, /* LO64_TERMS: (x^511 mod G) * x^0 */
- },
- .fold_across_256_bits_consts = {
- 0xb0bc2e589204f500, /* HI64_TERMS: (x^319 mod G) * x^0 */
- 0xe1e0bb9d45d7a44c, /* LO64_TERMS: (x^255 mod G) * x^0 */
- },
- .fold_across_128_bits_consts = {
- 0xeadc41fd2ba3d420, /* HI64_TERMS: (x^191 mod G) * x^0 */
- 0x21e9761e252621ac, /* LO64_TERMS: (x^127 mod G) * x^0 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0x27ecfa329aef9f77, /* HI64_TERMS: floor(x^127 / G) */
- 0x34d926535897936a, /* LO64_TERMS: (G - x^64 - x^0) / x */
- },
-};
diff --git a/arch/x86/lib/crc-pclmul-template.S b/arch/x86/lib/crc-pclmul-template.S
deleted file mode 100644
index ae0b6144c503..000000000000
--- a/arch/x86/lib/crc-pclmul-template.S
+++ /dev/null
@@ -1,582 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-//
-// Template to generate [V]PCLMULQDQ-based CRC functions for x86
-//
-// Copyright 2025 Google LLC
-//
-// Author: Eric Biggers <ebiggers@google.com>
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-// Offsets within the generated constants table
-.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only
-.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0
-.set OFFSETOF_SHUF_TABLE, 1*16
-.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16
-
-// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
-// corresponding non-VEX instruction plus any needed moves. The supported
-// instruction formats are:
-//
-// - Two-arg [src, dst], where the non-VEX format is the same.
-// - Three-arg [src1, src2, dst] where the non-VEX format is
-// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too.
-//
-// \insn gives the instruction without a "v" prefix and including any immediate
-// argument if needed to make the instruction follow one of the above formats.
-// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
-// it first; this is needed when \arg1 is an unaligned mem operand.
-.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
-.if AVX_LEVEL == 0
- // VEX not allowed. Emulate it.
- .ifnb \arg3 // Three-arg [src1, src2, dst]
- .ifc "\arg2", "\arg3" // src2 == dst?
- .ifnb \unaligned_mem_tmp
- movdqu \arg1, \unaligned_mem_tmp
- \insn \unaligned_mem_tmp, \arg3
- .else
- \insn \arg1, \arg3
- .endif
- .else // src2 != dst
- .ifc "\arg1", "\arg3"
- .error "Can't have src1 == dst when src2 != dst"
- .endif
- .ifnb \unaligned_mem_tmp
- movdqu \arg1, \unaligned_mem_tmp
- movdqa \arg2, \arg3
- \insn \unaligned_mem_tmp, \arg3
- .else
- movdqa \arg2, \arg3
- \insn \arg1, \arg3
- .endif
- .endif
- .else // Two-arg [src, dst]
- .ifnb \unaligned_mem_tmp
- movdqu \arg1, \unaligned_mem_tmp
- \insn \unaligned_mem_tmp, \arg2
- .else
- \insn \arg1, \arg2
- .endif
- .endif
-.else
- // VEX is allowed. Emit the desired instruction directly.
- .ifnb \arg3
- v\insn \arg1, \arg2, \arg3
- .else
- v\insn \arg1, \arg2
- .endif
-.endif
-.endm
-
-// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
-// register of length VL.
-.macro _vbroadcast src, dst
-.if VL == 16
- _cond_vex movdqa, \src, \dst
-.elseif VL == 32
- vbroadcasti128 \src, \dst
-.else
- vbroadcasti32x4 \src, \dst
-.endif
-.endm
-
-// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
-// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
-.macro _load_data vl, src, bswap_mask, dst
-.if \vl < 64
- _cond_vex movdqu, "\src", \dst
-.else
- vmovdqu8 \src, \dst
-.endif
-.if !LSB_CRC
- _cond_vex pshufb, \bswap_mask, \dst, \dst
-.endif
-.endm
-
-.macro _prepare_v0 vl, v0, v1, bswap_mask
-.if LSB_CRC
- .if \vl < 64
- _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1
- .else
- vpxorq (BUF), \v0, \v0
- .endif
-.else
- _load_data \vl, (BUF), \bswap_mask, \v1
- .if \vl < 64
- _cond_vex pxor, \v1, \v0, \v0
- .else
- vpxorq \v1, \v0, \v0
- .endif
-.endif
-.endm
-
-// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
-// msb-first order or the physically high qword for lsb-first order
-#define LO64_TERMS 0
-
-// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
-// qword for msb-first order or the physically low qword for lsb-first order
-#define HI64_TERMS 1
-
-// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
-// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
-.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst
- _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
- \src1, \src2, \dst
-.endm
-
-// Fold \acc into \data and store the result back into \acc. \data can be an
-// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
-// byte-reflection is needed; otherwise it must be a vector register. \consts
-// is a vector register containing the needed fold constants, and \tmp is a
-// temporary vector register. All arguments must be the same length.
-.macro _fold_vec acc, data, consts, tmp
- _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
- _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc
-.if AVX_LEVEL <= 2
- _cond_vex pxor, \data, \tmp, \tmp
- _cond_vex pxor, \tmp, \acc, \acc
-.else
- vpternlogq $0x96, \data, \tmp, \acc
-.endif
-.endm
-
-// Fold \acc into \data and store the result back into \acc. \data is an
-// unaligned mem operand, \consts is a vector register containing the needed
-// fold constants, \bswap_mask is a vector register containing the
-// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
-// temporary vector registers. All arguments must have length \vl.
-.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2
-.if AVX_LEVEL == 0 || !LSB_CRC
- _load_data \vl, \data, \bswap_mask, \tmp1
- _fold_vec \acc, \tmp1, \consts, \tmp2
-.else
- _fold_vec \acc, \data, \consts, \tmp1
-.endif
-.endm
-
-// Load the constants for folding across 2**i vectors of length VL at a time
-// into all 128-bit lanes of the vector register CONSTS.
-.macro _load_vec_folding_consts i
- _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
- CONSTS
-.endm
-
-// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
-// the result back into \v0. If the remaining length mod \vl is nonzero, also
-// fold \vl data bytes from BUF. For both operations the fold distance is \vl.
-// \consts must be a register of length \vl containing the fold constants.
-.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2
- _fold_vec \v0, \v1, \consts, \tmp1
- test $\vl, LEN8
- jz .Lfold_vec_final_done\@
- _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
- add $\vl, BUF
-.Lfold_vec_final_done\@:
-.endm
-
-// This macro generates the body of a CRC function with the following prototype:
-//
-// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
-//
-// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it.
-// |buf| is the data to checksum. |len| is the data length in bytes, which must
-// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts
-// field of the constants struct that was generated for the chosen CRC variant.
-//
-// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
-// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If
-// the file is compiled in i386 mode, then the maximum supported value is 32.
-//
-// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
-// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0
-// if the CRC processes the most significant bit of each byte first, i.e. maps
-// bit0 to x^0, bit1 to x^1, bit7 to x^7.
-//
-// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
-//
-// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
-// 512 for AVX512.
-//
-// If \vl == 16 && \avx_level == 0, the generated code requires:
-// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
-//
-// If \vl == 32 && \avx_level == 2, the generated code requires:
-// VPCLMULQDQ && AVX2.
-//
-// If \vl == 64 && \avx_level == 512, the generated code requires:
-// VPCLMULQDQ && AVX512BW && AVX512VL.
-//
-// Other \vl and \avx_level combinations are either not supported or not useful.
-.macro _crc_pclmul n, lsb_crc, vl, avx_level
- .set LSB_CRC, \lsb_crc
- .set VL, \vl
- .set AVX_LEVEL, \avx_level
-
- // Define aliases for the xmm, ymm, or zmm registers according to VL.
-.irp i, 0,1,2,3,4,5,6,7
- .if VL == 16
- .set V\i, %xmm\i
- .set LOG2_VL, 4
- .elseif VL == 32
- .set V\i, %ymm\i
- .set LOG2_VL, 5
- .elseif VL == 64
- .set V\i, %zmm\i
- .set LOG2_VL, 6
- .else
- .error "Unsupported vector length"
- .endif
-.endr
- // Define aliases for the function parameters.
- // Note: when crc_t is shorter than u32, zero-extension to 32 bits is
- // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed
- // when crc_t is shorter than u64.
-#ifdef __x86_64__
-.if \n <= 32
- .set CRC, %edi
-.else
- .set CRC, %rdi
-.endif
- .set BUF, %rsi
- .set LEN, %rdx
- .set LEN32, %edx
- .set LEN8, %dl
- .set CONSTS_PTR, %rcx
-#else
- // 32-bit support, assuming -mregparm=3 and not including support for
- // CRC-64 (which would use both eax and edx to pass the crc parameter).
- .set CRC, %eax
- .set BUF, %edx
- .set LEN, %ecx
- .set LEN32, %ecx
- .set LEN8, %cl
- .set CONSTS_PTR, %ebx // Passed on stack
-#endif
-
- // Define aliases for some local variables. V0-V5 are used without
- // aliases (for accumulators, data, temporary values, etc). Staying
- // within the first 8 vector registers keeps the code 32-bit SSE
- // compatible and reduces the size of 64-bit SSE code slightly.
- .set BSWAP_MASK, V6
- .set BSWAP_MASK_YMM, %ymm6
- .set BSWAP_MASK_XMM, %xmm6
- .set CONSTS, V7
- .set CONSTS_YMM, %ymm7
- .set CONSTS_XMM, %xmm7
-
- // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the
- // functions generated by this macro are called only by static_call.
- ANNOTATE_NOENDBR
-
-#ifdef __i386__
- push CONSTS_PTR
- mov 8(%esp), CONSTS_PTR
-#endif
-
- // Create a 128-bit vector that contains the initial CRC in the end
- // representing the high-order polynomial coefficients, and the rest 0.
- // If the CRC is msb-first, also load the byte-reflection table.
-.if \n <= 32
- _cond_vex movd, CRC, %xmm0
-.else
- _cond_vex movq, CRC, %xmm0
-.endif
-.if !LSB_CRC
- _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
- _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
-.endif
-
- // Load the first vector of data and XOR the initial CRC into the
- // appropriate end of the first 128-bit lane of data. If LEN < VL, then
- // use a short vector and jump ahead to the final reduction. (LEN >= 16
- // is guaranteed here but not necessarily LEN >= VL.)
-.if VL >= 32
- cmp $VL, LEN
- jae .Lat_least_1vec\@
- .if VL == 64
- cmp $32, LEN32
- jb .Lless_than_32bytes\@
- _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM
- add $32, BUF
- jmp .Lreduce_256bits_to_128bits\@
-.Lless_than_32bytes\@:
- .endif
- _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM
- add $16, BUF
- vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
- jmp .Lcheck_for_partial_block\@
-.Lat_least_1vec\@:
-.endif
- _prepare_v0 VL, V0, V1, BSWAP_MASK
-
- // Handle VL <= LEN < 4*VL.
- cmp $4*VL-1, LEN
- ja .Lat_least_4vecs\@
- add $VL, BUF
- // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector.
- // If VL==16 then load fold_across_128_bits_consts first, as the final
- // reduction depends on it and it won't be loaded anywhere else.
- cmp $2*VL-1, LEN32
-.if VL == 16
- _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
-.endif
- jbe .Lreduce_1vec_to_128bits\@
- // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to
- // the reduction from 2 vectors.
- _load_data VL, (BUF), BSWAP_MASK, V1
- add $VL, BUF
- jmp .Lreduce_2vecs_to_1\@
-
-.Lat_least_4vecs\@:
- // Load 3 more vectors of data.
- _load_data VL, 1*VL(BUF), BSWAP_MASK, V1
- _load_data VL, 2*VL(BUF), BSWAP_MASK, V2
- _load_data VL, 3*VL(BUF), BSWAP_MASK, V3
- sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32
- add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32
-
- // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
- // 4 vectors of data and write the result back to V0-V3.
- cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32
- jbe .Lreduce_4vecs_to_2\@
- _load_vec_folding_consts 2
-.Lfold_4vecs_loop\@:
- _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- sub $-4*VL, BUF
- add $-4*VL, LEN
- cmp $4*VL-1, LEN
- ja .Lfold_4vecs_loop\@
-
- // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold
- // two more vectors of data from BUF, if at least that much remains.
-.Lreduce_4vecs_to_2\@:
- _load_vec_folding_consts 1
- _fold_vec V0, V2, CONSTS, V4
- _fold_vec V1, V3, CONSTS, V4
- test $2*VL, LEN8
- jz .Lreduce_2vecs_to_1\@
- _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- sub $-2*VL, BUF
-
- // Fold V0 into V1 and write the result back to V0. Then fold one more
- // vector of data from BUF, if at least that much remains.
-.Lreduce_2vecs_to_1\@:
- _load_vec_folding_consts 0
- _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
-
-.Lreduce_1vec_to_128bits\@:
-.if VL == 64
- // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of
- // data from BUF, if at least that much remains.
- vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
- vextracti64x4 $1, %zmm0, %ymm1
- _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
-.Lreduce_256bits_to_128bits\@:
-.endif
-.if VL >= 32
- // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of
- // data from BUF, if at least that much remains.
- vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
- vextracti128 $1, %ymm0, %xmm1
- _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
-.Lcheck_for_partial_block\@:
-.endif
- and $15, LEN32
- jz .Lreduce_128bits_to_crc\@
-
- // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now
- // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
- // and B is the polynomial of the remaining LEN data bytes. To reduce
- // this to 128 bits without needing fold constants for each possible
- // LEN, rearrange this expression into C1*(x^128) + C2, where
- // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
- // Then fold C1 into C2, which is just another fold across 128 bits.
-
-.if !LSB_CRC || AVX_LEVEL == 0
- // Load the last 16 data bytes. Note that originally LEN was >= 16.
- _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
-.endif // Else will use vpblendvb mem operand later.
-.if !LSB_CRC
- neg LEN // Needed for indexing shuf_table
-.endif
-
- // tmp = A*x^(8*LEN) mod x^128
- // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
- // i.e. right-shift by LEN bytes.
- // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
- // i.e. left-shift by LEN bytes.
- _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
- _cond_vex pshufb, %xmm3, %xmm0, %xmm1
-
- // C1 = floor(A / x^(128 - 8*LEN))
- // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
- // i.e. left-shift by 16-LEN bytes.
- // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
- // i.e. right-shift by 16-LEN bytes.
- _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
- %xmm0, %xmm0, unaligned_mem_tmp=%xmm4
-
- // C2 = tmp + B. This is just a blend of tmp with the last 16 data
- // bytes (reflected if msb-first). The blend mask is the shuffle table
- // that was used to create tmp. 0 selects tmp, and 1 last16databytes.
-.if AVX_LEVEL == 0
- movdqa %xmm0, %xmm4
- movdqa %xmm3, %xmm0
- pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand
- movdqa %xmm4, %xmm0
-.elseif LSB_CRC
- vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1
-.else
- vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
-.endif
-
- // Fold C1 into C2 and store the 128-bit result in %xmm0.
- _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4
-
-.Lreduce_128bits_to_crc\@:
- // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit
- // polynomial stored in %xmm0 (using either lsb-first or msb-first bit
- // order according to LSB_CRC), and G is the CRC's generator polynomial.
-
- // First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
- //
- // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
- // x^n * (%xmm0 mod x^64)
- //
- // Store t0 * x^(64-n) in %xmm0. I.e., actually do:
- //
- // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
- // x^64 * (%xmm0 mod x^64)
- //
- // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
- // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
- // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
- // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
- // (considering the extra factor of x that gets implicitly introduced by
- // each pclmulqdq when using lsb-first order), is identical to the
- // constant that was used earlier for folding the LO64_TERMS across 128
- // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM.
- _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
-.if LSB_CRC
- _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
-.else
- _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
-.endif
- _cond_vex pxor, %xmm1, %xmm0, %xmm0
- // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
- // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
-
- // First step of Barrett reduction: Compute floor(t0 / G). This is the
- // polynomial by which G needs to be multiplied to cancel out the x^n
- // and higher terms of t0, i.e. to reduce t0 mod G. First do:
- //
- // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n)
- //
- // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in
- // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
- // value that makes enough precision be carried through the calculation.
- //
- // The '* x' makes it so the result is floor(t1 / x^64) rather than
- // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
- // can be extracted much more easily in the next step. In the lsb-first
- // case the '* x' happens implicitly. In the msb-first case it must be
- // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
- // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
- // the multiplication by the x^64 term is handled using a pxor. The
- // pxor causes the low 64 terms of t1 to be wrong, but they are unused.
- _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
- _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
-.if !LSB_CRC
- _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
-.endif
- // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
-
- // Second step of Barrett reduction: Cancel out the x^n and higher terms
- // of t0 by subtracting the needed multiple of G. This gives the CRC:
- //
- // crc := t0 - (G * floor(t0 / G))
- //
- // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
- //
- // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
- //
- // Furthermore, since the resulting CRC is n-bit, if mod x^n is
- // explicitly applied to it then the x^n term of G makes no difference
- // in the result and can be omitted. This helps keep the constant
- // multiplier in 64 bits in most cases. This gives the following:
- //
- // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
- // crc := (%xmm0 / x^(64-n)) mod x^n
- //
- // In the lsb-first case, each pclmulqdq implicitly introduces
- // an extra factor of x, so in that case the constant that needs to be
- // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
- // For lsb-first CRCs where n=64, the extra factor of x cannot be as
- // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to
- // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC
- // polynomials have nonzero x^n and x^0 terms.) It works out as: the
- // CRC has be XORed with the physically low qword of %xmm1, representing
- // floor(t0 / G). The most efficient way to do that is to move it to
- // the physically high qword and use a ternlog to combine the two XORs.
-.if LSB_CRC && \n == 64
- _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2
- _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
- .if AVX_LEVEL <= 2
- _cond_vex pxor, %xmm2, %xmm0, %xmm0
- _cond_vex pxor, %xmm1, %xmm0, %xmm0
- .else
- vpternlogq $0x96, %xmm2, %xmm1, %xmm0
- .endif
- _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64
-.else
- _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
- _cond_vex pxor, %xmm1, %xmm0, %xmm0
- .if \n == 8
- _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
- .elseif \n == 16
- _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
- .elseif \n == 32
- _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
- .else // \n == 64 && !LSB_CRC
- _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64
- .endif
-.endif
-
-.if VL > 16
- vzeroupper // Needed when ymm or zmm registers may have been used.
-.endif
-#ifdef __i386__
- pop CONSTS_PTR
-#endif
- RET
-.endm
-
-#ifdef CONFIG_AS_VPCLMULQDQ
-#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
-SYM_FUNC_START(prefix##_pclmul_sse); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
-SYM_FUNC_END(prefix##_pclmul_sse); \
- \
-SYM_FUNC_START(prefix##_vpclmul_avx2); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \
-SYM_FUNC_END(prefix##_vpclmul_avx2); \
- \
-SYM_FUNC_START(prefix##_vpclmul_avx512); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \
-SYM_FUNC_END(prefix##_vpclmul_avx512);
-#else
-#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
-SYM_FUNC_START(prefix##_pclmul_sse); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
-SYM_FUNC_END(prefix##_pclmul_sse);
-#endif // !CONFIG_AS_VPCLMULQDQ
diff --git a/arch/x86/lib/crc-pclmul-template.h b/arch/x86/lib/crc-pclmul-template.h
deleted file mode 100644
index c5b3bfe11d8d..000000000000
--- a/arch/x86/lib/crc-pclmul-template.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
- * instantiated by crc-pclmul-template.S
- *
- * Copyright 2025 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-#ifndef _CRC_PCLMUL_TEMPLATE_H
-#define _CRC_PCLMUL_TEMPLATE_H
-
-#include <asm/cpufeatures.h>
-#include <asm/simd.h>
-#include <crypto/internal/simd.h>
-#include <linux/static_call.h>
-#include "crc-pclmul-consts.h"
-
-#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \
-crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
-crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
-crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
-DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
-
-#define INIT_CRC_PCLMUL(prefix) \
-do { \
- if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) && \
- boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && \
- boot_cpu_has(X86_FEATURE_AVX2) && \
- cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) { \
- if (boot_cpu_has(X86_FEATURE_AVX512BW) && \
- boot_cpu_has(X86_FEATURE_AVX512VL) && \
- !boot_cpu_has(X86_FEATURE_PREFER_YMM) && \
- cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) { \
- static_call_update(prefix##_pclmul, \
- prefix##_vpclmul_avx512); \
- } else { \
- static_call_update(prefix##_pclmul, \
- prefix##_vpclmul_avx2); \
- } \
- } \
-} while (0)
-
-/*
- * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16
- * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.
- *
- * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.
- * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),
- * varying by CPU and factors such as which parts of the "FPU" state userspace
- * has touched, which could result in a larger cutoff being better. Indeed, a
- * larger cutoff is usually better for a *single* message. However, the
- * overhead of the FPU section gets amortized if multiple FPU sections get
- * executed before returning to userspace, since the XSAVE and XRSTOR occur only
- * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on
- * the dcache than the table-based code is, a 16-byte cutoff seems to work well.
- */
-#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \
-do { \
- if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \
- crypto_simd_usable()) { \
- const void *consts_ptr; \
- \
- consts_ptr = (consts).fold_across_128_bits_consts; \
- kernel_fpu_begin(); \
- crc = static_call(prefix##_pclmul)((crc), (p), (len), \
- consts_ptr); \
- kernel_fpu_end(); \
- return crc; \
- } \
-} while (0)
-
-#endif /* _CRC_PCLMUL_TEMPLATE_H */
diff --git a/arch/x86/lib/crc-t10dif.c b/arch/x86/lib/crc-t10dif.c
deleted file mode 100644
index db7ce59c31ac..000000000000
--- a/arch/x86/lib/crc-t10dif.c
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * CRC-T10DIF using [V]PCLMULQDQ instructions
- *
- * Copyright 2024 Google LLC
- */
-
-#include <linux/crc-t10dif.h>
-#include <linux/module.h>
-#include "crc-pclmul-template.h"
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
-
-DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16);
-
-u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts,
- have_pclmulqdq);
- return crc_t10dif_generic(crc, p, len);
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-static int __init crc_t10dif_x86_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
- static_branch_enable(&have_pclmulqdq);
- INIT_CRC_PCLMUL(crc16_msb);
- }
- return 0;
-}
-subsys_initcall(crc_t10dif_x86_init);
-
-static void __exit crc_t10dif_x86_exit(void)
-{
-}
-module_exit(crc_t10dif_x86_exit);
-
-MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc16-msb-pclmul.S b/arch/x86/lib/crc16-msb-pclmul.S
deleted file mode 100644
index e9fe248093a8..000000000000
--- a/arch/x86/lib/crc16-msb-pclmul.S
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-// Copyright 2025 Google LLC
-
-#include "crc-pclmul-template.S"
-
-DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0)
diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S
deleted file mode 100644
index f20f40fb0172..000000000000
--- a/arch/x86/lib/crc32-pclmul.S
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-// Copyright 2025 Google LLC
-
-#include "crc-pclmul-template.S"
-
-DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)
diff --git a/arch/x86/lib/crc32.c b/arch/x86/lib/crc32.c
deleted file mode 100644
index d09343e2cea9..000000000000
--- a/arch/x86/lib/crc32.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * x86-optimized CRC32 functions
- *
- * Copyright (C) 2008 Intel Corporation
- * Copyright 2012 Xyratex Technology Limited
- * Copyright 2024 Google LLC
- */
-
-#include <linux/crc32.h>
-#include <linux/module.h>
-#include "crc-pclmul-template.h"
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
-
-DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
- have_pclmulqdq);
- return crc32_le_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-#ifdef CONFIG_X86_64
-#define CRC32_INST "crc32q %1, %q0"
-#else
-#define CRC32_INST "crc32l %1, %0"
-#endif
-
-/*
- * Use carryless multiply version of crc32c when buffer size is >= 512 to
- * account for FPU state save/restore overhead.
- */
-#define CRC32C_PCLMUL_BREAKEVEN 512
-
-asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
- size_t num_longs;
-
- if (!static_branch_likely(&have_crc32))
- return crc32c_base(crc, p, len);
-
- if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
- static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
- kernel_fpu_begin();
- crc = crc32c_x86_3way(crc, p, len);
- kernel_fpu_end();
- return crc;
- }
-
- for (num_longs = len / sizeof(unsigned long);
- num_longs != 0; num_longs--, p += sizeof(unsigned long))
- asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
-
- if (sizeof(unsigned long) > 4 && (len & 4)) {
- asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
- p += 4;
- }
- if (len & 2) {
- asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
- p += 2;
- }
- if (len & 1)
- asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
-
- return crc;
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
- return crc32_be_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-static int __init crc32_x86_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_XMM4_2))
- static_branch_enable(&have_crc32);
- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
- static_branch_enable(&have_pclmulqdq);
- INIT_CRC_PCLMUL(crc32_lsb);
- }
- return 0;
-}
-subsys_initcall(crc32_x86_init);
-
-static void __exit crc32_x86_exit(void)
-{
-}
-module_exit(crc32_x86_exit);
-
-u32 crc32_optimizations(void)
-{
- u32 optimizations = 0;
-
- if (static_key_enabled(&have_crc32))
- optimizations |= CRC32C_OPTIMIZATION;
- if (static_key_enabled(&have_pclmulqdq))
- optimizations |= CRC32_LE_OPTIMIZATION;
- return optimizations;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_DESCRIPTION("x86-optimized CRC32 functions");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc32c-3way.S b/arch/x86/lib/crc32c-3way.S
deleted file mode 100644
index 9b8770503bbc..000000000000
--- a/arch/x86/lib/crc32c-3way.S
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
- *
- * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
- * downloaded from:
- * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
- * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
- *
- * Copyright (C) 2012 Intel Corporation.
- * Copyright 2024 Google LLC
- *
- * Authors:
- * Wajdi Feghali <wajdi.k.feghali@intel.com>
- * James Guilford <james.guilford@intel.com>
- * David Cote <david.m.cote@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/linkage.h>
-
-## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
-
-# Define threshold below which buffers are considered "small" and routed to
-# regular CRC code that does not interleave the CRC instructions.
-#define SMALL_SIZE 200
-
-# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
-
-.text
-SYM_FUNC_START(crc32c_x86_3way)
-#define crc0 %edi
-#define crc0_q %rdi
-#define bufp %rsi
-#define bufp_d %esi
-#define len %rdx
-#define len_dw %edx
-#define n_misaligned %ecx /* overlaps chunk_bytes! */
-#define n_misaligned_q %rcx
-#define chunk_bytes %ecx /* overlaps n_misaligned! */
-#define chunk_bytes_q %rcx
-#define crc1 %r8
-#define crc2 %r9
-
- cmp $SMALL_SIZE, len
- jb .Lsmall
-
- ################################################################
- ## 1) ALIGN:
- ################################################################
- mov bufp_d, n_misaligned
- neg n_misaligned
- and $7, n_misaligned # calculate the misalignment amount of
- # the address
- je .Laligned # Skip if aligned
-
- # Process 1 <= n_misaligned <= 7 bytes individually in order to align
- # the remaining data to an 8-byte boundary.
-.Ldo_align:
- movq (bufp), %rax
- add n_misaligned_q, bufp
- sub n_misaligned_q, len
-.Lalign_loop:
- crc32b %al, crc0 # compute crc32 of 1-byte
- shr $8, %rax # get next byte
- dec n_misaligned
- jne .Lalign_loop
-.Laligned:
-
- ################################################################
- ## 2) PROCESS BLOCK:
- ################################################################
-
- cmp $128*24, len
- jae .Lfull_block
-
-.Lpartial_block:
- # Compute floor(len / 24) to get num qwords to process from each lane.
- imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
- shr $16, %eax
- jmp .Lcrc_3lanes
-
-.Lfull_block:
- # Processing 128 qwords from each lane.
- mov $128, %eax
-
- ################################################################
- ## 3) CRC each of three lanes:
- ################################################################
-
-.Lcrc_3lanes:
- xor crc1,crc1
- xor crc2,crc2
- mov %eax, chunk_bytes
- shl $3, chunk_bytes # num bytes to process from each lane
- sub $5, %eax # 4 for 4x_loop, 1 for special last iter
- jl .Lcrc_3lanes_4x_done
-
- # Unroll the loop by a factor of 4 to reduce the overhead of the loop
- # bookkeeping instructions, which can compete with crc32q for the ALUs.
-.Lcrc_3lanes_4x_loop:
- crc32q (bufp), crc0_q
- crc32q (bufp,chunk_bytes_q), crc1
- crc32q (bufp,chunk_bytes_q,2), crc2
- crc32q 8(bufp), crc0_q
- crc32q 8(bufp,chunk_bytes_q), crc1
- crc32q 8(bufp,chunk_bytes_q,2), crc2
- crc32q 16(bufp), crc0_q
- crc32q 16(bufp,chunk_bytes_q), crc1
- crc32q 16(bufp,chunk_bytes_q,2), crc2
- crc32q 24(bufp), crc0_q
- crc32q 24(bufp,chunk_bytes_q), crc1
- crc32q 24(bufp,chunk_bytes_q,2), crc2
- add $32, bufp
- sub $4, %eax
- jge .Lcrc_3lanes_4x_loop
-
-.Lcrc_3lanes_4x_done:
- add $4, %eax
- jz .Lcrc_3lanes_last_qword
-
-.Lcrc_3lanes_1x_loop:
- crc32q (bufp), crc0_q
- crc32q (bufp,chunk_bytes_q), crc1
- crc32q (bufp,chunk_bytes_q,2), crc2
- add $8, bufp
- dec %eax
- jnz .Lcrc_3lanes_1x_loop
-
-.Lcrc_3lanes_last_qword:
- crc32q (bufp), crc0_q
- crc32q (bufp,chunk_bytes_q), crc1
-# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
-
- ################################################################
- ## 4) Combine three results:
- ################################################################
-
- lea (K_table-8)(%rip), %rax # first entry is for idx 1
- pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
- lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
- sub %rax, len # len -= chunk_bytes * 3
-
- movq crc0_q, %xmm1 # CRC for block 1
- pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
-
- movq crc1, %xmm2 # CRC for block 2
- pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
-
- pxor %xmm2,%xmm1
- movq %xmm1, %rax
- xor (bufp,chunk_bytes_q,2), %rax
- mov crc2, crc0_q
- crc32 %rax, crc0_q
- lea 8(bufp,chunk_bytes_q,2), bufp
-
- ################################################################
- ## 5) If more blocks remain, goto (2):
- ################################################################
-
- cmp $128*24, len
- jae .Lfull_block
- cmp $SMALL_SIZE, len
- jae .Lpartial_block
-
- #######################################################################
- ## 6) Process any remainder without interleaving:
- #######################################################################
-.Lsmall:
- test len_dw, len_dw
- jz .Ldone
- mov len_dw, %eax
- shr $3, %eax
- jz .Ldo_dword
-.Ldo_qwords:
- crc32q (bufp), crc0_q
- add $8, bufp
- dec %eax
- jnz .Ldo_qwords
-.Ldo_dword:
- test $4, len_dw
- jz .Ldo_word
- crc32l (bufp), crc0
- add $4, bufp
-.Ldo_word:
- test $2, len_dw
- jz .Ldo_byte
- crc32w (bufp), crc0
- add $2, bufp
-.Ldo_byte:
- test $1, len_dw
- jz .Ldone
- crc32b (bufp), crc0
-.Ldone:
- mov crc0, %eax
- RET
-SYM_FUNC_END(crc32c_x86_3way)
-
-.section .rodata, "a", @progbits
- ################################################################
- ## PCLMULQDQ tables
- ## Table is 128 entries x 2 words (8 bytes) each
- ################################################################
-.align 8
-K_table:
- .long 0x493c7d27, 0x00000001
- .long 0xba4fc28e, 0x493c7d27
- .long 0xddc0152b, 0xf20c0dfe
- .long 0x9e4addf8, 0xba4fc28e
- .long 0x39d3b296, 0x3da6d0cb
- .long 0x0715ce53, 0xddc0152b
- .long 0x47db8317, 0x1c291d04
- .long 0x0d3b6092, 0x9e4addf8
- .long 0xc96cfdc0, 0x740eef02
- .long 0x878a92a7, 0x39d3b296
- .long 0xdaece73e, 0x083a6eec
- .long 0xab7aff2a, 0x0715ce53
- .long 0x2162d385, 0xc49f4f67
- .long 0x83348832, 0x47db8317
- .long 0x299847d5, 0x2ad91c30
- .long 0xb9e02b86, 0x0d3b6092
- .long 0x18b33a4e, 0x6992cea2
- .long 0xb6dd949b, 0xc96cfdc0
- .long 0x78d9ccb7, 0x7e908048
- .long 0xbac2fd7b, 0x878a92a7
- .long 0xa60ce07b, 0x1b3d8f29
- .long 0xce7f39f4, 0xdaece73e
- .long 0x61d82e56, 0xf1d0f55e
- .long 0xd270f1a2, 0xab7aff2a
- .long 0xc619809d, 0xa87ab8a8
- .long 0x2b3cac5d, 0x2162d385
- .long 0x65863b64, 0x8462d800
- .long 0x1b03397f, 0x83348832
- .long 0xebb883bd, 0x71d111a8
- .long 0xb3e32c28, 0x299847d5
- .long 0x064f7f26, 0xffd852c6
- .long 0xdd7e3b0c, 0xb9e02b86
- .long 0xf285651c, 0xdcb17aa4
- .long 0x10746f3c, 0x18b33a4e
- .long 0xc7a68855, 0xf37c5aee
- .long 0x271d9844, 0xb6dd949b
- .long 0x8e766a0c, 0x6051d5a2
- .long 0x93a5f730, 0x78d9ccb7
- .long 0x6cb08e5c, 0x18b0d4ff
- .long 0x6b749fb2, 0xbac2fd7b
- .long 0x1393e203, 0x21f3d99c
- .long 0xcec3662e, 0xa60ce07b
- .long 0x96c515bb, 0x8f158014
- .long 0xe6fc4e6a, 0xce7f39f4
- .long 0x8227bb8a, 0xa00457f7
- .long 0xb0cd4768, 0x61d82e56
- .long 0x39c7ff35, 0x8d6d2c43
- .long 0xd7a4825c, 0xd270f1a2
- .long 0x0ab3844b, 0x00ac29cf
- .long 0x0167d312, 0xc619809d
- .long 0xf6076544, 0xe9adf796
- .long 0x26f6a60a, 0x2b3cac5d
- .long 0xa741c1bf, 0x96638b34
- .long 0x98d8d9cb, 0x65863b64
- .long 0x49c3cc9c, 0xe0e9f351
- .long 0x68bce87a, 0x1b03397f
- .long 0x57a3d037, 0x9af01f2d
- .long 0x6956fc3b, 0xebb883bd
- .long 0x42d98888, 0x2cff42cf
- .long 0x3771e98f, 0xb3e32c28
- .long 0xb42ae3d9, 0x88f25a3a
- .long 0x2178513a, 0x064f7f26
- .long 0xe0ac139e, 0x4e36f0b0
- .long 0x170076fa, 0xdd7e3b0c
- .long 0x444dd413, 0xbd6f81f8
- .long 0x6f345e45, 0xf285651c
- .long 0x41d17b64, 0x91c9bd4b
- .long 0xff0dba97, 0x10746f3c
- .long 0xa2b73df1, 0x885f087b
- .long 0xf872e54c, 0xc7a68855
- .long 0x1e41e9fc, 0x4c144932
- .long 0x86d8e4d2, 0x271d9844
- .long 0x651bd98b, 0x52148f02
- .long 0x5bb8f1bc, 0x8e766a0c
- .long 0xa90fd27a, 0xa3c6f37a
- .long 0xb3af077a, 0x93a5f730
- .long 0x4984d782, 0xd7c0557f
- .long 0xca6ef3ac, 0x6cb08e5c
- .long 0x234e0b26, 0x63ded06a
- .long 0xdd66cbbb, 0x6b749fb2
- .long 0x4597456a, 0x4d56973c
- .long 0xe9e28eb4, 0x1393e203
- .long 0x7b3ff57a, 0x9669c9df
- .long 0xc9c8b782, 0xcec3662e
- .long 0x3f70cc6f, 0xe417f38a
- .long 0x93e106a4, 0x96c515bb
- .long 0x62ec6c6d, 0x4b9e0f71
- .long 0xd813b325, 0xe6fc4e6a
- .long 0x0df04680, 0xd104b8fc
- .long 0x2342001e, 0x8227bb8a
- .long 0x0a2a8d7e, 0x5b397730
- .long 0x6d9a4957, 0xb0cd4768
- .long 0xe8b6368b, 0xe78eb416
- .long 0xd2c3ed1a, 0x39c7ff35
- .long 0x995a5724, 0x61ff0e01
- .long 0x9ef68d35, 0xd7a4825c
- .long 0x0c139b31, 0x8d96551c
- .long 0xf2271e60, 0x0ab3844b
- .long 0x0b0bf8ca, 0x0bf80dd2
- .long 0x2664fd8b, 0x0167d312
- .long 0xed64812d, 0x8821abed
- .long 0x02ee03b2, 0xf6076544
- .long 0x8604ae0f, 0x6a45d2b2
- .long 0x363bd6b3, 0x26f6a60a
- .long 0x135c83fd, 0xd8d26619
- .long 0x5fabe670, 0xa741c1bf
- .long 0x35ec3279, 0xde87806c
- .long 0x00bcf5f6, 0x98d8d9cb
- .long 0x8ae00689, 0x14338754
- .long 0x17f27698, 0x49c3cc9c
- .long 0x58ca5f00, 0x5bd2011f
- .long 0xaa7c7ad5, 0x68bce87a
- .long 0xb5cfca28, 0xdd07448e
- .long 0xded288f8, 0x57a3d037
- .long 0x59f229bc, 0xdde8f5b9
- .long 0x6d390dec, 0x6956fc3b
- .long 0x37170390, 0xa3e3e02c
- .long 0x6353c1cc, 0x42d98888
- .long 0xc4584f5c, 0xd73c7bea
- .long 0xf48642e9, 0x3771e98f
- .long 0x531377e2, 0x80ff0093
- .long 0xdd35bc8d, 0xb42ae3d9
- .long 0xb25b29f2, 0x8fe4c34d
- .long 0x9a5ede41, 0x2178513a
- .long 0xa563905d, 0xdf99fc11
- .long 0x45cddf4e, 0xe0ac139e
- .long 0xacfa3103, 0x6c23e841
- .long 0xa51b6135, 0x170076fa
diff --git a/arch/x86/lib/crc64-pclmul.S b/arch/x86/lib/crc64-pclmul.S
deleted file mode 100644
index 4173051b5197..000000000000
--- a/arch/x86/lib/crc64-pclmul.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-// Copyright 2025 Google LLC
-
-#include "crc-pclmul-template.S"
-
-DEFINE_CRC_PCLMUL_FUNCS(crc64_msb, /* bits= */ 64, /* lsb= */ 0)
-DEFINE_CRC_PCLMUL_FUNCS(crc64_lsb, /* bits= */ 64, /* lsb= */ 1)
diff --git a/arch/x86/lib/crc64.c b/arch/x86/lib/crc64.c
deleted file mode 100644
index 351a09f5813e..000000000000
--- a/arch/x86/lib/crc64.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * CRC64 using [V]PCLMULQDQ instructions
- *
- * Copyright 2025 Google LLC
- */
-
-#include <linux/crc64.h>
-#include <linux/module.h>
-#include "crc-pclmul-template.h"
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
-
-DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64);
-DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64);
-
-u64 crc64_be_arch(u64 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts,
- have_pclmulqdq);
- return crc64_be_generic(crc, p, len);
-}
-EXPORT_SYMBOL_GPL(crc64_be_arch);
-
-u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts,
- have_pclmulqdq);
- return crc64_nvme_generic(crc, p, len);
-}
-EXPORT_SYMBOL_GPL(crc64_nvme_arch);
-
-static int __init crc64_x86_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
- static_branch_enable(&have_pclmulqdq);
- INIT_CRC_PCLMUL(crc64_msb);
- INIT_CRC_PCLMUL(crc64_lsb);
- }
- return 0;
-}
-subsys_initcall(crc64_x86_init);
-
-static void __exit crc64_x86_exit(void)
-{
-}
-module_exit(crc64_x86_exit);
-
-MODULE_DESCRIPTION("CRC64 using [V]PCLMULQDQ instructions");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crypto/.gitignore b/arch/x86/lib/crypto/.gitignore
deleted file mode 100644
index 580c839bb177..000000000000
--- a/arch/x86/lib/crypto/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-poly1305-x86_64-cryptogams.S
diff --git a/arch/x86/lib/crypto/Kconfig b/arch/x86/lib/crypto/Kconfig
deleted file mode 100644
index 5e94cdee492c..000000000000
--- a/arch/x86/lib/crypto/Kconfig
+++ /dev/null
@@ -1,34 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_BLAKE2S_X86
- bool "Hash functions: BLAKE2s (SSSE3/AVX-512)"
- depends on 64BIT
- select CRYPTO_LIB_BLAKE2S_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
- help
- BLAKE2s cryptographic hash function (RFC 7693)
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX-512 (Advanced Vector Extensions-512)
-
-config CRYPTO_CHACHA20_X86_64
- tristate
- depends on 64BIT
- default CRYPTO_LIB_CHACHA
- select CRYPTO_LIB_CHACHA_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_X86_64
- tristate
- depends on 64BIT
- default CRYPTO_LIB_POLY1305
- select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_X86_64
- tristate
- depends on 64BIT
- default CRYPTO_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
- select CRYPTO_LIB_SHA256_GENERIC
diff --git a/arch/x86/lib/crypto/Makefile b/arch/x86/lib/crypto/Makefile
deleted file mode 100644
index abceca3d31c0..000000000000
--- a/arch/x86/lib/crypto/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
-libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
-
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
-chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
-poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
-targets += poly1305-x86_64-cryptogams.S
-
-obj-$(CONFIG_CRYPTO_SHA256_X86_64) += sha256-x86_64.o
-sha256-x86_64-y := sha256.o sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256-ni-asm.o
-
-quiet_cmd_perlasm = PERLASM $@
- cmd_perlasm = $(PERL) $< > $@
-
-$(obj)/%.S: $(src)/%.pl FORCE
- $(call if_changed,perlasm)
diff --git a/arch/x86/lib/crypto/blake2s-core.S b/arch/x86/lib/crypto/blake2s-core.S
deleted file mode 100644
index ac1c845445a4..000000000000
--- a/arch/x86/lib/crypto/blake2s-core.S
+++ /dev/null
@@ -1,252 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
-.align 32
-IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
- .octa 0x5BE0CD191F83D9AB9B05688C510E527F
-.section .rodata.cst16.ROT16, "aM", @progbits, 16
-.align 16
-ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
-.section .rodata.cst16.ROR328, "aM", @progbits, 16
-.align 16
-ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
-.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
-.align 64
-SIGMA:
-.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
-.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
-.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
-.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
-.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
-.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
-.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
-.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
-.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
-.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
-.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
-.align 64
-SIGMA2:
-.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
-.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
-.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
-.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
-.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
-.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
-.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
-.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
-.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
-.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
-
-.text
-SYM_FUNC_START(blake2s_compress_ssse3)
- testq %rdx,%rdx
- je .Lendofloop
- movdqu (%rdi),%xmm0
- movdqu 0x10(%rdi),%xmm1
- movdqa ROT16(%rip),%xmm12
- movdqa ROR328(%rip),%xmm13
- movdqu 0x20(%rdi),%xmm14
- movq %rcx,%xmm15
- leaq SIGMA+0xa0(%rip),%r8
- jmp .Lbeginofloop
- .align 32
-.Lbeginofloop:
- movdqa %xmm0,%xmm10
- movdqa %xmm1,%xmm11
- paddq %xmm15,%xmm14
- movdqa IV(%rip),%xmm2
- movdqa %xmm14,%xmm3
- pxor IV+0x10(%rip),%xmm3
- leaq SIGMA(%rip),%rcx
-.Lroundloop:
- movzbl (%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0x1(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0x2(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x3(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- punpckldq %xmm5,%xmm4
- punpckldq %xmm7,%xmm6
- punpcklqdq %xmm6,%xmm4
- paddd %xmm4,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movzbl 0x4(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0x5(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x6(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0x7(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- punpckldq %xmm6,%xmm5
- punpckldq %xmm4,%xmm7
- punpcklqdq %xmm7,%xmm5
- paddd %xmm5,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x93,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x39,%xmm2,%xmm2
- movzbl 0x8(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x9(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0xa(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0xb(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- punpckldq %xmm7,%xmm6
- punpckldq %xmm5,%xmm4
- punpcklqdq %xmm4,%xmm6
- paddd %xmm6,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movzbl 0xc(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0xd(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0xe(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0xf(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- punpckldq %xmm4,%xmm7
- punpckldq %xmm6,%xmm5
- punpcklqdq %xmm5,%xmm7
- paddd %xmm7,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x39,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x93,%xmm2,%xmm2
- addq $0x10,%rcx
- cmpq %r8,%rcx
- jnz .Lroundloop
- pxor %xmm2,%xmm0
- pxor %xmm3,%xmm1
- pxor %xmm10,%xmm0
- pxor %xmm11,%xmm1
- addq $0x40,%rsi
- decq %rdx
- jnz .Lbeginofloop
- movdqu %xmm0,(%rdi)
- movdqu %xmm1,0x10(%rdi)
- movdqu %xmm14,0x20(%rdi)
-.Lendofloop:
- RET
-SYM_FUNC_END(blake2s_compress_ssse3)
-
-SYM_FUNC_START(blake2s_compress_avx512)
- vmovdqu (%rdi),%xmm0
- vmovdqu 0x10(%rdi),%xmm1
- vmovdqu 0x20(%rdi),%xmm4
- vmovq %rcx,%xmm5
- vmovdqa IV(%rip),%xmm14
- vmovdqa IV+16(%rip),%xmm15
- jmp .Lblake2s_compress_avx512_mainloop
-.align 32
-.Lblake2s_compress_avx512_mainloop:
- vmovdqa %xmm0,%xmm10
- vmovdqa %xmm1,%xmm11
- vpaddq %xmm5,%xmm4,%xmm4
- vmovdqa %xmm14,%xmm2
- vpxor %xmm15,%xmm4,%xmm3
- vmovdqu (%rsi),%ymm6
- vmovdqu 0x20(%rsi),%ymm7
- addq $0x40,%rsi
- leaq SIGMA2(%rip),%rax
- movb $0xa,%cl
-.Lblake2s_compress_avx512_roundloop:
- addq $0x40,%rax
- vmovdqa -0x40(%rax),%ymm8
- vmovdqa -0x20(%rax),%ymm9
- vpermi2d %ymm7,%ymm6,%ymm8
- vpermi2d %ymm7,%ymm6,%ymm9
- vmovdqa %ymm8,%ymm6
- vmovdqa %ymm9,%ymm7
- vpaddd %xmm8,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x10,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0xc,%xmm1,%xmm1
- vextracti128 $0x1,%ymm8,%xmm8
- vpaddd %xmm8,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x8,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0x7,%xmm1,%xmm1
- vpshufd $0x93,%xmm0,%xmm0
- vpshufd $0x4e,%xmm3,%xmm3
- vpshufd $0x39,%xmm2,%xmm2
- vpaddd %xmm9,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x10,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0xc,%xmm1,%xmm1
- vextracti128 $0x1,%ymm9,%xmm9
- vpaddd %xmm9,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x8,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0x7,%xmm1,%xmm1
- vpshufd $0x39,%xmm0,%xmm0
- vpshufd $0x4e,%xmm3,%xmm3
- vpshufd $0x93,%xmm2,%xmm2
- decb %cl
- jne .Lblake2s_compress_avx512_roundloop
- vpxor %xmm10,%xmm0,%xmm0
- vpxor %xmm11,%xmm1,%xmm1
- vpxor %xmm2,%xmm0,%xmm0
- vpxor %xmm3,%xmm1,%xmm1
- decq %rdx
- jne .Lblake2s_compress_avx512_mainloop
- vmovdqu %xmm0,(%rdi)
- vmovdqu %xmm1,0x10(%rdi)
- vmovdqu %xmm4,0x20(%rdi)
- vzeroupper
- RET
-SYM_FUNC_END(blake2s_compress_avx512)
diff --git a/arch/x86/lib/crypto/blake2s-glue.c b/arch/x86/lib/crypto/blake2s-glue.c
deleted file mode 100644
index adc296cd17c9..000000000000
--- a/arch/x86/lib/crypto/blake2s-glue.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <asm/cpufeature.h>
-#include <asm/fpu/api.h>
-#include <asm/processor.h>
-#include <asm/simd.h>
-#include <crypto/internal/blake2s.h>
-#include <linux/init.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/sizes.h>
-
-asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
- const u8 *block, const size_t nblocks,
- const u32 inc);
-asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
- const u8 *block, const size_t nblocks,
- const u32 inc);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
-
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
-{
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
-
- if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
- blake2s_compress_generic(state, block, nblocks, inc);
- return;
- }
-
- do {
- const size_t blocks = min_t(size_t, nblocks,
- SZ_4K / BLAKE2S_BLOCK_SIZE);
-
- kernel_fpu_begin();
- if (static_branch_likely(&blake2s_use_avx512))
- blake2s_compress_avx512(state, block, blocks, inc);
- else
- blake2s_compress_ssse3(state, block, blocks, inc);
- kernel_fpu_end();
-
- nblocks -= blocks;
- block += blocks * BLAKE2S_BLOCK_SIZE;
- } while (nblocks);
-}
-EXPORT_SYMBOL(blake2s_compress);
-
-static int __init blake2s_mod_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- static_branch_enable(&blake2s_use_ssse3);
-
- if (boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- boot_cpu_has(X86_FEATURE_AVX512VL) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
- XFEATURE_MASK_AVX512, NULL))
- static_branch_enable(&blake2s_use_avx512);
-
- return 0;
-}
-
-subsys_initcall(blake2s_mod_init);
diff --git a/arch/x86/lib/crypto/chacha-avx2-x86_64.S b/arch/x86/lib/crypto/chacha-avx2-x86_64.S
deleted file mode 100644
index f3d8fc018249..000000000000
--- a/arch/x86/lib/crypto/chacha-avx2-x86_64.S
+++ /dev/null
@@ -1,1021 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.ROT8, "aM", @progbits, 32
-.align 32
-ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
- .octa 0x0e0d0c0f0a09080b0605040702010003
-
-.section .rodata.cst32.ROT16, "aM", @progbits, 32
-.align 32
-ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
- .octa 0x0d0c0f0e09080b0a0504070601000302
-
-.section .rodata.cst32.CTRINC, "aM", @progbits, 32
-.align 32
-CTRINC: .octa 0x00000003000000020000000100000000
- .octa 0x00000007000000060000000500000004
-
-.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
-.align 32
-CTR2BL: .octa 0x00000000000000000000000000000000
- .octa 0x00000000000000000000000000000001
-
-.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
-.align 32
-CTR4BL: .octa 0x00000000000000000000000000000002
- .octa 0x00000000000000000000000000000003
-
-.text
-
-SYM_FUNC_START(chacha_2block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 2 data blocks output, o
- # %rdx: up to 2 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts two ChaCha blocks by loading the state
- # matrix twice across four AVX registers. It performs matrix operations
- # on four words in each matrix in parallel, but requires shuffling to
- # rearrange the words after each round.
-
- vzeroupper
-
- # x0..3[0-2] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
-
- vmovdqa %ymm0,%ymm8
- vmovdqa %ymm1,%ymm9
- vmovdqa %ymm2,%ymm10
- vmovdqa %ymm3,%ymm11
-
- vmovdqa ROT8(%rip),%ymm4
- vmovdqa ROT16(%rip),%ymm5
-
- mov %rcx,%rax
-
-.Ldoubleround:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm5,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm6
- vpslld $12,%ymm6,%ymm6
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm6,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm4,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm7
- vpslld $7,%ymm7,%ymm7
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm5,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm6
- vpslld $12,%ymm6,%ymm6
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm6,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm4,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm7
- vpslld $7,%ymm7,%ymm7
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
-
- sub $2,%r8d
- jnz .Ldoubleround
-
- # o0 = i0 ^ (x0 + s0)
- vpaddd %ymm8,%ymm0,%ymm7
- cmp $0x10,%rax
- jl .Lxorpart2
- vpxor 0x00(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x00(%rsi)
- vextracti128 $1,%ymm7,%xmm0
- # o1 = i1 ^ (x1 + s1)
- vpaddd %ymm9,%ymm1,%ymm7
- cmp $0x20,%rax
- jl .Lxorpart2
- vpxor 0x10(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x10(%rsi)
- vextracti128 $1,%ymm7,%xmm1
- # o2 = i2 ^ (x2 + s2)
- vpaddd %ymm10,%ymm2,%ymm7
- cmp $0x30,%rax
- jl .Lxorpart2
- vpxor 0x20(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x20(%rsi)
- vextracti128 $1,%ymm7,%xmm2
- # o3 = i3 ^ (x3 + s3)
- vpaddd %ymm11,%ymm3,%ymm7
- cmp $0x40,%rax
- jl .Lxorpart2
- vpxor 0x30(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x30(%rsi)
- vextracti128 $1,%ymm7,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm7
- cmp $0x50,%rax
- jl .Lxorpart2
- vpxor 0x40(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm7
- cmp $0x60,%rax
- jl .Lxorpart2
- vpxor 0x50(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm7
- cmp $0x70,%rax
- jl .Lxorpart2
- vpxor 0x60(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm7
- cmp $0x80,%rax
- jl .Lxorpart2
- vpxor 0x70(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x70(%rsi)
-
-.Ldone2:
- vzeroupper
- RET
-
-.Lxorpart2:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone2
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea 8(%rsp),%r10
- sub $0x10,%rsp
- and $~31,%rsp
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- vpxor 0x00(%rsp),%xmm7,%xmm7
- vmovdqa %xmm7,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- lea -8(%r10),%rsp
- jmp .Ldone2
-
-SYM_FUNC_END(chacha_2block_xor_avx2)
-
-SYM_FUNC_START(chacha_4block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four ChaCha blocks by loading the state
- # matrix four times across eight AVX registers. It performs matrix
- # operations on four words in two matrices in parallel, sequentially
- # to the operations on the four words of the other two matrices. The
- # required word shuffling has a rather high latency, we can do the
- # arithmetic on two matrix-pairs without much slowdown.
-
- vzeroupper
-
- # x0..3[0-4] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vmovdqa %ymm0,%ymm4
- vmovdqa %ymm1,%ymm5
- vmovdqa %ymm2,%ymm6
- vmovdqa %ymm3,%ymm7
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
- vpaddd CTR4BL(%rip),%ymm7,%ymm7
-
- vmovdqa %ymm0,%ymm11
- vmovdqa %ymm1,%ymm12
- vmovdqa %ymm2,%ymm13
- vmovdqa %ymm3,%ymm14
- vmovdqa %ymm7,%ymm15
-
- vmovdqa ROT8(%rip),%ymm8
- vmovdqa ROT16(%rip),%ymm9
-
- mov %rcx,%rax
-
-.Ldoubleround4:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm9,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm9,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- vpshufd $0x39,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
- vpshufd $0x93,%ymm7,%ymm7
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm9,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm9,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- vpshufd $0x93,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
- vpshufd $0x39,%ymm7,%ymm7
-
- sub $2,%r8d
- jnz .Ldoubleround4
-
- # o0 = i0 ^ (x0 + s0), first block
- vpaddd %ymm11,%ymm0,%ymm10
- cmp $0x10,%rax
- jl .Lxorpart4
- vpxor 0x00(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x00(%rsi)
- vextracti128 $1,%ymm10,%xmm0
- # o1 = i1 ^ (x1 + s1), first block
- vpaddd %ymm12,%ymm1,%ymm10
- cmp $0x20,%rax
- jl .Lxorpart4
- vpxor 0x10(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x10(%rsi)
- vextracti128 $1,%ymm10,%xmm1
- # o2 = i2 ^ (x2 + s2), first block
- vpaddd %ymm13,%ymm2,%ymm10
- cmp $0x30,%rax
- jl .Lxorpart4
- vpxor 0x20(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x20(%rsi)
- vextracti128 $1,%ymm10,%xmm2
- # o3 = i3 ^ (x3 + s3), first block
- vpaddd %ymm14,%ymm3,%ymm10
- cmp $0x40,%rax
- jl .Lxorpart4
- vpxor 0x30(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x30(%rsi)
- vextracti128 $1,%ymm10,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm10
- cmp $0x50,%rax
- jl .Lxorpart4
- vpxor 0x40(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm10
- cmp $0x60,%rax
- jl .Lxorpart4
- vpxor 0x50(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm10
- cmp $0x70,%rax
- jl .Lxorpart4
- vpxor 0x60(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm10
- cmp $0x80,%rax
- jl .Lxorpart4
- vpxor 0x70(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x70(%rsi)
-
- # o0 = i0 ^ (x0 + s0), third block
- vpaddd %ymm11,%ymm4,%ymm10
- cmp $0x90,%rax
- jl .Lxorpart4
- vpxor 0x80(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x80(%rsi)
- vextracti128 $1,%ymm10,%xmm4
- # o1 = i1 ^ (x1 + s1), third block
- vpaddd %ymm12,%ymm5,%ymm10
- cmp $0xa0,%rax
- jl .Lxorpart4
- vpxor 0x90(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x90(%rsi)
- vextracti128 $1,%ymm10,%xmm5
- # o2 = i2 ^ (x2 + s2), third block
- vpaddd %ymm13,%ymm6,%ymm10
- cmp $0xb0,%rax
- jl .Lxorpart4
- vpxor 0xa0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xa0(%rsi)
- vextracti128 $1,%ymm10,%xmm6
- # o3 = i3 ^ (x3 + s3), third block
- vpaddd %ymm15,%ymm7,%ymm10
- cmp $0xc0,%rax
- jl .Lxorpart4
- vpxor 0xb0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xb0(%rsi)
- vextracti128 $1,%ymm10,%xmm7
-
- # xor and write fourth block
- vmovdqa %xmm4,%xmm10
- cmp $0xd0,%rax
- jl .Lxorpart4
- vpxor 0xc0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xc0(%rsi)
-
- vmovdqa %xmm5,%xmm10
- cmp $0xe0,%rax
- jl .Lxorpart4
- vpxor 0xd0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xd0(%rsi)
-
- vmovdqa %xmm6,%xmm10
- cmp $0xf0,%rax
- jl .Lxorpart4
- vpxor 0xe0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xe0(%rsi)
-
- vmovdqa %xmm7,%xmm10
- cmp $0x100,%rax
- jl .Lxorpart4
- vpxor 0xf0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xf0(%rsi)
-
-.Ldone4:
- vzeroupper
- RET
-
-.Lxorpart4:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone4
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea 8(%rsp),%r10
- sub $0x10,%rsp
- and $~31,%rsp
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- vpxor 0x00(%rsp),%xmm10,%xmm10
- vmovdqa %xmm10,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- lea -8(%r10),%rsp
- jmp .Ldone4
-
-SYM_FUNC_END(chacha_4block_xor_avx2)
-
-SYM_FUNC_START(chacha_8block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 8 data blocks output, o
- # %rdx: up to 8 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts eight consecutive ChaCha blocks by loading
- # the state matrix in AVX registers eight times. As we need some
- # scratch registers, we save the first four registers on the stack. The
- # algorithm performs each operation on the corresponding word of each
- # state matrix, hence requires no word shuffling. For final XORing step
- # we transpose the matrix by interleaving 32-, 64- and then 128-bit
- # words, which allows us to do XOR in AVX registers. 8/16-bit word
- # rotation is done with the slightly better performing byte shuffling,
- # 7/12-bit word rotation uses traditional shift+OR.
-
- vzeroupper
- # 4 * 32 byte stack, 32-byte aligned
- lea 8(%rsp),%r10
- and $~31, %rsp
- sub $0x80, %rsp
- mov %rcx,%rax
-
- # x0..15[0-7] = s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpbroadcastd 0x04(%rdi),%ymm1
- vpbroadcastd 0x08(%rdi),%ymm2
- vpbroadcastd 0x0c(%rdi),%ymm3
- vpbroadcastd 0x10(%rdi),%ymm4
- vpbroadcastd 0x14(%rdi),%ymm5
- vpbroadcastd 0x18(%rdi),%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm7
- vpbroadcastd 0x20(%rdi),%ymm8
- vpbroadcastd 0x24(%rdi),%ymm9
- vpbroadcastd 0x28(%rdi),%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm11
- vpbroadcastd 0x30(%rdi),%ymm12
- vpbroadcastd 0x34(%rdi),%ymm13
- vpbroadcastd 0x38(%rdi),%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm15
- # x0..3 on stack
- vmovdqa %ymm0,0x00(%rsp)
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa %ymm3,0x60(%rsp)
-
- vmovdqa CTRINC(%rip),%ymm1
- vmovdqa ROT8(%rip),%ymm2
- vmovdqa ROT16(%rip),%ymm3
-
- # x12 += counter values 0-3
- vpaddd %ymm1,%ymm12,%ymm12
-
-.Ldoubleround8:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- vpaddd 0x00(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm3,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- vpaddd 0x20(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm3,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- vpaddd 0x40(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm3,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- vpaddd 0x60(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm3,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $12,%ymm4,%ymm0
- vpsrld $20,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $12,%ymm5,%ymm0
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $12,%ymm6,%ymm0
- vpsrld $20,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm11,%ymm7,%ymm7
- vpslld $12,%ymm7,%ymm0
- vpsrld $20,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- vpaddd 0x00(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm2,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- vpaddd 0x20(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm2,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- vpaddd 0x40(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm2,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- vpaddd 0x60(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm2,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm0
- vpsrld $25,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm0
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm0
- vpsrld $25,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm11,%ymm7,%ymm7
- vpslld $7,%ymm7,%ymm0
- vpsrld $25,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- vpaddd 0x00(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm3,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
- vpaddd 0x20(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm3,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- vpaddd 0x40(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm3,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- vpaddd 0x60(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm3,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxor %ymm10,%ymm5,%ymm5
- vpslld $12,%ymm5,%ymm0
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxor %ymm11,%ymm6,%ymm6
- vpslld $12,%ymm6,%ymm0
- vpsrld $20,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpslld $12,%ymm7,%ymm0
- vpsrld $20,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm9,%ymm4,%ymm4
- vpslld $12,%ymm4,%ymm0
- vpsrld $20,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- vpaddd 0x00(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm2,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- vpaddd 0x20(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm2,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- vpaddd 0x40(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm2,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- vpaddd 0x60(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm2,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxor %ymm10,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm0
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxor %ymm11,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm0
- vpsrld $25,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpslld $7,%ymm7,%ymm0
- vpsrld $25,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm9,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm0
- vpsrld $25,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
-
- sub $2,%r8d
- jnz .Ldoubleround8
-
- # x0..15[0-3] += s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpaddd 0x00(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpbroadcastd 0x04(%rdi),%ymm0
- vpaddd 0x20(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpbroadcastd 0x08(%rdi),%ymm0
- vpaddd 0x40(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpbroadcastd 0x0c(%rdi),%ymm0
- vpaddd 0x60(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpbroadcastd 0x10(%rdi),%ymm0
- vpaddd %ymm0,%ymm4,%ymm4
- vpbroadcastd 0x14(%rdi),%ymm0
- vpaddd %ymm0,%ymm5,%ymm5
- vpbroadcastd 0x18(%rdi),%ymm0
- vpaddd %ymm0,%ymm6,%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm0
- vpaddd %ymm0,%ymm7,%ymm7
- vpbroadcastd 0x20(%rdi),%ymm0
- vpaddd %ymm0,%ymm8,%ymm8
- vpbroadcastd 0x24(%rdi),%ymm0
- vpaddd %ymm0,%ymm9,%ymm9
- vpbroadcastd 0x28(%rdi),%ymm0
- vpaddd %ymm0,%ymm10,%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm0
- vpaddd %ymm0,%ymm11,%ymm11
- vpbroadcastd 0x30(%rdi),%ymm0
- vpaddd %ymm0,%ymm12,%ymm12
- vpbroadcastd 0x34(%rdi),%ymm0
- vpaddd %ymm0,%ymm13,%ymm13
- vpbroadcastd 0x38(%rdi),%ymm0
- vpaddd %ymm0,%ymm14,%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm0
- vpaddd %ymm0,%ymm15,%ymm15
-
- # x12 += counter values 0-3
- vpaddd %ymm1,%ymm12,%ymm12
-
- # interleave 32-bit words in state n, n+1
- vmovdqa 0x00(%rsp),%ymm0
- vmovdqa 0x20(%rsp),%ymm1
- vpunpckldq %ymm1,%ymm0,%ymm2
- vpunpckhdq %ymm1,%ymm0,%ymm1
- vmovdqa %ymm2,0x00(%rsp)
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa 0x40(%rsp),%ymm0
- vmovdqa 0x60(%rsp),%ymm1
- vpunpckldq %ymm1,%ymm0,%ymm2
- vpunpckhdq %ymm1,%ymm0,%ymm1
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa %ymm1,0x60(%rsp)
- vmovdqa %ymm4,%ymm0
- vpunpckldq %ymm5,%ymm0,%ymm4
- vpunpckhdq %ymm5,%ymm0,%ymm5
- vmovdqa %ymm6,%ymm0
- vpunpckldq %ymm7,%ymm0,%ymm6
- vpunpckhdq %ymm7,%ymm0,%ymm7
- vmovdqa %ymm8,%ymm0
- vpunpckldq %ymm9,%ymm0,%ymm8
- vpunpckhdq %ymm9,%ymm0,%ymm9
- vmovdqa %ymm10,%ymm0
- vpunpckldq %ymm11,%ymm0,%ymm10
- vpunpckhdq %ymm11,%ymm0,%ymm11
- vmovdqa %ymm12,%ymm0
- vpunpckldq %ymm13,%ymm0,%ymm12
- vpunpckhdq %ymm13,%ymm0,%ymm13
- vmovdqa %ymm14,%ymm0
- vpunpckldq %ymm15,%ymm0,%ymm14
- vpunpckhdq %ymm15,%ymm0,%ymm15
-
- # interleave 64-bit words in state n, n+2
- vmovdqa 0x00(%rsp),%ymm0
- vmovdqa 0x40(%rsp),%ymm2
- vpunpcklqdq %ymm2,%ymm0,%ymm1
- vpunpckhqdq %ymm2,%ymm0,%ymm2
- vmovdqa %ymm1,0x00(%rsp)
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa 0x20(%rsp),%ymm0
- vmovdqa 0x60(%rsp),%ymm2
- vpunpcklqdq %ymm2,%ymm0,%ymm1
- vpunpckhqdq %ymm2,%ymm0,%ymm2
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa %ymm2,0x60(%rsp)
- vmovdqa %ymm4,%ymm0
- vpunpcklqdq %ymm6,%ymm0,%ymm4
- vpunpckhqdq %ymm6,%ymm0,%ymm6
- vmovdqa %ymm5,%ymm0
- vpunpcklqdq %ymm7,%ymm0,%ymm5
- vpunpckhqdq %ymm7,%ymm0,%ymm7
- vmovdqa %ymm8,%ymm0
- vpunpcklqdq %ymm10,%ymm0,%ymm8
- vpunpckhqdq %ymm10,%ymm0,%ymm10
- vmovdqa %ymm9,%ymm0
- vpunpcklqdq %ymm11,%ymm0,%ymm9
- vpunpckhqdq %ymm11,%ymm0,%ymm11
- vmovdqa %ymm12,%ymm0
- vpunpcklqdq %ymm14,%ymm0,%ymm12
- vpunpckhqdq %ymm14,%ymm0,%ymm14
- vmovdqa %ymm13,%ymm0
- vpunpcklqdq %ymm15,%ymm0,%ymm13
- vpunpckhqdq %ymm15,%ymm0,%ymm15
-
- # interleave 128-bit words in state n, n+4
- # xor/write first four blocks
- vmovdqa 0x00(%rsp),%ymm1
- vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
- cmp $0x0020,%rax
- jl .Lxorpart8
- vpxor 0x0000(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0000(%rsi)
- vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
-
- vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
- cmp $0x0040,%rax
- jl .Lxorpart8
- vpxor 0x0020(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0020(%rsi)
- vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
-
- vmovdqa 0x40(%rsp),%ymm1
- vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
- cmp $0x0060,%rax
- jl .Lxorpart8
- vpxor 0x0040(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0040(%rsi)
- vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
-
- vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
- cmp $0x0080,%rax
- jl .Lxorpart8
- vpxor 0x0060(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0060(%rsi)
- vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
-
- vmovdqa 0x20(%rsp),%ymm1
- vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
- cmp $0x00a0,%rax
- jl .Lxorpart8
- vpxor 0x0080(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0080(%rsi)
- vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
-
- vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
- cmp $0x00c0,%rax
- jl .Lxorpart8
- vpxor 0x00a0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00a0(%rsi)
- vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
-
- vmovdqa 0x60(%rsp),%ymm1
- vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
- cmp $0x00e0,%rax
- jl .Lxorpart8
- vpxor 0x00c0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00c0(%rsi)
- vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
-
- vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
- cmp $0x0100,%rax
- jl .Lxorpart8
- vpxor 0x00e0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00e0(%rsi)
- vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
-
- # xor remaining blocks, write to output
- vmovdqa %ymm4,%ymm0
- cmp $0x0120,%rax
- jl .Lxorpart8
- vpxor 0x0100(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0100(%rsi)
-
- vmovdqa %ymm12,%ymm0
- cmp $0x0140,%rax
- jl .Lxorpart8
- vpxor 0x0120(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0120(%rsi)
-
- vmovdqa %ymm6,%ymm0
- cmp $0x0160,%rax
- jl .Lxorpart8
- vpxor 0x0140(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0140(%rsi)
-
- vmovdqa %ymm14,%ymm0
- cmp $0x0180,%rax
- jl .Lxorpart8
- vpxor 0x0160(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0160(%rsi)
-
- vmovdqa %ymm5,%ymm0
- cmp $0x01a0,%rax
- jl .Lxorpart8
- vpxor 0x0180(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0180(%rsi)
-
- vmovdqa %ymm13,%ymm0
- cmp $0x01c0,%rax
- jl .Lxorpart8
- vpxor 0x01a0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x01a0(%rsi)
-
- vmovdqa %ymm7,%ymm0
- cmp $0x01e0,%rax
- jl .Lxorpart8
- vpxor 0x01c0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x01c0(%rsi)
-
- vmovdqa %ymm15,%ymm0
- cmp $0x0200,%rax
- jl .Lxorpart8
- vpxor 0x01e0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x01e0(%rsi)
-
-.Ldone8:
- vzeroupper
- lea -8(%r10),%rsp
- RET
-
-.Lxorpart8:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x1f,%r9
- jz .Ldone8
- and $~0x1f,%rax
-
- mov %rsi,%r11
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- vpxor 0x00(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- jmp .Ldone8
-
-SYM_FUNC_END(chacha_8block_xor_avx2)
diff --git a/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S b/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S
deleted file mode 100644
index 259383e1ad44..000000000000
--- a/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S
+++ /dev/null
@@ -1,836 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
- *
- * Copyright (C) 2018 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
-.align 32
-CTR2BL: .octa 0x00000000000000000000000000000000
- .octa 0x00000000000000000000000000000001
-
-.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
-.align 32
-CTR4BL: .octa 0x00000000000000000000000000000002
- .octa 0x00000000000000000000000000000003
-
-.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
-.align 32
-CTR8BL: .octa 0x00000003000000020000000100000000
- .octa 0x00000007000000060000000500000004
-
-.text
-
-SYM_FUNC_START(chacha_2block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 2 data blocks output, o
- # %rdx: up to 2 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts two ChaCha blocks by loading the state
- # matrix twice across four AVX registers. It performs matrix operations
- # on four words in each matrix in parallel, but requires shuffling to
- # rearrange the words after each round.
-
- vzeroupper
-
- # x0..3[0-2] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
-
- vmovdqa %ymm0,%ymm8
- vmovdqa %ymm1,%ymm9
- vmovdqa %ymm2,%ymm10
- vmovdqa %ymm3,%ymm11
-
-.Ldoubleround:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
-
- sub $2,%r8d
- jnz .Ldoubleround
-
- # o0 = i0 ^ (x0 + s0)
- vpaddd %ymm8,%ymm0,%ymm7
- cmp $0x10,%rcx
- jl .Lxorpart2
- vpxord 0x00(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x00(%rsi)
- vextracti128 $1,%ymm7,%xmm0
- # o1 = i1 ^ (x1 + s1)
- vpaddd %ymm9,%ymm1,%ymm7
- cmp $0x20,%rcx
- jl .Lxorpart2
- vpxord 0x10(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x10(%rsi)
- vextracti128 $1,%ymm7,%xmm1
- # o2 = i2 ^ (x2 + s2)
- vpaddd %ymm10,%ymm2,%ymm7
- cmp $0x30,%rcx
- jl .Lxorpart2
- vpxord 0x20(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x20(%rsi)
- vextracti128 $1,%ymm7,%xmm2
- # o3 = i3 ^ (x3 + s3)
- vpaddd %ymm11,%ymm3,%ymm7
- cmp $0x40,%rcx
- jl .Lxorpart2
- vpxord 0x30(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x30(%rsi)
- vextracti128 $1,%ymm7,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm7
- cmp $0x50,%rcx
- jl .Lxorpart2
- vpxord 0x40(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm7
- cmp $0x60,%rcx
- jl .Lxorpart2
- vpxord 0x50(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm7
- cmp $0x70,%rcx
- jl .Lxorpart2
- vpxord 0x60(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm7
- cmp $0x80,%rcx
- jl .Lxorpart2
- vpxord 0x70(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x70(%rsi)
-
-.Ldone2:
- vzeroupper
- RET
-
-.Lxorpart2:
- # xor remaining bytes from partial register into output
- mov %rcx,%rax
- and $0xf,%rcx
- jz .Ldone2
- mov %rax,%r9
- and $~0xf,%r9
-
- mov $1,%rax
- shld %cl,%rax,%rax
- sub $1,%rax
- kmovq %rax,%k1
-
- vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
- vpxord %xmm7,%xmm1,%xmm1
- vmovdqu8 %xmm1,(%rsi,%r9){%k1}
-
- jmp .Ldone2
-
-SYM_FUNC_END(chacha_2block_xor_avx512vl)
-
-SYM_FUNC_START(chacha_4block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four ChaCha blocks by loading the state
- # matrix four times across eight AVX registers. It performs matrix
- # operations on four words in two matrices in parallel, sequentially
- # to the operations on the four words of the other two matrices. The
- # required word shuffling has a rather high latency, we can do the
- # arithmetic on two matrix-pairs without much slowdown.
-
- vzeroupper
-
- # x0..3[0-4] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vmovdqa %ymm0,%ymm4
- vmovdqa %ymm1,%ymm5
- vmovdqa %ymm2,%ymm6
- vmovdqa %ymm3,%ymm7
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
- vpaddd CTR4BL(%rip),%ymm7,%ymm7
-
- vmovdqa %ymm0,%ymm11
- vmovdqa %ymm1,%ymm12
- vmovdqa %ymm2,%ymm13
- vmovdqa %ymm3,%ymm14
- vmovdqa %ymm7,%ymm15
-
-.Ldoubleround4:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $16,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- vpshufd $0x39,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
- vpshufd $0x93,%ymm7,%ymm7
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $16,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- vpshufd $0x93,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
- vpshufd $0x39,%ymm7,%ymm7
-
- sub $2,%r8d
- jnz .Ldoubleround4
-
- # o0 = i0 ^ (x0 + s0), first block
- vpaddd %ymm11,%ymm0,%ymm10
- cmp $0x10,%rcx
- jl .Lxorpart4
- vpxord 0x00(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x00(%rsi)
- vextracti128 $1,%ymm10,%xmm0
- # o1 = i1 ^ (x1 + s1), first block
- vpaddd %ymm12,%ymm1,%ymm10
- cmp $0x20,%rcx
- jl .Lxorpart4
- vpxord 0x10(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x10(%rsi)
- vextracti128 $1,%ymm10,%xmm1
- # o2 = i2 ^ (x2 + s2), first block
- vpaddd %ymm13,%ymm2,%ymm10
- cmp $0x30,%rcx
- jl .Lxorpart4
- vpxord 0x20(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x20(%rsi)
- vextracti128 $1,%ymm10,%xmm2
- # o3 = i3 ^ (x3 + s3), first block
- vpaddd %ymm14,%ymm3,%ymm10
- cmp $0x40,%rcx
- jl .Lxorpart4
- vpxord 0x30(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x30(%rsi)
- vextracti128 $1,%ymm10,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm10
- cmp $0x50,%rcx
- jl .Lxorpart4
- vpxord 0x40(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm10
- cmp $0x60,%rcx
- jl .Lxorpart4
- vpxord 0x50(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm10
- cmp $0x70,%rcx
- jl .Lxorpart4
- vpxord 0x60(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm10
- cmp $0x80,%rcx
- jl .Lxorpart4
- vpxord 0x70(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x70(%rsi)
-
- # o0 = i0 ^ (x0 + s0), third block
- vpaddd %ymm11,%ymm4,%ymm10
- cmp $0x90,%rcx
- jl .Lxorpart4
- vpxord 0x80(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x80(%rsi)
- vextracti128 $1,%ymm10,%xmm4
- # o1 = i1 ^ (x1 + s1), third block
- vpaddd %ymm12,%ymm5,%ymm10
- cmp $0xa0,%rcx
- jl .Lxorpart4
- vpxord 0x90(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x90(%rsi)
- vextracti128 $1,%ymm10,%xmm5
- # o2 = i2 ^ (x2 + s2), third block
- vpaddd %ymm13,%ymm6,%ymm10
- cmp $0xb0,%rcx
- jl .Lxorpart4
- vpxord 0xa0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xa0(%rsi)
- vextracti128 $1,%ymm10,%xmm6
- # o3 = i3 ^ (x3 + s3), third block
- vpaddd %ymm15,%ymm7,%ymm10
- cmp $0xc0,%rcx
- jl .Lxorpart4
- vpxord 0xb0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xb0(%rsi)
- vextracti128 $1,%ymm10,%xmm7
-
- # xor and write fourth block
- vmovdqa %xmm4,%xmm10
- cmp $0xd0,%rcx
- jl .Lxorpart4
- vpxord 0xc0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xc0(%rsi)
-
- vmovdqa %xmm5,%xmm10
- cmp $0xe0,%rcx
- jl .Lxorpart4
- vpxord 0xd0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xd0(%rsi)
-
- vmovdqa %xmm6,%xmm10
- cmp $0xf0,%rcx
- jl .Lxorpart4
- vpxord 0xe0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xe0(%rsi)
-
- vmovdqa %xmm7,%xmm10
- cmp $0x100,%rcx
- jl .Lxorpart4
- vpxord 0xf0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xf0(%rsi)
-
-.Ldone4:
- vzeroupper
- RET
-
-.Lxorpart4:
- # xor remaining bytes from partial register into output
- mov %rcx,%rax
- and $0xf,%rcx
- jz .Ldone4
- mov %rax,%r9
- and $~0xf,%r9
-
- mov $1,%rax
- shld %cl,%rax,%rax
- sub $1,%rax
- kmovq %rax,%k1
-
- vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
- vpxord %xmm10,%xmm1,%xmm1
- vmovdqu8 %xmm1,(%rsi,%r9){%k1}
-
- jmp .Ldone4
-
-SYM_FUNC_END(chacha_4block_xor_avx512vl)
-
-SYM_FUNC_START(chacha_8block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 8 data blocks output, o
- # %rdx: up to 8 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts eight consecutive ChaCha blocks by loading
- # the state matrix in AVX registers eight times. Compared to AVX2, this
- # mostly benefits from the new rotate instructions in VL and the
- # additional registers.
-
- vzeroupper
-
- # x0..15[0-7] = s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpbroadcastd 0x04(%rdi),%ymm1
- vpbroadcastd 0x08(%rdi),%ymm2
- vpbroadcastd 0x0c(%rdi),%ymm3
- vpbroadcastd 0x10(%rdi),%ymm4
- vpbroadcastd 0x14(%rdi),%ymm5
- vpbroadcastd 0x18(%rdi),%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm7
- vpbroadcastd 0x20(%rdi),%ymm8
- vpbroadcastd 0x24(%rdi),%ymm9
- vpbroadcastd 0x28(%rdi),%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm11
- vpbroadcastd 0x30(%rdi),%ymm12
- vpbroadcastd 0x34(%rdi),%ymm13
- vpbroadcastd 0x38(%rdi),%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm15
-
- # x12 += counter values 0-3
- vpaddd CTR8BL(%rip),%ymm12,%ymm12
-
- vmovdqa64 %ymm0,%ymm16
- vmovdqa64 %ymm1,%ymm17
- vmovdqa64 %ymm2,%ymm18
- vmovdqa64 %ymm3,%ymm19
- vmovdqa64 %ymm4,%ymm20
- vmovdqa64 %ymm5,%ymm21
- vmovdqa64 %ymm6,%ymm22
- vmovdqa64 %ymm7,%ymm23
- vmovdqa64 %ymm8,%ymm24
- vmovdqa64 %ymm9,%ymm25
- vmovdqa64 %ymm10,%ymm26
- vmovdqa64 %ymm11,%ymm27
- vmovdqa64 %ymm12,%ymm28
- vmovdqa64 %ymm13,%ymm29
- vmovdqa64 %ymm14,%ymm30
- vmovdqa64 %ymm15,%ymm31
-
-.Ldoubleround8:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- vpaddd %ymm0,%ymm4,%ymm0
- vpxord %ymm0,%ymm12,%ymm12
- vprold $16,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- vpaddd %ymm1,%ymm5,%ymm1
- vpxord %ymm1,%ymm13,%ymm13
- vprold $16,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- vpaddd %ymm2,%ymm6,%ymm2
- vpxord %ymm2,%ymm14,%ymm14
- vprold $16,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- vpaddd %ymm3,%ymm7,%ymm3
- vpxord %ymm3,%ymm15,%ymm15
- vprold $16,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxord %ymm8,%ymm4,%ymm4
- vprold $12,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxord %ymm9,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxord %ymm10,%ymm6,%ymm6
- vprold $12,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxord %ymm11,%ymm7,%ymm7
- vprold $12,%ymm7,%ymm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- vpaddd %ymm0,%ymm4,%ymm0
- vpxord %ymm0,%ymm12,%ymm12
- vprold $8,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- vpaddd %ymm1,%ymm5,%ymm1
- vpxord %ymm1,%ymm13,%ymm13
- vprold $8,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- vpaddd %ymm2,%ymm6,%ymm2
- vpxord %ymm2,%ymm14,%ymm14
- vprold $8,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- vpaddd %ymm3,%ymm7,%ymm3
- vpxord %ymm3,%ymm15,%ymm15
- vprold $8,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxord %ymm8,%ymm4,%ymm4
- vprold $7,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxord %ymm9,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxord %ymm10,%ymm6,%ymm6
- vprold $7,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxord %ymm11,%ymm7,%ymm7
- vprold $7,%ymm7,%ymm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- vpaddd %ymm0,%ymm5,%ymm0
- vpxord %ymm0,%ymm15,%ymm15
- vprold $16,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
- vpaddd %ymm1,%ymm6,%ymm1
- vpxord %ymm1,%ymm12,%ymm12
- vprold $16,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- vpaddd %ymm2,%ymm7,%ymm2
- vpxord %ymm2,%ymm13,%ymm13
- vprold $16,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- vpaddd %ymm3,%ymm4,%ymm3
- vpxord %ymm3,%ymm14,%ymm14
- vprold $16,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxord %ymm10,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxord %ymm11,%ymm6,%ymm6
- vprold $12,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxord %ymm8,%ymm7,%ymm7
- vprold $12,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxord %ymm9,%ymm4,%ymm4
- vprold $12,%ymm4,%ymm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- vpaddd %ymm0,%ymm5,%ymm0
- vpxord %ymm0,%ymm15,%ymm15
- vprold $8,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- vpaddd %ymm1,%ymm6,%ymm1
- vpxord %ymm1,%ymm12,%ymm12
- vprold $8,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- vpaddd %ymm2,%ymm7,%ymm2
- vpxord %ymm2,%ymm13,%ymm13
- vprold $8,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- vpaddd %ymm3,%ymm4,%ymm3
- vpxord %ymm3,%ymm14,%ymm14
- vprold $8,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxord %ymm10,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxord %ymm11,%ymm6,%ymm6
- vprold $7,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxord %ymm8,%ymm7,%ymm7
- vprold $7,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxord %ymm9,%ymm4,%ymm4
- vprold $7,%ymm4,%ymm4
-
- sub $2,%r8d
- jnz .Ldoubleround8
-
- # x0..15[0-3] += s[0..15]
- vpaddd %ymm16,%ymm0,%ymm0
- vpaddd %ymm17,%ymm1,%ymm1
- vpaddd %ymm18,%ymm2,%ymm2
- vpaddd %ymm19,%ymm3,%ymm3
- vpaddd %ymm20,%ymm4,%ymm4
- vpaddd %ymm21,%ymm5,%ymm5
- vpaddd %ymm22,%ymm6,%ymm6
- vpaddd %ymm23,%ymm7,%ymm7
- vpaddd %ymm24,%ymm8,%ymm8
- vpaddd %ymm25,%ymm9,%ymm9
- vpaddd %ymm26,%ymm10,%ymm10
- vpaddd %ymm27,%ymm11,%ymm11
- vpaddd %ymm28,%ymm12,%ymm12
- vpaddd %ymm29,%ymm13,%ymm13
- vpaddd %ymm30,%ymm14,%ymm14
- vpaddd %ymm31,%ymm15,%ymm15
-
- # interleave 32-bit words in state n, n+1
- vpunpckldq %ymm1,%ymm0,%ymm16
- vpunpckhdq %ymm1,%ymm0,%ymm17
- vpunpckldq %ymm3,%ymm2,%ymm18
- vpunpckhdq %ymm3,%ymm2,%ymm19
- vpunpckldq %ymm5,%ymm4,%ymm20
- vpunpckhdq %ymm5,%ymm4,%ymm21
- vpunpckldq %ymm7,%ymm6,%ymm22
- vpunpckhdq %ymm7,%ymm6,%ymm23
- vpunpckldq %ymm9,%ymm8,%ymm24
- vpunpckhdq %ymm9,%ymm8,%ymm25
- vpunpckldq %ymm11,%ymm10,%ymm26
- vpunpckhdq %ymm11,%ymm10,%ymm27
- vpunpckldq %ymm13,%ymm12,%ymm28
- vpunpckhdq %ymm13,%ymm12,%ymm29
- vpunpckldq %ymm15,%ymm14,%ymm30
- vpunpckhdq %ymm15,%ymm14,%ymm31
-
- # interleave 64-bit words in state n, n+2
- vpunpcklqdq %ymm18,%ymm16,%ymm0
- vpunpcklqdq %ymm19,%ymm17,%ymm1
- vpunpckhqdq %ymm18,%ymm16,%ymm2
- vpunpckhqdq %ymm19,%ymm17,%ymm3
- vpunpcklqdq %ymm22,%ymm20,%ymm4
- vpunpcklqdq %ymm23,%ymm21,%ymm5
- vpunpckhqdq %ymm22,%ymm20,%ymm6
- vpunpckhqdq %ymm23,%ymm21,%ymm7
- vpunpcklqdq %ymm26,%ymm24,%ymm8
- vpunpcklqdq %ymm27,%ymm25,%ymm9
- vpunpckhqdq %ymm26,%ymm24,%ymm10
- vpunpckhqdq %ymm27,%ymm25,%ymm11
- vpunpcklqdq %ymm30,%ymm28,%ymm12
- vpunpcklqdq %ymm31,%ymm29,%ymm13
- vpunpckhqdq %ymm30,%ymm28,%ymm14
- vpunpckhqdq %ymm31,%ymm29,%ymm15
-
- # interleave 128-bit words in state n, n+4
- # xor/write first four blocks
- vmovdqa64 %ymm0,%ymm16
- vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
- cmp $0x0020,%rcx
- jl .Lxorpart8
- vpxord 0x0000(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0000(%rsi)
- vmovdqa64 %ymm16,%ymm0
- vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
-
- vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
- cmp $0x0040,%rcx
- jl .Lxorpart8
- vpxord 0x0020(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0020(%rsi)
- vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
-
- vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
- cmp $0x0060,%rcx
- jl .Lxorpart8
- vpxord 0x0040(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0040(%rsi)
- vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
-
- vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
- cmp $0x0080,%rcx
- jl .Lxorpart8
- vpxord 0x0060(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0060(%rsi)
- vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
-
- vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
- cmp $0x00a0,%rcx
- jl .Lxorpart8
- vpxord 0x0080(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0080(%rsi)
- vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
-
- vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
- cmp $0x00c0,%rcx
- jl .Lxorpart8
- vpxord 0x00a0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x00a0(%rsi)
- vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
-
- vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
- cmp $0x00e0,%rcx
- jl .Lxorpart8
- vpxord 0x00c0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x00c0(%rsi)
- vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
-
- vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
- cmp $0x0100,%rcx
- jl .Lxorpart8
- vpxord 0x00e0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x00e0(%rsi)
- vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
-
- # xor remaining blocks, write to output
- vmovdqa64 %ymm4,%ymm0
- cmp $0x0120,%rcx
- jl .Lxorpart8
- vpxord 0x0100(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0100(%rsi)
-
- vmovdqa64 %ymm12,%ymm0
- cmp $0x0140,%rcx
- jl .Lxorpart8
- vpxord 0x0120(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0120(%rsi)
-
- vmovdqa64 %ymm6,%ymm0
- cmp $0x0160,%rcx
- jl .Lxorpart8
- vpxord 0x0140(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0140(%rsi)
-
- vmovdqa64 %ymm14,%ymm0
- cmp $0x0180,%rcx
- jl .Lxorpart8
- vpxord 0x0160(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0160(%rsi)
-
- vmovdqa64 %ymm5,%ymm0
- cmp $0x01a0,%rcx
- jl .Lxorpart8
- vpxord 0x0180(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0180(%rsi)
-
- vmovdqa64 %ymm13,%ymm0
- cmp $0x01c0,%rcx
- jl .Lxorpart8
- vpxord 0x01a0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x01a0(%rsi)
-
- vmovdqa64 %ymm7,%ymm0
- cmp $0x01e0,%rcx
- jl .Lxorpart8
- vpxord 0x01c0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x01c0(%rsi)
-
- vmovdqa64 %ymm15,%ymm0
- cmp $0x0200,%rcx
- jl .Lxorpart8
- vpxord 0x01e0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x01e0(%rsi)
-
-.Ldone8:
- vzeroupper
- RET
-
-.Lxorpart8:
- # xor remaining bytes from partial register into output
- mov %rcx,%rax
- and $0x1f,%rcx
- jz .Ldone8
- mov %rax,%r9
- and $~0x1f,%r9
-
- mov $1,%rax
- shld %cl,%rax,%rax
- sub $1,%rax
- kmovq %rax,%k1
-
- vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
- vpxord %ymm0,%ymm1,%ymm1
- vmovdqu8 %ymm1,(%rsi,%r9){%k1}
-
- jmp .Ldone8
-
-SYM_FUNC_END(chacha_8block_xor_avx512vl)
diff --git a/arch/x86/lib/crypto/chacha-ssse3-x86_64.S b/arch/x86/lib/crypto/chacha-ssse3-x86_64.S
deleted file mode 100644
index 7111949cd5b9..000000000000
--- a/arch/x86/lib/crypto/chacha-ssse3-x86_64.S
+++ /dev/null
@@ -1,791 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-.section .rodata.cst16.ROT8, "aM", @progbits, 16
-.align 16
-ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
-.section .rodata.cst16.ROT16, "aM", @progbits, 16
-.align 16
-ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
-.section .rodata.cst16.CTRINC, "aM", @progbits, 16
-.align 16
-CTRINC: .octa 0x00000003000000020000000100000000
-
-.text
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
- * function performs matrix operations on four words in parallel, but requires
- * shuffling to rearrange the words after each round. 8/16-bit word rotation is
- * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
- * rotation uses traditional shift+OR.
- *
- * The round count is given in %r8d.
- *
- * Clobbers: %r8d, %xmm4-%xmm7
- */
-SYM_FUNC_START_LOCAL(chacha_permute)
-
- movdqa ROT8(%rip),%xmm4
- movdqa ROT16(%rip),%xmm5
-
-.Ldoubleround:
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm5,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm6
- pslld $12,%xmm6
- psrld $20,%xmm1
- por %xmm6,%xmm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm4,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm7
- pslld $7,%xmm7
- psrld $25,%xmm1
- por %xmm7,%xmm1
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- pshufd $0x39,%xmm1,%xmm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- pshufd $0x4e,%xmm2,%xmm2
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- pshufd $0x93,%xmm3,%xmm3
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm5,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm6
- pslld $12,%xmm6
- psrld $20,%xmm1
- por %xmm6,%xmm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm4,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm7
- pslld $7,%xmm7
- psrld $25,%xmm1
- por %xmm7,%xmm1
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- pshufd $0x93,%xmm1,%xmm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- pshufd $0x4e,%xmm2,%xmm2
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- pshufd $0x39,%xmm3,%xmm3
-
- sub $2,%r8d
- jnz .Ldoubleround
-
- RET
-SYM_FUNC_END(chacha_permute)
-
-SYM_FUNC_START(chacha_block_xor_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: up to 1 data block output, o
- # %rdx: up to 1 data block input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
- FRAME_BEGIN
-
- # x0..3 = s0..3
- movdqu 0x00(%rdi),%xmm0
- movdqu 0x10(%rdi),%xmm1
- movdqu 0x20(%rdi),%xmm2
- movdqu 0x30(%rdi),%xmm3
- movdqa %xmm0,%xmm8
- movdqa %xmm1,%xmm9
- movdqa %xmm2,%xmm10
- movdqa %xmm3,%xmm11
-
- mov %rcx,%rax
- call chacha_permute
-
- # o0 = i0 ^ (x0 + s0)
- paddd %xmm8,%xmm0
- cmp $0x10,%rax
- jl .Lxorpart
- movdqu 0x00(%rdx),%xmm4
- pxor %xmm4,%xmm0
- movdqu %xmm0,0x00(%rsi)
- # o1 = i1 ^ (x1 + s1)
- paddd %xmm9,%xmm1
- movdqa %xmm1,%xmm0
- cmp $0x20,%rax
- jl .Lxorpart
- movdqu 0x10(%rdx),%xmm0
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x10(%rsi)
- # o2 = i2 ^ (x2 + s2)
- paddd %xmm10,%xmm2
- movdqa %xmm2,%xmm0
- cmp $0x30,%rax
- jl .Lxorpart
- movdqu 0x20(%rdx),%xmm0
- pxor %xmm2,%xmm0
- movdqu %xmm0,0x20(%rsi)
- # o3 = i3 ^ (x3 + s3)
- paddd %xmm11,%xmm3
- movdqa %xmm3,%xmm0
- cmp $0x40,%rax
- jl .Lxorpart
- movdqu 0x30(%rdx),%xmm0
- pxor %xmm3,%xmm0
- movdqu %xmm0,0x30(%rsi)
-
-.Ldone:
- FRAME_END
- RET
-
-.Lxorpart:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea 8(%rsp),%r10
- sub $0x10,%rsp
- and $~31,%rsp
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- pxor 0x00(%rsp),%xmm0
- movdqa %xmm0,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- lea -8(%r10),%rsp
- jmp .Ldone
-
-SYM_FUNC_END(chacha_block_xor_ssse3)
-
-SYM_FUNC_START(hchacha_block_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: output (8 32-bit words)
- # %edx: nrounds
- FRAME_BEGIN
-
- movdqu 0x00(%rdi),%xmm0
- movdqu 0x10(%rdi),%xmm1
- movdqu 0x20(%rdi),%xmm2
- movdqu 0x30(%rdi),%xmm3
-
- mov %edx,%r8d
- call chacha_permute
-
- movdqu %xmm0,0x00(%rsi)
- movdqu %xmm3,0x10(%rsi)
-
- FRAME_END
- RET
-SYM_FUNC_END(hchacha_block_ssse3)
-
-SYM_FUNC_START(chacha_4block_xor_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four consecutive ChaCha blocks by loading the
- # the state matrix in SSE registers four times. As we need some scratch
- # registers, we save the first four registers on the stack. The
- # algorithm performs each operation on the corresponding word of each
- # state matrix, hence requires no word shuffling. For final XORing step
- # we transpose the matrix by interleaving 32- and then 64-bit words,
- # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
- # done with the slightly better performing SSSE3 byte shuffling,
- # 7/12-bit word rotation uses traditional shift+OR.
-
- lea 8(%rsp),%r10
- sub $0x80,%rsp
- and $~63,%rsp
- mov %rcx,%rax
-
- # x0..15[0-3] = s0..3[0..3]
- movq 0x00(%rdi),%xmm1
- pshufd $0x00,%xmm1,%xmm0
- pshufd $0x55,%xmm1,%xmm1
- movq 0x08(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- movq 0x10(%rdi),%xmm5
- pshufd $0x00,%xmm5,%xmm4
- pshufd $0x55,%xmm5,%xmm5
- movq 0x18(%rdi),%xmm7
- pshufd $0x00,%xmm7,%xmm6
- pshufd $0x55,%xmm7,%xmm7
- movq 0x20(%rdi),%xmm9
- pshufd $0x00,%xmm9,%xmm8
- pshufd $0x55,%xmm9,%xmm9
- movq 0x28(%rdi),%xmm11
- pshufd $0x00,%xmm11,%xmm10
- pshufd $0x55,%xmm11,%xmm11
- movq 0x30(%rdi),%xmm13
- pshufd $0x00,%xmm13,%xmm12
- pshufd $0x55,%xmm13,%xmm13
- movq 0x38(%rdi),%xmm15
- pshufd $0x00,%xmm15,%xmm14
- pshufd $0x55,%xmm15,%xmm15
- # x0..3 on stack
- movdqa %xmm0,0x00(%rsp)
- movdqa %xmm1,0x10(%rsp)
- movdqa %xmm2,0x20(%rsp)
- movdqa %xmm3,0x30(%rsp)
-
- movdqa CTRINC(%rip),%xmm1
- movdqa ROT8(%rip),%xmm2
- movdqa ROT16(%rip),%xmm3
-
- # x12 += counter values 0-3
- paddd %xmm1,%xmm12
-
-.Ldoubleround4:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm3,%xmm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm3,%xmm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm3,%xmm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm3,%xmm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm4
- por %xmm0,%xmm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm5
- por %xmm0,%xmm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm6
- por %xmm0,%xmm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- paddd %xmm15,%xmm11
- pxor %xmm11,%xmm7
- movdqa %xmm7,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm7
- por %xmm0,%xmm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm2,%xmm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm2,%xmm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm2,%xmm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm2,%xmm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm4
- por %xmm0,%xmm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm5
- por %xmm0,%xmm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm6
- por %xmm0,%xmm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- paddd %xmm15,%xmm11
- pxor %xmm11,%xmm7
- movdqa %xmm7,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm7
- por %xmm0,%xmm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm3,%xmm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm3,%xmm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm3,%xmm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm3,%xmm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- paddd %xmm15,%xmm10
- pxor %xmm10,%xmm5
- movdqa %xmm5,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm5
- por %xmm0,%xmm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- paddd %xmm12,%xmm11
- pxor %xmm11,%xmm6
- movdqa %xmm6,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm6
- por %xmm0,%xmm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- paddd %xmm13,%xmm8
- pxor %xmm8,%xmm7
- movdqa %xmm7,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm7
- por %xmm0,%xmm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- paddd %xmm14,%xmm9
- pxor %xmm9,%xmm4
- movdqa %xmm4,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm4
- por %xmm0,%xmm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm2,%xmm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm2,%xmm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm2,%xmm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm2,%xmm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- paddd %xmm15,%xmm10
- pxor %xmm10,%xmm5
- movdqa %xmm5,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm5
- por %xmm0,%xmm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- paddd %xmm12,%xmm11
- pxor %xmm11,%xmm6
- movdqa %xmm6,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm6
- por %xmm0,%xmm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- paddd %xmm13,%xmm8
- pxor %xmm8,%xmm7
- movdqa %xmm7,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm7
- por %xmm0,%xmm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- paddd %xmm14,%xmm9
- pxor %xmm9,%xmm4
- movdqa %xmm4,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm4
- por %xmm0,%xmm4
-
- sub $2,%r8d
- jnz .Ldoubleround4
-
- # x0[0-3] += s0[0]
- # x1[0-3] += s0[1]
- movq 0x00(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd 0x00(%rsp),%xmm2
- movdqa %xmm2,0x00(%rsp)
- paddd 0x10(%rsp),%xmm3
- movdqa %xmm3,0x10(%rsp)
- # x2[0-3] += s0[2]
- # x3[0-3] += s0[3]
- movq 0x08(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd 0x20(%rsp),%xmm2
- movdqa %xmm2,0x20(%rsp)
- paddd 0x30(%rsp),%xmm3
- movdqa %xmm3,0x30(%rsp)
-
- # x4[0-3] += s1[0]
- # x5[0-3] += s1[1]
- movq 0x10(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm4
- paddd %xmm3,%xmm5
- # x6[0-3] += s1[2]
- # x7[0-3] += s1[3]
- movq 0x18(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm6
- paddd %xmm3,%xmm7
-
- # x8[0-3] += s2[0]
- # x9[0-3] += s2[1]
- movq 0x20(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm8
- paddd %xmm3,%xmm9
- # x10[0-3] += s2[2]
- # x11[0-3] += s2[3]
- movq 0x28(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm10
- paddd %xmm3,%xmm11
-
- # x12[0-3] += s3[0]
- # x13[0-3] += s3[1]
- movq 0x30(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm12
- paddd %xmm3,%xmm13
- # x14[0-3] += s3[2]
- # x15[0-3] += s3[3]
- movq 0x38(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm14
- paddd %xmm3,%xmm15
-
- # x12 += counter values 0-3
- paddd %xmm1,%xmm12
-
- # interleave 32-bit words in state n, n+1
- movdqa 0x00(%rsp),%xmm0
- movdqa 0x10(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpckldq %xmm1,%xmm2
- punpckhdq %xmm1,%xmm0
- movdqa %xmm2,0x00(%rsp)
- movdqa %xmm0,0x10(%rsp)
- movdqa 0x20(%rsp),%xmm0
- movdqa 0x30(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpckldq %xmm1,%xmm2
- punpckhdq %xmm1,%xmm0
- movdqa %xmm2,0x20(%rsp)
- movdqa %xmm0,0x30(%rsp)
- movdqa %xmm4,%xmm0
- punpckldq %xmm5,%xmm4
- punpckhdq %xmm5,%xmm0
- movdqa %xmm0,%xmm5
- movdqa %xmm6,%xmm0
- punpckldq %xmm7,%xmm6
- punpckhdq %xmm7,%xmm0
- movdqa %xmm0,%xmm7
- movdqa %xmm8,%xmm0
- punpckldq %xmm9,%xmm8
- punpckhdq %xmm9,%xmm0
- movdqa %xmm0,%xmm9
- movdqa %xmm10,%xmm0
- punpckldq %xmm11,%xmm10
- punpckhdq %xmm11,%xmm0
- movdqa %xmm0,%xmm11
- movdqa %xmm12,%xmm0
- punpckldq %xmm13,%xmm12
- punpckhdq %xmm13,%xmm0
- movdqa %xmm0,%xmm13
- movdqa %xmm14,%xmm0
- punpckldq %xmm15,%xmm14
- punpckhdq %xmm15,%xmm0
- movdqa %xmm0,%xmm15
-
- # interleave 64-bit words in state n, n+2
- movdqa 0x00(%rsp),%xmm0
- movdqa 0x20(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpcklqdq %xmm1,%xmm2
- punpckhqdq %xmm1,%xmm0
- movdqa %xmm2,0x00(%rsp)
- movdqa %xmm0,0x20(%rsp)
- movdqa 0x10(%rsp),%xmm0
- movdqa 0x30(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpcklqdq %xmm1,%xmm2
- punpckhqdq %xmm1,%xmm0
- movdqa %xmm2,0x10(%rsp)
- movdqa %xmm0,0x30(%rsp)
- movdqa %xmm4,%xmm0
- punpcklqdq %xmm6,%xmm4
- punpckhqdq %xmm6,%xmm0
- movdqa %xmm0,%xmm6
- movdqa %xmm5,%xmm0
- punpcklqdq %xmm7,%xmm5
- punpckhqdq %xmm7,%xmm0
- movdqa %xmm0,%xmm7
- movdqa %xmm8,%xmm0
- punpcklqdq %xmm10,%xmm8
- punpckhqdq %xmm10,%xmm0
- movdqa %xmm0,%xmm10
- movdqa %xmm9,%xmm0
- punpcklqdq %xmm11,%xmm9
- punpckhqdq %xmm11,%xmm0
- movdqa %xmm0,%xmm11
- movdqa %xmm12,%xmm0
- punpcklqdq %xmm14,%xmm12
- punpckhqdq %xmm14,%xmm0
- movdqa %xmm0,%xmm14
- movdqa %xmm13,%xmm0
- punpcklqdq %xmm15,%xmm13
- punpckhqdq %xmm15,%xmm0
- movdqa %xmm0,%xmm15
-
- # xor with corresponding input, write to output
- movdqa 0x00(%rsp),%xmm0
- cmp $0x10,%rax
- jl .Lxorpart4
- movdqu 0x00(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x00(%rsi)
-
- movdqu %xmm4,%xmm0
- cmp $0x20,%rax
- jl .Lxorpart4
- movdqu 0x10(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x10(%rsi)
-
- movdqu %xmm8,%xmm0
- cmp $0x30,%rax
- jl .Lxorpart4
- movdqu 0x20(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x20(%rsi)
-
- movdqu %xmm12,%xmm0
- cmp $0x40,%rax
- jl .Lxorpart4
- movdqu 0x30(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x30(%rsi)
-
- movdqa 0x20(%rsp),%xmm0
- cmp $0x50,%rax
- jl .Lxorpart4
- movdqu 0x40(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x40(%rsi)
-
- movdqu %xmm6,%xmm0
- cmp $0x60,%rax
- jl .Lxorpart4
- movdqu 0x50(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x50(%rsi)
-
- movdqu %xmm10,%xmm0
- cmp $0x70,%rax
- jl .Lxorpart4
- movdqu 0x60(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x60(%rsi)
-
- movdqu %xmm14,%xmm0
- cmp $0x80,%rax
- jl .Lxorpart4
- movdqu 0x70(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x70(%rsi)
-
- movdqa 0x10(%rsp),%xmm0
- cmp $0x90,%rax
- jl .Lxorpart4
- movdqu 0x80(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x80(%rsi)
-
- movdqu %xmm5,%xmm0
- cmp $0xa0,%rax
- jl .Lxorpart4
- movdqu 0x90(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x90(%rsi)
-
- movdqu %xmm9,%xmm0
- cmp $0xb0,%rax
- jl .Lxorpart4
- movdqu 0xa0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xa0(%rsi)
-
- movdqu %xmm13,%xmm0
- cmp $0xc0,%rax
- jl .Lxorpart4
- movdqu 0xb0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xb0(%rsi)
-
- movdqa 0x30(%rsp),%xmm0
- cmp $0xd0,%rax
- jl .Lxorpart4
- movdqu 0xc0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xc0(%rsi)
-
- movdqu %xmm7,%xmm0
- cmp $0xe0,%rax
- jl .Lxorpart4
- movdqu 0xd0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xd0(%rsi)
-
- movdqu %xmm11,%xmm0
- cmp $0xf0,%rax
- jl .Lxorpart4
- movdqu 0xe0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xe0(%rsi)
-
- movdqu %xmm15,%xmm0
- cmp $0x100,%rax
- jl .Lxorpart4
- movdqu 0xf0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xf0(%rsi)
-
-.Ldone4:
- lea -8(%r10),%rsp
- RET
-
-.Lxorpart4:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone4
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- pxor 0x00(%rsp),%xmm0
- movdqa %xmm0,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- jmp .Ldone4
-
-SYM_FUNC_END(chacha_4block_xor_ssse3)
diff --git a/arch/x86/lib/crypto/chacha_glue.c b/arch/x86/lib/crypto/chacha_glue.c
deleted file mode 100644
index 10b2c945f541..000000000000
--- a/arch/x86/lib/crypto/chacha_glue.c
+++ /dev/null
@@ -1,196 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * ChaCha and HChaCha functions (x86_64 optimized)
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <asm/simd.h>
-#include <crypto/chacha.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-
-asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void hchacha_block_ssse3(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds);
-
-asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-
-asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
-
-static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
-{
- len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
- return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
-}
-
-static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- if (static_branch_likely(&chacha_use_avx512vl)) {
- while (bytes >= CHACHA_BLOCK_SIZE * 8) {
- chacha_8block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 8;
- src += CHACHA_BLOCK_SIZE * 8;
- dst += CHACHA_BLOCK_SIZE * 8;
- state->x[12] += 8;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 4) {
- chacha_8block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- state->x[12] += chacha_advance(bytes, 8);
- return;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 2) {
- chacha_4block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- state->x[12] += chacha_advance(bytes, 4);
- return;
- }
- if (bytes) {
- chacha_2block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- state->x[12] += chacha_advance(bytes, 2);
- return;
- }
- }
-
- if (static_branch_likely(&chacha_use_avx2)) {
- while (bytes >= CHACHA_BLOCK_SIZE * 8) {
- chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 8;
- src += CHACHA_BLOCK_SIZE * 8;
- dst += CHACHA_BLOCK_SIZE * 8;
- state->x[12] += 8;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 4) {
- chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
- state->x[12] += chacha_advance(bytes, 8);
- return;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 2) {
- chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
- state->x[12] += chacha_advance(bytes, 4);
- return;
- }
- if (bytes > CHACHA_BLOCK_SIZE) {
- chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
- state->x[12] += chacha_advance(bytes, 2);
- return;
- }
- }
-
- while (bytes >= CHACHA_BLOCK_SIZE * 4) {
- chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 4;
- src += CHACHA_BLOCK_SIZE * 4;
- dst += CHACHA_BLOCK_SIZE * 4;
- state->x[12] += 4;
- }
- if (bytes > CHACHA_BLOCK_SIZE) {
- chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
- state->x[12] += chacha_advance(bytes, 4);
- return;
- }
- if (bytes) {
- chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
- state->x[12]++;
- }
-}
-
-void hchacha_block_arch(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
- if (!static_branch_likely(&chacha_use_simd)) {
- hchacha_block_generic(state, out, nrounds);
- } else {
- kernel_fpu_begin();
- hchacha_block_ssse3(state, out, nrounds);
- kernel_fpu_end();
- }
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- if (!static_branch_likely(&chacha_use_simd) ||
- bytes <= CHACHA_BLOCK_SIZE)
- return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
- do {
- unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
- kernel_fpu_begin();
- chacha_dosimd(state, dst, src, todo, nrounds);
- kernel_fpu_end();
-
- bytes -= todo;
- src += todo;
- dst += todo;
- } while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
- return static_key_enabled(&chacha_use_simd);
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_simd_mod_init(void)
-{
- if (!boot_cpu_has(X86_FEATURE_SSSE3))
- return 0;
-
- static_branch_enable(&chacha_use_simd);
-
- if (boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
- static_branch_enable(&chacha_use_avx2);
-
- if (boot_cpu_has(X86_FEATURE_AVX512VL) &&
- boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
- static_branch_enable(&chacha_use_avx512vl);
- }
- return 0;
-}
-subsys_initcall(chacha_simd_mod_init);
-
-static void __exit chacha_simd_mod_exit(void)
-{
-}
-module_exit(chacha_simd_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)");
diff --git a/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl
deleted file mode 100644
index 501827254fed..000000000000
--- a/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl
+++ /dev/null
@@ -1,4253 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
-# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
-# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for x86_64.
-#
-# March 2015
-#
-# Initial release.
-#
-# December 2016
-#
-# Add AVX512F+VL+BW code path.
-#
-# November 2017
-#
-# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
-# executed even on Knights Landing. Trigger for modification was
-# observation that AVX512 code paths can negatively affect overall
-# Skylake-X system performance. Since we are likely to suppress
-# AVX512F capability flag [at least on Skylake-X], conversion serves
-# as kind of "investment protection". Note that next *lake processor,
-# Cannonlake, has AVX512IFMA code path to execute...
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone,
-# measured with rdtsc at fixed clock frequency.
-#
-# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
-# P4 4.46/+120% -
-# Core 2 2.41/+90% -
-# Westmere 1.88/+120% -
-# Sandy Bridge 1.39/+140% 1.10
-# Haswell 1.14/+175% 1.11 0.65
-# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
-# Silvermont 2.83/+95% -
-# Knights L 3.60/? 1.65 1.10 0.41(***)
-# Goldmont 1.70/+180% -
-# VIA Nano 1.82/+150% -
-# Sledgehammer 1.38/+160% -
-# Bulldozer 2.30/+130% 0.97
-# Ryzen 1.15/+200% 1.08 1.18
-#
-# (*) improvement coefficients relative to clang are more modest and
-# are ~50% on most processors, in both cases we are comparing to
-# __int128 code;
-# (**) SSE2 implementation was attempted, but among non-AVX processors
-# it was faster than integer-only code only on older Intel P4 and
-# Core processors, 50-30%, less newer processor is, but slower on
-# contemporary ones, for example almost 2x slower on Atom, and as
-# former are naturally disappearing, SSE2 is deemed unnecessary;
-# (***) strangely enough performance seems to vary from core to core,
-# listed result is best case;
-
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-$kernel=0; $kernel=1 if (!$flavour && !$output);
-
-if (!$kernel) {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
-
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
- *STDOUT=*OUT;
-
- if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
- }
-
- if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
- $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
- $avx += 1 if ($1==2.11 && $2>=8);
- }
-
- if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
- $avx = ($1>=10) + ($1>=11);
- }
-
- if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
- $avx = ($2>=3.0) + ($2>3.0);
- }
-} else {
- $avx = 4; # The kernel uses ifdefs for this.
-}
-
-sub declare_function() {
- my ($name, $align, $nargs) = @_;
- if($kernel) {
- $code .= "SYM_FUNC_START($name)\n";
- $code .= ".L$name:\n";
- } else {
- $code .= ".globl $name\n";
- $code .= ".type $name,\@function,$nargs\n";
- $code .= ".align $align\n";
- $code .= "$name:\n";
- }
-}
-
-sub declare_typed_function() {
- my ($name, $align, $nargs) = @_;
- if($kernel) {
- $code .= "SYM_TYPED_FUNC_START($name)\n";
- $code .= ".L$name:\n";
- } else {
- $code .= ".globl $name\n";
- $code .= ".type $name,\@function,$nargs\n";
- $code .= ".align $align\n";
- $code .= "$name:\n";
- }
-}
-
-sub end_function() {
- my ($name) = @_;
- if($kernel) {
- $code .= "SYM_FUNC_END($name)\n";
- } else {
- $code .= ".size $name,.-$name\n";
- }
-}
-
-$code.=<<___ if $kernel;
-#include <linux/cfi_types.h>
-___
-
-if ($avx) {
-$code.=<<___ if $kernel;
-.section .rodata
-___
-$code.=<<___;
-.align 64
-.Lconst:
-.Lmask24:
-.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
-.L129:
-.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
-.Lmask26:
-.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lpermd_avx2:
-.long 2,2,2,3,2,0,2,1
-.Lpermd_avx512:
-.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
-
-.L2_44_inp_permd:
-.long 0,1,1,2,2,3,7,7
-.L2_44_inp_shift:
-.quad 0,12,24,64
-.L2_44_mask:
-.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
-.L2_44_shift_rgt:
-.quad 44,44,42,64
-.L2_44_shift_lft:
-.quad 8,8,10,64
-
-.align 64
-.Lx_mask44:
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.Lx_mask42:
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-___
-}
-$code.=<<___ if (!$kernel);
-.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align 16
-___
-
-my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
-my ($mac,$nonce)=($inp,$len); # *_emit arguments
-my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
-my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
-
-sub poly1305_iteration {
-# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
-# output: $h0-$h2 *= $r0-$r1
-$code.=<<___;
- mulq $h0 # h0*r1
- mov %rax,$d2
- mov $r0,%rax
- mov %rdx,$d3
-
- mulq $h0 # h0*r0
- mov %rax,$h0 # future $h0
- mov $r0,%rax
- mov %rdx,$d1
-
- mulq $h1 # h1*r0
- add %rax,$d2
- mov $s1,%rax
- adc %rdx,$d3
-
- mulq $h1 # h1*s1
- mov $h2,$h1 # borrow $h1
- add %rax,$h0
- adc %rdx,$d1
-
- imulq $s1,$h1 # h2*s1
- add $h1,$d2
- mov $d1,$h1
- adc \$0,$d3
-
- imulq $r0,$h2 # h2*r0
- add $d2,$h1
- mov \$-4,%rax # mask value
- adc $h2,$d3
-
- and $d3,%rax # last reduction step
- mov $d3,$h2
- shr \$2,$d3
- and \$3,$h2
- add $d3,%rax
- add %rax,$h0
- adc \$0,$h1
- adc \$0,$h2
-___
-}
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int64 h[3]; # current hash value base 2^64
-# unsigned __int64 r[2]; # key value base 2^64
-
-$code.=<<___;
-.text
-___
-$code.=<<___ if (!$kernel);
-.extern OPENSSL_ia32cap_P
-
-.globl poly1305_block_init_arch
-.hidden poly1305_block_init_arch
-.globl poly1305_blocks_x86_64
-.hidden poly1305_blocks_x86_64
-.globl poly1305_emit_x86_64
-.hidden poly1305_emit_x86_64
-___
-&declare_typed_function("poly1305_block_init_arch", 32, 3);
-$code.=<<___;
- xor %eax,%eax
- mov %rax,0($ctx) # initialize hash value
- mov %rax,8($ctx)
- mov %rax,16($ctx)
-
- test $inp,$inp
- je .Lno_key
-___
-$code.=<<___ if (!$kernel);
- lea poly1305_blocks_x86_64(%rip),%r10
- lea poly1305_emit_x86_64(%rip),%r11
-___
-$code.=<<___ if (!$kernel && $avx);
- mov OPENSSL_ia32cap_P+4(%rip),%r9
- lea poly1305_blocks_avx(%rip),%rax
- lea poly1305_emit_avx(%rip),%rcx
- bt \$`60-32`,%r9 # AVX?
- cmovc %rax,%r10
- cmovc %rcx,%r11
-___
-$code.=<<___ if (!$kernel && $avx>1);
- lea poly1305_blocks_avx2(%rip),%rax
- bt \$`5+32`,%r9 # AVX2?
- cmovc %rax,%r10
-___
-$code.=<<___ if (!$kernel && $avx>3);
- mov \$`(1<<31|1<<21|1<<16)`,%rax
- shr \$32,%r9
- and %rax,%r9
- cmp %rax,%r9
- je .Linit_base2_44
-___
-$code.=<<___;
- mov \$0x0ffffffc0fffffff,%rax
- mov \$0x0ffffffc0ffffffc,%rcx
- and 0($inp),%rax
- and 8($inp),%rcx
- mov %rax,24($ctx)
- mov %rcx,32($ctx)
-___
-$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
- mov %r10,0(%rdx)
- mov %r11,8(%rdx)
-___
-$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
- mov %r10d,0(%rdx)
- mov %r11d,4(%rdx)
-___
-$code.=<<___;
- mov \$1,%eax
-.Lno_key:
- RET
-___
-&end_function("poly1305_block_init_arch");
-
-&declare_function("poly1305_blocks_x86_64", 32, 4);
-$code.=<<___;
-.cfi_startproc
-.Lblocks:
- shr \$4,$len
- jz .Lno_data # too short
-
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
- push $ctx
-.cfi_push $ctx
-.Lblocks_body:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2
-
- mov $s1,$r1
- shr \$2,$s1
- mov $r1,%rax
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
- jmp .Loop
-
-.align 32
-.Loop:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
-___
-
- &poly1305_iteration();
-
-$code.=<<___;
- mov $r1,%rax
- dec %r15 # len-=16
- jnz .Loop
-
- mov 0(%rsp),$ctx
-.cfi_restore $ctx
-
- mov $h0,0($ctx) # store hash value
- mov $h1,8($ctx)
- mov $h2,16($ctx)
-
- mov 8(%rsp),%r15
-.cfi_restore %r15
- mov 16(%rsp),%r14
-.cfi_restore %r14
- mov 24(%rsp),%r13
-.cfi_restore %r13
- mov 32(%rsp),%r12
-.cfi_restore %r12
- mov 40(%rsp),%rbx
-.cfi_restore %rbx
- lea 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
-.Lno_data:
-.Lblocks_epilogue:
- RET
-.cfi_endproc
-___
-&end_function("poly1305_blocks_x86_64");
-
-&declare_function("poly1305_emit_x86_64", 32, 3);
-$code.=<<___;
-.Lemit:
- mov 0($ctx),%r8 # load hash value
- mov 8($ctx),%r9
- mov 16($ctx),%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- RET
-___
-&end_function("poly1305_emit_x86_64");
-if ($avx) {
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int32 h[5]; # current hash value base 2^26
-# unsigned __int32 is_base2_26;
-# unsigned __int64 r[2]; # key value base 2^64
-# unsigned __int64 pad;
-# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
-#
-# where r^n are base 2^26 digits of degrees of multiplier key. There are
-# 5 digits, but last four are interleaved with multiples of 5, totalling
-# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
-
-my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
- map("%xmm$_",(0..15));
-
-$code.=<<___;
-.type __poly1305_block,\@abi-omnipotent
-.align 32
-__poly1305_block:
- push $ctx
-___
- &poly1305_iteration();
-$code.=<<___;
- pop $ctx
- RET
-.size __poly1305_block,.-__poly1305_block
-
-.type __poly1305_init_avx,\@abi-omnipotent
-.align 32
-__poly1305_init_avx:
- push %rbp
- mov %rsp,%rbp
- mov $r0,$h0
- mov $r1,$h1
- xor $h2,$h2
-
- lea 48+64($ctx),$ctx # size optimization
-
- mov $r1,%rax
- call __poly1305_block # r^2
-
- mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
- mov \$0x3ffffff,%edx
- mov $h0,$d1
- and $h0#d,%eax
- mov $r0,$d2
- and $r0#d,%edx
- mov %eax,`16*0+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*0+4-64`($ctx)
- shr \$26,$d2
-
- mov \$0x3ffffff,%eax
- mov \$0x3ffffff,%edx
- and $d1#d,%eax
- and $d2#d,%edx
- mov %eax,`16*1+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*1+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*2+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*2+4-64`($ctx)
- shr \$26,$d2
-
- mov $h1,%rax
- mov $r1,%rdx
- shl \$12,%rax
- shl \$12,%rdx
- or $d1,%rax
- or $d2,%rdx
- and \$0x3ffffff,%eax
- and \$0x3ffffff,%edx
- mov %eax,`16*3+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*3+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*4+0-64`($ctx)
- mov $h1,$d1
- mov %edx,`16*4+4-64`($ctx)
- mov $r1,$d2
-
- mov \$0x3ffffff,%eax
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- shr \$14,$d2
- and $d1#d,%eax
- and $d2#d,%edx
- mov %eax,`16*5+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*5+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*6+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*6+4-64`($ctx)
- shr \$26,$d2
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+0-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d2#d,`16*7+4-64`($ctx)
- lea ($d2,$d2,4),$d2 # *5
- mov $d1#d,`16*8+0-64`($ctx)
- mov $d2#d,`16*8+4-64`($ctx)
-
- mov $r1,%rax
- call __poly1305_block # r^3
-
- mov \$0x3ffffff,%eax # save r^3 base 2^26
- mov $h0,$d1
- and $h0#d,%eax
- shr \$26,$d1
- mov %eax,`16*0+12-64`($ctx)
-
- mov \$0x3ffffff,%edx
- and $d1#d,%edx
- mov %edx,`16*1+12-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*2+12-64`($ctx)
-
- mov $h1,%rax
- shl \$12,%rax
- or $d1,%rax
- and \$0x3ffffff,%eax
- mov %eax,`16*3+12-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov $h1,$d1
- mov %eax,`16*4+12-64`($ctx)
-
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- and $d1#d,%edx
- mov %edx,`16*5+12-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*6+12-64`($ctx)
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+12-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d1#d,`16*8+12-64`($ctx)
-
- mov $r1,%rax
- call __poly1305_block # r^4
-
- mov \$0x3ffffff,%eax # save r^4 base 2^26
- mov $h0,$d1
- and $h0#d,%eax
- shr \$26,$d1
- mov %eax,`16*0+8-64`($ctx)
-
- mov \$0x3ffffff,%edx
- and $d1#d,%edx
- mov %edx,`16*1+8-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*2+8-64`($ctx)
-
- mov $h1,%rax
- shl \$12,%rax
- or $d1,%rax
- and \$0x3ffffff,%eax
- mov %eax,`16*3+8-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov $h1,$d1
- mov %eax,`16*4+8-64`($ctx)
-
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- and $d1#d,%edx
- mov %edx,`16*5+8-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*6+8-64`($ctx)
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+8-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d1#d,`16*8+8-64`($ctx)
-
- lea -48-64($ctx),$ctx # size [de-]optimization
- pop %rbp
- RET
-.size __poly1305_init_avx,.-__poly1305_init_avx
-___
-
-&declare_function("poly1305_blocks_avx", 32, 4);
-$code.=<<___;
-.cfi_startproc
- mov 20($ctx),%r8d # is_base2_26
- cmp \$128,$len
- jae .Lblocks_avx
- test %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx:
- and \$-16,$len
- jz .Lno_data_avx
-
- vzeroupper
-
- test %r8d,%r8d
- jz .Lbase2_64_avx
-
- test \$31,$len
- jz .Leven_avx
-
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lblocks_avx_body:
-
- mov $len,%r15 # reassign $len
-
- mov 0($ctx),$d1 # load hash value
- mov 8($ctx),$d2
- mov 16($ctx),$h2#d
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- ################################# base 2^26 -> base 2^64
- mov $d1#d,$h0#d
- and \$`-1*(1<<31)`,$d1
- mov $d2,$r1 # borrow $r1
- mov $d2#d,$h1#d
- and \$`-1*(1<<31)`,$d2
-
- shr \$6,$d1
- shl \$52,$r1
- add $d1,$h0
- shr \$12,$h1
- shr \$18,$d2
- add $r1,$h0
- adc $d2,$h1
-
- mov $h2,$d1
- shl \$40,$d1
- shr \$24,$h2
- add $d1,$h1
- adc \$0,$h2 # can be partially reduced...
-
- mov \$-4,$d2 # ... so reduce
- mov $h2,$d1
- and $h2,$d2
- shr \$2,$d1
- and \$3,$h2
- add $d2,$d1 # =*5
- add $d1,$h0
- adc \$0,$h1
- adc \$0,$h2
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
-
- call __poly1305_block
-
- test $padbit,$padbit # if $padbit is zero,
- jz .Lstore_base2_64_avx # store hash in base 2^64 format
-
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$r0
- mov $h1,$r1
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$r0
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $r0,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$r1
- and \$0x3ffffff,$h1 # h[3]
- or $r1,$h2 # h[4]
-
- sub \$16,%r15
- jz .Lstore_base2_26_avx
-
- vmovd %rax#d,$H0
- vmovd %rdx#d,$H1
- vmovd $h0#d,$H2
- vmovd $h1#d,$H3
- vmovd $h2#d,$H4
- jmp .Lproceed_avx
-
-.align 32
-.Lstore_base2_64_avx:
- mov $h0,0($ctx)
- mov $h1,8($ctx)
- mov $h2,16($ctx) # note that is_base2_26 is zeroed
- jmp .Ldone_avx
-
-.align 16
-.Lstore_base2_26_avx:
- mov %rax#d,0($ctx) # store hash value base 2^26
- mov %rdx#d,4($ctx)
- mov $h0#d,8($ctx)
- mov $h1#d,12($ctx)
- mov $h2#d,16($ctx)
-.align 16
-.Ldone_avx:
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lno_data_avx:
-.Lblocks_avx_epilogue:
- RET
-.cfi_endproc
-
-.align 32
-.Lbase2_64_avx:
-.cfi_startproc
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lbase2_64_avx_body:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2#d
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- test \$31,$len
- jz .Linit_avx
-
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
-
-.Linit_avx:
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$d1
- mov $h1,$d2
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$d1
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $d1,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$d2
- and \$0x3ffffff,$h1 # h[3]
- or $d2,$h2 # h[4]
-
- vmovd %rax#d,$H0
- vmovd %rdx#d,$H1
- vmovd $h0#d,$H2
- vmovd $h1#d,$H3
- vmovd $h2#d,$H4
- movl \$1,20($ctx) # set is_base2_26
-
- call __poly1305_init_avx
-
-.Lproceed_avx:
- mov %r15,$len
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lbase2_64_avx_epilogue:
- jmp .Ldo_avx
-.cfi_endproc
-
-.align 32
-.Leven_avx:
-.cfi_startproc
- vmovd 4*0($ctx),$H0 # load hash value
- vmovd 4*1($ctx),$H1
- vmovd 4*2($ctx),$H2
- vmovd 4*3($ctx),$H3
- vmovd 4*4($ctx),$H4
-
-.Ldo_avx:
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- and \$-32,%rsp
- sub \$-8,%rsp
- lea -0x58(%rsp),%r11
- sub \$0x178,%rsp
-___
-$code.=<<___ if ($win64);
- lea -0xf8(%rsp),%r11
- sub \$0x218,%rsp
- vmovdqa %xmm6,0x50(%r11)
- vmovdqa %xmm7,0x60(%r11)
- vmovdqa %xmm8,0x70(%r11)
- vmovdqa %xmm9,0x80(%r11)
- vmovdqa %xmm10,0x90(%r11)
- vmovdqa %xmm11,0xa0(%r11)
- vmovdqa %xmm12,0xb0(%r11)
- vmovdqa %xmm13,0xc0(%r11)
- vmovdqa %xmm14,0xd0(%r11)
- vmovdqa %xmm15,0xe0(%r11)
-.Ldo_avx_body:
-___
-$code.=<<___;
- sub \$64,$len
- lea -32($inp),%rax
- cmovc %rax,$inp
-
- vmovdqu `16*3`($ctx),$D4 # preload r0^2
- lea `16*3+64`($ctx),$ctx # size optimization
- lea .Lconst(%rip),%rcx
-
- ################################################################
- # load input
- vmovdqu 16*2($inp),$T0
- vmovdqu 16*3($inp),$T1
- vmovdqa 64(%rcx),$MASK # .Lmask26
-
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpunpcklqdq $T3,$T2,$T3 # 2:3
-
- vpsrlq \$40,$T4,$T4 # 4
- vpsrlq \$26,$T0,$T1
- vpand $MASK,$T0,$T0 # 0
- vpsrlq \$4,$T3,$T2
- vpand $MASK,$T1,$T1 # 1
- vpsrlq \$30,$T3,$T3
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- jbe .Lskip_loop_avx
-
- # expand and copy pre-calculated table to stack
- vmovdqu `16*1-64`($ctx),$D1
- vmovdqu `16*2-64`($ctx),$D2
- vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
- vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
- vmovdqa $D3,-0x90(%r11)
- vmovdqa $D0,0x00(%rsp)
- vpshufd \$0xEE,$D1,$D4
- vmovdqu `16*3-64`($ctx),$D0
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D4,-0x80(%r11)
- vmovdqa $D1,0x10(%rsp)
- vpshufd \$0xEE,$D2,$D3
- vmovdqu `16*4-64`($ctx),$D1
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D3,-0x70(%r11)
- vmovdqa $D2,0x20(%rsp)
- vpshufd \$0xEE,$D0,$D4
- vmovdqu `16*5-64`($ctx),$D2
- vpshufd \$0x44,$D0,$D0
- vmovdqa $D4,-0x60(%r11)
- vmovdqa $D0,0x30(%rsp)
- vpshufd \$0xEE,$D1,$D3
- vmovdqu `16*6-64`($ctx),$D0
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D3,-0x50(%r11)
- vmovdqa $D1,0x40(%rsp)
- vpshufd \$0xEE,$D2,$D4
- vmovdqu `16*7-64`($ctx),$D1
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D4,-0x40(%r11)
- vmovdqa $D2,0x50(%rsp)
- vpshufd \$0xEE,$D0,$D3
- vmovdqu `16*8-64`($ctx),$D2
- vpshufd \$0x44,$D0,$D0
- vmovdqa $D3,-0x30(%r11)
- vmovdqa $D0,0x60(%rsp)
- vpshufd \$0xEE,$D1,$D4
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D4,-0x20(%r11)
- vmovdqa $D1,0x70(%rsp)
- vpshufd \$0xEE,$D2,$D3
- vmovdqa 0x00(%rsp),$D4 # preload r0^2
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D3,-0x10(%r11)
- vmovdqa $D2,0x80(%rsp)
-
- jmp .Loop_avx
-
-.align 32
-.Loop_avx:
- ################################################################
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- # \___________________/
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- # \___________________/ \____________________/
- #
- # Note that we start with inp[2:3]*r^2. This is because it
- # doesn't depend on reduction in previous iteration.
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # though note that $Tx and $Hx are "reversed" in this section,
- # and $D4 is preloaded with r0^2...
-
- vpmuludq $T0,$D4,$D0 # d0 = h0*r0
- vpmuludq $T1,$D4,$D1 # d1 = h1*r0
- vmovdqa $H2,0x20(%r11) # offload hash
- vpmuludq $T2,$D4,$D2 # d3 = h2*r0
- vmovdqa 0x10(%rsp),$H2 # r1^2
- vpmuludq $T3,$D4,$D3 # d3 = h3*r0
- vpmuludq $T4,$D4,$D4 # d4 = h4*r0
-
- vmovdqa $H0,0x00(%r11) #
- vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
- vmovdqa $H1,0x10(%r11) #
- vpmuludq $T3,$H2,$H1 # h3*r1
- vpaddq $H0,$D0,$D0 # d0 += h4*s1
- vpaddq $H1,$D4,$D4 # d4 += h3*r1
- vmovdqa $H3,0x30(%r11) #
- vpmuludq $T2,$H2,$H0 # h2*r1
- vpmuludq $T1,$H2,$H1 # h1*r1
- vpaddq $H0,$D3,$D3 # d3 += h2*r1
- vmovdqa 0x30(%rsp),$H3 # r2^2
- vpaddq $H1,$D2,$D2 # d2 += h1*r1
- vmovdqa $H4,0x40(%r11) #
- vpmuludq $T0,$H2,$H2 # h0*r1
- vpmuludq $T2,$H3,$H0 # h2*r2
- vpaddq $H2,$D1,$D1 # d1 += h0*r1
-
- vmovdqa 0x40(%rsp),$H4 # s2^2
- vpaddq $H0,$D4,$D4 # d4 += h2*r2
- vpmuludq $T1,$H3,$H1 # h1*r2
- vpmuludq $T0,$H3,$H3 # h0*r2
- vpaddq $H1,$D3,$D3 # d3 += h1*r2
- vmovdqa 0x50(%rsp),$H2 # r3^2
- vpaddq $H3,$D2,$D2 # d2 += h0*r2
- vpmuludq $T4,$H4,$H0 # h4*s2
- vpmuludq $T3,$H4,$H4 # h3*s2
- vpaddq $H0,$D1,$D1 # d1 += h4*s2
- vmovdqa 0x60(%rsp),$H3 # s3^2
- vpaddq $H4,$D0,$D0 # d0 += h3*s2
-
- vmovdqa 0x80(%rsp),$H4 # s4^2
- vpmuludq $T1,$H2,$H1 # h1*r3
- vpmuludq $T0,$H2,$H2 # h0*r3
- vpaddq $H1,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $T4,$H3,$H0 # h4*s3
- vpmuludq $T3,$H3,$H1 # h3*s3
- vpaddq $H0,$D2,$D2 # d2 += h4*s3
- vmovdqu 16*0($inp),$H0 # load input
- vpaddq $H1,$D1,$D1 # d1 += h3*s3
- vpmuludq $T2,$H3,$H3 # h2*s3
- vpmuludq $T2,$H4,$T2 # h2*s4
- vpaddq $H3,$D0,$D0 # d0 += h2*s3
-
- vmovdqu 16*1($inp),$H1 #
- vpaddq $T2,$D1,$D1 # d1 += h2*s4
- vpmuludq $T3,$H4,$T3 # h3*s4
- vpmuludq $T4,$H4,$T4 # h4*s4
- vpsrldq \$6,$H0,$H2 # splat input
- vpaddq $T3,$D2,$D2 # d2 += h3*s4
- vpaddq $T4,$D3,$D3 # d3 += h4*s4
- vpsrldq \$6,$H1,$H3 #
- vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
- vpmuludq $T1,$H4,$T0 # h1*s4
- vpunpckhqdq $H1,$H0,$H4 # 4
- vpaddq $T4,$D4,$D4 # d4 += h0*r4
- vmovdqa -0x90(%r11),$T4 # r0^4
- vpaddq $T0,$D0,$D0 # d0 += h1*s4
-
- vpunpcklqdq $H1,$H0,$H0 # 0:1
- vpunpcklqdq $H3,$H2,$H3 # 2:3
-
- #vpsrlq \$40,$H4,$H4 # 4
- vpsrldq \$`40/8`,$H4,$H4 # 4
- vpsrlq \$26,$H0,$H1
- vpand $MASK,$H0,$H0 # 0
- vpsrlq \$4,$H3,$H2
- vpand $MASK,$H1,$H1 # 1
- vpand 0(%rcx),$H4,$H4 # .Lmask24
- vpsrlq \$30,$H3,$H3
- vpand $MASK,$H2,$H2 # 2
- vpand $MASK,$H3,$H3 # 3
- vpor 32(%rcx),$H4,$H4 # padbit, yes, always
-
- vpaddq 0x00(%r11),$H0,$H0 # add hash value
- vpaddq 0x10(%r11),$H1,$H1
- vpaddq 0x20(%r11),$H2,$H2
- vpaddq 0x30(%r11),$H3,$H3
- vpaddq 0x40(%r11),$H4,$H4
-
- lea 16*2($inp),%rax
- lea 16*4($inp),$inp
- sub \$64,$len
- cmovc %rax,$inp
-
- ################################################################
- # Now we accumulate (inp[0:1]+hash)*r^4
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- vpmuludq $H0,$T4,$T0 # h0*r0
- vpmuludq $H1,$T4,$T1 # h1*r0
- vpaddq $T0,$D0,$D0
- vpaddq $T1,$D1,$D1
- vmovdqa -0x80(%r11),$T2 # r1^4
- vpmuludq $H2,$T4,$T0 # h2*r0
- vpmuludq $H3,$T4,$T1 # h3*r0
- vpaddq $T0,$D2,$D2
- vpaddq $T1,$D3,$D3
- vpmuludq $H4,$T4,$T4 # h4*r0
- vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
- vpaddq $T4,$D4,$D4
-
- vpaddq $T0,$D0,$D0 # d0 += h4*s1
- vpmuludq $H2,$T2,$T1 # h2*r1
- vpmuludq $H3,$T2,$T0 # h3*r1
- vpaddq $T1,$D3,$D3 # d3 += h2*r1
- vmovdqa -0x60(%r11),$T3 # r2^4
- vpaddq $T0,$D4,$D4 # d4 += h3*r1
- vpmuludq $H1,$T2,$T1 # h1*r1
- vpmuludq $H0,$T2,$T2 # h0*r1
- vpaddq $T1,$D2,$D2 # d2 += h1*r1
- vpaddq $T2,$D1,$D1 # d1 += h0*r1
-
- vmovdqa -0x50(%r11),$T4 # s2^4
- vpmuludq $H2,$T3,$T0 # h2*r2
- vpmuludq $H1,$T3,$T1 # h1*r2
- vpaddq $T0,$D4,$D4 # d4 += h2*r2
- vpaddq $T1,$D3,$D3 # d3 += h1*r2
- vmovdqa -0x40(%r11),$T2 # r3^4
- vpmuludq $H0,$T3,$T3 # h0*r2
- vpmuludq $H4,$T4,$T0 # h4*s2
- vpaddq $T3,$D2,$D2 # d2 += h0*r2
- vpaddq $T0,$D1,$D1 # d1 += h4*s2
- vmovdqa -0x30(%r11),$T3 # s3^4
- vpmuludq $H3,$T4,$T4 # h3*s2
- vpmuludq $H1,$T2,$T1 # h1*r3
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
-
- vmovdqa -0x10(%r11),$T4 # s4^4
- vpaddq $T1,$D4,$D4 # d4 += h1*r3
- vpmuludq $H0,$T2,$T2 # h0*r3
- vpmuludq $H4,$T3,$T0 # h4*s3
- vpaddq $T2,$D3,$D3 # d3 += h0*r3
- vpaddq $T0,$D2,$D2 # d2 += h4*s3
- vmovdqu 16*2($inp),$T0 # load input
- vpmuludq $H3,$T3,$T2 # h3*s3
- vpmuludq $H2,$T3,$T3 # h2*s3
- vpaddq $T2,$D1,$D1 # d1 += h3*s3
- vmovdqu 16*3($inp),$T1 #
- vpaddq $T3,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $H2,$T4,$H2 # h2*s4
- vpmuludq $H3,$T4,$H3 # h3*s4
- vpsrldq \$6,$T0,$T2 # splat input
- vpaddq $H2,$D1,$D1 # d1 += h2*s4
- vpmuludq $H4,$T4,$H4 # h4*s4
- vpsrldq \$6,$T1,$T3 #
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
- vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
- vpmuludq $H1,$T4,$H0
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpunpcklqdq $T3,$T2,$T3 # 2:3
-
- #vpsrlq \$40,$T4,$T4 # 4
- vpsrldq \$`40/8`,$T4,$T4 # 4
- vpsrlq \$26,$T0,$T1
- vmovdqa 0x00(%rsp),$D4 # preload r0^2
- vpand $MASK,$T0,$T0 # 0
- vpsrlq \$4,$T3,$T2
- vpand $MASK,$T1,$T1 # 1
- vpand 0(%rcx),$T4,$T4 # .Lmask24
- vpsrlq \$30,$T3,$T3
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- ################################################################
- # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- # and P. Schwabe
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D0
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D0,$H0,$H0
- vpsllq \$2,$D0,$D0
- vpaddq $D0,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- ja .Loop_avx
-
-.Lskip_loop_avx:
- ################################################################
- # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
- add \$32,$len
- jnz .Long_tail_avx
-
- vpaddq $H2,$T2,$T2
- vpaddq $H0,$T0,$T0
- vpaddq $H1,$T1,$T1
- vpaddq $H3,$T3,$T3
- vpaddq $H4,$T4,$T4
-
-.Long_tail_avx:
- vmovdqa $H2,0x20(%r11)
- vmovdqa $H0,0x00(%r11)
- vmovdqa $H1,0x10(%r11)
- vmovdqa $H3,0x30(%r11)
- vmovdqa $H4,0x40(%r11)
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- vpmuludq $T2,$D4,$D2 # d2 = h2*r0
- vpmuludq $T0,$D4,$D0 # d0 = h0*r0
- vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
- vpmuludq $T1,$D4,$D1 # d1 = h1*r0
- vpmuludq $T3,$D4,$D3 # d3 = h3*r0
- vpmuludq $T4,$D4,$D4 # d4 = h4*r0
-
- vpmuludq $T3,$H2,$H0 # h3*r1
- vpaddq $H0,$D4,$D4 # d4 += h3*r1
- vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
- vpmuludq $T2,$H2,$H1 # h2*r1
- vpaddq $H1,$D3,$D3 # d3 += h2*r1
- vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
- vpmuludq $T1,$H2,$H0 # h1*r1
- vpaddq $H0,$D2,$D2 # d2 += h1*r1
- vpmuludq $T0,$H2,$H2 # h0*r1
- vpaddq $H2,$D1,$D1 # d1 += h0*r1
- vpmuludq $T4,$H3,$H3 # h4*s1
- vpaddq $H3,$D0,$D0 # d0 += h4*s1
-
- vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
- vpmuludq $T2,$H4,$H1 # h2*r2
- vpaddq $H1,$D4,$D4 # d4 += h2*r2
- vpmuludq $T1,$H4,$H0 # h1*r2
- vpaddq $H0,$D3,$D3 # d3 += h1*r2
- vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
- vpmuludq $T0,$H4,$H4 # h0*r2
- vpaddq $H4,$D2,$D2 # d2 += h0*r2
- vpmuludq $T4,$H2,$H1 # h4*s2
- vpaddq $H1,$D1,$D1 # d1 += h4*s2
- vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
- vpmuludq $T3,$H2,$H2 # h3*s2
- vpaddq $H2,$D0,$D0 # d0 += h3*s2
-
- vpmuludq $T1,$H3,$H0 # h1*r3
- vpaddq $H0,$D4,$D4 # d4 += h1*r3
- vpmuludq $T0,$H3,$H3 # h0*r3
- vpaddq $H3,$D3,$D3 # d3 += h0*r3
- vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
- vpmuludq $T4,$H4,$H1 # h4*s3
- vpaddq $H1,$D2,$D2 # d2 += h4*s3
- vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
- vpmuludq $T3,$H4,$H0 # h3*s3
- vpaddq $H0,$D1,$D1 # d1 += h3*s3
- vpmuludq $T2,$H4,$H4 # h2*s3
- vpaddq $H4,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $T0,$H2,$H2 # h0*r4
- vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
- vpmuludq $T4,$H3,$H1 # h4*s4
- vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
- vpmuludq $T3,$H3,$H0 # h3*s4
- vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
- vpmuludq $T2,$H3,$H1 # h2*s4
- vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
- vpmuludq $T1,$H3,$H3 # h1*s4
- vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
-
- jz .Lshort_tail_avx
-
- vmovdqu 16*0($inp),$H0 # load input
- vmovdqu 16*1($inp),$H1
-
- vpsrldq \$6,$H0,$H2 # splat input
- vpsrldq \$6,$H1,$H3
- vpunpckhqdq $H1,$H0,$H4 # 4
- vpunpcklqdq $H1,$H0,$H0 # 0:1
- vpunpcklqdq $H3,$H2,$H3 # 2:3
-
- vpsrlq \$40,$H4,$H4 # 4
- vpsrlq \$26,$H0,$H1
- vpand $MASK,$H0,$H0 # 0
- vpsrlq \$4,$H3,$H2
- vpand $MASK,$H1,$H1 # 1
- vpsrlq \$30,$H3,$H3
- vpand $MASK,$H2,$H2 # 2
- vpand $MASK,$H3,$H3 # 3
- vpor 32(%rcx),$H4,$H4 # padbit, yes, always
-
- vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
- vpaddq 0x00(%r11),$H0,$H0
- vpaddq 0x10(%r11),$H1,$H1
- vpaddq 0x20(%r11),$H2,$H2
- vpaddq 0x30(%r11),$H3,$H3
- vpaddq 0x40(%r11),$H4,$H4
-
- ################################################################
- # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
-
- vpmuludq $H0,$T4,$T0 # h0*r0
- vpaddq $T0,$D0,$D0 # d0 += h0*r0
- vpmuludq $H1,$T4,$T1 # h1*r0
- vpaddq $T1,$D1,$D1 # d1 += h1*r0
- vpmuludq $H2,$T4,$T0 # h2*r0
- vpaddq $T0,$D2,$D2 # d2 += h2*r0
- vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
- vpmuludq $H3,$T4,$T1 # h3*r0
- vpaddq $T1,$D3,$D3 # d3 += h3*r0
- vpmuludq $H4,$T4,$T4 # h4*r0
- vpaddq $T4,$D4,$D4 # d4 += h4*r0
-
- vpmuludq $H3,$T2,$T0 # h3*r1
- vpaddq $T0,$D4,$D4 # d4 += h3*r1
- vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
- vpmuludq $H2,$T2,$T1 # h2*r1
- vpaddq $T1,$D3,$D3 # d3 += h2*r1
- vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
- vpmuludq $H1,$T2,$T0 # h1*r1
- vpaddq $T0,$D2,$D2 # d2 += h1*r1
- vpmuludq $H0,$T2,$T2 # h0*r1
- vpaddq $T2,$D1,$D1 # d1 += h0*r1
- vpmuludq $H4,$T3,$T3 # h4*s1
- vpaddq $T3,$D0,$D0 # d0 += h4*s1
-
- vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
- vpmuludq $H2,$T4,$T1 # h2*r2
- vpaddq $T1,$D4,$D4 # d4 += h2*r2
- vpmuludq $H1,$T4,$T0 # h1*r2
- vpaddq $T0,$D3,$D3 # d3 += h1*r2
- vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
- vpmuludq $H0,$T4,$T4 # h0*r2
- vpaddq $T4,$D2,$D2 # d2 += h0*r2
- vpmuludq $H4,$T2,$T1 # h4*s2
- vpaddq $T1,$D1,$D1 # d1 += h4*s2
- vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
- vpmuludq $H3,$T2,$T2 # h3*s2
- vpaddq $T2,$D0,$D0 # d0 += h3*s2
-
- vpmuludq $H1,$T3,$T0 # h1*r3
- vpaddq $T0,$D4,$D4 # d4 += h1*r3
- vpmuludq $H0,$T3,$T3 # h0*r3
- vpaddq $T3,$D3,$D3 # d3 += h0*r3
- vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
- vpmuludq $H4,$T4,$T1 # h4*s3
- vpaddq $T1,$D2,$D2 # d2 += h4*s3
- vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
- vpmuludq $H3,$T4,$T0 # h3*s3
- vpaddq $T0,$D1,$D1 # d1 += h3*s3
- vpmuludq $H2,$T4,$T4 # h2*s3
- vpaddq $T4,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $H0,$T2,$T2 # h0*r4
- vpaddq $T2,$D4,$D4 # d4 += h0*r4
- vpmuludq $H4,$T3,$T1 # h4*s4
- vpaddq $T1,$D3,$D3 # d3 += h4*s4
- vpmuludq $H3,$T3,$T0 # h3*s4
- vpaddq $T0,$D2,$D2 # d2 += h3*s4
- vpmuludq $H2,$T3,$T1 # h2*s4
- vpaddq $T1,$D1,$D1 # d1 += h2*s4
- vpmuludq $H1,$T3,$T3 # h1*s4
- vpaddq $T3,$D0,$D0 # d0 += h1*s4
-
-.Lshort_tail_avx:
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D4,$T4
- vpsrldq \$8,$D3,$T3
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$D0,$T0
- vpsrldq \$8,$D2,$T2
- vpaddq $T3,$D3,$D3
- vpaddq $T4,$D4,$D4
- vpaddq $T0,$D0,$D0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$D2,$D2
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$D3,$H3
- vpand $MASK,$D3,$D3
- vpaddq $H3,$D4,$D4 # h3 -> h4
-
- vpsrlq \$26,$D0,$H0
- vpand $MASK,$D0,$D0
- vpaddq $H0,$D1,$D1 # h0 -> h1
-
- vpsrlq \$26,$D4,$H4
- vpand $MASK,$D4,$D4
-
- vpsrlq \$26,$D1,$H1
- vpand $MASK,$D1,$D1
- vpaddq $H1,$D2,$D2 # h1 -> h2
-
- vpaddq $H4,$D0,$D0
- vpsllq \$2,$H4,$H4
- vpaddq $H4,$D0,$D0 # h4 -> h0
-
- vpsrlq \$26,$D2,$H2
- vpand $MASK,$D2,$D2
- vpaddq $H2,$D3,$D3 # h2 -> h3
-
- vpsrlq \$26,$D0,$H0
- vpand $MASK,$D0,$D0
- vpaddq $H0,$D1,$D1 # h0 -> h1
-
- vpsrlq \$26,$D3,$H3
- vpand $MASK,$D3,$D3
- vpaddq $H3,$D4,$D4 # h3 -> h4
-
- vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
- vmovd $D1,`4*1-48-64`($ctx)
- vmovd $D2,`4*2-48-64`($ctx)
- vmovd $D3,`4*3-48-64`($ctx)
- vmovd $D4,`4*4-48-64`($ctx)
-___
-$code.=<<___ if ($win64);
- vmovdqa 0x50(%r11),%xmm6
- vmovdqa 0x60(%r11),%xmm7
- vmovdqa 0x70(%r11),%xmm8
- vmovdqa 0x80(%r11),%xmm9
- vmovdqa 0x90(%r11),%xmm10
- vmovdqa 0xa0(%r11),%xmm11
- vmovdqa 0xb0(%r11),%xmm12
- vmovdqa 0xc0(%r11),%xmm13
- vmovdqa 0xd0(%r11),%xmm14
- vmovdqa 0xe0(%r11),%xmm15
- lea 0xf8(%r11),%rsp
-.Ldo_avx_epilogue:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- vzeroupper
- RET
-.cfi_endproc
-___
-&end_function("poly1305_blocks_avx");
-
-&declare_function("poly1305_emit_avx", 32, 3);
-$code.=<<___;
- cmpl \$0,20($ctx) # is_base2_26?
- je .Lemit
-
- mov 0($ctx),%eax # load hash value base 2^26
- mov 4($ctx),%ecx
- mov 8($ctx),%r8d
- mov 12($ctx),%r11d
- mov 16($ctx),%r10d
-
- shl \$26,%rcx # base 2^26 -> base 2^64
- mov %r8,%r9
- shl \$52,%r8
- add %rcx,%rax
- shr \$12,%r9
- add %rax,%r8 # h0
- adc \$0,%r9
-
- shl \$14,%r11
- mov %r10,%rax
- shr \$24,%r10
- add %r11,%r9
- shl \$40,%rax
- add %rax,%r9 # h1
- adc \$0,%r10 # h2
-
- mov %r10,%rax # could be partially reduced, so reduce
- mov %r10,%rcx
- and \$3,%r10
- shr \$2,%rax
- and \$-4,%rcx
- add %rcx,%rax
- add %rax,%r8
- adc \$0,%r9
- adc \$0,%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- RET
-___
-&end_function("poly1305_emit_avx");
-
-if ($avx>1) {
-
-my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
- map("%ymm$_",(0..15));
-my $S4=$MASK;
-
-sub poly1305_blocks_avxN {
- my ($avx512) = @_;
- my $suffix = $avx512 ? "_avx512" : "";
-$code.=<<___;
-.cfi_startproc
- mov 20($ctx),%r8d # is_base2_26
- cmp \$128,$len
- jae .Lblocks_avx2$suffix
- test %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx2$suffix:
- and \$-16,$len
- jz .Lno_data_avx2$suffix
-
- vzeroupper
-
- test %r8d,%r8d
- jz .Lbase2_64_avx2$suffix
-
- test \$63,$len
- jz .Leven_avx2$suffix
-
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lblocks_avx2_body$suffix:
-
- mov $len,%r15 # reassign $len
-
- mov 0($ctx),$d1 # load hash value
- mov 8($ctx),$d2
- mov 16($ctx),$h2#d
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- ################################# base 2^26 -> base 2^64
- mov $d1#d,$h0#d
- and \$`-1*(1<<31)`,$d1
- mov $d2,$r1 # borrow $r1
- mov $d2#d,$h1#d
- and \$`-1*(1<<31)`,$d2
-
- shr \$6,$d1
- shl \$52,$r1
- add $d1,$h0
- shr \$12,$h1
- shr \$18,$d2
- add $r1,$h0
- adc $d2,$h1
-
- mov $h2,$d1
- shl \$40,$d1
- shr \$24,$h2
- add $d1,$h1
- adc \$0,$h2 # can be partially reduced...
-
- mov \$-4,$d2 # ... so reduce
- mov $h2,$d1
- and $h2,$d2
- shr \$2,$d1
- and \$3,$h2
- add $d2,$d1 # =*5
- add $d1,$h0
- adc \$0,$h1
- adc \$0,$h2
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
-.Lbase2_26_pre_avx2$suffix:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
- mov $r1,%rax
-
- test \$63,%r15
- jnz .Lbase2_26_pre_avx2$suffix
-
- test $padbit,$padbit # if $padbit is zero,
- jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
-
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$r0
- mov $h1,$r1
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$r0
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $r0,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$r1
- and \$0x3ffffff,$h1 # h[3]
- or $r1,$h2 # h[4]
-
- test %r15,%r15
- jz .Lstore_base2_26_avx2$suffix
-
- vmovd %rax#d,%x#$H0
- vmovd %rdx#d,%x#$H1
- vmovd $h0#d,%x#$H2
- vmovd $h1#d,%x#$H3
- vmovd $h2#d,%x#$H4
- jmp .Lproceed_avx2$suffix
-
-.align 32
-.Lstore_base2_64_avx2$suffix:
- mov $h0,0($ctx)
- mov $h1,8($ctx)
- mov $h2,16($ctx) # note that is_base2_26 is zeroed
- jmp .Ldone_avx2$suffix
-
-.align 16
-.Lstore_base2_26_avx2$suffix:
- mov %rax#d,0($ctx) # store hash value base 2^26
- mov %rdx#d,4($ctx)
- mov $h0#d,8($ctx)
- mov $h1#d,12($ctx)
- mov $h2#d,16($ctx)
-.align 16
-.Ldone_avx2$suffix:
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lno_data_avx2$suffix:
-.Lblocks_avx2_epilogue$suffix:
- RET
-.cfi_endproc
-
-.align 32
-.Lbase2_64_avx2$suffix:
-.cfi_startproc
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lbase2_64_avx2_body$suffix:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2#d
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- test \$63,$len
- jz .Linit_avx2$suffix
-
-.Lbase2_64_pre_avx2$suffix:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
- mov $r1,%rax
-
- test \$63,%r15
- jnz .Lbase2_64_pre_avx2$suffix
-
-.Linit_avx2$suffix:
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$d1
- mov $h1,$d2
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$d1
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $d1,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$d2
- and \$0x3ffffff,$h1 # h[3]
- or $d2,$h2 # h[4]
-
- vmovd %rax#d,%x#$H0
- vmovd %rdx#d,%x#$H1
- vmovd $h0#d,%x#$H2
- vmovd $h1#d,%x#$H3
- vmovd $h2#d,%x#$H4
- movl \$1,20($ctx) # set is_base2_26
-
- call __poly1305_init_avx
-
-.Lproceed_avx2$suffix:
- mov %r15,$len # restore $len
-___
-$code.=<<___ if (!$kernel);
- mov OPENSSL_ia32cap_P+8(%rip),%r9d
- mov \$`(1<<31|1<<30|1<<16)`,%r11d
-___
-$code.=<<___;
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lbase2_64_avx2_epilogue$suffix:
- jmp .Ldo_avx2$suffix
-.cfi_endproc
-
-.align 32
-.Leven_avx2$suffix:
-.cfi_startproc
-___
-$code.=<<___ if (!$kernel);
- mov OPENSSL_ia32cap_P+8(%rip),%r9d
-___
-$code.=<<___;
- vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
- vmovd 4*1($ctx),%x#$H1
- vmovd 4*2($ctx),%x#$H2
- vmovd 4*3($ctx),%x#$H3
- vmovd 4*4($ctx),%x#$H4
-
-.Ldo_avx2$suffix:
-___
-$code.=<<___ if (!$kernel && $avx>2);
- cmp \$512,$len
- jb .Lskip_avx512
- and %r11d,%r9d
- test \$`1<<16`,%r9d # check for AVX512F
- jnz .Lblocks_avx512
-.Lskip_avx512$suffix:
-___
-$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
- cmp \$512,$len
- jae .Lblocks_avx512
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- sub \$0x128,%rsp
-___
-$code.=<<___ if ($win64);
- lea 8(%rsp),%r10
- sub \$0x1c8,%rsp
- vmovdqa %xmm6,-0xb0(%r10)
- vmovdqa %xmm7,-0xa0(%r10)
- vmovdqa %xmm8,-0x90(%r10)
- vmovdqa %xmm9,-0x80(%r10)
- vmovdqa %xmm10,-0x70(%r10)
- vmovdqa %xmm11,-0x60(%r10)
- vmovdqa %xmm12,-0x50(%r10)
- vmovdqa %xmm13,-0x40(%r10)
- vmovdqa %xmm14,-0x30(%r10)
- vmovdqa %xmm15,-0x20(%r10)
-.Ldo_avx2_body$suffix:
-___
-$code.=<<___;
- lea .Lconst(%rip),%rcx
- lea 48+64($ctx),$ctx # size optimization
- vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
-
- # expand and copy pre-calculated table to stack
- vmovdqu `16*0-64`($ctx),%x#$T2
- and \$-512,%rsp
- vmovdqu `16*1-64`($ctx),%x#$T3
- vmovdqu `16*2-64`($ctx),%x#$T4
- vmovdqu `16*3-64`($ctx),%x#$D0
- vmovdqu `16*4-64`($ctx),%x#$D1
- vmovdqu `16*5-64`($ctx),%x#$D2
- lea 0x90(%rsp),%rax # size optimization
- vmovdqu `16*6-64`($ctx),%x#$D3
- vpermd $T2,$T0,$T2 # 00003412 -> 14243444
- vmovdqu `16*7-64`($ctx),%x#$D4
- vpermd $T3,$T0,$T3
- vmovdqu `16*8-64`($ctx),%x#$MASK
- vpermd $T4,$T0,$T4
- vmovdqa $T2,0x00(%rsp)
- vpermd $D0,$T0,$D0
- vmovdqa $T3,0x20-0x90(%rax)
- vpermd $D1,$T0,$D1
- vmovdqa $T4,0x40-0x90(%rax)
- vpermd $D2,$T0,$D2
- vmovdqa $D0,0x60-0x90(%rax)
- vpermd $D3,$T0,$D3
- vmovdqa $D1,0x80-0x90(%rax)
- vpermd $D4,$T0,$D4
- vmovdqa $D2,0xa0-0x90(%rax)
- vpermd $MASK,$T0,$MASK
- vmovdqa $D3,0xc0-0x90(%rax)
- vmovdqa $D4,0xe0-0x90(%rax)
- vmovdqa $MASK,0x100-0x90(%rax)
- vmovdqa 64(%rcx),$MASK # .Lmask26
-
- ################################################################
- # load input
- vmovdqu 16*0($inp),%x#$T0
- vmovdqu 16*1($inp),%x#$T1
- vinserti128 \$1,16*2($inp),$T0,$T0
- vinserti128 \$1,16*3($inp),$T1,$T1
- lea 16*4($inp),$inp
-
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpunpcklqdq $T3,$T2,$T2 # 2:3
- vpunpcklqdq $T1,$T0,$T0 # 0:1
-
- vpsrlq \$30,$T2,$T3
- vpsrlq \$4,$T2,$T2
- vpsrlq \$26,$T0,$T1
- vpsrlq \$40,$T4,$T4 # 4
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T0,$T0 # 0
- vpand $MASK,$T1,$T1 # 1
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- vpaddq $H2,$T2,$H2 # accumulate input
- sub \$64,$len
- jz .Ltail_avx2$suffix
- jmp .Loop_avx2$suffix
-
-.align 32
-.Loop_avx2$suffix:
- ################################################################
- # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
- # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
- # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
- # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
- # \________/\__________/
- ################################################################
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
- vmovdqa `32*0`(%rsp),$T0 # r0^4
- vpaddq $H1,$T1,$H1
- vmovdqa `32*1`(%rsp),$T1 # r1^4
- vpaddq $H3,$T3,$H3
- vmovdqa `32*3`(%rsp),$T2 # r2^4
- vpaddq $H4,$T4,$H4
- vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
- vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # however, as h2 is "chronologically" first one available pull
- # corresponding operations up, so it's
- #
- # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
- # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
- # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
-
- vpmuludq $H2,$T0,$D2 # d2 = h2*r0
- vpmuludq $H2,$T1,$D3 # d3 = h2*r1
- vpmuludq $H2,$T2,$D4 # d4 = h2*r2
- vpmuludq $H2,$T3,$D0 # d0 = h2*s3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
-
- vpmuludq $H0,$T1,$T4 # h0*r1
- vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
- vpaddq $T4,$D1,$D1 # d1 += h0*r1
- vpaddq $H2,$D2,$D2 # d2 += h1*r1
- vpmuludq $H3,$T1,$T4 # h3*r1
- vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
- vpaddq $T4,$D4,$D4 # d4 += h3*r1
- vpaddq $H2,$D0,$D0 # d0 += h4*s1
- vmovdqa `32*4-0x90`(%rax),$T1 # s2
-
- vpmuludq $H0,$T0,$T4 # h0*r0
- vpmuludq $H1,$T0,$H2 # h1*r0
- vpaddq $T4,$D0,$D0 # d0 += h0*r0
- vpaddq $H2,$D1,$D1 # d1 += h1*r0
- vpmuludq $H3,$T0,$T4 # h3*r0
- vpmuludq $H4,$T0,$H2 # h4*r0
- vmovdqu 16*0($inp),%x#$T0 # load input
- vpaddq $T4,$D3,$D3 # d3 += h3*r0
- vpaddq $H2,$D4,$D4 # d4 += h4*r0
- vinserti128 \$1,16*2($inp),$T0,$T0
-
- vpmuludq $H3,$T1,$T4 # h3*s2
- vpmuludq $H4,$T1,$H2 # h4*s2
- vmovdqu 16*1($inp),%x#$T1
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
- vpaddq $H2,$D1,$D1 # d1 += h4*s2
- vmovdqa `32*5-0x90`(%rax),$H2 # r3
- vpmuludq $H1,$T2,$T4 # h1*r2
- vpmuludq $H0,$T2,$T2 # h0*r2
- vpaddq $T4,$D3,$D3 # d3 += h1*r2
- vpaddq $T2,$D2,$D2 # d2 += h0*r2
- vinserti128 \$1,16*3($inp),$T1,$T1
- lea 16*4($inp),$inp
-
- vpmuludq $H1,$H2,$T4 # h1*r3
- vpmuludq $H0,$H2,$H2 # h0*r3
- vpsrldq \$6,$T0,$T2 # splat input
- vpaddq $T4,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $H3,$T3,$T4 # h3*s3
- vpmuludq $H4,$T3,$H2 # h4*s3
- vpsrldq \$6,$T1,$T3
- vpaddq $T4,$D1,$D1 # d1 += h3*s3
- vpaddq $H2,$D2,$D2 # d2 += h4*s3
- vpunpckhqdq $T1,$T0,$T4 # 4
-
- vpmuludq $H3,$S4,$H3 # h3*s4
- vpmuludq $H4,$S4,$H4 # h4*s4
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
- vpunpcklqdq $T3,$T2,$T3 # 2:3
- vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
- vpmuludq $H1,$S4,$H0 # h1*s4
- vmovdqa 64(%rcx),$MASK # .Lmask26
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- ################################################################
- # lazy reduction (interleaved with tail of input splat)
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$4,$T3,$T2
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpand $MASK,$T2,$T2 # 2
- vpsrlq \$26,$T0,$T1
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpaddq $T2,$H2,$H2 # modulo-scheduled
- vpsrlq \$30,$T3,$T3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$40,$T4,$T4 # 4
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpand $MASK,$T0,$T0 # 0
- vpand $MASK,$T1,$T1 # 1
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- sub \$64,$len
- jnz .Loop_avx2$suffix
-
- .byte 0x66,0x90
-.Ltail_avx2$suffix:
- ################################################################
- # while above multiplications were by r^4 in all lanes, in last
- # iteration we multiply least significant lane by r^4 and most
- # significant one by r, so copy of above except that references
- # to the precomputed table are displaced by 4...
-
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
- vmovdqu `32*0+4`(%rsp),$T0 # r0^4
- vpaddq $H1,$T1,$H1
- vmovdqu `32*1+4`(%rsp),$T1 # r1^4
- vpaddq $H3,$T3,$H3
- vmovdqu `32*3+4`(%rsp),$T2 # r2^4
- vpaddq $H4,$T4,$H4
- vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
- vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
-
- vpmuludq $H2,$T0,$D2 # d2 = h2*r0
- vpmuludq $H2,$T1,$D3 # d3 = h2*r1
- vpmuludq $H2,$T2,$D4 # d4 = h2*r2
- vpmuludq $H2,$T3,$D0 # d0 = h2*s3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
-
- vpmuludq $H0,$T1,$T4 # h0*r1
- vpmuludq $H1,$T1,$H2 # h1*r1
- vpaddq $T4,$D1,$D1 # d1 += h0*r1
- vpaddq $H2,$D2,$D2 # d2 += h1*r1
- vpmuludq $H3,$T1,$T4 # h3*r1
- vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
- vpaddq $T4,$D4,$D4 # d4 += h3*r1
- vpaddq $H2,$D0,$D0 # d0 += h4*s1
-
- vpmuludq $H0,$T0,$T4 # h0*r0
- vpmuludq $H1,$T0,$H2 # h1*r0
- vpaddq $T4,$D0,$D0 # d0 += h0*r0
- vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
- vpaddq $H2,$D1,$D1 # d1 += h1*r0
- vpmuludq $H3,$T0,$T4 # h3*r0
- vpmuludq $H4,$T0,$H2 # h4*r0
- vpaddq $T4,$D3,$D3 # d3 += h3*r0
- vpaddq $H2,$D4,$D4 # d4 += h4*r0
-
- vpmuludq $H3,$T1,$T4 # h3*s2
- vpmuludq $H4,$T1,$H2 # h4*s2
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
- vpaddq $H2,$D1,$D1 # d1 += h4*s2
- vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
- vpmuludq $H1,$T2,$T4 # h1*r2
- vpmuludq $H0,$T2,$T2 # h0*r2
- vpaddq $T4,$D3,$D3 # d3 += h1*r2
- vpaddq $T2,$D2,$D2 # d2 += h0*r2
-
- vpmuludq $H1,$H2,$T4 # h1*r3
- vpmuludq $H0,$H2,$H2 # h0*r3
- vpaddq $T4,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $H3,$T3,$T4 # h3*s3
- vpmuludq $H4,$T3,$H2 # h4*s3
- vpaddq $T4,$D1,$D1 # d1 += h3*s3
- vpaddq $H2,$D2,$D2 # d2 += h4*s3
-
- vpmuludq $H3,$S4,$H3 # h3*s4
- vpmuludq $H4,$S4,$H4 # h4*s4
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
- vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
- vpmuludq $H1,$S4,$H0 # h1*s4
- vmovdqa 64(%rcx),$MASK # .Lmask26
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$H2,$T2
- vpsrldq \$8,$H3,$T3
- vpsrldq \$8,$H4,$T4
- vpsrldq \$8,$H0,$T0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$H2,$H2
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
- vpaddq $T0,$H0,$H0
-
- vpermq \$0x2,$H3,$T3
- vpermq \$0x2,$H4,$T4
- vpermq \$0x2,$H0,$T0
- vpermq \$0x2,$D1,$T1
- vpermq \$0x2,$H2,$T2
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$H2,$H2
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
- vmovd %x#$H1,`4*1-48-64`($ctx)
- vmovd %x#$H2,`4*2-48-64`($ctx)
- vmovd %x#$H3,`4*3-48-64`($ctx)
- vmovd %x#$H4,`4*4-48-64`($ctx)
-___
-$code.=<<___ if ($win64);
- vmovdqa -0xb0(%r10),%xmm6
- vmovdqa -0xa0(%r10),%xmm7
- vmovdqa -0x90(%r10),%xmm8
- vmovdqa -0x80(%r10),%xmm9
- vmovdqa -0x70(%r10),%xmm10
- vmovdqa -0x60(%r10),%xmm11
- vmovdqa -0x50(%r10),%xmm12
- vmovdqa -0x40(%r10),%xmm13
- vmovdqa -0x30(%r10),%xmm14
- vmovdqa -0x20(%r10),%xmm15
- lea -8(%r10),%rsp
-.Ldo_avx2_epilogue$suffix:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- vzeroupper
- RET
-.cfi_endproc
-___
-if($avx > 2 && $avx512) {
-my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
-my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
-my $PADBIT="%zmm30";
-
-map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
-map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
-map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
-map(s/%y/%z/,($MASK));
-
-$code.=<<___;
-.cfi_startproc
-.Lblocks_avx512:
- mov \$15,%eax
- kmovw %eax,%k2
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- sub \$0x128,%rsp
-___
-$code.=<<___ if ($win64);
- lea 8(%rsp),%r10
- sub \$0x1c8,%rsp
- vmovdqa %xmm6,-0xb0(%r10)
- vmovdqa %xmm7,-0xa0(%r10)
- vmovdqa %xmm8,-0x90(%r10)
- vmovdqa %xmm9,-0x80(%r10)
- vmovdqa %xmm10,-0x70(%r10)
- vmovdqa %xmm11,-0x60(%r10)
- vmovdqa %xmm12,-0x50(%r10)
- vmovdqa %xmm13,-0x40(%r10)
- vmovdqa %xmm14,-0x30(%r10)
- vmovdqa %xmm15,-0x20(%r10)
-.Ldo_avx512_body:
-___
-$code.=<<___;
- lea .Lconst(%rip),%rcx
- lea 48+64($ctx),$ctx # size optimization
- vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
-
- # expand pre-calculated table
- vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
- and \$-512,%rsp
- vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
- mov \$0x20,%rax
- vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
- vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
- vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
- vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
- vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
- vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
- vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
- vpermd $D0,$T2,$R0 # 00003412 -> 14243444
- vpbroadcastq 64(%rcx),$MASK # .Lmask26
- vpermd $D1,$T2,$R1
- vpermd $T0,$T2,$S1
- vpermd $D2,$T2,$R2
- vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
- vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
- vpermd $T1,$T2,$S2
- vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
- vpsrlq \$32,$R1,$T1
- vpermd $D3,$T2,$R3
- vmovdqa64 $S1,0x40(%rsp){%k2}
- vpermd $T3,$T2,$S3
- vpermd $D4,$T2,$R4
- vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
- vpermd $T4,$T2,$S4
- vmovdqa64 $S2,0x80(%rsp){%k2}
- vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
- vmovdqa64 $S3,0xc0(%rsp){%k2}
- vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
- vmovdqa64 $S4,0x100(%rsp){%k2}
-
- ################################################################
- # calculate 5th through 8th powers of the key
- #
- # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
- # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
- # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
- # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
- # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
-
- vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
- vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
- vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
- vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
- vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
- vpsrlq \$32,$R2,$T2
-
- vpmuludq $T1,$S4,$M0
- vpmuludq $T1,$R0,$M1
- vpmuludq $T1,$R1,$M2
- vpmuludq $T1,$R2,$M3
- vpmuludq $T1,$R3,$M4
- vpsrlq \$32,$R3,$T3
- vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
- vpaddq $M1,$D1,$D1 # d1 += r1'*r0
- vpaddq $M2,$D2,$D2 # d2 += r1'*r1
- vpaddq $M3,$D3,$D3 # d3 += r1'*r2
- vpaddq $M4,$D4,$D4 # d4 += r1'*r3
-
- vpmuludq $T2,$S3,$M0
- vpmuludq $T2,$S4,$M1
- vpmuludq $T2,$R1,$M3
- vpmuludq $T2,$R2,$M4
- vpmuludq $T2,$R0,$M2
- vpsrlq \$32,$R4,$T4
- vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
- vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
- vpaddq $M3,$D3,$D3 # d3 += r2'*r1
- vpaddq $M4,$D4,$D4 # d4 += r2'*r2
- vpaddq $M2,$D2,$D2 # d2 += r2'*r0
-
- vpmuludq $T3,$S2,$M0
- vpmuludq $T3,$R0,$M3
- vpmuludq $T3,$R1,$M4
- vpmuludq $T3,$S3,$M1
- vpmuludq $T3,$S4,$M2
- vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
- vpaddq $M3,$D3,$D3 # d3 += r3'*r0
- vpaddq $M4,$D4,$D4 # d4 += r3'*r1
- vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
- vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
-
- vpmuludq $T4,$S4,$M3
- vpmuludq $T4,$R0,$M4
- vpmuludq $T4,$S1,$M0
- vpmuludq $T4,$S2,$M1
- vpmuludq $T4,$S3,$M2
- vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
- vpaddq $M4,$D4,$D4 # d4 += r2'*r0
- vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
- vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
- vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
-
- ################################################################
- # load input
- vmovdqu64 16*0($inp),%z#$T3
- vmovdqu64 16*4($inp),%z#$T4
- lea 16*8($inp),$inp
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$D3,$M3
- vpandq $MASK,$D3,$D3
- vpaddq $M3,$D4,$D4 # d3 -> d4
-
- vpsrlq \$26,$D0,$M0
- vpandq $MASK,$D0,$D0
- vpaddq $M0,$D1,$D1 # d0 -> d1
-
- vpsrlq \$26,$D4,$M4
- vpandq $MASK,$D4,$D4
-
- vpsrlq \$26,$D1,$M1
- vpandq $MASK,$D1,$D1
- vpaddq $M1,$D2,$D2 # d1 -> d2
-
- vpaddq $M4,$D0,$D0
- vpsllq \$2,$M4,$M4
- vpaddq $M4,$D0,$D0 # d4 -> d0
-
- vpsrlq \$26,$D2,$M2
- vpandq $MASK,$D2,$D2
- vpaddq $M2,$D3,$D3 # d2 -> d3
-
- vpsrlq \$26,$D0,$M0
- vpandq $MASK,$D0,$D0
- vpaddq $M0,$D1,$D1 # d0 -> d1
-
- vpsrlq \$26,$D3,$M3
- vpandq $MASK,$D3,$D3
- vpaddq $M3,$D4,$D4 # d3 -> d4
-
- ################################################################
- # at this point we have 14243444 in $R0-$S4 and 05060708 in
- # $D0-$D4, ...
-
- vpunpcklqdq $T4,$T3,$T0 # transpose input
- vpunpckhqdq $T4,$T3,$T4
-
- # ... since input 64-bit lanes are ordered as 73625140, we could
- # "vperm" it to 76543210 (here and in each loop iteration), *or*
- # we could just flow along, hence the goal for $R0-$S4 is
- # 1858286838784888 ...
-
- vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
- mov \$0x7777,%eax
- kmovw %eax,%k1
-
- vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
- vpermd $R1,$M0,$R1
- vpermd $R2,$M0,$R2
- vpermd $R3,$M0,$R3
- vpermd $R4,$M0,$R4
-
- vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
- vpermd $D1,$M0,${R1}{%k1}
- vpermd $D2,$M0,${R2}{%k1}
- vpermd $D3,$M0,${R3}{%k1}
- vpermd $D4,$M0,${R4}{%k1}
-
- vpslld \$2,$R1,$S1 # *5
- vpslld \$2,$R2,$S2
- vpslld \$2,$R3,$S3
- vpslld \$2,$R4,$S4
- vpaddd $R1,$S1,$S1
- vpaddd $R2,$S2,$S2
- vpaddd $R3,$S3,$S3
- vpaddd $R4,$S4,$S4
-
- vpbroadcastq 32(%rcx),$PADBIT # .L129
-
- vpsrlq \$52,$T0,$T2 # splat input
- vpsllq \$12,$T4,$T3
- vporq $T3,$T2,$T2
- vpsrlq \$26,$T0,$T1
- vpsrlq \$14,$T4,$T3
- vpsrlq \$40,$T4,$T4 # 4
- vpandq $MASK,$T2,$T2 # 2
- vpandq $MASK,$T0,$T0 # 0
- #vpandq $MASK,$T1,$T1 # 1
- #vpandq $MASK,$T3,$T3 # 3
- #vporq $PADBIT,$T4,$T4 # padbit, yes, always
-
- vpaddq $H2,$T2,$H2 # accumulate input
- sub \$192,$len
- jbe .Ltail_avx512
- jmp .Loop_avx512
-
-.align 32
-.Loop_avx512:
- ################################################################
- # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
- # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
- # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
- # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
- # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
- # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
- # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
- # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
- # \________/\___________/
- ################################################################
- #vpaddq $H2,$T2,$H2 # accumulate input
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # however, as h2 is "chronologically" first one available pull
- # corresponding operations up, so it's
- #
- # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
- # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
- # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
- # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
- # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
-
- vpmuludq $H2,$R1,$D3 # d3 = h2*r1
- vpaddq $H0,$T0,$H0
- vpmuludq $H2,$R2,$D4 # d4 = h2*r2
- vpandq $MASK,$T1,$T1 # 1
- vpmuludq $H2,$S3,$D0 # d0 = h2*s3
- vpandq $MASK,$T3,$T3 # 3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
- vporq $PADBIT,$T4,$T4 # padbit, yes, always
- vpmuludq $H2,$R0,$D2 # d2 = h2*r0
- vpaddq $H1,$T1,$H1 # accumulate input
- vpaddq $H3,$T3,$H3
- vpaddq $H4,$T4,$H4
-
- vmovdqu64 16*0($inp),$T3 # load input
- vmovdqu64 16*4($inp),$T4
- lea 16*8($inp),$inp
- vpmuludq $H0,$R3,$M3
- vpmuludq $H0,$R4,$M4
- vpmuludq $H0,$R0,$M0
- vpmuludq $H0,$R1,$M1
- vpaddq $M3,$D3,$D3 # d3 += h0*r3
- vpaddq $M4,$D4,$D4 # d4 += h0*r4
- vpaddq $M0,$D0,$D0 # d0 += h0*r0
- vpaddq $M1,$D1,$D1 # d1 += h0*r1
-
- vpmuludq $H1,$R2,$M3
- vpmuludq $H1,$R3,$M4
- vpmuludq $H1,$S4,$M0
- vpmuludq $H0,$R2,$M2
- vpaddq $M3,$D3,$D3 # d3 += h1*r2
- vpaddq $M4,$D4,$D4 # d4 += h1*r3
- vpaddq $M0,$D0,$D0 # d0 += h1*s4
- vpaddq $M2,$D2,$D2 # d2 += h0*r2
-
- vpunpcklqdq $T4,$T3,$T0 # transpose input
- vpunpckhqdq $T4,$T3,$T4
-
- vpmuludq $H3,$R0,$M3
- vpmuludq $H3,$R1,$M4
- vpmuludq $H1,$R0,$M1
- vpmuludq $H1,$R1,$M2
- vpaddq $M3,$D3,$D3 # d3 += h3*r0
- vpaddq $M4,$D4,$D4 # d4 += h3*r1
- vpaddq $M1,$D1,$D1 # d1 += h1*r0
- vpaddq $M2,$D2,$D2 # d2 += h1*r1
-
- vpmuludq $H4,$S4,$M3
- vpmuludq $H4,$R0,$M4
- vpmuludq $H3,$S2,$M0
- vpmuludq $H3,$S3,$M1
- vpaddq $M3,$D3,$D3 # d3 += h4*s4
- vpmuludq $H3,$S4,$M2
- vpaddq $M4,$D4,$D4 # d4 += h4*r0
- vpaddq $M0,$D0,$D0 # d0 += h3*s2
- vpaddq $M1,$D1,$D1 # d1 += h3*s3
- vpaddq $M2,$D2,$D2 # d2 += h3*s4
-
- vpmuludq $H4,$S1,$M0
- vpmuludq $H4,$S2,$M1
- vpmuludq $H4,$S3,$M2
- vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
- vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
- vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
-
- ################################################################
- # lazy reduction (interleaved with input splat)
-
- vpsrlq \$52,$T0,$T2 # splat input
- vpsllq \$12,$T4,$T3
-
- vpsrlq \$26,$D3,$H3
- vpandq $MASK,$D3,$D3
- vpaddq $H3,$D4,$H4 # h3 -> h4
-
- vporq $T3,$T2,$T2
-
- vpsrlq \$26,$H0,$D0
- vpandq $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpandq $MASK,$T2,$T2 # 2
-
- vpsrlq \$26,$H4,$D4
- vpandq $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpandq $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpaddq $T2,$H2,$H2 # modulo-scheduled
- vpsrlq \$26,$T0,$T1
-
- vpsrlq \$26,$H2,$D2
- vpandq $MASK,$H2,$H2
- vpaddq $D2,$D3,$H3 # h2 -> h3
-
- vpsrlq \$14,$T4,$T3
-
- vpsrlq \$26,$H0,$D0
- vpandq $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$40,$T4,$T4 # 4
-
- vpsrlq \$26,$H3,$D3
- vpandq $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpandq $MASK,$T0,$T0 # 0
- #vpandq $MASK,$T1,$T1 # 1
- #vpandq $MASK,$T3,$T3 # 3
- #vporq $PADBIT,$T4,$T4 # padbit, yes, always
-
- sub \$128,$len
- ja .Loop_avx512
-
-.Ltail_avx512:
- ################################################################
- # while above multiplications were by r^8 in all lanes, in last
- # iteration we multiply least significant lane by r^8 and most
- # significant one by r, that's why table gets shifted...
-
- vpsrlq \$32,$R0,$R0 # 0105020603070408
- vpsrlq \$32,$R1,$R1
- vpsrlq \$32,$R2,$R2
- vpsrlq \$32,$S3,$S3
- vpsrlq \$32,$S4,$S4
- vpsrlq \$32,$R3,$R3
- vpsrlq \$32,$R4,$R4
- vpsrlq \$32,$S1,$S1
- vpsrlq \$32,$S2,$S2
-
- ################################################################
- # load either next or last 64 byte of input
- lea ($inp,$len),$inp
-
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
-
- vpmuludq $H2,$R1,$D3 # d3 = h2*r1
- vpmuludq $H2,$R2,$D4 # d4 = h2*r2
- vpmuludq $H2,$S3,$D0 # d0 = h2*s3
- vpandq $MASK,$T1,$T1 # 1
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
- vpandq $MASK,$T3,$T3 # 3
- vpmuludq $H2,$R0,$D2 # d2 = h2*r0
- vporq $PADBIT,$T4,$T4 # padbit, yes, always
- vpaddq $H1,$T1,$H1 # accumulate input
- vpaddq $H3,$T3,$H3
- vpaddq $H4,$T4,$H4
-
- vmovdqu 16*0($inp),%x#$T0
- vpmuludq $H0,$R3,$M3
- vpmuludq $H0,$R4,$M4
- vpmuludq $H0,$R0,$M0
- vpmuludq $H0,$R1,$M1
- vpaddq $M3,$D3,$D3 # d3 += h0*r3
- vpaddq $M4,$D4,$D4 # d4 += h0*r4
- vpaddq $M0,$D0,$D0 # d0 += h0*r0
- vpaddq $M1,$D1,$D1 # d1 += h0*r1
-
- vmovdqu 16*1($inp),%x#$T1
- vpmuludq $H1,$R2,$M3
- vpmuludq $H1,$R3,$M4
- vpmuludq $H1,$S4,$M0
- vpmuludq $H0,$R2,$M2
- vpaddq $M3,$D3,$D3 # d3 += h1*r2
- vpaddq $M4,$D4,$D4 # d4 += h1*r3
- vpaddq $M0,$D0,$D0 # d0 += h1*s4
- vpaddq $M2,$D2,$D2 # d2 += h0*r2
-
- vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
- vpmuludq $H3,$R0,$M3
- vpmuludq $H3,$R1,$M4
- vpmuludq $H1,$R0,$M1
- vpmuludq $H1,$R1,$M2
- vpaddq $M3,$D3,$D3 # d3 += h3*r0
- vpaddq $M4,$D4,$D4 # d4 += h3*r1
- vpaddq $M1,$D1,$D1 # d1 += h1*r0
- vpaddq $M2,$D2,$D2 # d2 += h1*r1
-
- vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
- vpmuludq $H4,$S4,$M3
- vpmuludq $H4,$R0,$M4
- vpmuludq $H3,$S2,$M0
- vpmuludq $H3,$S3,$M1
- vpmuludq $H3,$S4,$M2
- vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
- vpaddq $M4,$D4,$D4 # d4 += h4*r0
- vpaddq $M0,$D0,$D0 # d0 += h3*s2
- vpaddq $M1,$D1,$D1 # d1 += h3*s3
- vpaddq $M2,$D2,$D2 # d2 += h3*s4
-
- vpmuludq $H4,$S1,$M0
- vpmuludq $H4,$S2,$M1
- vpmuludq $H4,$S3,$M2
- vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
- vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
- vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
-
- ################################################################
- # horizontal addition
-
- mov \$1,%eax
- vpermq \$0xb1,$H3,$D3
- vpermq \$0xb1,$D4,$H4
- vpermq \$0xb1,$H0,$D0
- vpermq \$0xb1,$H1,$D1
- vpermq \$0xb1,$H2,$D2
- vpaddq $D3,$H3,$H3
- vpaddq $D4,$H4,$H4
- vpaddq $D0,$H0,$H0
- vpaddq $D1,$H1,$H1
- vpaddq $D2,$H2,$H2
-
- kmovw %eax,%k3
- vpermq \$0x2,$H3,$D3
- vpermq \$0x2,$H4,$D4
- vpermq \$0x2,$H0,$D0
- vpermq \$0x2,$H1,$D1
- vpermq \$0x2,$H2,$D2
- vpaddq $D3,$H3,$H3
- vpaddq $D4,$H4,$H4
- vpaddq $D0,$H0,$H0
- vpaddq $D1,$H1,$H1
- vpaddq $D2,$H2,$H2
-
- vextracti64x4 \$0x1,$H3,%y#$D3
- vextracti64x4 \$0x1,$H4,%y#$D4
- vextracti64x4 \$0x1,$H0,%y#$D0
- vextracti64x4 \$0x1,$H1,%y#$D1
- vextracti64x4 \$0x1,$H2,%y#$D2
- vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
- vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
- vpaddq $D0,$H0,${H0}{%k3}{z}
- vpaddq $D1,$H1,${H1}{%k3}{z}
- vpaddq $D2,$H2,${H2}{%k3}{z}
-___
-map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
-map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
-$code.=<<___;
- ################################################################
- # lazy reduction (interleaved with input splat)
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpunpcklqdq $T3,$T2,$T2 # 2:3
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpsrlq \$30,$T2,$T3
- vpsrlq \$4,$T2,$T2
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpsrlq \$26,$T0,$T1
- vpsrlq \$40,$T4,$T4 # 4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T0,$T0 # 0
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
- vpand $MASK,$T1,$T1 # 1
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
- add \$64,$len
- jnz .Ltail_avx2$suffix
-
- vpsubq $T2,$H2,$H2 # undo input accumulation
- vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
- vmovd %x#$H1,`4*1-48-64`($ctx)
- vmovd %x#$H2,`4*2-48-64`($ctx)
- vmovd %x#$H3,`4*3-48-64`($ctx)
- vmovd %x#$H4,`4*4-48-64`($ctx)
- vzeroall
-___
-$code.=<<___ if ($win64);
- movdqa -0xb0(%r10),%xmm6
- movdqa -0xa0(%r10),%xmm7
- movdqa -0x90(%r10),%xmm8
- movdqa -0x80(%r10),%xmm9
- movdqa -0x70(%r10),%xmm10
- movdqa -0x60(%r10),%xmm11
- movdqa -0x50(%r10),%xmm12
- movdqa -0x40(%r10),%xmm13
- movdqa -0x30(%r10),%xmm14
- movdqa -0x20(%r10),%xmm15
- lea -8(%r10),%rsp
-.Ldo_avx512_epilogue:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- RET
-.cfi_endproc
-___
-
-}
-
-}
-
-&declare_function("poly1305_blocks_avx2", 32, 4);
-poly1305_blocks_avxN(0);
-&end_function("poly1305_blocks_avx2");
-
-#######################################################################
-if ($avx>2) {
-# On entry we have input length divisible by 64. But since inner loop
-# processes 128 bytes per iteration, cases when length is not divisible
-# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
-# reason stack layout is kept identical to poly1305_blocks_avx2. If not
-# for this tail, we wouldn't have to even allocate stack frame...
-
-&declare_function("poly1305_blocks_avx512", 32, 4);
-poly1305_blocks_avxN(1);
-&end_function("poly1305_blocks_avx512");
-
-if (!$kernel && $avx>3) {
-########################################################################
-# VPMADD52 version using 2^44 radix.
-#
-# One can argue that base 2^52 would be more natural. Well, even though
-# some operations would be more natural, one has to recognize couple of
-# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
-# at amount of multiply-n-accumulate operations. Secondly, it makes it
-# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
-# reference implementations], which means that more such operations
-# would have to be performed in inner loop, which in turn makes critical
-# path longer. In other words, even though base 2^44 reduction might
-# look less elegant, overall critical path is actually shorter...
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int64 h[3]; # current hash value base 2^44
-# unsigned __int64 s[2]; # key value*20 base 2^44
-# unsigned __int64 r[3]; # key value base 2^44
-# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
-# # r^n positions reflect
-# # placement in register, not
-# # memory, R[3] is R[1]*20
-
-$code.=<<___;
-.type poly1305_init_base2_44,\@function,3
-.align 32
-poly1305_init_base2_44:
- xor %eax,%eax
- mov %rax,0($ctx) # initialize hash value
- mov %rax,8($ctx)
- mov %rax,16($ctx)
-
-.Linit_base2_44:
- lea poly1305_blocks_vpmadd52(%rip),%r10
- lea poly1305_emit_base2_44(%rip),%r11
-
- mov \$0x0ffffffc0fffffff,%rax
- mov \$0x0ffffffc0ffffffc,%rcx
- and 0($inp),%rax
- mov \$0x00000fffffffffff,%r8
- and 8($inp),%rcx
- mov \$0x00000fffffffffff,%r9
- and %rax,%r8
- shrd \$44,%rcx,%rax
- mov %r8,40($ctx) # r0
- and %r9,%rax
- shr \$24,%rcx
- mov %rax,48($ctx) # r1
- lea (%rax,%rax,4),%rax # *5
- mov %rcx,56($ctx) # r2
- shl \$2,%rax # magic <<2
- lea (%rcx,%rcx,4),%rcx # *5
- shl \$2,%rcx # magic <<2
- mov %rax,24($ctx) # s1
- mov %rcx,32($ctx) # s2
- movq \$-1,64($ctx) # write impossible value
-___
-$code.=<<___ if ($flavour !~ /elf32/);
- mov %r10,0(%rdx)
- mov %r11,8(%rdx)
-___
-$code.=<<___ if ($flavour =~ /elf32/);
- mov %r10d,0(%rdx)
- mov %r11d,4(%rdx)
-___
-$code.=<<___;
- mov \$1,%eax
- RET
-.size poly1305_init_base2_44,.-poly1305_init_base2_44
-___
-{
-my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
-my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
-my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52,\@function,4
-.align 32
-poly1305_blocks_vpmadd52:
- shr \$4,$len
- jz .Lno_data_vpmadd52 # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
- # if powers of the key are not calculated yet, process up to 3
- # blocks with this single-block subroutine, otherwise ensure that
- # length is divisible by 2 blocks and pass the rest down to next
- # subroutine...
-
- mov \$3,%rax
- mov \$1,%r10
- cmp \$4,$len # is input long
- cmovae %r10,%rax
- test %r8,%r8 # is power value impossible?
- cmovns %r10,%rax
-
- and $len,%rax # is input of favourable length?
- jz .Lblocks_vpmadd52_4x
-
- sub %rax,$len
- mov \$7,%r10d
- mov \$1,%r11d
- kmovw %r10d,%k7
- lea .L2_44_inp_permd(%rip),%r10
- kmovw %r11d,%k1
-
- vmovq $padbit,%x#$PAD
- vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
- vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
- vpermq \$0xcf,$PAD,$PAD
- vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
-
- vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
- vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
- vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
- vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
-
- vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
- vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
-
- jmp .Loop_vpmadd52
-
-.align 32
-.Loop_vpmadd52:
- vmovdqu32 0($inp),%x#$T0 # load input as ----3210
- lea 16($inp),$inp
-
- vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
- vpsrlvq $inp_shift,$T0,$T0
- vpandq $reduc_mask,$T0,$T0
- vporq $PAD,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo # accumulate input
-
- vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
- vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
- vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
-
- vpxord $Dlo,$Dlo,$Dlo
- vpxord $Dhi,$Dhi,$Dhi
-
- vpmadd52luq $r2r1r0,$H0,$Dlo
- vpmadd52huq $r2r1r0,$H0,$Dhi
-
- vpmadd52luq $r1r0s2,$H1,$Dlo
- vpmadd52huq $r1r0s2,$H1,$Dhi
-
- vpmadd52luq $r0s2s1,$H2,$Dlo
- vpmadd52huq $r0s2s1,$H2,$Dhi
-
- vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
- vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
- vpandq $reduc_mask,$Dlo,$Dlo
-
- vpaddq $T0,$Dhi,$Dhi
-
- vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
-
- vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
-
- vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
- vpandq $reduc_mask,$Dlo,$Dlo
-
- vpermq \$0b10010011,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo
-
- vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
-
- vpaddq $T0,$Dlo,$Dlo
- vpsllq \$2,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo
-
- dec %rax # len-=16
- jnz .Loop_vpmadd52
-
- vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
-
- test $len,$len
- jnz .Lblocks_vpmadd52_4x
-
-.Lno_data_vpmadd52:
- RET
-.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
-___
-}
-{
-########################################################################
-# As implied by its name 4x subroutine processes 4 blocks in parallel
-# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
-# and is handled in 256-bit %ymm registers.
-
-my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
-my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
-my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52_4x,\@function,4
-.align 32
-poly1305_blocks_vpmadd52_4x:
- shr \$4,$len
- jz .Lno_data_vpmadd52_4x # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
-.Lblocks_vpmadd52_4x:
- vpbroadcastq $padbit,$PAD
-
- vmovdqa64 .Lx_mask44(%rip),$mask44
- mov \$5,%eax
- vmovdqa64 .Lx_mask42(%rip),$mask42
- kmovw %eax,%k1 # used in 2x path
-
- test %r8,%r8 # is power value impossible?
- js .Linit_vpmadd52 # if it is, then init R[4]
-
- vmovq 0($ctx),%x#$H0 # load current hash value
- vmovq 8($ctx),%x#$H1
- vmovq 16($ctx),%x#$H2
-
- test \$3,$len # is length 4*n+2?
- jnz .Lblocks_vpmadd52_2x_do
-
-.Lblocks_vpmadd52_4x_do:
- vpbroadcastq 64($ctx),$R0 # load 4th power of the key
- vpbroadcastq 96($ctx),$R1
- vpbroadcastq 128($ctx),$R2
- vpbroadcastq 160($ctx),$S1
-
-.Lblocks_vpmadd52_4x_key_loaded:
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S2,$S2
-
- test \$7,$len # is len 8*n?
- jz .Lblocks_vpmadd52_8x
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*2($inp),$T3
- lea 16*4($inp),$inp
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
-
- # at this point 64-bit lanes are ordered as 3-1-2-0
-
- vpsrlq \$24,$T3,$T2 # splat the data
- vporq $PAD,$T2,$T2
- vpaddq $T2,$H2,$H2 # accumulate input
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- sub \$4,$len
- jz .Ltail_vpmadd52_4x
- jmp .Loop_vpmadd52_4x
- ud2
-
-.align 32
-.Linit_vpmadd52:
- vmovq 24($ctx),%x#$S1 # load key
- vmovq 56($ctx),%x#$H2
- vmovq 32($ctx),%x#$S2
- vmovq 40($ctx),%x#$R0
- vmovq 48($ctx),%x#$R1
-
- vmovdqa $R0,$H0
- vmovdqa $R1,$H1
- vmovdqa $H2,$R2
-
- mov \$2,%eax
-
-.Lmul_init_vpmadd52:
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- dec %eax
- jz .Ldone_init_vpmadd52
-
- vpunpcklqdq $R1,$H1,$R1 # 1,2
- vpbroadcastq %x#$H1,%x#$H1 # 2,2
- vpunpcklqdq $R2,$H2,$R2
- vpbroadcastq %x#$H2,%x#$H2
- vpunpcklqdq $R0,$H0,$R0
- vpbroadcastq %x#$H0,%x#$H0
-
- vpsllq \$2,$R1,$S1 # S1 = R1*5*4
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R1,$S1,$S1
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S1,$S1
- vpsllq \$2,$S2,$S2
-
- jmp .Lmul_init_vpmadd52
- ud2
-
-.align 32
-.Ldone_init_vpmadd52:
- vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
- vinserti128 \$1,%x#$R2,$H2,$R2
- vinserti128 \$1,%x#$R0,$H0,$R0
-
- vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
- vpermq \$0b11011000,$R2,$R2
- vpermq \$0b11011000,$R0,$R0
-
- vpsllq \$2,$R1,$S1 # S1 = R1*5*4
- vpaddq $R1,$S1,$S1
- vpsllq \$2,$S1,$S1
-
- vmovq 0($ctx),%x#$H0 # load current hash value
- vmovq 8($ctx),%x#$H1
- vmovq 16($ctx),%x#$H2
-
- test \$3,$len # is length 4*n+2?
- jnz .Ldone_init_vpmadd52_2x
-
- vmovdqu64 $R0,64($ctx) # save key powers
- vpbroadcastq %x#$R0,$R0 # broadcast 4th power
- vmovdqu64 $R1,96($ctx)
- vpbroadcastq %x#$R1,$R1
- vmovdqu64 $R2,128($ctx)
- vpbroadcastq %x#$R2,$R2
- vmovdqu64 $S1,160($ctx)
- vpbroadcastq %x#$S1,$S1
-
- jmp .Lblocks_vpmadd52_4x_key_loaded
- ud2
-
-.align 32
-.Ldone_init_vpmadd52_2x:
- vmovdqu64 $R0,64($ctx) # save key powers
- vpsrldq \$8,$R0,$R0 # 0-1-0-2
- vmovdqu64 $R1,96($ctx)
- vpsrldq \$8,$R1,$R1
- vmovdqu64 $R2,128($ctx)
- vpsrldq \$8,$R2,$R2
- vmovdqu64 $S1,160($ctx)
- vpsrldq \$8,$S1,$S1
- jmp .Lblocks_vpmadd52_2x_key_loaded
- ud2
-
-.align 32
-.Lblocks_vpmadd52_2x_do:
- vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
- vmovdqu64 160+8($ctx),${S1}{%k1}{z}
- vmovdqu64 64+8($ctx),${R0}{%k1}{z}
- vmovdqu64 96+8($ctx),${R1}{%k1}{z}
-
-.Lblocks_vpmadd52_2x_key_loaded:
- vmovdqu64 16*0($inp),$T2 # load data
- vpxorq $T3,$T3,$T3
- lea 16*2($inp),$inp
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
-
- # at this point 64-bit lanes are ordered as x-1-x-0
-
- vpsrlq \$24,$T3,$T2 # splat the data
- vporq $PAD,$T2,$T2
- vpaddq $T2,$H2,$H2 # accumulate input
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- jmp .Ltail_vpmadd52_2x
- ud2
-
-.align 32
-.Loop_vpmadd52_4x:
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*2($inp),$T3
- lea 16*4($inp),$inp
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # partial reduction (interleaved with data splat)
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpsrlq \$24,$T3,$T2
- vporq $PAD,$T2,$T2
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- sub \$4,$len # len-=64
- jnz .Loop_vpmadd52_4x
-
-.Ltail_vpmadd52_4x:
- vmovdqu64 128($ctx),$R2 # load all key powers
- vmovdqu64 160($ctx),$S1
- vmovdqu64 64($ctx),$R0
- vmovdqu64 96($ctx),$R1
-
-.Ltail_vpmadd52_2x:
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S2,$S2
-
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # horizontal addition
-
- mov \$1,%eax
- kmovw %eax,%k1
- vpsrldq \$8,$D0lo,$T0
- vpsrldq \$8,$D0hi,$H0
- vpsrldq \$8,$D1lo,$T1
- vpsrldq \$8,$D1hi,$H1
- vpaddq $T0,$D0lo,$D0lo
- vpaddq $H0,$D0hi,$D0hi
- vpsrldq \$8,$D2lo,$T2
- vpsrldq \$8,$D2hi,$H2
- vpaddq $T1,$D1lo,$D1lo
- vpaddq $H1,$D1hi,$D1hi
- vpermq \$0x2,$D0lo,$T0
- vpermq \$0x2,$D0hi,$H0
- vpaddq $T2,$D2lo,$D2lo
- vpaddq $H2,$D2hi,$D2hi
-
- vpermq \$0x2,$D1lo,$T1
- vpermq \$0x2,$D1hi,$H1
- vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
- vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
- vpermq \$0x2,$D2lo,$T2
- vpermq \$0x2,$D2hi,$H2
- vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
- vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
- vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
- vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
- # at this point $len is
- # either 4*n+2 or 0...
- sub \$2,$len # len-=32
- ja .Lblocks_vpmadd52_4x_do
-
- vmovq %x#$H0,0($ctx)
- vmovq %x#$H1,8($ctx)
- vmovq %x#$H2,16($ctx)
- vzeroall
-
-.Lno_data_vpmadd52_4x:
- RET
-.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
-___
-}
-{
-########################################################################
-# As implied by its name 8x subroutine processes 8 blocks in parallel...
-# This is intermediate version, as it's used only in cases when input
-# length is either 8*n, 8*n+1 or 8*n+2...
-
-my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
-my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
-my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
-my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52_8x,\@function,4
-.align 32
-poly1305_blocks_vpmadd52_8x:
- shr \$4,$len
- jz .Lno_data_vpmadd52_8x # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
- vmovdqa64 .Lx_mask44(%rip),$mask44
- vmovdqa64 .Lx_mask42(%rip),$mask42
-
- test %r8,%r8 # is power value impossible?
- js .Linit_vpmadd52 # if it is, then init R[4]
-
- vmovq 0($ctx),%x#$H0 # load current hash value
- vmovq 8($ctx),%x#$H1
- vmovq 16($ctx),%x#$H2
-
-.Lblocks_vpmadd52_8x:
- ################################################################
- # fist we calculate more key powers
-
- vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
- vmovdqu64 160($ctx),$S1
- vmovdqu64 64($ctx),$R0
- vmovdqu64 96($ctx),$R1
-
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S2,$S2
-
- vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
- vpbroadcastq %x#$R0,$RR0
- vpbroadcastq %x#$R1,$RR1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $RR2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $RR2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $RR2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $RR2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $RR2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $RR2,$R0,$D2hi
-
- vpmadd52luq $RR0,$R0,$D0lo
- vpmadd52huq $RR0,$R0,$D0hi
- vpmadd52luq $RR0,$R1,$D1lo
- vpmadd52huq $RR0,$R1,$D1hi
- vpmadd52luq $RR0,$R2,$D2lo
- vpmadd52huq $RR0,$R2,$D2hi
-
- vpmadd52luq $RR1,$S2,$D0lo
- vpmadd52huq $RR1,$S2,$D0hi
- vpmadd52luq $RR1,$R0,$D1lo
- vpmadd52huq $RR1,$R0,$D1hi
- vpmadd52luq $RR1,$R1,$D2lo
- vpmadd52huq $RR1,$R1,$D2hi
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$RR0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$RR1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$RR2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$RR0,$RR0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$RR0,$RR0
-
- vpsrlq \$44,$RR0,$tmp # additional step
- vpandq $mask44,$RR0,$RR0
-
- vpaddq $tmp,$RR1,$RR1
-
- ################################################################
- # At this point Rx holds 1324 powers, RRx - 5768, and the goal
- # is 15263748, which reflects how data is loaded...
-
- vpunpcklqdq $R2,$RR2,$T2 # 3748
- vpunpckhqdq $R2,$RR2,$R2 # 1526
- vpunpcklqdq $R0,$RR0,$T0
- vpunpckhqdq $R0,$RR0,$R0
- vpunpcklqdq $R1,$RR1,$T1
- vpunpckhqdq $R1,$RR1,$R1
-___
-######## switch to %zmm
-map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
-map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
-map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
-map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
-
-$code.=<<___;
- vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
- vshufi64x2 \$0x44,$R0,$T0,$RR0
- vshufi64x2 \$0x44,$R1,$T1,$RR1
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*4($inp),$T3
- lea 16*8($inp),$inp
-
- vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
- vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
- vpaddq $RR2,$SS2,$SS2
- vpaddq $RR1,$SS1,$SS1
- vpsllq \$2,$SS2,$SS2
- vpsllq \$2,$SS1,$SS1
-
- vpbroadcastq $padbit,$PAD
- vpbroadcastq %x#$mask44,$mask44
- vpbroadcastq %x#$mask42,$mask42
-
- vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
- vpbroadcastq %x#$SS2,$S2
- vpbroadcastq %x#$RR0,$R0
- vpbroadcastq %x#$RR1,$R1
- vpbroadcastq %x#$RR2,$R2
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
-
- # at this point 64-bit lanes are ordered as 73625140
-
- vpsrlq \$24,$T3,$T2 # splat the data
- vporq $PAD,$T2,$T2
- vpaddq $T2,$H2,$H2 # accumulate input
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- sub \$8,$len
- jz .Ltail_vpmadd52_8x
- jmp .Loop_vpmadd52_8x
-
-.align 32
-.Loop_vpmadd52_8x:
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*4($inp),$T3
- lea 16*8($inp),$inp
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # partial reduction (interleaved with data splat)
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpsrlq \$24,$T3,$T2
- vporq $PAD,$T2,$T2
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- sub \$8,$len # len-=128
- jnz .Loop_vpmadd52_8x
-
-.Ltail_vpmadd52_8x:
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$SS1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$SS1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$SS2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$SS2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$RR0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$RR0,$D2hi
-
- vpmadd52luq $H0,$RR0,$D0lo
- vpmadd52huq $H0,$RR0,$D0hi
- vpmadd52luq $H0,$RR1,$D1lo
- vpmadd52huq $H0,$RR1,$D1hi
- vpmadd52luq $H0,$RR2,$D2lo
- vpmadd52huq $H0,$RR2,$D2hi
-
- vpmadd52luq $H1,$SS2,$D0lo
- vpmadd52huq $H1,$SS2,$D0hi
- vpmadd52luq $H1,$RR0,$D1lo
- vpmadd52huq $H1,$RR0,$D1hi
- vpmadd52luq $H1,$RR1,$D2lo
- vpmadd52huq $H1,$RR1,$D2hi
-
- ################################################################
- # horizontal addition
-
- mov \$1,%eax
- kmovw %eax,%k1
- vpsrldq \$8,$D0lo,$T0
- vpsrldq \$8,$D0hi,$H0
- vpsrldq \$8,$D1lo,$T1
- vpsrldq \$8,$D1hi,$H1
- vpaddq $T0,$D0lo,$D0lo
- vpaddq $H0,$D0hi,$D0hi
- vpsrldq \$8,$D2lo,$T2
- vpsrldq \$8,$D2hi,$H2
- vpaddq $T1,$D1lo,$D1lo
- vpaddq $H1,$D1hi,$D1hi
- vpermq \$0x2,$D0lo,$T0
- vpermq \$0x2,$D0hi,$H0
- vpaddq $T2,$D2lo,$D2lo
- vpaddq $H2,$D2hi,$D2hi
-
- vpermq \$0x2,$D1lo,$T1
- vpermq \$0x2,$D1hi,$H1
- vpaddq $T0,$D0lo,$D0lo
- vpaddq $H0,$D0hi,$D0hi
- vpermq \$0x2,$D2lo,$T2
- vpermq \$0x2,$D2hi,$H2
- vpaddq $T1,$D1lo,$D1lo
- vpaddq $H1,$D1hi,$D1hi
- vextracti64x4 \$1,$D0lo,%y#$T0
- vextracti64x4 \$1,$D0hi,%y#$H0
- vpaddq $T2,$D2lo,$D2lo
- vpaddq $H2,$D2hi,$D2hi
-
- vextracti64x4 \$1,$D1lo,%y#$T1
- vextracti64x4 \$1,$D1hi,%y#$H1
- vextracti64x4 \$1,$D2lo,%y#$T2
- vextracti64x4 \$1,$D2hi,%y#$H2
-___
-######## switch back to %ymm
-map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
-map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
-map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
-
-$code.=<<___;
- vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
- vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
- vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
- vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
- vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
- vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- ################################################################
-
- vmovq %x#$H0,0($ctx)
- vmovq %x#$H1,8($ctx)
- vmovq %x#$H2,16($ctx)
- vzeroall
-
-.Lno_data_vpmadd52_8x:
- RET
-.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
-___
-}
-$code.=<<___;
-.type poly1305_emit_base2_44,\@function,3
-.align 32
-poly1305_emit_base2_44:
- mov 0($ctx),%r8 # load hash value
- mov 8($ctx),%r9
- mov 16($ctx),%r10
-
- mov %r9,%rax
- shr \$20,%r9
- shl \$44,%rax
- mov %r10,%rcx
- shr \$40,%r10
- shl \$24,%rcx
-
- add %rax,%r8
- adc %rcx,%r9
- adc \$0,%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- RET
-.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
-___
-} } }
-}
-
-if (!$kernel)
-{ # chacha20-poly1305 helpers
-my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
- ("%rdi","%rsi","%rdx","%rcx"); # Unix order
-$code.=<<___;
-.globl xor128_encrypt_n_pad
-.type xor128_encrypt_n_pad,\@abi-omnipotent
-.align 16
-xor128_encrypt_n_pad:
- sub $otp,$inp
- sub $otp,$out
- mov $len,%r10 # put len aside
- shr \$4,$len # len / 16
- jz .Ltail_enc
- nop
-.Loop_enc_xmm:
- movdqu ($inp,$otp),%xmm0
- pxor ($otp),%xmm0
- movdqu %xmm0,($out,$otp)
- movdqa %xmm0,($otp)
- lea 16($otp),$otp
- dec $len
- jnz .Loop_enc_xmm
-
- and \$15,%r10 # len % 16
- jz .Ldone_enc
-
-.Ltail_enc:
- mov \$16,$len
- sub %r10,$len
- xor %eax,%eax
-.Loop_enc_byte:
- mov ($inp,$otp),%al
- xor ($otp),%al
- mov %al,($out,$otp)
- mov %al,($otp)
- lea 1($otp),$otp
- dec %r10
- jnz .Loop_enc_byte
-
- xor %eax,%eax
-.Loop_enc_pad:
- mov %al,($otp)
- lea 1($otp),$otp
- dec $len
- jnz .Loop_enc_pad
-
-.Ldone_enc:
- mov $otp,%rax
- RET
-.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
-
-.globl xor128_decrypt_n_pad
-.type xor128_decrypt_n_pad,\@abi-omnipotent
-.align 16
-xor128_decrypt_n_pad:
- sub $otp,$inp
- sub $otp,$out
- mov $len,%r10 # put len aside
- shr \$4,$len # len / 16
- jz .Ltail_dec
- nop
-.Loop_dec_xmm:
- movdqu ($inp,$otp),%xmm0
- movdqa ($otp),%xmm1
- pxor %xmm0,%xmm1
- movdqu %xmm1,($out,$otp)
- movdqa %xmm0,($otp)
- lea 16($otp),$otp
- dec $len
- jnz .Loop_dec_xmm
-
- pxor %xmm1,%xmm1
- and \$15,%r10 # len % 16
- jz .Ldone_dec
-
-.Ltail_dec:
- mov \$16,$len
- sub %r10,$len
- xor %eax,%eax
- xor %r11d,%r11d
-.Loop_dec_byte:
- mov ($inp,$otp),%r11b
- mov ($otp),%al
- xor %r11b,%al
- mov %al,($out,$otp)
- mov %r11b,($otp)
- lea 1($otp),$otp
- dec %r10
- jnz .Loop_dec_byte
-
- xor %eax,%eax
-.Loop_dec_pad:
- mov %al,($otp)
- lea 1($otp),$otp
- dec $len
- jnz .Loop_dec_pad
-
-.Ldone_dec:
- mov $otp,%rax
- RET
-.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
-___
-}
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-# CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern __imp_RtlVirtualUnwind
-.type se_handler,\@abi-omnipotent
-.align 16
-se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<.Lprologue
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=.Lepilogue
- jae .Lcommon_seh_tail
-
- lea 48(%rax),%rax
-
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
- mov -48(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R14
-
- jmp .Lcommon_seh_tail
-.size se_handler,.-se_handler
-
-.type avx_handler,\@abi-omnipotent
-.align 16
-avx_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lcommon_seh_tail
-
- mov 208($context),%rax # pull context->R11
-
- lea 0x50(%rax),%rsi
- lea 0xf8(%rax),%rax
- lea 512($context),%rdi # &context.Xmm6
- mov \$20,%ecx
- .long 0xa548f3fc # cld; rep movsq
-
-.Lcommon_seh_tail:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
- mov %rsi,168($context) # restore context->Rsi
- mov %rdi,176($context) # restore context->Rdi
-
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$154,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
-
- mov $disp,%rsi
- xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
-
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- RET
-.size avx_handler,.-avx_handler
-
-.section .pdata
-.align 4
- .rva .LSEH_begin_poly1305_block_init_arch
- .rva .LSEH_end_poly1305_block_init_arch
- .rva .LSEH_info_poly1305_block_init_arch
-
- .rva .LSEH_begin_poly1305_blocks_x86_64
- .rva .LSEH_end_poly1305_blocks_x86_64
- .rva .LSEH_info_poly1305_blocks_x86_64
-
- .rva .LSEH_begin_poly1305_emit_x86_64
- .rva .LSEH_end_poly1305_emit_x86_64
- .rva .LSEH_info_poly1305_emit_x86_64
-___
-$code.=<<___ if ($avx);
- .rva .LSEH_begin_poly1305_blocks_avx
- .rva .Lbase2_64_avx
- .rva .LSEH_info_poly1305_blocks_avx_1
-
- .rva .Lbase2_64_avx
- .rva .Leven_avx
- .rva .LSEH_info_poly1305_blocks_avx_2
-
- .rva .Leven_avx
- .rva .LSEH_end_poly1305_blocks_avx
- .rva .LSEH_info_poly1305_blocks_avx_3
-
- .rva .LSEH_begin_poly1305_emit_avx
- .rva .LSEH_end_poly1305_emit_avx
- .rva .LSEH_info_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
- .rva .LSEH_begin_poly1305_blocks_avx2
- .rva .Lbase2_64_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_1
-
- .rva .Lbase2_64_avx2
- .rva .Leven_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_2
-
- .rva .Leven_avx2
- .rva .LSEH_end_poly1305_blocks_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_3
-___
-$code.=<<___ if ($avx>2);
- .rva .LSEH_begin_poly1305_blocks_avx512
- .rva .LSEH_end_poly1305_blocks_avx512
- .rva .LSEH_info_poly1305_blocks_avx512
-___
-$code.=<<___;
-.section .xdata
-.align 8
-.LSEH_info_poly1305_block_init_arch:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch
-
-.LSEH_info_poly1305_blocks_x86_64:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_body,.Lblocks_epilogue
-
-.LSEH_info_poly1305_emit_x86_64:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
-___
-$code.=<<___ if ($avx);
-.LSEH_info_poly1305_blocks_avx_1:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_2:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_3:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_emit_avx:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
-.LSEH_info_poly1305_blocks_avx2_1:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_2:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_3:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
-___
-$code.=<<___ if ($avx>2);
-.LSEH_info_poly1305_blocks_avx512:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
-___
-}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split('\n',$code)) {
- s/\`([^\`]*)\`/eval($1)/ge;
- s/%r([a-z]+)#d/%e$1/g;
- s/%r([0-9]+)#d/%r$1d/g;
- s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
-
- if ($kernel) {
- s/(^\.type.*),[0-9]+$/\1/;
- s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
- next if /^\.cfi.*/;
- }
-
- print $_,"\n";
-}
-close STDOUT;
diff --git a/arch/x86/lib/crypto/poly1305_glue.c b/arch/x86/lib/crypto/poly1305_glue.c
deleted file mode 100644
index b7e78a583e07..000000000000
--- a/arch/x86/lib/crypto/poly1305_glue.c
+++ /dev/null
@@ -1,129 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-#include <linux/unaligned.h>
-
-struct poly1305_arch_internal {
- union {
- struct {
- u32 h[5];
- u32 is_base2_26;
- };
- u64 hs[3];
- };
- u64 r[2];
- u64 pad;
- struct { u32 r2, r1, r4, r3; } rn[9];
-};
-
-asmlinkage void poly1305_block_init_arch(
- struct poly1305_block_state *state,
- const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx,
- const u8 *inp,
- const size_t len, const u32 padbit);
-asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx,
- u8 mac[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx,
- u8 mac[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx,
- const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx,
- const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx,
- const u8 *inp,
- const size_t len, const u32 padbit);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp,
- unsigned int len, u32 padbit)
-{
- struct poly1305_arch_internal *ctx =
- container_of(&state->h.h, struct poly1305_arch_internal, h);
-
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
- SZ_4K % POLY1305_BLOCK_SIZE);
-
- if (!static_branch_likely(&poly1305_use_avx)) {
- poly1305_blocks_x86_64(ctx, inp, len, padbit);
- return;
- }
-
- do {
- const unsigned int bytes = min(len, SZ_4K);
-
- kernel_fpu_begin();
- if (static_branch_likely(&poly1305_use_avx512))
- poly1305_blocks_avx512(ctx, inp, bytes, padbit);
- else if (static_branch_likely(&poly1305_use_avx2))
- poly1305_blocks_avx2(ctx, inp, bytes, padbit);
- else
- poly1305_blocks_avx(ctx, inp, bytes, padbit);
- kernel_fpu_end();
-
- len -= bytes;
- inp += bytes;
- } while (len);
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-void poly1305_emit_arch(const struct poly1305_state *ctx,
- u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4])
-{
- if (!static_branch_likely(&poly1305_use_avx))
- poly1305_emit_x86_64(ctx, mac, nonce);
- else
- poly1305_emit_avx(ctx, mac, nonce);
-}
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
- return static_key_enabled(&poly1305_use_avx);
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init poly1305_simd_mod_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_AVX) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
- static_branch_enable(&poly1305_use_avx);
- if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
- static_branch_enable(&poly1305_use_avx2);
- if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
- /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
- boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
- static_branch_enable(&poly1305_use_avx512);
- return 0;
-}
-subsys_initcall(poly1305_simd_mod_init);
-
-static void __exit poly1305_simd_mod_exit(void)
-{
-}
-module_exit(poly1305_simd_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
-MODULE_DESCRIPTION("Poly1305 authenticator");
diff --git a/arch/x86/lib/crypto/sha256-avx-asm.S b/arch/x86/lib/crypto/sha256-avx-asm.S
deleted file mode 100644
index 0d7b2c3e45d9..000000000000
--- a/arch/x86/lib/crypto/sha256-avx-asm.S
+++ /dev/null
@@ -1,499 +0,0 @@
-########################################################################
-# Implement fast SHA-256 with AVX1 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 1 block at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-
-.macro MY_ROR p1 p2
- shld $(32-(\p1)), \p2, \p2
-.endm
-
-################################
-
-# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
-# Load xmm with mem and byte swap each dword
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
- VMOVDQ \p2, \p1
- vpshufb \p3, \p1, \p1
-.endm
-
-################################
-
-X0 = %xmm4
-X1 = %xmm5
-X2 = %xmm6
-X3 = %xmm7
-
-XTMP0 = %xmm0
-XTMP1 = %xmm1
-XTMP2 = %xmm2
-XTMP3 = %xmm3
-XTMP4 = %xmm8
-XFER = %xmm9
-XTMP5 = %xmm11
-
-SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %xmm13
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-
-SRND = %rsi # clobbers INP
-c = %ecx
-d = %r8d
-e = %edx
-TBL = %r12
-a = %eax
-b = %ebx
-
-f = %r9d
-g = %r10d
-h = %r11d
-
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_XFER_SIZE = 16
-_XMM_SAVE_SIZE = 0
-
-_INP_END = 0
-_INP = _INP_END + _INP_END_SIZE
-_XFER = _INP + _INP_SIZE
-_XMM_SAVE = _XFER + _XFER_SIZE
-STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
- ## compute s0 four at a time and s1 two at a time
- ## compute W[-16] + W[-7] 4 at a time
-
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- ## compute s0
- vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpsrld $7, XTMP1, XTMP2
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpslld $(32-7), XTMP1, XTMP3
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- vpsrld $18, XTMP1, XTMP2 #
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- vpslld $(32-18), XTMP1, XTMP1
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- vpxor XTMP1, XTMP3, XTMP3 #
- add y0, y2 # y2 = S1 + CH
- add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- ## compute low s1
- vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
- xor g, y2 # y2 = f^g
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- vpxor XTMP3, XTMP2, XTMP2 #
- add y0, y2 # y2 = S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- ## compute high s1
- vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- vpxor XTMP3, XTMP2, XTMP2
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-## input is [rsp + _XFER + %1 * 4]
-.macro DO_ROUND round
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- xor e, y0 # y0 = e ^ (e >> (25-11))
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- and e, y2 # y2 = (f^g)&e
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- add y0, y2 # y2 = S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- offset = \round * 4 + _XFER #
- add offset(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
-.endm
-
-########################################################################
-## void sha256_transform_avx(u32 state[SHA256_STATE_WORDS],
-## const u8 *data, size_t nblocks);
-########################################################################
-.text
-SYM_FUNC_START(sha256_transform_avx)
- ANNOTATE_NOENDBR # since this is called only via static_call
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbp
- movq %rsp, %rbp
-
- subq $STACK_SIZE, %rsp # allocate stack space
- and $~15, %rsp # align stack pointer
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS # pointer to end of data
- mov NUM_BLKS, _INP_END(%rsp)
-
- ## load initial digest
- mov 4*0(CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-.Lloop0:
- lea K256(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
-
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 16 each
- mov $3, SRND
-.align 16
-.Lloop1:
- vpaddd (TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 1*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 2*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 3*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- add $4*16, TBL
- FOUR_ROUNDS_AND_SCHED
-
- sub $1, SRND
- jne .Lloop1
-
- mov $2, SRND
-.Lloop2:
- vpaddd (TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- vpaddd 1*16(TBL), X1, XFER
- vmovdqa XFER, _XFER(%rsp)
- add $2*16, TBL
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- vmovdqa X2, X0
- vmovdqa X3, X1
-
- sub $1, SRND
- jne .Lloop2
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- mov _INP(%rsp), INP
- add $64, INP
- cmp _INP_END(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- popq %rbp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- RET
-SYM_FUNC_END(sha256_transform_avx)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
-
-.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
-.align 16
-# shuffle xBxA -> 00BA
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
-.align 16
-# shuffle xDxC -> DC00
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/lib/crypto/sha256-avx2-asm.S b/arch/x86/lib/crypto/sha256-avx2-asm.S
deleted file mode 100644
index 25d3380321ec..000000000000
--- a/arch/x86/lib/crypto/sha256-avx2-asm.S
+++ /dev/null
@@ -1,774 +0,0 @@
-########################################################################
-# Implement fast SHA-256 with AVX2 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 2 blocks at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-################################
-
-X0 = %ymm4
-X1 = %ymm5
-X2 = %ymm6
-X3 = %ymm7
-
-# XMM versions of above
-XWORD0 = %xmm4
-XWORD1 = %xmm5
-XWORD2 = %xmm6
-XWORD3 = %xmm7
-
-XTMP0 = %ymm0
-XTMP1 = %ymm1
-XTMP2 = %ymm2
-XTMP3 = %ymm3
-XTMP4 = %ymm8
-XFER = %ymm9
-XTMP5 = %ymm11
-
-SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %ymm13
-
-X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-c = %ecx
-d = %r8d
-e = %edx # clobbers NUM_BLKS
-y3 = %esi # clobbers INP
-
-SRND = CTX # SRND is same register as CTX
-
-a = %eax
-b = %ebx
-f = %r9d
-g = %r10d
-h = %r11d
-old_h = %r11d
-
-T1 = %r12d
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
-_XMM_SAVE_SIZE = 0
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_CTX_SIZE = 8
-
-_XFER = 0
-_XMM_SAVE = _XFER + _XFER_SIZE
-_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
-_INP = _INP_END + _INP_END_SIZE
-_CTX = _INP + _INP_SIZE
-STACK_SIZE = _CTX + _CTX_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
- X_ = X0
- X0 = X1
- X1 = X2
- X2 = X3
- X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
- old_h = h
- TMP_ = h
- h = g
- g = f
- f = e
- e = d
- d = c
- c = b
- b = a
- a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED disp
-################################### RND N + 0 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
-
- addl \disp(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
- vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
- vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
-
- and e, y2 # y2 = (f^g)&e # CH
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- add h, d # d = k + w + h + d # --
-
- and b, y3 # y3 = (a|c)&b # MAJA
- vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
-
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- vpsrld $7, XTMP1, XTMP2
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
-
- add y0, y2 # y2 = S1 + CH # --
- vpslld $(32-7), XTMP1, XTMP3
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
-
- vpsrld $18, XTMP1, XTMP2
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
-
- ROTATE_ARGS
-
-################################### RND N + 1 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- offset = \disp + 1*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
-
- vpslld $(32-18), XTMP1, XTMP1
- and b, y3 # y3 = (a|c)&b # MAJA
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
-
- vpxor XTMP1, XTMP3, XTMP3
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
- vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
-
-
- ROTATE_ARGS
-
-################################### RND N + 2 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- offset = \disp + 2*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
-
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- or c, y3 # y3 = a|c # MAJA
- mov f, y2 # y2 = f # CH
- xor g, y2 # y2 = f^g # CH
-
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
- and e, y2 # y2 = (f^g)&e # CH
-
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- vpxor XTMP3, XTMP2, XTMP2
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a ,T1 # T1 = (a >> 2) # S0
- vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
- vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
-
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1,h # h = k + w + h + S0 # --
- add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3,h # h = t1 + S0 + MAJ # --
-
-
- ROTATE_ARGS
-
-################################### RND N + 3 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- offset = \disp + 3*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpxor XTMP3, XTMP2, XTMP2
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- add y0, y2 # y2 = S1 + CH # --
-
- vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
-
- vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
-
- add y1, h # h = k + w + h + S0 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-.macro DO_4ROUNDS disp
-################################### RND N + 0 ###########################
-
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- addl \disp(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 1 ###########################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*1 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 2 ##############################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*2 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 3 ###########################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*3 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
-
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3, h # h = t1 + S0 + MAJ # --
-
- ROTATE_ARGS
-
-.endm
-
-########################################################################
-## void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS],
-## const u8 *data, size_t nblocks);
-########################################################################
-.text
-SYM_FUNC_START(sha256_transform_rorx)
- ANNOTATE_NOENDBR # since this is called only via static_call
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
- push %rbp
- mov %rsp, %rbp
-
- subq $STACK_SIZE, %rsp
- and $-32, %rsp # align rsp to 32 byte boundary
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
- mov NUM_BLKS, _INP_END(%rsp)
-
- cmp NUM_BLKS, INP
- je .Lonly_one_block
-
- ## load initial digest
- mov (CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-
- mov CTX, _CTX(%rsp)
-
-.Lloop0:
- ## Load first 16 dwords from two blocks
- VMOVDQ 0*32(INP),XTMP0
- VMOVDQ 1*32(INP),XTMP1
- VMOVDQ 2*32(INP),XTMP2
- VMOVDQ 3*32(INP),XTMP3
-
- ## byte swap data
- vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
- vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
- vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
- vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
-
- ## transpose data into high/low halves
- vperm2i128 $0x20, XTMP2, XTMP0, X0
- vperm2i128 $0x31, XTMP2, XTMP0, X1
- vperm2i128 $0x20, XTMP3, XTMP1, X2
- vperm2i128 $0x31, XTMP3, XTMP1, X3
-
-.Llast_block_enter:
- add $64, INP
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 12 each
- xor SRND, SRND
-
-.align 16
-.Lloop1:
- leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
-
- leaq K256+1*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
-
- leaq K256+2*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
-
- leaq K256+3*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
-
- add $4*32, SRND
- cmp $3*4*32, SRND
- jb .Lloop1
-
-.Lloop2:
- ## Do last 16 rounds with no scheduling
- leaq K256+0*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- DO_4ROUNDS (_XFER + 0*32)
-
- leaq K256+1*32(%rip), INP
- vpaddd (INP, SRND), X1, XFER
- vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- DO_4ROUNDS (_XFER + 1*32)
- add $2*32, SRND
-
- vmovdqa X2, X0
- vmovdqa X3, X1
-
- cmp $4*4*32, SRND
- jb .Lloop2
-
- mov _CTX(%rsp), CTX
- mov _INP(%rsp), INP
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- cmp _INP_END(%rsp), INP
- ja .Ldone_hash
-
- #### Do second block using previously scheduled results
- xor SRND, SRND
-.align 16
-.Lloop3:
- DO_4ROUNDS (_XFER + 0*32 + 16)
- DO_4ROUNDS (_XFER + 1*32 + 16)
- add $2*32, SRND
- cmp $4*4*32, SRND
- jb .Lloop3
-
- mov _CTX(%rsp), CTX
- mov _INP(%rsp), INP
- add $64, INP
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- cmp _INP_END(%rsp), INP
- jb .Lloop0
- ja .Ldone_hash
-
-.Ldo_last_block:
- VMOVDQ 0*16(INP),XWORD0
- VMOVDQ 1*16(INP),XWORD1
- VMOVDQ 2*16(INP),XWORD2
- VMOVDQ 3*16(INP),XWORD3
-
- vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
- vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
- vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
- vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
-
- jmp .Llast_block_enter
-
-.Lonly_one_block:
-
- ## load initial digest
- mov (4*0)(CTX),a
- mov (4*1)(CTX),b
- mov (4*2)(CTX),c
- mov (4*3)(CTX),d
- mov (4*4)(CTX),e
- mov (4*5)(CTX),f
- mov (4*6)(CTX),g
- mov (4*7)(CTX),h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-
- mov CTX, _CTX(%rsp)
- jmp .Ldo_last_block
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- pop %rbp
-
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- vzeroupper
- RET
-SYM_FUNC_END(sha256_transform_rorx)
-
-.section .rodata.cst512.K256, "aM", @progbits, 512
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
-.align 32
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
-
-# shuffle xBxA -> 00BA
-.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
-.align 32
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-# shuffle xDxC -> DC00
-.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
-.align 32
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/lib/crypto/sha256-ni-asm.S b/arch/x86/lib/crypto/sha256-ni-asm.S
deleted file mode 100644
index d3548206cf3d..000000000000
--- a/arch/x86/lib/crypto/sha256-ni-asm.S
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Intel SHA Extensions optimized implementation of a SHA-256 update function
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Sean Gulley <sean.m.gulley@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-#define STATE_PTR %rdi /* 1st arg */
-#define DATA_PTR %rsi /* 2nd arg */
-#define NUM_BLKS %rdx /* 3rd arg */
-
-#define SHA256CONSTANTS %rax
-
-#define MSG %xmm0 /* sha256rnds2 implicit operand */
-#define STATE0 %xmm1
-#define STATE1 %xmm2
-#define MSG0 %xmm3
-#define MSG1 %xmm4
-#define MSG2 %xmm5
-#define MSG3 %xmm6
-#define TMP %xmm7
-
-#define SHUF_MASK %xmm8
-
-#define ABEF_SAVE %xmm9
-#define CDGH_SAVE %xmm10
-
-.macro do_4rounds i, m0, m1, m2, m3
-.if \i < 16
- movdqu \i*4(DATA_PTR), \m0
- pshufb SHUF_MASK, \m0
-.endif
- movdqa (\i-32)*4(SHA256CONSTANTS), MSG
- paddd \m0, MSG
- sha256rnds2 STATE0, STATE1
-.if \i >= 12 && \i < 60
- movdqa \m0, TMP
- palignr $4, \m3, TMP
- paddd TMP, \m1
- sha256msg2 \m0, \m1
-.endif
- punpckhqdq MSG, MSG
- sha256rnds2 STATE1, STATE0
-.if \i >= 4 && \i < 52
- sha256msg1 \m0, \m3
-.endif
-.endm
-
-/*
- * Intel SHA Extensions optimized implementation of a SHA-256 block function
- *
- * This function takes a pointer to the current SHA-256 state, a pointer to the
- * input data, and the number of 64-byte blocks to process. Once all blocks
- * have been processed, the state is updated with the new state. This function
- * only processes complete blocks. State initialization, buffering of partial
- * blocks, and digest finalization is expected to be handled elsewhere.
- *
- * void sha256_ni_transform(u32 state[SHA256_STATE_WORDS],
- * const u8 *data, size_t nblocks);
- */
-.text
-SYM_FUNC_START(sha256_ni_transform)
- ANNOTATE_NOENDBR # since this is called only via static_call
-
- shl $6, NUM_BLKS /* convert to bytes */
- jz .Ldone_hash
- add DATA_PTR, NUM_BLKS /* pointer to end of data */
-
- /*
- * load initial hash values
- * Need to reorder these appropriately
- * DCBA, HGFE -> ABEF, CDGH
- */
- movdqu 0*16(STATE_PTR), STATE0 /* DCBA */
- movdqu 1*16(STATE_PTR), STATE1 /* HGFE */
-
- movdqa STATE0, TMP
- punpcklqdq STATE1, STATE0 /* FEBA */
- punpckhqdq TMP, STATE1 /* DCHG */
- pshufd $0x1B, STATE0, STATE0 /* ABEF */
- pshufd $0xB1, STATE1, STATE1 /* CDGH */
-
- movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
- lea K256+32*4(%rip), SHA256CONSTANTS
-
-.Lloop0:
- /* Save hash values for addition after rounds */
- movdqa STATE0, ABEF_SAVE
- movdqa STATE1, CDGH_SAVE
-
-.irp i, 0, 16, 32, 48
- do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
- do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
- do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
- do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
-.endr
-
- /* Add current hash values with previously saved */
- paddd ABEF_SAVE, STATE0
- paddd CDGH_SAVE, STATE1
-
- /* Increment data pointer and loop if more to process */
- add $64, DATA_PTR
- cmp NUM_BLKS, DATA_PTR
- jne .Lloop0
-
- /* Write hash values back in the correct order */
- movdqa STATE0, TMP
- punpcklqdq STATE1, STATE0 /* GHEF */
- punpckhqdq TMP, STATE1 /* ABCD */
- pshufd $0xB1, STATE0, STATE0 /* HGFE */
- pshufd $0x1B, STATE1, STATE1 /* DCBA */
-
- movdqu STATE1, 0*16(STATE_PTR)
- movdqu STATE0, 1*16(STATE_PTR)
-
-.Ldone_hash:
-
- RET
-SYM_FUNC_END(sha256_ni_transform)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/lib/crypto/sha256-ssse3-asm.S b/arch/x86/lib/crypto/sha256-ssse3-asm.S
deleted file mode 100644
index 7f24a4cdcb25..000000000000
--- a/arch/x86/lib/crypto/sha256-ssse3-asm.S
+++ /dev/null
@@ -1,511 +0,0 @@
-########################################################################
-# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-## assume buffers not aligned
-#define MOVDQ movdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-################################
-
-# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
-# Load xmm with mem and byte swap each dword
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
- MOVDQ \p2, \p1
- pshufb \p3, \p1
-.endm
-
-################################
-
-X0 = %xmm4
-X1 = %xmm5
-X2 = %xmm6
-X3 = %xmm7
-
-XTMP0 = %xmm0
-XTMP1 = %xmm1
-XTMP2 = %xmm2
-XTMP3 = %xmm3
-XTMP4 = %xmm8
-XFER = %xmm9
-
-SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %xmm12
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-
-SRND = %rsi # clobbers INP
-c = %ecx
-d = %r8d
-e = %edx
-TBL = %r12
-a = %eax
-b = %ebx
-
-f = %r9d
-g = %r10d
-h = %r11d
-
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_XFER_SIZE = 16
-_XMM_SAVE_SIZE = 0
-
-_INP_END = 0
-_INP = _INP_END + _INP_END_SIZE
-_XFER = _INP + _INP_SIZE
-_XMM_SAVE = _XFER + _XFER_SIZE
-STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
- ## compute s0 four at a time and s1 two at a time
- ## compute W[-16] + W[-7] 4 at a time
- movdqa X3, XTMP0
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- palignr $4, X2, XTMP0 # XTMP0 = W[-7]
- ror $(22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- movdqa X1, XTMP1
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- ## compute s0
- palignr $4, X0, XTMP1 # XTMP1 = W[-15]
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
- movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pslld $(32-7), XTMP1 #
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- psrld $7, XTMP2 #
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- #
- ROTATE_ARGS #
- movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
- ror $(25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(22-13), y1 # y1 = a >> (22-13)
- pslld $(32-18), XTMP3 #
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- psrld $18, XTMP2 #
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- pxor XTMP3, XTMP1
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
- add y0, y2 # y2 = S1 + CH
- add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pxor XTMP4, XTMP1 # XTMP1 = s0
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- ## compute low s1
- pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
-
- ROTATE_ARGS
- movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- ror $(25-11), y0 # y0 = e >> (25-11)
- movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
- xor e, y0 # y0 = e ^ (e >> (25-11))
- ror $(22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
- xor g, y2 # y2 = f^g
- psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- pxor XTMP3, XTMP2
- add y0, y2 # y2 = S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- ## compute high s1
- pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- #
- ROTATE_ARGS #
- movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
- ror $(22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
- and e, y2 # y2 = (f^g)&e
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- pxor XTMP3, XTMP2 #
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
- add y0, y2 # y2 = S1 + CH
- add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- pxor XTMP2, X0 # X0 = s1 {xDxC}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
-
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-## input is [rsp + _XFER + %1 * 4]
-.macro DO_ROUND round
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- xor e, y0 # y0 = e ^ (e >> (25-11))
- ror $(22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- and e, y2 # y2 = (f^g)&e
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- add y0, y2 # y2 = S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- offset = \round * 4 + _XFER
- add offset(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
-.endm
-
-########################################################################
-## void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS],
-## const u8 *data, size_t nblocks);
-########################################################################
-.text
-SYM_FUNC_START(sha256_transform_ssse3)
- ANNOTATE_NOENDBR # since this is called only via static_call
-
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbp
- mov %rsp, %rbp
-
- subq $STACK_SIZE, %rsp
- and $~15, %rsp
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS
- mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
-
- ## load initial digest
- mov 4*0(CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- movdqa _SHUF_00BA(%rip), SHUF_00BA
- movdqa _SHUF_DC00(%rip), SHUF_DC00
-
-.Lloop0:
- lea K256(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
-
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 16 each
- mov $3, SRND
-.align 16
-.Lloop1:
- movdqa (TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 1*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 2*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 3*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- add $4*16, TBL
- FOUR_ROUNDS_AND_SCHED
-
- sub $1, SRND
- jne .Lloop1
-
- mov $2, SRND
-.Lloop2:
- paddd (TBL), X0
- movdqa X0, _XFER(%rsp)
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
- paddd 1*16(TBL), X1
- movdqa X1, _XFER(%rsp)
- add $2*16, TBL
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- movdqa X2, X0
- movdqa X3, X1
-
- sub $1, SRND
- jne .Lloop2
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- mov _INP(%rsp), INP
- add $64, INP
- cmp _INP_END(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- popq %rbp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
-
- RET
-SYM_FUNC_END(sha256_transform_ssse3)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
-
-.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
-.align 16
-# shuffle xBxA -> 00BA
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
-.align 16
-# shuffle xDxC -> DC00
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/lib/crypto/sha256.c b/arch/x86/lib/crypto/sha256.c
deleted file mode 100644
index 80380f8fdcee..000000000000
--- a/arch/x86/lib/crypto/sha256.c
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for x86_64
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/fpu/api.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/static_call.h>
-
-asmlinkage void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-asmlinkage void sha256_transform_avx(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-asmlinkage void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-asmlinkage void sha256_ni_transform(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86);
-
-DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3);
-
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- if (static_branch_likely(&have_sha256_x86)) {
- kernel_fpu_begin();
- static_call(sha256_blocks_x86)(state, data, nblocks);
- kernel_fpu_end();
- } else {
- sha256_blocks_generic(state, data, nblocks);
- }
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- sha256_blocks_generic(state, data, nblocks);
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-bool sha256_is_arch_optimized(void)
-{
- return static_key_enabled(&have_sha256_x86);
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_x86_mod_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
- static_call_update(sha256_blocks_x86, sha256_ni_transform);
- } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE |
- XFEATURE_MASK_YMM, NULL) &&
- boot_cpu_has(X86_FEATURE_AVX)) {
- if (boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_BMI2))
- static_call_update(sha256_blocks_x86,
- sha256_transform_rorx);
- else
- static_call_update(sha256_blocks_x86,
- sha256_transform_avx);
- } else if (!boot_cpu_has(X86_FEATURE_SSSE3)) {
- return 0;
- }
- static_branch_enable(&have_sha256_x86);
- return 0;
-}
-subsys_initcall(sha256_x86_mod_init);
-
-static void __exit sha256_x86_mod_exit(void)
-{
-}
-module_exit(sha256_x86_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for x86_64");
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 89079ea73e65..a4700ef6eb64 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -266,6 +266,32 @@ static void effective_prot(struct ptdump_state *pt_st, int level, u64 val)
st->prot_levels[level] = effective;
}
+static void effective_prot_pte(struct ptdump_state *st, pte_t pte)
+{
+ effective_prot(st, 4, pte_val(pte));
+}
+
+static void effective_prot_pmd(struct ptdump_state *st, pmd_t pmd)
+{
+ effective_prot(st, 3, pmd_val(pmd));
+}
+
+static void effective_prot_pud(struct ptdump_state *st, pud_t pud)
+{
+ effective_prot(st, 2, pud_val(pud));
+}
+
+static void effective_prot_p4d(struct ptdump_state *st, p4d_t p4d)
+{
+ effective_prot(st, 1, p4d_val(p4d));
+}
+
+static void effective_prot_pgd(struct ptdump_state *st, pgd_t pgd)
+{
+ effective_prot(st, 0, pgd_val(pgd));
+}
+
+
/*
* This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to
@@ -362,6 +388,38 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
}
}
+static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+ note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+ note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+ note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+ note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+ note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+static void note_page_flush(struct ptdump_state *pt_st)
+{
+ pte_t pte_zero = {0};
+
+ note_page(pt_st, 0, -1, pte_val(pte_zero));
+}
+
bool ptdump_walk_pgd_level_core(struct seq_file *m,
struct mm_struct *mm, pgd_t *pgd,
bool checkwx, bool dmesg)
@@ -378,8 +436,17 @@ bool ptdump_walk_pgd_level_core(struct seq_file *m,
struct pg_state st = {
.ptdump = {
- .note_page = note_page,
- .effective_prot = effective_prot,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
+ .effective_prot_pte = effective_prot_pte,
+ .effective_prot_pmd = effective_prot_pmd,
+ .effective_prot_pud = effective_prot_pud,
+ .effective_prot_p4d = effective_prot_p4d,
+ .effective_prot_pgd = effective_prot_pgd,
.range = ptdump_ranges
},
.level = -1,
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index bf8dab18be97..2fdc1f1f5adb 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -122,13 +122,12 @@ static bool ex_handler_sgx(const struct exception_table_entry *fixup,
static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
struct pt_regs *regs)
{
- regs->ip = ex_fixup_addr(fixup);
-
WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
(void *)instruction_pointer(regs));
fpu_reset_from_exception_fixup();
- return true;
+
+ return ex_handler_default(fixup, regs);
}
/*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 607d6a2e66e2..8a34fff6ab2b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -30,7 +30,6 @@
#include <linux/initrd.h>
#include <linux/cpumask.h>
#include <linux/gfp.h>
-#include <linux/execmem.h>
#include <asm/asm.h>
#include <asm/bios_ebda.h>
@@ -749,8 +748,6 @@ void mark_rodata_ro(void)
pr_info("Write protecting kernel text and read-only data: %luk\n",
size >> 10);
- execmem_cache_make_ro();
-
kernel_set_to_readonly = 1;
#ifdef CONFIG_CPA_DEBUG
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 66330fe4e18c..76e33bd7c556 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -34,7 +34,6 @@
#include <linux/gfp.h>
#include <linux/kcore.h>
#include <linux/bootmem_info.h>
-#include <linux/execmem.h>
#include <asm/processor.h>
#include <asm/bios_ebda.h>
@@ -806,7 +805,7 @@ kernel_physical_mapping_change(unsigned long paddr_start,
}
#ifndef CONFIG_NUMA
-static inline void x86_numa_init(void)
+static __always_inline void x86_numa_init(void)
{
memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
}
@@ -1392,8 +1391,6 @@ void mark_rodata_ro(void)
(end - start) >> 10);
set_memory_ro(start, (end - start) >> PAGE_SHIFT);
- execmem_cache_make_ro();
-
kernel_set_to_readonly = 1;
/*
@@ -1467,16 +1464,21 @@ static unsigned long probe_memory_block_size(void)
}
/*
- * Use max block size to minimize overhead on bare metal, where
- * alignment for memory hotplug isn't a concern.
+ * When hotplug alignment is not a concern, maximize blocksize
+ * to minimize overhead. Otherwise, align to the lesser of advice
+ * alignment and end of memory alignment.
*/
- if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ bz = memory_block_advised_max_size();
+ if (!bz) {
bz = MAX_BLOCK_SIZE;
- goto done;
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ goto done;
+ } else {
+ bz = max(min(bz, MAX_BLOCK_SIZE), MIN_MEMORY_BLOCK_SIZE);
}
/* Find the largest allowed block size that aligns to memory end */
- for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
+ for (; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
if (IS_ALIGNED(boot_mem_end, bz))
break;
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 331e101bf801..12c8180ca1ba 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -71,7 +71,7 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
static unsigned int __ioremap_check_ram(struct resource *res)
{
unsigned long start_pfn, stop_pfn;
- unsigned long i;
+ unsigned long pfn;
if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
return 0;
@@ -79,9 +79,8 @@ static unsigned int __ioremap_check_ram(struct resource *res)
start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
stop_pfn = (res->end + 1) >> PAGE_SHIFT;
if (stop_pfn > start_pfn) {
- for (i = 0; i < (stop_pfn - start_pfn); ++i)
- if (pfn_valid(start_pfn + i) &&
- !PageReserved(pfn_to_page(start_pfn + i)))
+ for_each_valid_pfn(pfn, start_pfn, stop_pfn)
+ if (!PageReserved(pfn_to_page(pfn)))
return IORES_MAP_SYSTEM_RAM;
}
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index c97b527c66fe..2e7923844afe 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -775,6 +775,12 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
return vma_prot;
}
+static inline void pgprot_set_cachemode(pgprot_t *prot, enum page_cache_mode pcm)
+{
+ *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) |
+ cachemode2protval(pcm));
+}
+
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t *vma_prot)
{
@@ -789,8 +795,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
if (file->f_flags & O_DSYNC)
pcm = _PAGE_CACHE_MODE_UC_MINUS;
- *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
- cachemode2protval(pcm));
+ pgprot_set_cachemode(vma_prot, pcm);
return 1;
}
@@ -831,8 +836,7 @@ int memtype_kernel_map_sync(u64 base, unsigned long size,
* Reserved non RAM regions only and after successful memtype_reserve,
* this func also keeps identity mapping (if any) in sync with this new prot.
*/
-static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
- int strict_prot)
+static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot)
{
int is_ram = 0;
int ret;
@@ -858,9 +862,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
(unsigned long long)paddr,
(unsigned long long)(paddr + size - 1),
cattr_name(pcm));
- *vma_prot = __pgprot((pgprot_val(*vma_prot) &
- (~_PAGE_CACHE_MASK)) |
- cachemode2protval(pcm));
+ pgprot_set_cachemode(vma_prot, pcm);
}
return 0;
}
@@ -870,8 +872,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
return ret;
if (pcm != want_pcm) {
- if (strict_prot ||
- !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
+ if (!is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
memtype_free(paddr, paddr + size);
pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
@@ -881,13 +882,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
cattr_name(pcm));
return -EINVAL;
}
- /*
- * We allow returning different type than the one requested in
- * non strict case.
- */
- *vma_prot = __pgprot((pgprot_val(*vma_prot) &
- (~_PAGE_CACHE_MASK)) |
- cachemode2protval(pcm));
+ pgprot_set_cachemode(vma_prot, pcm);
}
if (memtype_kernel_map_sync(paddr, size, pcm) < 0) {
@@ -910,124 +905,14 @@ static void free_pfn_range(u64 paddr, unsigned long size)
memtype_free(paddr, paddr + size);
}
-static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
- resource_size_t *phys)
-{
- struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start };
-
- if (follow_pfnmap_start(&args))
- return -EINVAL;
-
- /* Never return PFNs of anon folios in COW mappings. */
- if (!args.special) {
- follow_pfnmap_end(&args);
- return -EINVAL;
- }
-
- *prot = pgprot_val(args.pgprot);
- *phys = (resource_size_t)args.pfn << PAGE_SHIFT;
- follow_pfnmap_end(&args);
- return 0;
-}
-
-static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
- pgprot_t *pgprot)
-{
- unsigned long prot;
-
- VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT));
-
- /*
- * We need the starting PFN and cachemode used for track_pfn_remap()
- * that covered the whole VMA. For most mappings, we can obtain that
- * information from the page tables. For COW mappings, we might now
- * suddenly have anon folios mapped and follow_phys() will fail.
- *
- * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to
- * detect the PFN. If we need the cachemode as well, we're out of luck
- * for now and have to fail fork().
- */
- if (!follow_phys(vma, &prot, paddr)) {
- if (pgprot)
- *pgprot = __pgprot(prot);
- return 0;
- }
- if (is_cow_mapping(vma->vm_flags)) {
- if (pgprot)
- return -EINVAL;
- *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
- return 0;
- }
- WARN_ON_ONCE(1);
- return -EINVAL;
-}
-
-int track_pfn_copy(struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma, unsigned long *pfn)
-{
- const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start;
- resource_size_t paddr;
- pgprot_t pgprot;
- int rc;
-
- if (!(src_vma->vm_flags & VM_PAT))
- return 0;
-
- /*
- * Duplicate the PAT information for the dst VMA based on the src
- * VMA.
- */
- if (get_pat_info(src_vma, &paddr, &pgprot))
- return -EINVAL;
- rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1);
- if (rc)
- return rc;
-
- /* Reservation for the destination VMA succeeded. */
- vm_flags_set(dst_vma, VM_PAT);
- *pfn = PHYS_PFN(paddr);
- return 0;
-}
-
-void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn)
-{
- untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true);
- /*
- * Reservation was freed, any copied page tables will get cleaned
- * up later, but without getting PAT involved again.
- */
-}
-
-/*
- * prot is passed in as a parameter for the new mapping. If the vma has
- * a linear pfn mapping for the entire range, or no vma is provided,
- * reserve the entire pfn + size range with single reserve_pfn_range
- * call.
- */
-int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
- unsigned long pfn, unsigned long addr, unsigned long size)
+int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot)
{
resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
enum page_cache_mode pcm;
- /* reserve the whole chunk starting from paddr */
- if (!vma || (addr == vma->vm_start
- && size == (vma->vm_end - vma->vm_start))) {
- int ret;
-
- ret = reserve_pfn_range(paddr, size, prot, 0);
- if (ret == 0 && vma)
- vm_flags_set(vma, VM_PAT);
- return ret;
- }
-
if (!pat_enabled())
return 0;
- /*
- * For anything smaller than the vma size we set prot based on the
- * lookup.
- */
pcm = lookup_memtype(paddr);
/* Check memtype for the remaining pages */
@@ -1038,70 +923,35 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
return -EINVAL;
}
- *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
- cachemode2protval(pcm));
-
+ pgprot_set_cachemode(prot, pcm);
return 0;
}
-void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
+int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot)
{
- enum page_cache_mode pcm;
+ const resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
- if (!pat_enabled())
- return;
-
- /* Set prot based on lookup */
- pcm = lookup_memtype(pfn_t_to_phys(pfn));
- *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
- cachemode2protval(pcm));
+ return reserve_pfn_range(paddr, size, prot);
}
-/*
- * untrack_pfn is called while unmapping a pfnmap for a region.
- * untrack can be called for a specific region indicated by pfn and size or
- * can be for the entire vma (in which case pfn, size are zero).
- */
-void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
- unsigned long size, bool mm_wr_locked)
+void pfnmap_untrack(unsigned long pfn, unsigned long size)
{
- resource_size_t paddr;
-
- if (vma && !(vma->vm_flags & VM_PAT))
- return;
+ const resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
- /* free the chunk starting from pfn or the whole chunk */
- paddr = (resource_size_t)pfn << PAGE_SHIFT;
- if (!paddr && !size) {
- if (get_pat_info(vma, &paddr, NULL))
- return;
- size = vma->vm_end - vma->vm_start;
- }
free_pfn_range(paddr, size);
- if (vma) {
- if (mm_wr_locked)
- vm_flags_clear(vma, VM_PAT);
- else
- __vm_flags_mod(vma, 0, VM_PAT);
- }
-}
-
-void untrack_pfn_clear(struct vm_area_struct *vma)
-{
- vm_flags_clear(vma, VM_PAT);
}
pgprot_t pgprot_writecombine(pgprot_t prot)
{
- return __pgprot(pgprot_val(prot) |
- cachemode2protval(_PAGE_CACHE_MODE_WC));
+ pgprot_set_cachemode(&prot, _PAGE_CACHE_MODE_WC);
+ return prot;
}
EXPORT_SYMBOL_GPL(pgprot_writecombine);
pgprot_t pgprot_writethrough(pgprot_t prot)
{
- return __pgprot(pgprot_val(prot) |
- cachemode2protval(_PAGE_CACHE_MODE_WT));
+ pgprot_set_cachemode(&prot, _PAGE_CACHE_MODE_WT);
+ return prot;
}
EXPORT_SYMBOL_GPL(pgprot_writethrough);
diff --git a/arch/x86/mm/pat/memtype_interval.c b/arch/x86/mm/pat/memtype_interval.c
index 645613d59942..e5844ed1311e 100644
--- a/arch/x86/mm/pat/memtype_interval.c
+++ b/arch/x86/mm/pat/memtype_interval.c
@@ -49,32 +49,6 @@ INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end,
static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED;
-enum {
- MEMTYPE_EXACT_MATCH = 0,
- MEMTYPE_END_MATCH = 1
-};
-
-static struct memtype *memtype_match(u64 start, u64 end, int match_type)
-{
- struct memtype *entry_match;
-
- entry_match = interval_iter_first(&memtype_rbroot, start, end-1);
-
- while (entry_match != NULL && entry_match->start < end) {
- if ((match_type == MEMTYPE_EXACT_MATCH) &&
- (entry_match->start == start) && (entry_match->end == end))
- return entry_match;
-
- if ((match_type == MEMTYPE_END_MATCH) &&
- (entry_match->start < start) && (entry_match->end == end))
- return entry_match;
-
- entry_match = interval_iter_next(entry_match, start, end-1);
- }
-
- return NULL; /* Returns NULL if there is no match */
-}
-
static int memtype_check_conflict(u64 start, u64 end,
enum page_cache_mode reqtype,
enum page_cache_mode *newtype)
@@ -130,35 +104,16 @@ int memtype_check_insert(struct memtype *entry_new, enum page_cache_mode *ret_ty
struct memtype *memtype_erase(u64 start, u64 end)
{
- struct memtype *entry_old;
-
- /*
- * Since the memtype_rbroot tree allows overlapping ranges,
- * memtype_erase() checks with EXACT_MATCH first, i.e. free
- * a whole node for the munmap case. If no such entry is found,
- * it then checks with END_MATCH, i.e. shrink the size of a node
- * from the end for the mremap case.
- */
- entry_old = memtype_match(start, end, MEMTYPE_EXACT_MATCH);
- if (!entry_old) {
- entry_old = memtype_match(start, end, MEMTYPE_END_MATCH);
- if (!entry_old)
- return ERR_PTR(-EINVAL);
+ struct memtype *entry = interval_iter_first(&memtype_rbroot, start, end - 1);
+
+ while (entry && entry->start < end) {
+ if (entry->start == start && entry->end == end) {
+ interval_remove(entry, &memtype_rbroot);
+ return entry;
+ }
+ entry = interval_iter_next(entry, start, end - 1);
}
-
- if (entry_old->start == start) {
- /* munmap: erase this node */
- interval_remove(entry_old, &memtype_rbroot);
- } else {
- /* mremap: update the end value of this node */
- interval_remove(entry_old, &memtype_rbroot);
- entry_old->end = start;
- interval_insert(entry_old, &memtype_rbroot);
-
- return NULL;
- }
-
- return entry_old;
+ return ERR_PTR(-EINVAL);
}
struct memtype *memtype_lookup(u64 addr)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 30ab4aced761..8834c76f91c9 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1257,6 +1257,9 @@ static int collapse_pmd_page(pmd_t *pmd, unsigned long addr,
pgprot_t pgprot;
int i = 0;
+ if (!cpu_feature_enabled(X86_FEATURE_PSE))
+ return 0;
+
addr &= PMD_MASK;
pte = pte_offset_kernel(pmd, addr);
first = *pte;
@@ -2148,6 +2151,19 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages,
CPA_PAGES_ARRAY, pages);
}
+/*
+ * __set_memory_prot is an internal helper for callers that have been passed
+ * a pgprot_t value from upper layers and a reservation has already been taken.
+ * If you want to set the pgprot to a specific page protocol, use the
+ * set_memory_xx() functions.
+ */
+int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
+{
+ return change_page_attr_set_clr(&addr, numpages, prot,
+ __pgprot(~pgprot_val(prot)), 0, 0,
+ NULL);
+}
+
int _set_memory_uc(unsigned long addr, int numpages)
{
/*
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 62777ba4de1a..ddf248c3ee7d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -189,7 +189,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
if (!ptdesc)
failed = true;
- if (ptdesc && !pagetable_pmd_ctor(ptdesc)) {
+ if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) {
pagetable_free(ptdesc);
ptdesc = NULL;
failed = true;
@@ -751,14 +751,13 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
for (i = 0; i < PTRS_PER_PMD; i++) {
if (!pmd_none(pmd_sv[i])) {
pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
- free_page((unsigned long)pte);
+ pte_free_kernel(&init_mm, pte);
}
}
free_page((unsigned long)pmd_sv);
- pagetable_dtor(virt_to_ptdesc(pmd));
- free_page((unsigned long)pmd);
+ pmd_free(&init_mm, pmd);
return 1;
}
@@ -781,7 +780,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
/* INVLPG to clear all paging-structure caches */
flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
- free_page((unsigned long)pte);
+ pte_free_kernel(&init_mm, pte);
return 1;
}
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 190299834011..b10d4d131dce 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -38,6 +38,7 @@
#include <asm/desc.h>
#include <asm/sections.h>
#include <asm/set_memory.h>
+#include <asm/bugs.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
@@ -84,7 +85,8 @@ void __init pti_check_boottime_disable(void)
return;
}
- if (cpu_mitigations_off())
+ if (pti_mode == PTI_AUTO &&
+ !cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
pti_mode = PTI_FORCE_OFF;
if (pti_mode == PTI_FORCE_OFF) {
pti_print_if_insecure("disabled on command line.");
@@ -98,6 +100,11 @@ void __init pti_check_boottime_disable(void)
return;
setup_force_cpu_cap(X86_FEATURE_PTI);
+
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ pr_debug("PTI enabled, disabling INVLPGB\n");
+ setup_clear_cpu_cap(X86_FEATURE_INVLPGB);
+ }
}
static int __init pti_parse_cmdline(char *arg)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 15672cb926fc..7e3fca164620 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3501,13 +3501,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf);
}
-static const char *bpf_get_prog_name(struct bpf_prog *prog)
-{
- if (prog->aux->ksym.prog)
- return prog->aux->ksym.name;
- return prog->aux->name;
-}
-
static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size)
{
int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
@@ -3531,7 +3524,7 @@ static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size
if (stack_ptr[0] != PRIV_STACK_GUARD_VAL ||
stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL) {
pr_err("BPF private stack overflow/underflow detected for prog %sx\n",
- bpf_get_prog_name(prog));
+ bpf_jit_get_prog_name(prog));
break;
}
}
@@ -3845,7 +3838,6 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
}
return;
#endif
- WARN(1, "verification of programs using bpf_throw should have failed\n");
}
void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 4933fb337983..c1efd5b0d198 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -8,13 +8,13 @@ obj-$(CONFIG_PCI_OLPC) += olpc.o
obj-$(CONFIG_PCI_XEN) += xen.o
obj-y += fixup.o
-obj-$(CONFIG_X86_INTEL_CE) += ce4100.o
obj-$(CONFIG_ACPI) += acpi.o
obj-y += legacy.o irq.o
-obj-$(CONFIG_X86_NUMACHIP) += numachip.o
+obj-$(CONFIG_X86_INTEL_CE) += ce4100.o
+obj-$(CONFIG_X86_INTEL_MID) += intel_mid.o
-obj-$(CONFIG_X86_INTEL_MID) += intel_mid_pci.o
+obj-$(CONFIG_X86_NUMACHIP) += numachip.o
obj-y += common.o early.o
obj-y += bus_numa.o
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 36336299596b..e7e71490bd25 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -970,13 +970,13 @@ static void amd_rp_pme_suspend(struct pci_dev *dev)
struct pci_dev *rp;
/*
- * PM_SUSPEND_ON means we're doing runtime suspend, which means
+ * If system suspend is not in progress, we're doing runtime suspend, so
* amd-pmc will not be involved so PMEs during D3 work as advertised.
*
* The PMEs *do* work if amd-pmc doesn't put the SoC in the hardware
* sleep state, but we assume amd-pmc is always present.
*/
- if (pm_suspend_target_state == PM_SUSPEND_ON)
+ if (!pm_suspend_in_progress())
return;
rp = pcie_find_root_port(dev);
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid.c
index b433b1753016..b433b1753016 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid.c
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index f8126821a94d..aaa7017416f7 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -5,19 +5,12 @@
* (C) Copyright 2010 Intel Corporation
*/
#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/irq.h>
#include <linux/reboot.h>
-#include <linux/serial_reg.h>
-#include <linux/serial_8250.h>
#include <asm/ce4100.h>
#include <asm/prom.h>
#include <asm/setup.h>
-#include <asm/i8259.h>
#include <asm/io.h>
-#include <asm/io_apic.h>
-#include <asm/emergency-restart.h>
/*
* The CE4100 platform has an internal 8051 Microcontroller which is
@@ -31,94 +24,6 @@ static void ce4100_power_off(void)
outb(0x4, 0xcf9);
}
-#ifdef CONFIG_SERIAL_8250
-
-static unsigned int mem_serial_in(struct uart_port *p, int offset)
-{
- offset = offset << p->regshift;
- return readl(p->membase + offset);
-}
-
-/*
- * The UART Tx interrupts are not set under some conditions and therefore serial
- * transmission hangs. This is a silicon issue and has not been root caused. The
- * workaround for this silicon issue checks UART_LSR_THRE bit and UART_LSR_TEMT
- * bit of LSR register in interrupt handler to see whether at least one of these
- * two bits is set, if so then process the transmit request. If this workaround
- * is not applied, then the serial transmission may hang. This workaround is for
- * errata number 9 in Errata - B step.
-*/
-
-static unsigned int ce4100_mem_serial_in(struct uart_port *p, int offset)
-{
- unsigned int ret, ier, lsr;
-
- if (offset == UART_IIR) {
- offset = offset << p->regshift;
- ret = readl(p->membase + offset);
- if (ret & UART_IIR_NO_INT) {
- /* see if the TX interrupt should have really set */
- ier = mem_serial_in(p, UART_IER);
- /* see if the UART's XMIT interrupt is enabled */
- if (ier & UART_IER_THRI) {
- lsr = mem_serial_in(p, UART_LSR);
- /* now check to see if the UART should be
- generating an interrupt (but isn't) */
- if (lsr & (UART_LSR_THRE | UART_LSR_TEMT))
- ret &= ~UART_IIR_NO_INT;
- }
- }
- } else
- ret = mem_serial_in(p, offset);
- return ret;
-}
-
-static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value)
-{
- offset = offset << p->regshift;
- writel(value, p->membase + offset);
-}
-
-static void ce4100_serial_fixup(int port, struct uart_port *up,
- u32 *capabilities)
-{
-#ifdef CONFIG_EARLY_PRINTK
- /*
- * Over ride the legacy port configuration that comes from
- * asm/serial.h. Using the ioport driver then switching to the
- * PCI memmaped driver hangs the IOAPIC
- */
- if (up->iotype != UPIO_MEM32) {
- up->uartclk = 14745600;
- up->mapbase = 0xdffe0200;
- set_fixmap_nocache(FIX_EARLYCON_MEM_BASE,
- up->mapbase & PAGE_MASK);
- up->membase =
- (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE);
- up->membase += up->mapbase & ~PAGE_MASK;
- up->mapbase += port * 0x100;
- up->membase += port * 0x100;
- up->iotype = UPIO_MEM32;
- up->regshift = 2;
- up->irq = 4;
- }
-#endif
- up->iobase = 0;
- up->serial_in = ce4100_mem_serial_in;
- up->serial_out = ce4100_mem_serial_out;
-
- *capabilities |= (1 << 12);
-}
-
-static __init void sdv_serial_fixup(void)
-{
- serial8250_set_isa_configurator(ce4100_serial_fixup);
-}
-
-#else
-static inline void sdv_serial_fixup(void) {};
-#endif
-
static void __init sdv_arch_setup(void)
{
sdv_serial_fixup();
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index e7e8f77f77f8..b4409df2105a 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -216,8 +216,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
* When SEV-ES is active, the GHCB as set by the kernel will be used
* by firmware. Create a 1:1 unencrypted mapping for each GHCB.
*/
- if (sev_es_efi_map_ghcbs(pgd)) {
- pr_err("Failed to create 1:1 mapping for the GHCBs!\n");
+ if (sev_es_efi_map_ghcbs_cas(pgd)) {
+ pr_err("Failed to create 1:1 mapping for the GHCBs and CAs!\n");
return 1;
}
diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c
index a7c23f2a58c9..a2294c1649f6 100644
--- a/arch/x86/power/hibernate.c
+++ b/arch/x86/power/hibernate.c
@@ -192,7 +192,8 @@ out:
int arch_resume_nosmt(void)
{
- int ret = 0;
+ int ret;
+
/*
* We reached this while coming out of hibernation. This means
* that SMT siblings are sleeping in hlt, as mwait is not safe
@@ -206,18 +207,10 @@ int arch_resume_nosmt(void)
* Called with hotplug disabled.
*/
cpu_hotplug_enable();
- if (cpu_smt_control == CPU_SMT_DISABLED ||
- cpu_smt_control == CPU_SMT_FORCE_DISABLED) {
- enum cpuhp_smt_control old = cpu_smt_control;
-
- ret = cpuhp_smt_enable();
- if (ret)
- goto out;
- ret = cpuhp_smt_disable(old);
- if (ret)
- goto out;
- }
-out:
+
+ ret = arch_cpu_rescan_dead_smt_siblings();
+
cpu_hotplug_disable();
+
return ret;
}
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index ebdfd7b84feb..e0a607a14e7e 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -35,7 +35,7 @@ targets += purgatory.ro purgatory.chk
PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
PURGATORY_CFLAGS := -mcmodel=small -ffreestanding -fno-zero-initialized-in-bss -g0
PURGATORY_CFLAGS += -fpic -fvisibility=hidden
-PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING
+PURGATORY_CFLAGS += $(DISABLE_KSTACK_ERASE) -DDISABLE_BRANCH_PROFILING
PURGATORY_CFLAGS += -fno-stack-protector
# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index aea47e793963..655139dd0532 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -25,7 +25,7 @@ static int verify_sha256_digest(void)
{
struct kexec_sha_region *ptr, *end;
u8 digest[SHA256_DIGEST_SIZE];
- struct sha256_state sctx;
+ struct sha256_ctx sctx;
sha256_init(&sctx);
end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions);
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index ed5c63c0b4e5..88be32026768 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -66,6 +66,8 @@ void __init reserve_real_mode(void)
* setup_arch().
*/
memblock_reserve(0, SZ_1M);
+
+ memblock_clear_kho_scratch(0, SZ_1M);
}
static void __init sme_sev_setup_real_mode(struct trampoline_header *th)
diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c
index 08cd913cbd4e..8bf15c4aefa9 100644
--- a/arch/x86/tools/insn_decoder_test.c
+++ b/arch/x86/tools/insn_decoder_test.c
@@ -167,7 +167,7 @@ int main(int argc, char **argv)
pr_warn("Decoded and checked %d instructions with %d "
"failures\n", insns, warnings);
else
- fprintf(stdout, "%s: success: Decoded and checked %d"
+ fprintf(stdout, " %s: success: Decoded and checked %d"
" instructions\n", prog, insns);
return 0;
}
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c
index 213f35f94feb..e743f0ea01ee 100644
--- a/arch/x86/tools/insn_sanity.c
+++ b/arch/x86/tools/insn_sanity.c
@@ -253,9 +253,9 @@ int main(int argc, char **argv)
}
fprintf((errors) ? stderr : stdout,
- "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n",
+ " %s: %s: Decoded and checked %d %s instructions with %d errors (seed:0x%x)\n",
prog,
- (errors) ? "Failure" : "Success",
+ (errors) ? "failure" : "success",
insns,
(input_file) ? "given" : "random",
errors,
diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h
index b07824500363..ddc144657efa 100644
--- a/arch/x86/um/asm/checksum.h
+++ b/arch/x86/um/asm/checksum.h
@@ -20,6 +20,9 @@
*/
extern __wsum csum_partial(const void *buff, int len, __wsum sum);
+/* Do not call this directly. Declared for export type visibility. */
+extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len);
+
/**
* csum_fold - Fold and invert a 32bit checksum.
* sum: 32bit unfolded sum
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 478710384b34..e222d2ae28fd 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -21,10 +21,10 @@
#include <asm/user.h>
-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static __always_inline void rep_nop(void)
+/* PAUSE is a good thing to insert into busy-wait loops. */
+static __always_inline void native_pause(void)
{
- __asm__ __volatile__("rep;nop": : :"memory");
+ __asm__ __volatile__("pause": : :"memory");
}
static __always_inline void cpu_relax(void)
@@ -33,7 +33,7 @@ static __always_inline void cpu_relax(void)
time_travel_mode == TT_MODE_EXTERNAL)
time_travel_ndelay(1);
else
- rep_nop();
+ native_pause();
}
#define task_pt_regs(t) (&(t)->thread.regs)
diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h
index 56a2f0913e3c..d6208d0fad51 100644
--- a/arch/x86/um/asm/syscall.h
+++ b/arch/x86/um/asm/syscall.h
@@ -9,6 +9,8 @@ typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
+extern const sys_call_ptr_t sys_call_table[];
+
static inline int syscall_get_arch(struct task_struct *task)
{
#ifdef CONFIG_X86_32
diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c
index 37decaa74761..a21403df6663 100644
--- a/arch/x86/um/os-Linux/mcontext.c
+++ b/arch/x86/um/os-Linux/mcontext.c
@@ -1,7 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
-#include <sys/ucontext.h>
#define __FRAME_OFFSETS
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <sys/ucontext.h>
#include <asm/ptrace.h>
+#include <asm/sigcontext.h>
#include <sysdep/ptrace.h>
#include <sysdep/mcontext.h>
#include <arch.h>
@@ -18,6 +21,10 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc)
COPY2(UESP, ESP); /* sic */
COPY(EBX); COPY(EDX); COPY(ECX); COPY(EAX);
COPY(EIP); COPY_SEG_CPL3(CS); COPY(EFL); COPY_SEG_CPL3(SS);
+#undef COPY2
+#undef COPY
+#undef COPY_SEG
+#undef COPY_SEG_CPL3
#else
#define COPY2(X,Y) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##Y]
#define COPY(X) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##X]
@@ -29,6 +36,8 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc)
COPY2(EFLAGS, EFL);
COPY2(CS, CSGSFS);
regs->gp[SS / sizeof(unsigned long)] = mc->gregs[REG_CSGSFS] >> 48;
+#undef COPY2
+#undef COPY
#endif
}
@@ -42,3 +51,210 @@ void mc_set_rip(void *_mc, void *target)
mc->gregs[REG_RIP] = (unsigned long)target;
#endif
}
+
+/* Same thing, but the copy macros are turned around. */
+void get_mc_from_regs(struct uml_pt_regs *regs, mcontext_t *mc, int single_stepping)
+{
+#ifdef __i386__
+#define COPY2(X,Y) mc->gregs[REG_##Y] = regs->gp[X]
+#define COPY(X) mc->gregs[REG_##X] = regs->gp[X]
+#define COPY_SEG(X) mc->gregs[REG_##X] = regs->gp[X] & 0xffff;
+#define COPY_SEG_CPL3(X) mc->gregs[REG_##X] = (regs->gp[X] & 0xffff) | 3;
+ COPY_SEG(GS); COPY_SEG(FS); COPY_SEG(ES); COPY_SEG(DS);
+ COPY(EDI); COPY(ESI); COPY(EBP);
+ COPY2(UESP, ESP); /* sic */
+ COPY(EBX); COPY(EDX); COPY(ECX); COPY(EAX);
+ COPY(EIP); COPY_SEG_CPL3(CS); COPY(EFL); COPY_SEG_CPL3(SS);
+#else
+#define COPY2(X,Y) mc->gregs[REG_##Y] = regs->gp[X/sizeof(unsigned long)]
+#define COPY(X) mc->gregs[REG_##X] = regs->gp[X/sizeof(unsigned long)]
+ COPY(R8); COPY(R9); COPY(R10); COPY(R11);
+ COPY(R12); COPY(R13); COPY(R14); COPY(R15);
+ COPY(RDI); COPY(RSI); COPY(RBP); COPY(RBX);
+ COPY(RDX); COPY(RAX); COPY(RCX); COPY(RSP);
+ COPY(RIP);
+ COPY2(EFLAGS, EFL);
+ mc->gregs[REG_CSGSFS] = mc->gregs[REG_CSGSFS] & 0xffffffffffffl;
+ mc->gregs[REG_CSGSFS] |= (regs->gp[SS / sizeof(unsigned long)] & 0xffff) << 48;
+#endif
+
+ if (single_stepping)
+ mc->gregs[REG_EFL] |= X86_EFLAGS_TF;
+ else
+ mc->gregs[REG_EFL] &= ~X86_EFLAGS_TF;
+}
+
+#ifdef CONFIG_X86_32
+struct _xstate_64 {
+ struct _fpstate_64 fpstate;
+ struct _header xstate_hdr;
+ struct _ymmh_state ymmh;
+ /* New processor state extensions go here: */
+};
+
+/* Not quite the right structures as these contain more information */
+int um_i387_from_fxsr(struct _fpstate_32 *i387,
+ const struct _fpstate_64 *fxsave);
+int um_fxsr_from_i387(struct _fpstate_64 *fxsave,
+ const struct _fpstate_32 *from);
+#else
+#define _xstate_64 _xstate
+#endif
+
+static struct _fpstate *get_fpstate(struct stub_data *data,
+ mcontext_t *mcontext,
+ int *fp_size)
+{
+ struct _fpstate *res;
+
+ /* Assume floating point registers are on the same page */
+ res = (void *)(((unsigned long)mcontext->fpregs &
+ (UM_KERN_PAGE_SIZE - 1)) +
+ (unsigned long)&data->sigstack[0]);
+
+ if ((void *)res + sizeof(struct _fpstate) >
+ (void *)data->sigstack + sizeof(data->sigstack))
+ return NULL;
+
+ if (res->sw_reserved.magic1 != FP_XSTATE_MAGIC1) {
+ *fp_size = sizeof(struct _fpstate);
+ } else {
+ char *magic2_addr;
+
+ magic2_addr = (void *)res;
+ magic2_addr += res->sw_reserved.extended_size;
+ magic2_addr -= FP_XSTATE_MAGIC2_SIZE;
+
+ /* We still need to be within our stack */
+ if ((void *)magic2_addr >
+ (void *)data->sigstack + sizeof(data->sigstack))
+ return NULL;
+
+ /* If we do not read MAGIC2, then we did something wrong */
+ if (*(__u32 *)magic2_addr != FP_XSTATE_MAGIC2)
+ return NULL;
+
+ /* Remove MAGIC2 from the size, we do not save/restore it */
+ *fp_size = res->sw_reserved.extended_size -
+ FP_XSTATE_MAGIC2_SIZE;
+ }
+
+ return res;
+}
+
+int get_stub_state(struct uml_pt_regs *regs, struct stub_data *data,
+ unsigned long *fp_size_out)
+{
+ mcontext_t *mcontext;
+ struct _fpstate *fpstate_stub;
+ struct _xstate_64 *xstate_stub;
+ int fp_size, xstate_size;
+
+ /* mctx_offset is verified by wait_stub_done_seccomp */
+ mcontext = (void *)&data->sigstack[data->mctx_offset];
+
+ get_regs_from_mc(regs, mcontext);
+
+ fpstate_stub = get_fpstate(data, mcontext, &fp_size);
+ if (!fpstate_stub)
+ return -EINVAL;
+
+#ifdef CONFIG_X86_32
+ xstate_stub = (void *)&fpstate_stub->_fxsr_env;
+ xstate_size = fp_size - offsetof(struct _fpstate_32, _fxsr_env);
+#else
+ xstate_stub = (void *)fpstate_stub;
+ xstate_size = fp_size;
+#endif
+
+ if (fp_size_out)
+ *fp_size_out = xstate_size;
+
+ if (xstate_size > host_fp_size)
+ return -ENOSPC;
+
+ memcpy(&regs->fp, xstate_stub, xstate_size);
+
+ /* We do not need to read the x86_64 FS_BASE/GS_BASE registers as
+ * we do not permit userspace to set them directly.
+ */
+
+#ifdef CONFIG_X86_32
+ /* Read the i387 legacy FP registers */
+ if (um_fxsr_from_i387((void *)&regs->fp, fpstate_stub))
+ return -EINVAL;
+#endif
+
+ return 0;
+}
+
+/* Copied because we cannot include regset.h here. */
+struct task_struct;
+struct user_regset;
+struct membuf {
+ void *p;
+ size_t left;
+};
+
+int fpregs_legacy_get(struct task_struct *target,
+ const struct user_regset *regset,
+ struct membuf to);
+
+int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data,
+ int single_stepping)
+{
+ mcontext_t *mcontext;
+ struct _fpstate *fpstate_stub;
+ struct _xstate_64 *xstate_stub;
+ int fp_size, xstate_size;
+
+ /* mctx_offset is verified by wait_stub_done_seccomp */
+ mcontext = (void *)&data->sigstack[data->mctx_offset];
+
+ if ((unsigned long)mcontext < (unsigned long)data->sigstack ||
+ (unsigned long)mcontext >
+ (unsigned long) data->sigstack +
+ sizeof(data->sigstack) - sizeof(*mcontext))
+ return -EINVAL;
+
+ get_mc_from_regs(regs, mcontext, single_stepping);
+
+ fpstate_stub = get_fpstate(data, mcontext, &fp_size);
+ if (!fpstate_stub)
+ return -EINVAL;
+
+#ifdef CONFIG_X86_32
+ xstate_stub = (void *)&fpstate_stub->_fxsr_env;
+ xstate_size = fp_size - offsetof(struct _fpstate_32, _fxsr_env);
+#else
+ xstate_stub = (void *)fpstate_stub;
+ xstate_size = fp_size;
+#endif
+
+ memcpy(xstate_stub, &regs->fp, xstate_size);
+
+#ifdef __i386__
+ /*
+ * On x86, the GDT entries are updated by arch_set_tls.
+ */
+
+ /* Store the i387 legacy FP registers which the host will use */
+ if (um_i387_from_fxsr(fpstate_stub, (void *)&regs->fp))
+ return -EINVAL;
+#else
+ /*
+ * On x86_64, we need to sync the FS_BASE/GS_BASE registers using the
+ * arch specific data.
+ */
+ if (data->arch_data.fs_base != regs->gp[FS_BASE / sizeof(unsigned long)]) {
+ data->arch_data.fs_base = regs->gp[FS_BASE / sizeof(unsigned long)];
+ data->arch_data.sync |= STUB_SYNC_FS_BASE;
+ }
+ if (data->arch_data.gs_base != regs->gp[GS_BASE / sizeof(unsigned long)]) {
+ data->arch_data.gs_base = regs->gp[GS_BASE / sizeof(unsigned long)];
+ data->arch_data.sync |= STUB_SYNC_GS_BASE;
+ }
+#endif
+
+ return 0;
+}
diff --git a/arch/x86/um/ptrace.c b/arch/x86/um/ptrace.c
index 57c504fd5626..2635ca2595a3 100644
--- a/arch/x86/um/ptrace.c
+++ b/arch/x86/um/ptrace.c
@@ -25,7 +25,8 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
return tmp;
}
-static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave)
+static inline unsigned long
+twd_fxsr_to_i387(const struct user_fxsr_struct *fxsave)
{
struct _fpxreg *st = NULL;
unsigned long twd = (unsigned long) fxsave->twd;
@@ -69,12 +70,16 @@ static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave)
return ret;
}
-/* Get/set the old 32bit i387 registers (pre-FPX) */
-static int fpregs_legacy_get(struct task_struct *target,
- const struct user_regset *regset,
- struct membuf to)
+/*
+ * Get/set the old 32bit i387 registers (pre-FPX)
+ *
+ * We provide simple wrappers for mcontext.c, they are only defined locally
+ * because mcontext.c is userspace facing and needs to a different definition
+ * of the structures.
+ */
+static int _um_i387_from_fxsr(struct membuf to,
+ const struct user_fxsr_struct *fxsave)
{
- struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp;
int i;
membuf_store(&to, (unsigned long)fxsave->cwd | 0xffff0000ul);
@@ -91,23 +96,36 @@ static int fpregs_legacy_get(struct task_struct *target,
return 0;
}
-static int fpregs_legacy_set(struct task_struct *target,
+int um_i387_from_fxsr(struct user_i387_struct *i387,
+ const struct user_fxsr_struct *fxsave);
+
+int um_i387_from_fxsr(struct user_i387_struct *i387,
+ const struct user_fxsr_struct *fxsave)
+{
+ struct membuf to = {
+ .p = i387,
+ .left = sizeof(*i387),
+ };
+
+ return _um_i387_from_fxsr(to, fxsave);
+}
+
+static int fpregs_legacy_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- const void *kbuf, const void __user *ubuf)
+ struct membuf to)
{
struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp;
- const struct user_i387_struct *from;
- struct user_i387_struct buf;
- int i;
- if (ubuf) {
- if (copy_from_user(&buf, ubuf, sizeof(buf)))
- return -EFAULT;
- from = &buf;
- } else {
- from = kbuf;
- }
+ return _um_i387_from_fxsr(to, fxsave);
+}
+
+int um_fxsr_from_i387(struct user_fxsr_struct *fxsave,
+ const struct user_i387_struct *from);
+
+int um_fxsr_from_i387(struct user_fxsr_struct *fxsave,
+ const struct user_i387_struct *from)
+{
+ int i;
fxsave->cwd = (unsigned short)(from->cwd & 0xffff);
fxsave->swd = (unsigned short)(from->swd & 0xffff);
@@ -125,6 +143,26 @@ static int fpregs_legacy_set(struct task_struct *target,
return 0;
}
+
+static int fpregs_legacy_set(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp;
+ const struct user_i387_struct *from;
+ struct user_i387_struct buf;
+
+ if (ubuf) {
+ if (copy_from_user(&buf, ubuf, sizeof(buf)))
+ return -EFAULT;
+ from = &buf;
+ } else {
+ from = kbuf;
+ }
+
+ return um_fxsr_from_i387(fxsave, from);
+}
#endif
static int genregs_get(struct task_struct *target,
@@ -198,7 +236,7 @@ static int generic_fpregs_set(struct task_struct *target,
static struct user_regset uml_regsets[] __ro_after_init = {
[REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(struct user_regs_struct) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -208,7 +246,7 @@ static struct user_regset uml_regsets[] __ro_after_init = {
#ifdef CONFIG_X86_32
/* Old FP registers, they are needed in signal frames */
[REGSET_FP_LEGACY] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_i387_ia32_struct) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
@@ -219,10 +257,10 @@ static struct user_regset uml_regsets[] __ro_after_init = {
#endif
[REGSET_FP] = {
#ifdef CONFIG_X86_32
- .core_note_type = NT_PRXFPREG,
+ USER_REGSET_NOTE_TYPE(PRXFPREG),
.n = sizeof(struct user32_fxsr_struct) / sizeof(long),
#else
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_i387_struct) / sizeof(long),
#endif
.size = sizeof(long),
@@ -232,7 +270,7 @@ static struct user_regset uml_regsets[] __ro_after_init = {
.set = generic_fpregs_set,
},
[REGSET_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
+ USER_REGSET_NOTE_TYPE(X86_XSTATE),
.size = sizeof(long),
.align = sizeof(long),
.active = generic_fpregs_active,
diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h
index 48de3a71f845..6fd1ed400399 100644
--- a/arch/x86/um/shared/sysdep/kernel-offsets.h
+++ b/arch/x86/um/shared/sysdep/kernel-offsets.h
@@ -4,7 +4,9 @@
#include <linux/elf.h>
#include <linux/crypto.h>
#include <linux/kbuild.h>
+#include <linux/audit.h>
#include <asm/mman.h>
+#include <asm/seccomp.h>
/* workaround for a warning with -Wmissing-prototypes */
void foo(void);
diff --git a/arch/x86/um/shared/sysdep/mcontext.h b/arch/x86/um/shared/sysdep/mcontext.h
index b724c54da316..6fe490cc5b98 100644
--- a/arch/x86/um/shared/sysdep/mcontext.h
+++ b/arch/x86/um/shared/sysdep/mcontext.h
@@ -6,7 +6,16 @@
#ifndef __SYS_SIGCONTEXT_X86_H
#define __SYS_SIGCONTEXT_X86_H
+#include <stub-data.h>
+
extern void get_regs_from_mc(struct uml_pt_regs *, mcontext_t *);
+extern void get_mc_from_regs(struct uml_pt_regs *regs, mcontext_t *mc,
+ int single_stepping);
+
+extern int get_stub_state(struct uml_pt_regs *regs, struct stub_data *data,
+ unsigned long *fp_size_out);
+extern int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data,
+ int single_stepping);
#ifdef __i386__
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 8f7476ff6e95..572ea2d79131 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -44,18 +44,6 @@
#include "ptrace_64.h"
#endif
-struct syscall_args {
- unsigned long args[6];
-};
-
-#define SYSCALL_ARGS(r) ((struct syscall_args) \
- { .args = { UPT_SYSCALL_ARG1(r), \
- UPT_SYSCALL_ARG2(r), \
- UPT_SYSCALL_ARG3(r), \
- UPT_SYSCALL_ARG4(r), \
- UPT_SYSCALL_ARG5(r), \
- UPT_SYSCALL_ARG6(r) } } )
-
extern unsigned long host_fp_size;
struct uml_pt_regs {
diff --git a/arch/x86/um/shared/sysdep/stub-data.h b/arch/x86/um/shared/sysdep/stub-data.h
new file mode 100644
index 000000000000..82b1b7f8ac3d
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/stub-data.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARCH_STUB_DATA_H
+#define __ARCH_STUB_DATA_H
+
+#ifdef __i386__
+#include <generated/asm-offsets.h>
+#include <asm/ldt.h>
+
+struct stub_data_arch {
+ int sync;
+ struct user_desc tls[UM_KERN_GDT_ENTRY_TLS_ENTRIES];
+};
+#else
+#define STUB_SYNC_FS_BASE (1 << 0)
+#define STUB_SYNC_GS_BASE (1 << 1)
+struct stub_data_arch {
+ int sync;
+ unsigned long fs_base;
+ unsigned long gs_base;
+};
+#endif
+
+#endif /* __ARCH_STUB_DATA_H */
diff --git a/arch/x86/um/shared/sysdep/stub.h b/arch/x86/um/shared/sysdep/stub.h
index dc89f4423454..4fa58f5b4fca 100644
--- a/arch/x86/um/shared/sysdep/stub.h
+++ b/arch/x86/um/shared/sysdep/stub.h
@@ -13,3 +13,5 @@
extern void stub_segv_handler(int, siginfo_t *, void *);
extern void stub_syscall_handler(void);
+extern void stub_signal_interrupt(int, siginfo_t *, void *);
+extern void stub_signal_restorer(void);
diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h
index 390988132c0a..df568fc3ceb4 100644
--- a/arch/x86/um/shared/sysdep/stub_32.h
+++ b/arch/x86/um/shared/sysdep/stub_32.h
@@ -131,4 +131,17 @@ static __always_inline void *get_stub_data(void)
"call *%%eax ;" \
:: "i" ((1 + STUB_DATA_PAGES) * UM_KERN_PAGE_SIZE), \
"i" (&fn))
+
+static __always_inline void
+stub_seccomp_restore_state(struct stub_data_arch *arch)
+{
+ for (int i = 0; i < sizeof(arch->tls) / sizeof(arch->tls[0]); i++) {
+ if (arch->sync & (1 << i))
+ stub_syscall1(__NR_set_thread_area,
+ (unsigned long) &arch->tls[i]);
+ }
+
+ arch->sync = 0;
+}
+
#endif
diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h
index 294affbec742..9cfd31afa769 100644
--- a/arch/x86/um/shared/sysdep/stub_64.h
+++ b/arch/x86/um/shared/sysdep/stub_64.h
@@ -10,6 +10,7 @@
#include <sysdep/ptrace_user.h>
#include <generated/asm-offsets.h>
#include <linux/stddef.h>
+#include <asm/prctl.h>
#define STUB_MMAP_NR __NR_mmap
#define MMAP_OFFSET(o) (o)
@@ -134,4 +135,20 @@ static __always_inline void *get_stub_data(void)
"call *%%rax ;" \
:: "i" ((1 + STUB_DATA_PAGES) * UM_KERN_PAGE_SIZE), \
"i" (&fn))
+
+static __always_inline void
+stub_seccomp_restore_state(struct stub_data_arch *arch)
+{
+ /*
+ * We could use _writefsbase_u64/_writegsbase_u64 if the host reports
+ * support in the hwcaps (HWCAP2_FSGSBASE).
+ */
+ if (arch->sync & STUB_SYNC_FS_BASE)
+ stub_syscall2(__NR_arch_prctl, ARCH_SET_FS, arch->fs_base);
+ if (arch->sync & STUB_SYNC_GS_BASE)
+ stub_syscall2(__NR_arch_prctl, ARCH_SET_GS, arch->gs_base);
+
+ arch->sync = 0;
+}
+
#endif
diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h
deleted file mode 100644
index b2060ac707f0..000000000000
--- a/arch/x86/um/shared/sysdep/syscalls.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifdef __i386__
-#include "syscalls_32.h"
-#else
-#include "syscalls_64.h"
-#endif
diff --git a/arch/x86/um/shared/sysdep/syscalls_32.h b/arch/x86/um/shared/sysdep/syscalls_32.h
deleted file mode 100644
index f6e9f84397e7..000000000000
--- a/arch/x86/um/shared/sysdep/syscalls_32.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (C) 2000 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <asm/unistd.h>
-#include <sysdep/ptrace.h>
-
-typedef long syscall_handler_t(struct syscall_args);
-
-extern syscall_handler_t *sys_call_table[];
-
-#define EXECUTE_SYSCALL(syscall, regs) \
- ((*sys_call_table[syscall]))(SYSCALL_ARGS(&regs->regs))
diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h
deleted file mode 100644
index b6b997225841..000000000000
--- a/arch/x86/um/shared/sysdep/syscalls_64.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright 2003 PathScale, Inc.
- *
- * Licensed under the GPL
- */
-
-#ifndef __SYSDEP_X86_64_SYSCALLS_H__
-#define __SYSDEP_X86_64_SYSCALLS_H__
-
-#include <linux/msg.h>
-#include <linux/shm.h>
-
-typedef long syscall_handler_t(long, long, long, long, long, long);
-
-extern syscall_handler_t *sys_call_table[];
-
-#define EXECUTE_SYSCALL(syscall, regs) \
- (((*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(&regs->regs), \
- UPT_SYSCALL_ARG2(&regs->regs), \
- UPT_SYSCALL_ARG3(&regs->regs), \
- UPT_SYSCALL_ARG4(&regs->regs), \
- UPT_SYSCALL_ARG5(&regs->regs), \
- UPT_SYSCALL_ARG6(&regs->regs)))
-
-extern syscall_handler_t sys_modify_ldt;
-extern syscall_handler_t sys_arch_prctl;
-
-#endif
diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c
index fbb129023080..1909c2e640b2 100644
--- a/arch/x86/um/tls_32.c
+++ b/arch/x86/um/tls_32.c
@@ -12,6 +12,7 @@
#include <skas.h>
#include <sysdep/tls.h>
#include <asm/desc.h>
+#include <stub-data.h>
/*
* If needed we can detect when it's uninitialized.
@@ -21,14 +22,25 @@
static int host_supports_tls = -1;
int host_gdt_entry_tls_min;
-static int do_set_thread_area(struct user_desc *info)
+static int do_set_thread_area(struct task_struct* task, struct user_desc *info)
{
int ret;
- u32 cpu;
- cpu = get_cpu();
- ret = os_set_thread_area(info, userspace_pid[cpu]);
- put_cpu();
+ if (info->entry_number < host_gdt_entry_tls_min ||
+ info->entry_number >= host_gdt_entry_tls_min + GDT_ENTRY_TLS_ENTRIES)
+ return -EINVAL;
+
+ if (using_seccomp) {
+ int idx = info->entry_number - host_gdt_entry_tls_min;
+ struct stub_data *data = (void *)task->mm->context.id.stack;
+
+ data->arch_data.tls[idx] = *info;
+ data->arch_data.sync |= BIT(idx);
+
+ return 0;
+ }
+
+ ret = os_set_thread_area(info, task->mm->context.id.pid);
if (ret)
printk(KERN_ERR "PTRACE_SET_THREAD_AREA failed, err = %d, "
@@ -97,7 +109,7 @@ static int load_TLS(int flags, struct task_struct *to)
if (!(flags & O_FORCE) && curr->flushed)
continue;
- ret = do_set_thread_area(&curr->tls);
+ ret = do_set_thread_area(current, &curr->tls);
if (ret)
goto out;
@@ -174,7 +186,7 @@ int arch_switch_tls(struct task_struct *to)
/*
* We have no need whatsoever to switch TLS for kernel threads; beyond
* that, that would also result in us calling os_set_thread_area with
- * userspace_pid[cpu] == 0, which gives an error.
+ * task->mm == NULL, which would cause a crash.
*/
if (likely(to->mm))
return load_TLS(O_FORCE, to);
@@ -275,7 +287,7 @@ SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, user_desc)
return -EFAULT;
}
- ret = do_set_thread_area(&info);
+ ret = do_set_thread_area(current, &info);
if (ret)
return ret;
return set_tls_entry(current, &info, idx, 1);
diff --git a/arch/x86/virt/vmx/tdx/seamcall.S b/arch/x86/virt/vmx/tdx/seamcall.S
index 5b1f2286aea9..6854c52c374b 100644
--- a/arch/x86/virt/vmx/tdx/seamcall.S
+++ b/arch/x86/virt/vmx/tdx/seamcall.S
@@ -41,6 +41,9 @@ SYM_FUNC_START(__seamcall_ret)
TDX_MODULE_CALL host=1 ret=1
SYM_FUNC_END(__seamcall_ret)
+/* KVM requires non-instrumentable __seamcall_saved_ret() for TDH.VP.ENTER */
+.section .noinstr.text, "ax"
+
/*
* __seamcall_saved_ret() - Host-side interface functions to SEAM software
* (the P-SEAMLDR or the TDX module), with saving output registers to the
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 7fdb37387886..c7a9a087ccaf 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -5,6 +5,7 @@
* Intel Trusted Domain Extensions (TDX) support
*/
+#include "asm/page_types.h"
#define pr_fmt(fmt) "virt/tdx: " fmt
#include <linux/types.h>
@@ -27,6 +28,7 @@
#include <linux/log2.h>
#include <linux/acpi.h>
#include <linux/suspend.h>
+#include <linux/idr.h>
#include <asm/page.h>
#include <asm/special_insns.h>
#include <asm/msr-index.h>
@@ -42,6 +44,8 @@ static u32 tdx_global_keyid __ro_after_init;
static u32 tdx_guest_keyid_start __ro_after_init;
static u32 tdx_nr_guest_keyids __ro_after_init;
+static DEFINE_IDA(tdx_guest_keyid_pool);
+
static DEFINE_PER_CPU(bool, tdx_lp_initialized);
static struct tdmr_info_list tdx_tdmr_list;
@@ -52,6 +56,8 @@ static DEFINE_MUTEX(tdx_module_lock);
/* All TDX-usable memory regions. Protected by mem_hotplug_lock. */
static LIST_HEAD(tdx_memlist);
+static struct tdx_sys_info tdx_sysinfo;
+
typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
@@ -69,8 +75,9 @@ static inline void seamcall_err_ret(u64 fn, u64 err,
args->r9, args->r10, args->r11);
}
-static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
- u64 fn, struct tdx_module_args *args)
+static __always_inline int sc_retry_prerr(sc_func_t func,
+ sc_err_func_t err_func,
+ u64 fn, struct tdx_module_args *args)
{
u64 sret = sc_retry(func, fn, args);
@@ -1060,15 +1067,14 @@ static int init_tdmrs(struct tdmr_info_list *tdmr_list)
static int init_tdx_module(void)
{
- struct tdx_sys_info sysinfo;
int ret;
- ret = get_tdx_sys_info(&sysinfo);
+ ret = get_tdx_sys_info(&tdx_sysinfo);
if (ret)
return ret;
/* Check whether the kernel can support this module */
- ret = check_features(&sysinfo);
+ ret = check_features(&tdx_sysinfo);
if (ret)
return ret;
@@ -1089,12 +1095,12 @@ static int init_tdx_module(void)
goto out_put_tdxmem;
/* Allocate enough space for constructing TDMRs */
- ret = alloc_tdmr_list(&tdx_tdmr_list, &sysinfo.tdmr);
+ ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr);
if (ret)
goto err_free_tdxmem;
/* Cover all TDX-usable memory regions in TDMRs */
- ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &sysinfo.tdmr);
+ ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr);
if (ret)
goto err_free_tdmrs;
@@ -1456,3 +1462,411 @@ void __init tdx_init(void)
check_tdx_erratum();
}
+
+const struct tdx_sys_info *tdx_get_sysinfo(void)
+{
+ const struct tdx_sys_info *p = NULL;
+
+ /* Make sure all fields in @tdx_sysinfo have been populated */
+ mutex_lock(&tdx_module_lock);
+ if (tdx_module_status == TDX_MODULE_INITIALIZED)
+ p = (const struct tdx_sys_info *)&tdx_sysinfo;
+ mutex_unlock(&tdx_module_lock);
+
+ return p;
+}
+EXPORT_SYMBOL_GPL(tdx_get_sysinfo);
+
+u32 tdx_get_nr_guest_keyids(void)
+{
+ return tdx_nr_guest_keyids;
+}
+EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids);
+
+int tdx_guest_keyid_alloc(void)
+{
+ return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start,
+ tdx_guest_keyid_start + tdx_nr_guest_keyids - 1,
+ GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc);
+
+void tdx_guest_keyid_free(unsigned int keyid)
+{
+ ida_free(&tdx_guest_keyid_pool, keyid);
+}
+EXPORT_SYMBOL_GPL(tdx_guest_keyid_free);
+
+static inline u64 tdx_tdr_pa(struct tdx_td *td)
+{
+ return page_to_phys(td->tdr_page);
+}
+
+static inline u64 tdx_tdvpr_pa(struct tdx_vp *td)
+{
+ return page_to_phys(td->tdvpr_page);
+}
+
+/*
+ * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether
+ * a CLFLUSH of pages is required before handing them to the TDX module.
+ * Be conservative and make the code simpler by doing the CLFLUSH
+ * unconditionally.
+ */
+static void tdx_clflush_page(struct page *page)
+{
+ clflush_cache_range(page_to_virt(page), PAGE_SIZE);
+}
+
+noinstr __flatten u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
+{
+ args->rcx = tdx_tdvpr_pa(td);
+
+ return __seamcall_saved_ret(TDH_VP_ENTER, args);
+}
+EXPORT_SYMBOL_GPL(tdh_vp_enter);
+
+u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
+{
+ struct tdx_module_args args = {
+ .rcx = page_to_phys(tdcs_page),
+ .rdx = tdx_tdr_pa(td),
+ };
+
+ tdx_clflush_page(tdcs_page);
+ return seamcall(TDH_MNG_ADDCX, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_mng_addcx);
+
+u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
+{
+ struct tdx_module_args args = {
+ .rcx = gpa,
+ .rdx = tdx_tdr_pa(td),
+ .r8 = page_to_phys(page),
+ .r9 = page_to_phys(source),
+ };
+ u64 ret;
+
+ tdx_clflush_page(page);
+ ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args);
+
+ *ext_err1 = args.rcx;
+ *ext_err2 = args.rdx;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdh_mem_page_add);
+
+u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
+{
+ struct tdx_module_args args = {
+ .rcx = gpa | level,
+ .rdx = tdx_tdr_pa(td),
+ .r8 = page_to_phys(page),
+ };
+ u64 ret;
+
+ tdx_clflush_page(page);
+ ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args);
+
+ *ext_err1 = args.rcx;
+ *ext_err2 = args.rdx;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdh_mem_sept_add);
+
+u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
+{
+ struct tdx_module_args args = {
+ .rcx = page_to_phys(tdcx_page),
+ .rdx = tdx_tdvpr_pa(vp),
+ };
+
+ tdx_clflush_page(tdcx_page);
+ return seamcall(TDH_VP_ADDCX, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_vp_addcx);
+
+u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
+{
+ struct tdx_module_args args = {
+ .rcx = gpa | level,
+ .rdx = tdx_tdr_pa(td),
+ .r8 = page_to_phys(page),
+ };
+ u64 ret;
+
+ tdx_clflush_page(page);
+ ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args);
+
+ *ext_err1 = args.rcx;
+ *ext_err2 = args.rdx;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdh_mem_page_aug);
+
+u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2)
+{
+ struct tdx_module_args args = {
+ .rcx = gpa | level,
+ .rdx = tdx_tdr_pa(td),
+ };
+ u64 ret;
+
+ ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args);
+
+ *ext_err1 = args.rcx;
+ *ext_err2 = args.rdx;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdh_mem_range_block);
+
+u64 tdh_mng_key_config(struct tdx_td *td)
+{
+ struct tdx_module_args args = {
+ .rcx = tdx_tdr_pa(td),
+ };
+
+ return seamcall(TDH_MNG_KEY_CONFIG, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_mng_key_config);
+
+u64 tdh_mng_create(struct tdx_td *td, u16 hkid)
+{
+ struct tdx_module_args args = {
+ .rcx = tdx_tdr_pa(td),
+ .rdx = hkid,
+ };
+
+ tdx_clflush_page(td->tdr_page);
+ return seamcall(TDH_MNG_CREATE, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_mng_create);
+
+u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
+{
+ struct tdx_module_args args = {
+ .rcx = tdx_tdvpr_pa(vp),
+ .rdx = tdx_tdr_pa(td),
+ };
+
+ tdx_clflush_page(vp->tdvpr_page);
+ return seamcall(TDH_VP_CREATE, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_vp_create);
+
+u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
+{
+ struct tdx_module_args args = {
+ .rcx = tdx_tdr_pa(td),
+ .rdx = field,
+ };
+ u64 ret;
+
+ ret = seamcall_ret(TDH_MNG_RD, &args);
+
+ /* R8: Content of the field, or 0 in case of error. */
+ *data = args.r8;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdh_mng_rd);
+
+u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2)
+{
+ struct tdx_module_args args = {
+ .rcx = gpa,
+ .rdx = tdx_tdr_pa(td),
+ };
+ u64 ret;
+
+ ret = seamcall_ret(TDH_MR_EXTEND, &args);
+
+ *ext_err1 = args.rcx;
+ *ext_err2 = args.rdx;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdh_mr_extend);
+
+u64 tdh_mr_finalize(struct tdx_td *td)
+{
+ struct tdx_module_args args = {
+ .rcx = tdx_tdr_pa(td),
+ };
+
+ return seamcall(TDH_MR_FINALIZE, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_mr_finalize);
+
+u64 tdh_vp_flush(struct tdx_vp *vp)
+{
+ struct tdx_module_args args = {
+ .rcx = tdx_tdvpr_pa(vp),
+ };
+
+ return seamcall(TDH_VP_FLUSH, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_vp_flush);
+
+u64 tdh_mng_vpflushdone(struct tdx_td *td)
+{
+ struct tdx_module_args args = {
+ .rcx = tdx_tdr_pa(td),
+ };
+
+ return seamcall(TDH_MNG_VPFLUSHDONE, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone);
+
+u64 tdh_mng_key_freeid(struct tdx_td *td)
+{
+ struct tdx_module_args args = {
+ .rcx = tdx_tdr_pa(td),
+ };
+
+ return seamcall(TDH_MNG_KEY_FREEID, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_mng_key_freeid);
+
+u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err)
+{
+ struct tdx_module_args args = {
+ .rcx = tdx_tdr_pa(td),
+ .rdx = td_params,
+ };
+ u64 ret;
+
+ ret = seamcall_ret(TDH_MNG_INIT, &args);
+
+ *extended_err = args.rcx;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdh_mng_init);
+
+u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
+{
+ struct tdx_module_args args = {
+ .rcx = tdx_tdvpr_pa(vp),
+ .rdx = field,
+ };
+ u64 ret;
+
+ ret = seamcall_ret(TDH_VP_RD, &args);
+
+ /* R8: Content of the field, or 0 in case of error. */
+ *data = args.r8;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdh_vp_rd);
+
+u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
+{
+ struct tdx_module_args args = {
+ .rcx = tdx_tdvpr_pa(vp),
+ .rdx = field,
+ .r8 = data,
+ .r9 = mask,
+ };
+
+ return seamcall(TDH_VP_WR, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_vp_wr);
+
+u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
+{
+ struct tdx_module_args args = {
+ .rcx = tdx_tdvpr_pa(vp),
+ .rdx = initial_rcx,
+ .r8 = x2apicid,
+ };
+
+ /* apicid requires version == 1. */
+ return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args);
+}
+EXPORT_SYMBOL_GPL(tdh_vp_init);
+
+/*
+ * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats.
+ * So despite the names, they must be interpted specially as described by the spec. Return
+ * them only for error reporting purposes.
+ */
+u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size)
+{
+ struct tdx_module_args args = {
+ .rcx = page_to_phys(page),
+ };
+ u64 ret;
+
+ ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args);
+
+ *tdx_pt = args.rcx;
+ *tdx_owner = args.rdx;
+ *tdx_size = args.r8;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim);
+
+u64 tdh_mem_track(struct tdx_td *td)
+{
+ struct tdx_module_args args = {
+ .rcx = tdx_tdr_pa(td),
+ };
+
+ return seamcall(TDH_MEM_TRACK, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_mem_track);
+
+u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2)
+{
+ struct tdx_module_args args = {
+ .rcx = gpa | level,
+ .rdx = tdx_tdr_pa(td),
+ };
+ u64 ret;
+
+ ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args);
+
+ *ext_err1 = args.rcx;
+ *ext_err2 = args.rdx;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdh_mem_page_remove);
+
+u64 tdh_phymem_cache_wb(bool resume)
+{
+ struct tdx_module_args args = {
+ .rcx = resume ? 1 : 0,
+ };
+
+ return seamcall(TDH_PHYMEM_CACHE_WB, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb);
+
+u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
+{
+ struct tdx_module_args args = {};
+
+ args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page);
+
+ return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr);
+
+u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
+{
+ struct tdx_module_args args = {};
+
+ args.rcx = mk_keyed_paddr(hkid, page);
+
+ return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
+}
+EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
index 4e3d533cdd61..82bb82be8567 100644
--- a/arch/x86/virt/vmx/tdx/tdx.h
+++ b/arch/x86/virt/vmx/tdx/tdx.h
@@ -3,7 +3,6 @@
#define _X86_VIRT_TDX_H
#include <linux/bits.h>
-#include "tdx_global_metadata.h"
/*
* This file contains both macros and data structures defined by the TDX
@@ -15,13 +14,46 @@
/*
* TDX module SEAMCALL leaf functions
*/
-#define TDH_PHYMEM_PAGE_RDMD 24
-#define TDH_SYS_KEY_CONFIG 31
-#define TDH_SYS_INIT 33
-#define TDH_SYS_RD 34
-#define TDH_SYS_LP_INIT 35
-#define TDH_SYS_TDMR_INIT 36
-#define TDH_SYS_CONFIG 45
+#define TDH_VP_ENTER 0
+#define TDH_MNG_ADDCX 1
+#define TDH_MEM_PAGE_ADD 2
+#define TDH_MEM_SEPT_ADD 3
+#define TDH_VP_ADDCX 4
+#define TDH_MEM_PAGE_AUG 6
+#define TDH_MEM_RANGE_BLOCK 7
+#define TDH_MNG_KEY_CONFIG 8
+#define TDH_MNG_CREATE 9
+#define TDH_MNG_RD 11
+#define TDH_MR_EXTEND 16
+#define TDH_MR_FINALIZE 17
+#define TDH_VP_FLUSH 18
+#define TDH_MNG_VPFLUSHDONE 19
+#define TDH_VP_CREATE 10
+#define TDH_MNG_KEY_FREEID 20
+#define TDH_MNG_INIT 21
+#define TDH_VP_INIT 22
+#define TDH_PHYMEM_PAGE_RDMD 24
+#define TDH_VP_RD 26
+#define TDH_PHYMEM_PAGE_RECLAIM 28
+#define TDH_MEM_PAGE_REMOVE 29
+#define TDH_SYS_KEY_CONFIG 31
+#define TDH_SYS_INIT 33
+#define TDH_SYS_RD 34
+#define TDH_SYS_LP_INIT 35
+#define TDH_SYS_TDMR_INIT 36
+#define TDH_MEM_TRACK 38
+#define TDH_PHYMEM_CACHE_WB 40
+#define TDH_PHYMEM_PAGE_WBINVD 41
+#define TDH_VP_WR 43
+#define TDH_SYS_CONFIG 45
+
+/*
+ * SEAMCALL leaf:
+ *
+ * Bit 15:0 Leaf number
+ * Bit 23:16 Version number
+ */
+#define TDX_VERSION_SHIFT 16
/* TDX page types */
#define PT_NDA 0x0
diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
index 8027a24d1c6e..13ad2663488b 100644
--- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
+++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
@@ -37,12 +37,62 @@ static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr)
return ret;
}
+static int get_tdx_sys_info_td_ctrl(struct tdx_sys_info_td_ctrl *sysinfo_td_ctrl)
+{
+ int ret = 0;
+ u64 val;
+
+ if (!ret && !(ret = read_sys_metadata_field(0x9800000100000000, &val)))
+ sysinfo_td_ctrl->tdr_base_size = val;
+ if (!ret && !(ret = read_sys_metadata_field(0x9800000100000100, &val)))
+ sysinfo_td_ctrl->tdcs_base_size = val;
+ if (!ret && !(ret = read_sys_metadata_field(0x9800000100000200, &val)))
+ sysinfo_td_ctrl->tdvps_base_size = val;
+
+ return ret;
+}
+
+static int get_tdx_sys_info_td_conf(struct tdx_sys_info_td_conf *sysinfo_td_conf)
+{
+ int ret = 0;
+ u64 val;
+ int i, j;
+
+ if (!ret && !(ret = read_sys_metadata_field(0x1900000300000000, &val)))
+ sysinfo_td_conf->attributes_fixed0 = val;
+ if (!ret && !(ret = read_sys_metadata_field(0x1900000300000001, &val)))
+ sysinfo_td_conf->attributes_fixed1 = val;
+ if (!ret && !(ret = read_sys_metadata_field(0x1900000300000002, &val)))
+ sysinfo_td_conf->xfam_fixed0 = val;
+ if (!ret && !(ret = read_sys_metadata_field(0x1900000300000003, &val)))
+ sysinfo_td_conf->xfam_fixed1 = val;
+ if (!ret && !(ret = read_sys_metadata_field(0x9900000100000004, &val)))
+ sysinfo_td_conf->num_cpuid_config = val;
+ if (!ret && !(ret = read_sys_metadata_field(0x9900000100000008, &val)))
+ sysinfo_td_conf->max_vcpus_per_td = val;
+ if (sysinfo_td_conf->num_cpuid_config > ARRAY_SIZE(sysinfo_td_conf->cpuid_config_leaves))
+ return -EINVAL;
+ for (i = 0; i < sysinfo_td_conf->num_cpuid_config; i++)
+ if (!ret && !(ret = read_sys_metadata_field(0x9900000300000400 + i, &val)))
+ sysinfo_td_conf->cpuid_config_leaves[i] = val;
+ if (sysinfo_td_conf->num_cpuid_config > ARRAY_SIZE(sysinfo_td_conf->cpuid_config_values))
+ return -EINVAL;
+ for (i = 0; i < sysinfo_td_conf->num_cpuid_config; i++)
+ for (j = 0; j < 2; j++)
+ if (!ret && !(ret = read_sys_metadata_field(0x9900000300000500 + i * 2 + j, &val)))
+ sysinfo_td_conf->cpuid_config_values[i][j] = val;
+
+ return ret;
+}
+
static int get_tdx_sys_info(struct tdx_sys_info *sysinfo)
{
int ret = 0;
ret = ret ?: get_tdx_sys_info_features(&sysinfo->features);
ret = ret ?: get_tdx_sys_info_tdmr(&sysinfo->tdmr);
+ ret = ret ?: get_tdx_sys_info_td_ctrl(&sysinfo->td_ctrl);
+ ret = ret ?: get_tdx_sys_info_td_conf(&sysinfo->td_conf);
return ret;
}